mada949 · Oct 7, 2021
diff --git a/‎GetData.ipynb
+478 b/‎GetData.ipynb
+478
diff --git a/‎Models.ipynb
+598 b/‎Models.ipynb
+598
diff --git a/‎UpdateGames.ipynb
+401 b/‎UpdateGames.ipynb
+401
@@ -0,0 +1,478 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import libraries\n",
+    "from nba_api.stats.static import teams\n",
+    "from nba_api.stats.endpoints import leaguegamefinder, playbyplayv2\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "import datetime\n",
+    "import time\n",
+    "import numpy as np\n",
+    "import itertools\n",
+    "import sqlalchemy\n",
+    "\n",
+    "# connect to a local postgres database\n",
+    "engine = sqlalchemy.create_engine('postgresql://postgres:password@localhost:5432/NBA')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get raw dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get teams\n",
+    "nba_teams = teams.get_teams()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get team ids\n",
+    "team_id = [team['id'] for team in nba_teams]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:50<00:00,  1.68s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Get all games by team id\n",
+    "df = pd.DataFrame()\n",
+    "for team in tqdm(team_id):\n",
+    "    gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=team)\n",
+    "    df = pd.concat([df,gamefinder.get_data_frames()[0]])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Clean data and convert to training data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Get the counts of each game id\n",
+    "df_games = df['GAME_ID'].value_counts().reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get game ids that show up twice\n",
+    "games_id = df_games[df_games['GAME_ID']==2]['index']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Filter for those game ids\n",
+    "df = df[df['GAME_ID'].isin(games_id)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get preseason game ids\n",
+    "pregames_id = df[df['SEASON_ID'].str.contains('1\\d{4}')].sort_values('GAME_DATE')['GAME_ID'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Filter out preseason games\n",
+    "df = df[~df['GAME_ID'].isin(pregames_id)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|████████████████████████████████████████████████████████████████████████████| 45919/45919 [30:04<00:00, 25.44it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Get the colunm names for the home team col to be col_home\n",
+    "c = df.columns.tolist()[9:-1]\n",
+    "l = []\n",
+    "for i in range(len(c)):\n",
+    "    l.append(c[i] + '_oppos')\n",
+    "\n",
+    "# away games\n",
+    "away = df[df['MATCHUP'].str.contains('@')].reset_index(drop=True)\n",
+    "\n",
+    "# home games\n",
+    "home = df[df['MATCHUP'].str.contains('vs.')]\n",
+    "\n",
+    "# convert a home team row and an away team row to 1 away and home team row\n",
+    "new = pd.DataFrame()\n",
+    "for index in tqdm(range(away.shape[0])):\n",
+    "    tmp = home[home['GAME_ID']==away.iloc[index,:]['GAME_ID']].iloc[:,9:-1].reset_index(drop=True)\n",
+    "    tmp.columns = l\n",
+    "    new = pd.concat([new, pd.concat([away.iloc[index:index+1,:].reset_index(drop=True), tmp],axis=1)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save the data to a database\n",
+    "new.to_sql('raw',con=engine,if_exists='append',index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Can restart notebook here to save memory\n",
+    "# Read in the data\n",
+    "all_games = pd.read_sql_query(\"select * from raw\", con=engine)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert game data to datetime type\n",
+    "all_games['date'] = pd.to_datetime(all_games['game_date'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# function to convert date to the number of days after the first game\n",
+    "def convert_days(date):\n",
+    "    d = pd.Timestamp(1983,10,28)\n",
+    "    return (date - d).days"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# convert date to number of days\n",
+    "all_games['days'] = all_games['date'].apply(convert_days)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compute score total and score difference\n",
+    "all_games['diff'] = all_games['pts'] - all_games['pts_oppos']\n",
+    "#all_games['DIFF'] = all_games['PTS'] + all_games['PTS_oppos']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get home team abbreviation\n",
+    "all_games['hometeam'] = all_games['matchup'].str.extract(r'\\w* @ (\\w*)')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# treat play-in games like regular season\n",
+    "all_games['season_id'] = all_games['season_id'].replace('52019','22019')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert to training data format\n",
+    "d = {'game_id':[], 'diff':[], 'opposite':[], 'daysdiff':[]}\n",
+    "for away, home in itertools.product(all_games['team_abbreviation'].unique(), all_games['team_abbreviation'].unique()):\n",
+    "    if away != home:\n",
+    "        one = all_games[(all_games['team_abbreviation']==away) & (all_games['hometeam'] == home)].reset_index(drop=True)\n",
+    "        two = all_games[(all_games['team_abbreviation']==home) & (all_games['hometeam'] == away)].reset_index(drop=True)\n",
+    "        tmp = pd.concat([one,two]).sort_values('days')\n",
+    "        if tmp.shape[0] != 0:\n",
+    "            for i in range (tmp.shape[0]-1):\n",
+    "                d['game_id'].append(tmp.iloc[i,:]['game_id'])\n",
+    "                d['diff'].append(tmp.iloc[i+1,:]['diff'])\n",
+    "                if tmp.iloc[i,:]['hometeam'] != tmp.iloc[i+1,:]['hometeam']:\n",
+    "                    d['opposite'].append(1)\n",
+    "                else:\n",
+    "                    d['opposite'].append(0)\n",
+    "                d['daysdiff'].append(tmp.iloc[i+1,:]['days']-tmp.iloc[i,:]['days'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "# merge back to get the games we want\n",
+    "work = pd.DataFrame(d).merge(all_games, left_on=['game_id'], right_on=[\"game_id\"]).drop_duplicates()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Rename columns\n",
+    "work['1diff'] = work['diff_y']\n",
+    "work['2diff'] = work['diff_x']\n",
+    "work.drop(columns=['diff_y','diff_x'],inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Drop column\n",
+    "work.drop(columns=['PLUS_MINUS'],inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Replace null averages with 0\n",
+    "work['FG3_PCT'] = work['FG3_PCT'].fillna(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "work['awayteam'] = work['team_abbreviation']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Drop column\n",
+    "work.drop(columns=['date'],inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save to database\n",
+    "work.to_sql('train_total',engine, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make column names lowercase\n",
+    "work.columns = [item.lower() for item in work.columns]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save to database\n",
+    "work.to_sql('train',engine, index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get Play-by-Play Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get game ids\n",
+    "games = pd.read_sql_query(\"select distinct game_id from raw where season_id >= '21996' and season_id <= '22025'\", con=engine)['game_id']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# function to convert score string to score difference\n",
+    "def calc_scorediff(x):\n",
+    "    if x != None:\n",
+    "        return eval(x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Get play-by-play time series data for each game\n",
+    "games = np.setdiff1d(games, pd.read_sql_query(\"select distinct game_id from playbyplay\", con=engine))\n",
+    "for game in tqdm(games):\n",
+    "    play = playbyplayv2.PlayByPlayV2(game).get_data_frames()[0]\n",
+    "    #print(play)\n",
+    "    if play.shape[0] != 0:\n",
+    "        score = play['SCORE'].apply(calc_scorediff).fillna(method='ffill').fillna(0)\n",
+    "\n",
+    "        # Record the last index of 0:00 of each quarter\n",
+    "        zeroindex = 0\n",
+    "        ends = []\n",
+    "        l = play['PCTIMESTRING'].to_list()\n",
+    "        for i in range (len(l)):\n",
+    "            if l[i] == '0:00':\n",
+    "                zeroindex = i\n",
+    "            else:\n",
+    "                if zeroindex != 0:\n",
+    "                    ends.append(zeroindex)\n",
+    "                    zeroindex=0\n",
+    "        ends.append(i)\n",
+    "        if len(ends) < 4:\n",
+    "            print(game, ends)\n",
+    "            continue\n",
+    "        \n",
+    "        # 4 quarters\n",
+    "        q1 = [(datetime.datetime.strptime(item, \"%M:%S\") + datetime.timedelta(minutes=36)) for item in l[:ends[0]+1]]\n",
+    "        q2 = [(datetime.datetime.strptime(item, \"%M:%S\") + datetime.timedelta(minutes=24)) for item in l[ends[0]+1:ends[1]+1]]\n",
+    "        q3 = [(datetime.datetime.strptime(item, \"%M:%S\") + datetime.timedelta(minutes=12)) for item in l[ends[1]+1:ends[2]+1]]\n",
+    "        q4 = [datetime.datetime.strptime(item, \"%M:%S\") for item in l[ends[2]+1:ends[3]+1]]\n",
+    "        times = [(datetime.datetime(1900,1,1,0,48) - event).total_seconds() for event in q1+q2+q3+q4]\n",
+    "\n",
+    "        timedf = pd.DataFrame([times,score]).transpose()\n",
+    "        tdata = []\n",
+    "        try:\n",
+    "            for i in range (360):\n",
+    "                tdata.append(timedf[timedf[0] <= 8*(i+1)].iloc[-1][1])\n",
+    "        except IndexError:\n",
+    "            continue\n",
+    "        tdf = pd.DataFrame(tdata).transpose()\n",
+    "        tdf.columns = [\"t\"+str(col) for col in tdf.columns]\n",
+    "        tdf.insert(0,'game_id',game)\n",
+    "        \n",
+    "        # overtime indicator\n",
+    "        if len(ends) > 4:\n",
+    "            tdf.insert(1,'overtime', 1)\n",
+    "        else:\n",
+    "            tdf.insert(1,'overtime', 0)\n",
+    "            \n",
+    "        # save to database\n",
+    "        tdf.to_sql('playbyplay', con=engine, if_exists='append', index=False)\n",
+    "        time.sleep(1)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -0,0 +1,598 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import libraries\n",
+    "import nba_api.stats.endpoints\n",
+    "from nba_api.stats.endpoints import leaguegamefinder, playbyplayv2\n",
+    "from nba_api.stats.static import teams\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "import time\n",
+    "import datetime\n",
+    "import numpy as np\n",
+    "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn.neighbors import KNeighborsClassifier\n",
+    "from sklearn.cluster import KMeans\n",
+    "from sklearn.linear_model import LinearRegression, LogisticRegression\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import normalize, MinMaxScaler, StandardScaler\n",
+    "from sklearn.metrics import mean_absolute_error, mean_squared_error\n",
+    "from imblearn.over_sampling import RandomOverSampler\n",
+    "import requests, sqlalchemy\n",
+    "from bs4 import BeautifulSoup\n",
+    "import itertools\n",
+    "\n",
+    "# connect to postgres database\n",
+    "engine = sqlalchemy.create_engine('postgresql://postgres:password@localhost:5432/NBA')\n",
+    "\n",
+    "pd.set_option('display.max_rows', 500)\n",
+    "pd.set_option('display.max_columns', 500)\n",
+    "pd.set_option('display.width', 1000)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Score difference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# read training data from database\n",
+    "work = pd.read_sql_query(\"select * from train\", con=engine)\n",
+    "\n",
+    "# playoffs dummy variable\n",
+    "work['playoff'] = work['season_id'].str.extract(r'(\\d)\\d{4}').astype(int)\n",
+    "work['playoff'] = work['playoff'].replace(2,0)\n",
+    "work['playoff'] = work['playoff'].replace(4,1)\n",
+    "work['season_id'] = work['season_id'].str.replace('^\\d','2')\n",
+    "\n",
+    "# Get season after 1995\n",
+    "work = work[work['season_id'] >= '21996']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# drop columns we dont need\n",
+    "train = work.drop(columns=['team_id','team_abbreviation','game_date','matchup','min','days'])\n",
+    "\n",
+    "# compute difference in stats\n",
+    "train['blk_diff'] = train['blk'] - train['blk_oppos']\n",
+    "train['oreb_diff'] = train['oreb'] - train['oreb_oppos']\n",
+    "train['reb_diff'] = train['reb'] - train['reb_oppos']\n",
+    "train['ast_diff'] = train['ast'] - train['ast_oppos']\n",
+    "train['stl_diff'] = train['stl'] - train['stl_oppos']\n",
+    "train['tov_diff'] = train['tov'] - train['tov_oppos']\n",
+    "train['pf_diff'] = train['pf'] - train['pf_oppos']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# read in the play-by-play data from database\n",
+    "plays = pd.read_sql_query(\"select * from playbyplay\", con=engine)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cluster the play-by-play data \n",
+    "kmeans = KMeans(6, random_state=0).fit(plays.drop(columns=['game_id']))\n",
+    "\n",
+    "# new feature as cluster\n",
+    "plays['clusters'] = kmeans.labels_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# merge to training data\n",
+    "train = train.merge(plays, left_on=['game_id'], right_on=['game_id'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get dummy variables\n",
+    "final = pd.concat([\n",
+    "           pd.get_dummies(train['season_id']), \n",
+    "           pd.get_dummies(train['wl'],drop_first=True), \n",
+    "           pd.get_dummies(train['clusters'],prefix='cluster'),\n",
+    "           train.drop(columns=['awayteam','season_id','wl','hometeam','game_id'])], axis=1)\n",
+    "\n",
+    "# Split into train and test data\n",
+    "tr, te = train_test_split(final,test_size=0.1,random_state=0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Random forest\n",
+    "rfr = RandomForestRegressor(max_depth=12, min_samples_split=64, n_jobs=-1, random_state=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RandomForestRegressor(max_depth=12, min_samples_split=64, n_jobs=-1,\n",
+       "                      random_state=0)"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Fit model\n",
+    "rfr.fit(tr.drop(columns=['2diff']), tr['2diff'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "9.373045180192985"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# MAE on testing\n",
+    "mean_absolute_error(te['2diff'],rfr.predict(te.drop(columns=['2diff'])))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8.3139611855241"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# MAE on training\n",
+    "mean_absolute_error(tr['2diff'],rfr.predict(tr.drop(columns=['2diff'])))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Team abbreviation conversion between our data and MSN\n",
+    "team_dict = {'ATL': 'ATL',\n",
+    " 'BKN': 'BKN',\n",
+    " 'BOS': 'BOS',\n",
+    " 'CHA': 'CHA',\n",
+    " 'CHI': 'CHI',\n",
+    " 'CLE': 'CLE',\n",
+    " 'DAL': 'DAL',\n",
+    " 'DEN': 'DEN',\n",
+    " 'DET': 'DET',\n",
+    " 'GS': 'GSW',\n",
+    " 'HOU': 'HOU',\n",
+    " 'IND': 'IND',\n",
+    " 'LAC': 'LAC',\n",
+    " 'LAL': 'LAL',\n",
+    " 'MEM': 'MEM',\n",
+    " 'MIA': 'MIA',\n",
+    " 'MIL': 'MIL',\n",
+    " 'MIN': 'MIN',\n",
+    " 'NO': 'NOP',\n",
+    " 'NY': 'NYK',\n",
+    " 'OKC': 'OKC',\n",
+    " 'ORL': 'ORL',\n",
+    " 'PHI': 'PHI',\n",
+    " 'PHO': 'PHX',\n",
+    " 'POR': 'POR',\n",
+    " 'SA': 'SAS',\n",
+    " 'SAC': 'SAC',\n",
+    " 'TOR': 'TOR',\n",
+    " 'UTA': 'UTA',\n",
+    " 'WAS': 'WAS'}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Get game schedule tomorrrow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# webscrape from MSN\n",
+    "headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}\n",
+    "today = str(int(str(datetime.date.today()).replace('-','')))\n",
+    "games_today = requests.get('https://www.msn.com/en-us/sports/nba/schedule', headers=headers)\n",
+    "html_soup = BeautifulSoup(games_today.content, 'html.parser')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# convert date to the number of days after the first game\n",
+    "def convert_days(date):\n",
+    "    d = pd.Timestamp(1983,10,28)\n",
+    "    return (date - d).days"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get the predictors from past data\n",
+    "teamstoday = []\n",
+    "page = html_soup.find_all('div',{'id':today})[0].find_all('td')\n",
+    "for i in range (len(page)):\n",
+    "    if i % 5 == 2:\n",
+    "        teamstoday.append(page[i].text.split('\\n')[1].strip())\n",
+    "        \n",
+    "all_games = pd.read_sql_query(\"select * from raw\", con=engine)\n",
+    "s = \"\" # email string\n",
+    "for i in range (0, len(teamstoday), 2):\n",
+    "    away = team_dict[teamstoday[i]]\n",
+    "    home = team_dict[teamstoday[i+1]]\n",
+    "    s += away + ',' + home + '\\n'\n",
+    "    all_games['days'] = pd.to_datetime(all_games['game_date']).apply(convert_days)\n",
+    "    all_games['hometeam'] = all_games['matchup'].str.extract(r'\\w* @ (\\w*)')\n",
+    "    thiscomp1 = all_games[(all_games['team_abbreviation'] == away) & (all_games['hometeam'] == home)].reset_index(drop=True)\n",
+    "    thiscomp2 = all_games[(all_games['team_abbreviation'] == home) & (all_games['hometeam'] == away)].reset_index(drop=True)\n",
+    "    target = pd.concat([thiscomp1, thiscomp2]).sort_values('days').iloc[-1].drop(labels=['team_id','team_name','game_date','matchup','min'])\n",
+    "    daysdiff = (pd.Timestamp.today() - pd.Timestamp(1983,10,28)).days - target['days']\n",
+    "    \n",
+    "    data = pd.Series([0]*final.shape[1], index = final.drop(columns=['2diff']).columns)\n",
+    "    data['playoff'] = 0\n",
+    "    data['22020'] = 1\n",
+    "    if target['hometeam'] == home:\n",
+    "        data['opposite'] = 0\n",
+    "    else:\n",
+    "        data['opposite'] = 1\n",
+    "\n",
+    "    if target['wl'] == 'W':\n",
+    "        data['W'] = 1\n",
+    "    else:\n",
+    "        data['W'] = 0\n",
+    "    \n",
+    "    # compute the predictors\n",
+    "    data['1diff'] = target['pts'] - target['pts_oppos']\n",
+    "    data['daysdiff'] = daysdiff\n",
+    "    data[28:64] = target[4:-2]\n",
+    "    data[66:115] = team[(team['season'] == target['season_id']) & (team['team_abbreviation'] == target['team_abbreviation'])].drop(columns=['season_id','team_abbreviation','team_name','TEAM_ID','TEAM_NAME','GP','W','L','CFID','CFPARAMS','season']).iloc[0]\n",
+    "    data[115:-7] = team[(team['season'] == target['season_id']) & (team['team_abbreviation'] == target['hometeam'])].drop(columns=['season_id','team_abbreviation','team_name','TEAM_ID','TEAM_NAME','GP','W','L','CFID','CFPARAMS','season']).iloc[0]\n",
+    "    data['blk_diff'] = data['blk'] - data['blk_oppos']\n",
+    "    data['oreb_diff'] = data['oreb'] - data['oreb_oppos']\n",
+    "    data['reb_diff'] = data['reb'] - data['reb_oppos']\n",
+    "    data['ast_diff'] = data['ast'] - data['ast_oppos']\n",
+    "    data['stl_diff'] = data['stl'] - data['stl_oppos']\n",
+    "    data['tov_diff'] = data['tov'] - data['tov_oppos']\n",
+    "    data['pf_diff'] = data['pf'] - data['pf_oppos']\n",
+    "    \n",
+    "    # prediction\n",
+    "    s += str(rfr.predict(data.values.reshape(1,-1))) + '\\n' # email string"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Over/Under"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read in data training data from database\n",
+    "work = pd.read_sql_query(\"select * from train_total\", con=engine)\n",
+    "\n",
+    "# playoffs indicator\n",
+    "work['playoff'] = work['season_id'].str.extract(r'(\\d)\\d{4}').astype(int)\n",
+    "work['playoff'] = work['playoff'].replace(2,0)\n",
+    "work['playoff'] = work['playoff'].replace(4,1)\n",
+    "\n",
+    "# Get season after 1995\n",
+    "work['season_id'] = work['season_id'].str.replace('^\\d','2')\n",
+    "work = work[work['season_id'] >= '21996']\n",
+    "\n",
+    "# drop columns we dont need\n",
+    "train = work.drop(columns=['team_id','team_abbreviation','game_date','matchup','min','days'])\n",
+    "\n",
+    "# compute stat difference\n",
+    "train['blk_diff'] = train['blk'] - train['blk_oppos']\n",
+    "train['oreb_diff'] = train['oreb'] - train['oreb_oppos']\n",
+    "train['reb_diff'] = train['reb'] - train['reb_oppos']\n",
+    "train['ast_diff'] = train['ast'] - train['ast_oppos']\n",
+    "train['stl_diff'] = train['stl'] - train['stl_oppos']\n",
+    "train['tov_diff'] = train['tov'] - train['tov_oppos']\n",
+    "train['pf_diff'] = train['pf'] - train['pf_oppos']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# dummy variables\n",
+    "final = pd.concat([\n",
+    "           pd.get_dummies(train['season_id']), \n",
+    "           pd.get_dummies(train['wl'],drop_first=True), \n",
+    "           train.drop(columns=['awayteam','season_id','wl','hometeam','game_id'])], axis=1)\n",
+    "\n",
+    "# split into train and test dataset\n",
+    "tr, te = train_test_split(final,test_size=0.1,random_state=0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Model training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# random forest\n",
+    "rfr = RandomForestRegressor(max_depth=12, min_samples_split=64, n_jobs=-1, random_state=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RandomForestRegressor(max_depth=12, min_samples_split=64, n_jobs=-1,\n",
+       "                      random_state=0)"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# fit the model\n",
+    "rfr.fit(tr.drop(columns=['2diff']), tr['2diff'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "14.881988864855604"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# MAE for testing\n",
+    "mean_absolute_error(te['2diff'],rfr.predict(te.drop(columns=['2diff'])))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "13.022569548413044"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# MAE for training\n",
+    "mean_absolute_error(tr['2diff'],rfr.predict(tr.drop(columns=['2diff'])))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get game schedule tomorrow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# webscrape from MSN\n",
+    "teamstoday = []\n",
+    "page = html_soup.find_all('div',{'id':today})[0].find_all('td')\n",
+    "for i in range (len(page)):\n",
+    "    if i % 5 == 2:\n",
+    "        teamstoday.append(page[i].text.split('\\n')[1].strip())\n",
+    "\n",
+    "# get raw data from database\n",
+    "all_games = pd.read_sql_query(\"select * from raw\", con=engine)\n",
+    "\n",
+    "# get predictors from past data\n",
+    "for i in range (0, len(teamstoday), 2):\n",
+    "    away = team_dict[teamstoday[i]]\n",
+    "    home = team_dict[teamstoday[i+1]]\n",
+    "    s += away + ',' + home + '\\n' # email string\n",
+    "    all_games['days'] = pd.to_datetime(all_games['game_date']).apply(convert_days)\n",
+    "    all_games['hometeam'] = all_games['matchup'].str.extract(r'\\w* @ (\\w*)')\n",
+    "    thiscomp1 = all_games[(all_games['team_abbreviation'] == away) & (all_games['hometeam'] == home)].reset_index(drop=True)\n",
+    "    thiscomp2 = all_games[(all_games['team_abbreviation'] == home) & (all_games['hometeam'] == away)].reset_index(drop=True)\n",
+    "    target = pd.concat([thiscomp1, thiscomp2]).sort_values('days').iloc[-1].drop(labels=['team_id','team_name','game_date','matchup','min'])\n",
+    "    daysdiff = (pd.Timestamp.today() - pd.Timestamp(1983,10,28)).days - target['days']\n",
+    "    \n",
+    "    data = pd.Series([0]*final.shape[1], index = final.drop(columns=['2diff']).columns)\n",
+    "    data['playoff'] = 0\n",
+    "    data['22020'] = 1\n",
+    "    if target['hometeam'] == home:\n",
+    "        data['opposite'] = 0\n",
+    "    else:\n",
+    "        data['opposite'] = 1\n",
+    "\n",
+    "    if target['wl'] == 'W':\n",
+    "        data['W'] = 1\n",
+    "    else:\n",
+    "        data['W'] = 0\n",
+    "    \n",
+    "    # compute the predictors\n",
+    "    data['1diff'] = target['pts'] - target['pts_oppos']\n",
+    "    data['daysdiff'] = daysdiff\n",
+    "    data[28:64] = target[4:-2]\n",
+    "    data[66:115] = team[(team['season'] == target['season_id']) & (team['team_abbreviation'] == target['team_abbreviation'])].drop(columns=['season_id','team_abbreviation','team_name','TEAM_ID','TEAM_NAME','GP','W','L','CFID','CFPARAMS','season']).iloc[0]\n",
+    "    data[115:-7] = team[(team['season'] == target['season_id']) & (team['team_abbreviation'] == target['hometeam'])].drop(columns=['season_id','team_abbreviation','team_name','TEAM_ID','TEAM_NAME','GP','W','L','CFID','CFPARAMS','season']).iloc[0]\n",
+    "    data['blk_diff'] = data['blk'] - data['blk_oppos']\n",
+    "    data['oreb_diff'] = data['oreb'] - data['oreb_oppos']\n",
+    "    data['reb_diff'] = data['reb'] - data['reb_oppos']\n",
+    "    data['ast_diff'] = data['ast'] - data['ast_oppos']\n",
+    "    data['stl_diff'] = data['stl'] - data['stl_oppos']\n",
+    "    data['tov_diff'] = data['tov'] - data['tov_oppos']\n",
+    "    data['pf_diff'] = data['pf'] - data['pf_oppos']\n",
+    "    \n",
+    "    # prediction\n",
+    "    s += str(rfr.predict(data.values.reshape(1,-1))) + '\\n' # email string"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Email the predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import smtplib, ssl\n",
+    "\n",
+    "port = 465  # For SSL\n",
+    "smtp_server = \"smtp.gmail.com\"\n",
+    "sender_email = \"leowei08@gmail.com\"\n",
+    "receiver_email1 = \"leowei08@gmail.com\" \n",
+    "password = 'password'\n",
+    "message = \"\"\"\\\n",
+    "Subject: Predictions Today {today}\n",
+    "\n",
+    "\n",
+    "{content}.\"\"\"\n",
+    "\n",
+    "context = ssl.create_default_context()\n",
+    "with smtplib.SMTP_SSL(smtp_server, port, context=context) as server:\n",
+    "    server.login(sender_email, password)\n",
+    "    server.sendmail(sender_email, receiver_email1, message.format(today=str(datetime.date.today()).replace('-',''), content=s))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -0,0 +1,401 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import library\n",
+    "from nba_api.stats.static import teams\n",
+    "from nba_api.stats.endpoints import leaguegamefinder, playbyplayv2\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import requests, datetime, time, sqlalchemy\n",
+    "pd.set_option('display.max_rows', 500)\n",
+    "pd.set_option('display.max_columns', 500)\n",
+    "pd.set_option('display.width', 1000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# data type for each column\n",
+    "types = {'opposite': 'int',\n",
+    " 'daysdiff': 'int',\n",
+    " 'season_id': 'str',\n",
+    " 'team_id': 'int',\n",
+    " 'team_abbreviation': 'str',\n",
+    " 'team_name': 'str',\n",
+    " 'game_id': 'str',\n",
+    " 'game_date': 'str',\n",
+    " 'matchup': 'str',\n",
+    " 'wl': 'str',\n",
+    " 'min': 'int',\n",
+    " 'pts': 'int',\n",
+    " 'fgm': 'int',\n",
+    " 'fga': 'int',\n",
+    " 'fg_pct': 'float',\n",
+    " 'fg3m': 'int',\n",
+    " 'fg3a': 'float',\n",
+    " 'fg3_pct': 'float',\n",
+    " 'ftm': 'int',\n",
+    " 'fta': 'int',\n",
+    " 'ft_pct': 'float',\n",
+    " 'oreb': 'float',\n",
+    " 'dreb': 'float',\n",
+    " 'reb': 'float',\n",
+    " 'ast': 'int',\n",
+    " 'stl': 'float',\n",
+    " 'blk': 'int',\n",
+    " 'tov': 'int',\n",
+    " 'pf': 'int',\n",
+    " 'pts_oppos': 'int',\n",
+    " 'fgm_oppos': 'int',\n",
+    " 'fga_oppos': 'int',\n",
+    " 'fg_pct_oppos': 'float',\n",
+    " 'fg3m_oppos': 'int',\n",
+    " 'fg3a_oppos': 'float',\n",
+    " 'fg3_pct_oppos': 'float',\n",
+    " 'ftm_oppos': 'int',\n",
+    " 'fta_oppos': 'int',\n",
+    " 'ft_pct_oppos': 'float',\n",
+    " 'oreb_oppos': 'float',\n",
+    " 'dreb_oppos': 'float',\n",
+    " 'reb_oppos': 'float',\n",
+    " 'ast_oppos': 'int',\n",
+    " 'stl_oppos': 'float',\n",
+    " 'blk_oppos': 'int',\n",
+    " 'tov_oppos': 'int',\n",
+    " 'pf_oppos': 'int',\n",
+    " 'days': 'int',\n",
+    " 'hometeam': 'str',\n",
+    " '1diff': 'int',\n",
+    " '2diff': 'int',\n",
+    " 'awayteam': 'str'}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Update game history "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# connect to local database\n",
+    "engine = sqlalchemy.create_engine('postgresql://postgres:password@localhost:5432/NBA')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get the game results today\n",
+    "today = datetime.date.today() - datetime.timedelta(days=1)\n",
+    "l = str(today).split('-')\n",
+    "today = l[1]+\"/\"+l[2]+\"/\"+l[0]\n",
+    "df = leaguegamefinder.LeagueGameFinder(date_from_nullable=today).get_data_frames()[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 130.35it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Get the colunm names for the home team to be PTS_home\n",
+    "c = df.columns.tolist()[9:-1]\n",
+    "l = []\n",
+    "for i in range(len(c)):\n",
+    "    l.append(c[i] + '_oppos')\n",
+    "\n",
+    "# away team abbreviation\n",
+    "away = df[df['MATCHUP'].str.contains('@')].reset_index(drop=True)\n",
+    "\n",
+    "# home team abbreviation\n",
+    "home = df[df['MATCHUP'].str.contains('vs.')]\n",
+    "\n",
+    "# convert a home team row and an away team row to 1 away and home team row\n",
+    "new = pd.DataFrame()\n",
+    "for i in tqdm(range(away.shape[0])):\n",
+    "    tmp = home[home['GAME_ID']==away.iloc[i,:]['GAME_ID']].iloc[:,9:-1].reset_index(drop=True)\n",
+    "    tmp.columns = l\n",
+    "    new = pd.concat([new, pd.concat([away.iloc[i:i+1,:].reset_index(drop=True), tmp],axis=1)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# drop column and fill averages with 0\n",
+    "new.drop(columns=['PLUS_MINUS'],inplace=True)\n",
+    "new = new.fillna(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make column names lowercase\n",
+    "cols = [item.lower() for item in new.columns.to_list()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "new.columns = cols"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# add them to the database\n",
+    "new.reset_index(drop=True).to_sql('raw',engine,if_exists='append',index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Update training data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get home team abbreviation for todays games\n",
+    "new['hometeam'] = new['matchup'].str.extract(r'\\w* @ (\\w*)')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get all past game data\n",
+    "games = pd.read_sql_query(\"select * from raw\", con=engine)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# convert date to number of days after the first game\n",
+    "def convert_days(date):\n",
+    "    d = pd.Timestamp(1983,10,28)\n",
+    "    return (date - d).days\n",
+    "\n",
+    "# compute\n",
+    "games['days'] = pd.to_datetime(games['game_date']).apply(convert_days)\n",
+    "\n",
+    "# get home team abbreviation for past games\n",
+    "games['hometeam'] = games['matchup'].str.extract(r'\\w* @ (\\w*)')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Find the last time before today both teams faced each other\n",
+    "z = 0\n",
+    "for i in range (new.shape[0]):\n",
+    "    z += 1\n",
+    "    home = new.iloc[i]['hometeam']\n",
+    "    away = new.iloc[i]['team_abbreviation']\n",
+    "    one = games[(games['team_abbreviation']==home) & (games['hometeam'] == away)].reset_index(drop=True)\n",
+    "    two = games[(games['team_abbreviation']==away) & (games['hometeam'] == home)].reset_index(drop=True)\n",
+    "    tmp = pd.concat([one,two]).sort_values('days').iloc[-2:,:]\n",
+    "    \n",
+    "    d = {}\n",
+    "    if tmp.shape[0] > 1:\n",
+    "        if tmp.iloc[0]['team_abbreviation'] != tmp.iloc[1]['team_abbreviation']:\n",
+    "            d['opposite'] = 1\n",
+    "        else:\n",
+    "            d['opposite'] = 0\n",
+    "        d['daysdiff'] = tmp.iloc[1]['days']-tmp.iloc[0]['days']\n",
+    "        tmp1 = pd.Series(d).append(tmp.iloc[0])\n",
+    "        d = {}\n",
+    "        \n",
+    "        # compute score difference\n",
+    "        d['1diff'] = tmp.iloc[0]['pts'] - tmp.iloc[0]['pts_oppos']\n",
+    "        d['2diff'] = tmp.iloc[1]['pts'] - tmp.iloc[1]['pts_oppos']\n",
+    "        d['awayteam'] = tmp.iloc[0]['team_abbreviation']\n",
+    "        tmp2 = tmp1.append(pd.Series(d))\n",
+    "        \n",
+    "        # save to database\n",
+    "        tmp2.to_frame().transpose().astype(types).to_sql('train',con=engine,if_exists='append',index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Find the last time before today both teams faced each other\n",
+    "z = 0\n",
+    "for i in range (new.shape[0]):\n",
+    "    z += 1\n",
+    "    home = new.iloc[i]['hometeam']\n",
+    "    away = new.iloc[i]['team_abbreviation']\n",
+    "    one = games[(games['team_abbreviation']==home) & (games['hometeam'] == away)].reset_index(drop=True)\n",
+    "    two = games[(games['team_abbreviation']==away) & (games['hometeam'] == home)].reset_index(drop=True)\n",
+    "    tmp = pd.concat([one,two]).sort_values('days').iloc[-2:,:]\n",
+    "    \n",
+    "    d = {}\n",
+    "    if tmp.shape[0] > 1:\n",
+    "        if tmp.iloc[0]['team_abbreviation'] != tmp.iloc[1]['team_abbreviation']:\n",
+    "            d['opposite'] = 1\n",
+    "        else:\n",
+    "            d['opposite'] = 0\n",
+    "        d['daysdiff'] = tmp.iloc[1]['days']-tmp.iloc[0]['days']\n",
+    "        tmp1 = pd.Series(d).append(tmp.iloc[0])\n",
+    "        d = {}\n",
+    "        \n",
+    "        # compute score total\n",
+    "        d['1diff'] = tmp.iloc[0]['pts'] + tmp.iloc[0]['pts_oppos']\n",
+    "        d['2diff'] = tmp.iloc[1]['pts'] + tmp.iloc[1]['pts_oppos']\n",
+    "        d['awayteam'] = tmp.iloc[0]['team_abbreviation']\n",
+    "        tmp2 = tmp1.append(pd.Series(d))\n",
+    "        \n",
+    "        # save to database\n",
+    "        tmp2.to_frame().transpose().astype(types).to_sql('train_total',con=engine,if_exists='append',index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Update Play-by-Play Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# function to convert score string to score difference\n",
+    "def calc_scorediff(x):\n",
+    "    if x != None:\n",
+    "        return eval(x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get play-by-play data for the games today\n",
+    "for game in new['game_id']:\n",
+    "    play = playbyplayv2.PlayByPlayV2(game).get_data_frames()[0]\n",
+    "    if play.shape[0] != 0:\n",
+    "        score = play['SCORE'].apply(calc_scorediff).fillna(method='ffill').fillna(0)\n",
+    "\n",
+    "        # Record the last index of 0:00 of each quarter\n",
+    "        zeroindex = 0\n",
+    "        ends = []\n",
+    "        l = play['PCTIMESTRING'].to_list()\n",
+    "        for i in range (len(l)):\n",
+    "            if l[i] == '0:00':\n",
+    "                zeroindex = i\n",
+    "            else:\n",
+    "                if zeroindex != 0:\n",
+    "                    ends.append(zeroindex)\n",
+    "                    zeroindex=0\n",
+    "        ends.append(i)\n",
+    "        if len(ends) < 4:\n",
+    "            continue\n",
+    "        \n",
+    "        # 4 quarters\n",
+    "        q1 = [(datetime.datetime.strptime(item, \"%M:%S\") + datetime.timedelta(minutes=36)) for item in l[:ends[0]+1]]\n",
+    "        q2 = [(datetime.datetime.strptime(item, \"%M:%S\") + datetime.timedelta(minutes=24)) for item in l[ends[0]+1:ends[1]+1]]\n",
+    "        q3 = [(datetime.datetime.strptime(item, \"%M:%S\") + datetime.timedelta(minutes=12)) for item in l[ends[1]+1:ends[2]+1]]\n",
+    "        q4 = [datetime.datetime.strptime(item, \"%M:%S\") for item in l[ends[2]+1:ends[3]+1]]\n",
+    "        times = [(datetime.datetime(1900,1,1,0,48) - event).total_seconds() for event in q1+q2+q3+q4]\n",
+    "\n",
+    "        timedf = pd.DataFrame([times,score]).transpose()\n",
+    "        tdata = []\n",
+    "        try:\n",
+    "            for i in range (360):\n",
+    "                tdata.append(timedf[timedf[0] <= 8*(i+1)].iloc[-1][1])\n",
+    "        except IndexError:\n",
+    "            continue\n",
+    "        tdf = pd.DataFrame(tdata).transpose()\n",
+    "        tdf.columns = [\"t\"+str(col) for col in tdf.columns]\n",
+    "        tdf.insert(0,'game_id',game)\n",
+    "        \n",
+    "        # overtime indicator\n",
+    "        if len(ends) > 4:\n",
+    "            tdf.insert(1,'overtime', 1)\n",
+    "        else:\n",
+    "            tdf.insert(1,'overtime', 0)\n",
+    "            \n",
+    "        # save to database\n",
+    "        tdf.to_sql('playbyplay', con=engine, if_exists='append', index=False)\n",
+    "        time.sleep(3)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}