Skip to content

Commit 66f6785

Browse files
committedOct 7, 2021
initial commit
1 parent 48122f9 commit 66f6785

File tree

3 files changed

+1477
-0
lines changed

3 files changed

+1477
-0
lines changed
 

‎GetData.ipynb

+478
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,478 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"# Import libraries\n",
10+
"from nba_api.stats.static import teams\n",
11+
"from nba_api.stats.endpoints import leaguegamefinder, playbyplayv2\n",
12+
"import pandas as pd\n",
13+
"from tqdm import tqdm\n",
14+
"import datetime\n",
15+
"import time\n",
16+
"import numpy as np\n",
17+
"import itertools\n",
18+
"import sqlalchemy\n",
19+
"\n",
20+
"# connect to a local postgres database\n",
21+
"engine = sqlalchemy.create_engine('postgresql://postgres:password@localhost:5432/NBA')"
22+
]
23+
},
24+
{
25+
"cell_type": "markdown",
26+
"metadata": {},
27+
"source": [
28+
"## Get raw dataset"
29+
]
30+
},
31+
{
32+
"cell_type": "code",
33+
"execution_count": 2,
34+
"metadata": {},
35+
"outputs": [],
36+
"source": [
37+
"# Get teams\n",
38+
"nba_teams = teams.get_teams()"
39+
]
40+
},
41+
{
42+
"cell_type": "code",
43+
"execution_count": 3,
44+
"metadata": {},
45+
"outputs": [],
46+
"source": [
47+
"# Get team ids\n",
48+
"team_id = [team['id'] for team in nba_teams]"
49+
]
50+
},
51+
{
52+
"cell_type": "code",
53+
"execution_count": 4,
54+
"metadata": {},
55+
"outputs": [
56+
{
57+
"name": "stderr",
58+
"output_type": "stream",
59+
"text": [
60+
"100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:50<00:00, 1.68s/it]\n"
61+
]
62+
}
63+
],
64+
"source": [
65+
"# Get all games by team id\n",
66+
"df = pd.DataFrame()\n",
67+
"for team in tqdm(team_id):\n",
68+
" gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=team)\n",
69+
" df = pd.concat([df,gamefinder.get_data_frames()[0]])"
70+
]
71+
},
72+
{
73+
"cell_type": "markdown",
74+
"metadata": {},
75+
"source": [
76+
"## Clean data and convert to training data"
77+
]
78+
},
79+
{
80+
"cell_type": "code",
81+
"execution_count": 5,
82+
"metadata": {
83+
"scrolled": true
84+
},
85+
"outputs": [],
86+
"source": [
87+
"# Get the counts of each game id\n",
88+
"df_games = df['GAME_ID'].value_counts().reset_index()"
89+
]
90+
},
91+
{
92+
"cell_type": "code",
93+
"execution_count": 6,
94+
"metadata": {},
95+
"outputs": [],
96+
"source": [
97+
"# Get game ids that show up twice\n",
98+
"games_id = df_games[df_games['GAME_ID']==2]['index']"
99+
]
100+
},
101+
{
102+
"cell_type": "code",
103+
"execution_count": 7,
104+
"metadata": {},
105+
"outputs": [],
106+
"source": [
107+
"# Filter for those game ids\n",
108+
"df = df[df['GAME_ID'].isin(games_id)]"
109+
]
110+
},
111+
{
112+
"cell_type": "code",
113+
"execution_count": 8,
114+
"metadata": {},
115+
"outputs": [],
116+
"source": [
117+
"# Get preseason game ids\n",
118+
"pregames_id = df[df['SEASON_ID'].str.contains('1\\d{4}')].sort_values('GAME_DATE')['GAME_ID'].unique()"
119+
]
120+
},
121+
{
122+
"cell_type": "code",
123+
"execution_count": 9,
124+
"metadata": {},
125+
"outputs": [],
126+
"source": [
127+
"# Filter out preseason games\n",
128+
"df = df[~df['GAME_ID'].isin(pregames_id)]"
129+
]
130+
},
131+
{
132+
"cell_type": "code",
133+
"execution_count": 10,
134+
"metadata": {},
135+
"outputs": [
136+
{
137+
"name": "stderr",
138+
"output_type": "stream",
139+
"text": [
140+
"100%|████████████████████████████████████████████████████████████████████████████| 45919/45919 [30:04<00:00, 25.44it/s]\n"
141+
]
142+
}
143+
],
144+
"source": [
145+
"# Get the colunm names for the home team col to be col_home\n",
146+
"c = df.columns.tolist()[9:-1]\n",
147+
"l = []\n",
148+
"for i in range(len(c)):\n",
149+
" l.append(c[i] + '_oppos')\n",
150+
"\n",
151+
"# away games\n",
152+
"away = df[df['MATCHUP'].str.contains('@')].reset_index(drop=True)\n",
153+
"\n",
154+
"# home games\n",
155+
"home = df[df['MATCHUP'].str.contains('vs.')]\n",
156+
"\n",
157+
"# convert a home team row and an away team row to 1 away and home team row\n",
158+
"new = pd.DataFrame()\n",
159+
"for index in tqdm(range(away.shape[0])):\n",
160+
" tmp = home[home['GAME_ID']==away.iloc[index,:]['GAME_ID']].iloc[:,9:-1].reset_index(drop=True)\n",
161+
" tmp.columns = l\n",
162+
" new = pd.concat([new, pd.concat([away.iloc[index:index+1,:].reset_index(drop=True), tmp],axis=1)])"
163+
]
164+
},
165+
{
166+
"cell_type": "code",
167+
"execution_count": null,
168+
"metadata": {},
169+
"outputs": [],
170+
"source": [
171+
"# Save the data to a database\n",
172+
"new.to_sql('raw',con=engine,if_exists='append',index=False)"
173+
]
174+
},
175+
{
176+
"cell_type": "code",
177+
"execution_count": 52,
178+
"metadata": {},
179+
"outputs": [],
180+
"source": [
181+
"# Can restart notebook here to save memory\n",
182+
"# Read in the data\n",
183+
"all_games = pd.read_sql_query(\"select * from raw\", con=engine)"
184+
]
185+
},
186+
{
187+
"cell_type": "code",
188+
"execution_count": 53,
189+
"metadata": {},
190+
"outputs": [],
191+
"source": [
192+
"# Convert game data to datetime type\n",
193+
"all_games['date'] = pd.to_datetime(all_games['game_date'])"
194+
]
195+
},
196+
{
197+
"cell_type": "code",
198+
"execution_count": 54,
199+
"metadata": {},
200+
"outputs": [],
201+
"source": [
202+
"# function to convert date to the number of days after the first game\n",
203+
"def convert_days(date):\n",
204+
" d = pd.Timestamp(1983,10,28)\n",
205+
" return (date - d).days"
206+
]
207+
},
208+
{
209+
"cell_type": "code",
210+
"execution_count": 55,
211+
"metadata": {},
212+
"outputs": [],
213+
"source": [
214+
"# convert date to number of days\n",
215+
"all_games['days'] = all_games['date'].apply(convert_days)"
216+
]
217+
},
218+
{
219+
"cell_type": "code",
220+
"execution_count": 56,
221+
"metadata": {},
222+
"outputs": [],
223+
"source": [
224+
"# compute score total and score difference\n",
225+
"all_games['diff'] = all_games['pts'] - all_games['pts_oppos']\n",
226+
"#all_games['DIFF'] = all_games['PTS'] + all_games['PTS_oppos']"
227+
]
228+
},
229+
{
230+
"cell_type": "code",
231+
"execution_count": 57,
232+
"metadata": {},
233+
"outputs": [],
234+
"source": [
235+
"# get home team abbreviation\n",
236+
"all_games['hometeam'] = all_games['matchup'].str.extract(r'\\w* @ (\\w*)')"
237+
]
238+
},
239+
{
240+
"cell_type": "code",
241+
"execution_count": 58,
242+
"metadata": {},
243+
"outputs": [],
244+
"source": [
245+
"# treat play-in games like regular season\n",
246+
"all_games['season_id'] = all_games['season_id'].replace('52019','22019')"
247+
]
248+
},
249+
{
250+
"cell_type": "code",
251+
"execution_count": 59,
252+
"metadata": {},
253+
"outputs": [],
254+
"source": [
255+
"# Convert to training data format\n",
256+
"d = {'game_id':[], 'diff':[], 'opposite':[], 'daysdiff':[]}\n",
257+
"for away, home in itertools.product(all_games['team_abbreviation'].unique(), all_games['team_abbreviation'].unique()):\n",
258+
" if away != home:\n",
259+
" one = all_games[(all_games['team_abbreviation']==away) & (all_games['hometeam'] == home)].reset_index(drop=True)\n",
260+
" two = all_games[(all_games['team_abbreviation']==home) & (all_games['hometeam'] == away)].reset_index(drop=True)\n",
261+
" tmp = pd.concat([one,two]).sort_values('days')\n",
262+
" if tmp.shape[0] != 0:\n",
263+
" for i in range (tmp.shape[0]-1):\n",
264+
" d['game_id'].append(tmp.iloc[i,:]['game_id'])\n",
265+
" d['diff'].append(tmp.iloc[i+1,:]['diff'])\n",
266+
" if tmp.iloc[i,:]['hometeam'] != tmp.iloc[i+1,:]['hometeam']:\n",
267+
" d['opposite'].append(1)\n",
268+
" else:\n",
269+
" d['opposite'].append(0)\n",
270+
" d['daysdiff'].append(tmp.iloc[i+1,:]['days']-tmp.iloc[i,:]['days'])"
271+
]
272+
},
273+
{
274+
"cell_type": "code",
275+
"execution_count": 60,
276+
"metadata": {
277+
"scrolled": false
278+
},
279+
"outputs": [],
280+
"source": [
281+
"# merge back to get the games we want\n",
282+
"work = pd.DataFrame(d).merge(all_games, left_on=['game_id'], right_on=[\"game_id\"]).drop_duplicates()"
283+
]
284+
},
285+
{
286+
"cell_type": "code",
287+
"execution_count": 62,
288+
"metadata": {},
289+
"outputs": [],
290+
"source": [
291+
"# Rename columns\n",
292+
"work['1diff'] = work['diff_y']\n",
293+
"work['2diff'] = work['diff_x']\n",
294+
"work.drop(columns=['diff_y','diff_x'],inplace=True)"
295+
]
296+
},
297+
{
298+
"cell_type": "code",
299+
"execution_count": null,
300+
"metadata": {},
301+
"outputs": [],
302+
"source": [
303+
"# Drop column\n",
304+
"work.drop(columns=['PLUS_MINUS'],inplace=True)"
305+
]
306+
},
307+
{
308+
"cell_type": "code",
309+
"execution_count": null,
310+
"metadata": {},
311+
"outputs": [],
312+
"source": [
313+
"# Replace null averages with 0\n",
314+
"work['FG3_PCT'] = work['FG3_PCT'].fillna(0)"
315+
]
316+
},
317+
{
318+
"cell_type": "code",
319+
"execution_count": 63,
320+
"metadata": {},
321+
"outputs": [],
322+
"source": [
323+
"work['awayteam'] = work['team_abbreviation']"
324+
]
325+
},
326+
{
327+
"cell_type": "code",
328+
"execution_count": 64,
329+
"metadata": {},
330+
"outputs": [],
331+
"source": [
332+
"# Drop column\n",
333+
"work.drop(columns=['date'],inplace=True)"
334+
]
335+
},
336+
{
337+
"cell_type": "code",
338+
"execution_count": 42,
339+
"metadata": {},
340+
"outputs": [],
341+
"source": [
342+
"# save to database\n",
343+
"work.to_sql('train_total',engine, index=False)"
344+
]
345+
},
346+
{
347+
"cell_type": "code",
348+
"execution_count": 52,
349+
"metadata": {},
350+
"outputs": [],
351+
"source": [
352+
"# make column names lowercase\n",
353+
"work.columns = [item.lower() for item in work.columns]"
354+
]
355+
},
356+
{
357+
"cell_type": "code",
358+
"execution_count": 65,
359+
"metadata": {},
360+
"outputs": [],
361+
"source": [
362+
"# save to database\n",
363+
"work.to_sql('train',engine, index=False)"
364+
]
365+
},
366+
{
367+
"cell_type": "markdown",
368+
"metadata": {},
369+
"source": [
370+
"## Get Play-by-Play Data"
371+
]
372+
},
373+
{
374+
"cell_type": "code",
375+
"execution_count": 3,
376+
"metadata": {},
377+
"outputs": [],
378+
"source": [
379+
"# get game ids\n",
380+
"games = pd.read_sql_query(\"select distinct game_id from raw where season_id >= '21996' and season_id <= '22025'\", con=engine)['game_id']"
381+
]
382+
},
383+
{
384+
"cell_type": "code",
385+
"execution_count": 4,
386+
"metadata": {},
387+
"outputs": [],
388+
"source": [
389+
"# function to convert score string to score difference\n",
390+
"def calc_scorediff(x):\n",
391+
" if x != None:\n",
392+
" return eval(x)"
393+
]
394+
},
395+
{
396+
"cell_type": "code",
397+
"execution_count": null,
398+
"metadata": {
399+
"scrolled": true
400+
},
401+
"outputs": [],
402+
"source": [
403+
"# Get play-by-play time series data for each game\n",
404+
"games = np.setdiff1d(games, pd.read_sql_query(\"select distinct game_id from playbyplay\", con=engine))\n",
405+
"for game in tqdm(games):\n",
406+
" play = playbyplayv2.PlayByPlayV2(game).get_data_frames()[0]\n",
407+
" #print(play)\n",
408+
" if play.shape[0] != 0:\n",
409+
" score = play['SCORE'].apply(calc_scorediff).fillna(method='ffill').fillna(0)\n",
410+
"\n",
411+
" # Record the last index of 0:00 of each quarter\n",
412+
" zeroindex = 0\n",
413+
" ends = []\n",
414+
" l = play['PCTIMESTRING'].to_list()\n",
415+
" for i in range (len(l)):\n",
416+
" if l[i] == '0:00':\n",
417+
" zeroindex = i\n",
418+
" else:\n",
419+
" if zeroindex != 0:\n",
420+
" ends.append(zeroindex)\n",
421+
" zeroindex=0\n",
422+
" ends.append(i)\n",
423+
" if len(ends) < 4:\n",
424+
" print(game, ends)\n",
425+
" continue\n",
426+
" \n",
427+
" # 4 quarters\n",
428+
" q1 = [(datetime.datetime.strptime(item, \"%M:%S\") + datetime.timedelta(minutes=36)) for item in l[:ends[0]+1]]\n",
429+
" q2 = [(datetime.datetime.strptime(item, \"%M:%S\") + datetime.timedelta(minutes=24)) for item in l[ends[0]+1:ends[1]+1]]\n",
430+
" q3 = [(datetime.datetime.strptime(item, \"%M:%S\") + datetime.timedelta(minutes=12)) for item in l[ends[1]+1:ends[2]+1]]\n",
431+
" q4 = [datetime.datetime.strptime(item, \"%M:%S\") for item in l[ends[2]+1:ends[3]+1]]\n",
432+
" times = [(datetime.datetime(1900,1,1,0,48) - event).total_seconds() for event in q1+q2+q3+q4]\n",
433+
"\n",
434+
" timedf = pd.DataFrame([times,score]).transpose()\n",
435+
" tdata = []\n",
436+
" try:\n",
437+
" for i in range (360):\n",
438+
" tdata.append(timedf[timedf[0] <= 8*(i+1)].iloc[-1][1])\n",
439+
" except IndexError:\n",
440+
" continue\n",
441+
" tdf = pd.DataFrame(tdata).transpose()\n",
442+
" tdf.columns = [\"t\"+str(col) for col in tdf.columns]\n",
443+
" tdf.insert(0,'game_id',game)\n",
444+
" \n",
445+
" # overtime indicator\n",
446+
" if len(ends) > 4:\n",
447+
" tdf.insert(1,'overtime', 1)\n",
448+
" else:\n",
449+
" tdf.insert(1,'overtime', 0)\n",
450+
" \n",
451+
" # save to database\n",
452+
" tdf.to_sql('playbyplay', con=engine, if_exists='append', index=False)\n",
453+
" time.sleep(1)"
454+
]
455+
}
456+
],
457+
"metadata": {
458+
"kernelspec": {
459+
"display_name": "Python 3",
460+
"language": "python",
461+
"name": "python3"
462+
},
463+
"language_info": {
464+
"codemirror_mode": {
465+
"name": "ipython",
466+
"version": 3
467+
},
468+
"file_extension": ".py",
469+
"mimetype": "text/x-python",
470+
"name": "python",
471+
"nbconvert_exporter": "python",
472+
"pygments_lexer": "ipython3",
473+
"version": "3.7.11"
474+
}
475+
},
476+
"nbformat": 4,
477+
"nbformat_minor": 4
478+
}

‎Models.ipynb

+598
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,598 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"# Import libraries\n",
10+
"import nba_api.stats.endpoints\n",
11+
"from nba_api.stats.endpoints import leaguegamefinder, playbyplayv2\n",
12+
"from nba_api.stats.static import teams\n",
13+
"import pandas as pd\n",
14+
"from tqdm import tqdm\n",
15+
"import time\n",
16+
"import datetime\n",
17+
"import numpy as np\n",
18+
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
19+
"from sklearn.naive_bayes import MultinomialNB\n",
20+
"from sklearn.neighbors import KNeighborsClassifier\n",
21+
"from sklearn.cluster import KMeans\n",
22+
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
23+
"from sklearn.model_selection import train_test_split\n",
24+
"from sklearn.preprocessing import normalize, MinMaxScaler, StandardScaler\n",
25+
"from sklearn.metrics import mean_absolute_error, mean_squared_error\n",
26+
"from imblearn.over_sampling import RandomOverSampler\n",
27+
"import requests, sqlalchemy\n",
28+
"from bs4 import BeautifulSoup\n",
29+
"import itertools\n",
30+
"\n",
31+
"# connect to postgres database\n",
32+
"engine = sqlalchemy.create_engine('postgresql://postgres:password@localhost:5432/NBA')\n",
33+
"\n",
34+
"pd.set_option('display.max_rows', 500)\n",
35+
"pd.set_option('display.max_columns', 500)\n",
36+
"pd.set_option('display.width', 1000)"
37+
]
38+
},
39+
{
40+
"cell_type": "markdown",
41+
"metadata": {},
42+
"source": [
43+
"# Score difference"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": 3,
49+
"metadata": {},
50+
"outputs": [],
51+
"source": [
52+
"# read training data from database\n",
53+
"work = pd.read_sql_query(\"select * from train\", con=engine)\n",
54+
"\n",
55+
"# playoffs dummy variable\n",
56+
"work['playoff'] = work['season_id'].str.extract(r'(\\d)\\d{4}').astype(int)\n",
57+
"work['playoff'] = work['playoff'].replace(2,0)\n",
58+
"work['playoff'] = work['playoff'].replace(4,1)\n",
59+
"work['season_id'] = work['season_id'].str.replace('^\\d','2')\n",
60+
"\n",
61+
"# Get season after 1995\n",
62+
"work = work[work['season_id'] >= '21996']"
63+
]
64+
},
65+
{
66+
"cell_type": "code",
67+
"execution_count": 7,
68+
"metadata": {},
69+
"outputs": [],
70+
"source": [
71+
"# drop columns we dont need\n",
72+
"train = work.drop(columns=['team_id','team_abbreviation','game_date','matchup','min','days'])\n",
73+
"\n",
74+
"# compute difference in stats\n",
75+
"train['blk_diff'] = train['blk'] - train['blk_oppos']\n",
76+
"train['oreb_diff'] = train['oreb'] - train['oreb_oppos']\n",
77+
"train['reb_diff'] = train['reb'] - train['reb_oppos']\n",
78+
"train['ast_diff'] = train['ast'] - train['ast_oppos']\n",
79+
"train['stl_diff'] = train['stl'] - train['stl_oppos']\n",
80+
"train['tov_diff'] = train['tov'] - train['tov_oppos']\n",
81+
"train['pf_diff'] = train['pf'] - train['pf_oppos']"
82+
]
83+
},
84+
{
85+
"cell_type": "code",
86+
"execution_count": 8,
87+
"metadata": {},
88+
"outputs": [],
89+
"source": [
90+
"# read in the play-by-play data from database\n",
91+
"plays = pd.read_sql_query(\"select * from playbyplay\", con=engine)"
92+
]
93+
},
94+
{
95+
"cell_type": "code",
96+
"execution_count": 9,
97+
"metadata": {},
98+
"outputs": [],
99+
"source": [
100+
"# Cluster the play-by-play data \n",
101+
"kmeans = KMeans(6, random_state=0).fit(plays.drop(columns=['game_id']))\n",
102+
"\n",
103+
"# new feature as cluster\n",
104+
"plays['clusters'] = kmeans.labels_"
105+
]
106+
},
107+
{
108+
"cell_type": "code",
109+
"execution_count": 10,
110+
"metadata": {},
111+
"outputs": [],
112+
"source": [
113+
"# merge to training data\n",
114+
"train = train.merge(plays, left_on=['game_id'], right_on=['game_id'])"
115+
]
116+
},
117+
{
118+
"cell_type": "code",
119+
"execution_count": null,
120+
"metadata": {},
121+
"outputs": [],
122+
"source": [
123+
"# Get dummy variables\n",
124+
"final = pd.concat([\n",
125+
" pd.get_dummies(train['season_id']), \n",
126+
" pd.get_dummies(train['wl'],drop_first=True), \n",
127+
" pd.get_dummies(train['clusters'],prefix='cluster'),\n",
128+
" train.drop(columns=['awayteam','season_id','wl','hometeam','game_id'])], axis=1)\n",
129+
"\n",
130+
"# Split into train and test data\n",
131+
"tr, te = train_test_split(final,test_size=0.1,random_state=0)"
132+
]
133+
},
134+
{
135+
"cell_type": "markdown",
136+
"metadata": {},
137+
"source": [
138+
"## Model training"
139+
]
140+
},
141+
{
142+
"cell_type": "code",
143+
"execution_count": 13,
144+
"metadata": {},
145+
"outputs": [],
146+
"source": [
147+
"# Random forest\n",
148+
"rfr = RandomForestRegressor(max_depth=12, min_samples_split=64, n_jobs=-1, random_state=0)"
149+
]
150+
},
151+
{
152+
"cell_type": "code",
153+
"execution_count": 14,
154+
"metadata": {},
155+
"outputs": [
156+
{
157+
"data": {
158+
"text/plain": [
159+
"RandomForestRegressor(max_depth=12, min_samples_split=64, n_jobs=-1,\n",
160+
" random_state=0)"
161+
]
162+
},
163+
"execution_count": 14,
164+
"metadata": {},
165+
"output_type": "execute_result"
166+
}
167+
],
168+
"source": [
169+
"# Fit model\n",
170+
"rfr.fit(tr.drop(columns=['2diff']), tr['2diff'])"
171+
]
172+
},
173+
{
174+
"cell_type": "code",
175+
"execution_count": 15,
176+
"metadata": {},
177+
"outputs": [
178+
{
179+
"data": {
180+
"text/plain": [
181+
"9.373045180192985"
182+
]
183+
},
184+
"execution_count": 15,
185+
"metadata": {},
186+
"output_type": "execute_result"
187+
}
188+
],
189+
"source": [
190+
"# MAE on testing\n",
191+
"mean_absolute_error(te['2diff'],rfr.predict(te.drop(columns=['2diff'])))"
192+
]
193+
},
194+
{
195+
"cell_type": "code",
196+
"execution_count": 16,
197+
"metadata": {},
198+
"outputs": [
199+
{
200+
"data": {
201+
"text/plain": [
202+
"8.3139611855241"
203+
]
204+
},
205+
"execution_count": 16,
206+
"metadata": {},
207+
"output_type": "execute_result"
208+
}
209+
],
210+
"source": [
211+
"# MAE on training\n",
212+
"mean_absolute_error(tr['2diff'],rfr.predict(tr.drop(columns=['2diff'])))"
213+
]
214+
},
215+
{
216+
"cell_type": "code",
217+
"execution_count": 17,
218+
"metadata": {},
219+
"outputs": [],
220+
"source": [
221+
"# Team abbreviation conversion between our data and MSN\n",
222+
"team_dict = {'ATL': 'ATL',\n",
223+
" 'BKN': 'BKN',\n",
224+
" 'BOS': 'BOS',\n",
225+
" 'CHA': 'CHA',\n",
226+
" 'CHI': 'CHI',\n",
227+
" 'CLE': 'CLE',\n",
228+
" 'DAL': 'DAL',\n",
229+
" 'DEN': 'DEN',\n",
230+
" 'DET': 'DET',\n",
231+
" 'GS': 'GSW',\n",
232+
" 'HOU': 'HOU',\n",
233+
" 'IND': 'IND',\n",
234+
" 'LAC': 'LAC',\n",
235+
" 'LAL': 'LAL',\n",
236+
" 'MEM': 'MEM',\n",
237+
" 'MIA': 'MIA',\n",
238+
" 'MIL': 'MIL',\n",
239+
" 'MIN': 'MIN',\n",
240+
" 'NO': 'NOP',\n",
241+
" 'NY': 'NYK',\n",
242+
" 'OKC': 'OKC',\n",
243+
" 'ORL': 'ORL',\n",
244+
" 'PHI': 'PHI',\n",
245+
" 'PHO': 'PHX',\n",
246+
" 'POR': 'POR',\n",
247+
" 'SA': 'SAS',\n",
248+
" 'SAC': 'SAC',\n",
249+
" 'TOR': 'TOR',\n",
250+
" 'UTA': 'UTA',\n",
251+
" 'WAS': 'WAS'}"
252+
]
253+
},
254+
{
255+
"cell_type": "markdown",
256+
"metadata": {},
257+
"source": [
258+
"# Get game schedule tomorrrow"
259+
]
260+
},
261+
{
262+
"cell_type": "code",
263+
"execution_count": 18,
264+
"metadata": {},
265+
"outputs": [],
266+
"source": [
267+
"# webscrape from MSN\n",
268+
"headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}\n",
269+
"today = str(int(str(datetime.date.today()).replace('-','')))\n",
270+
"games_today = requests.get('https://www.msn.com/en-us/sports/nba/schedule', headers=headers)\n",
271+
"html_soup = BeautifulSoup(games_today.content, 'html.parser')"
272+
]
273+
},
274+
{
275+
"cell_type": "code",
276+
"execution_count": 19,
277+
"metadata": {},
278+
"outputs": [],
279+
"source": [
280+
"# convert date to the number of days after the first game\n",
281+
"def convert_days(date):\n",
282+
" d = pd.Timestamp(1983,10,28)\n",
283+
" return (date - d).days"
284+
]
285+
},
286+
{
287+
"cell_type": "code",
288+
"execution_count": 20,
289+
"metadata": {},
290+
"outputs": [],
291+
"source": [
292+
"# get the predictors from past data\n",
293+
"teamstoday = []\n",
294+
"page = html_soup.find_all('div',{'id':today})[0].find_all('td')\n",
295+
"for i in range (len(page)):\n",
296+
" if i % 5 == 2:\n",
297+
" teamstoday.append(page[i].text.split('\\n')[1].strip())\n",
298+
" \n",
299+
"all_games = pd.read_sql_query(\"select * from raw\", con=engine)\n",
300+
"s = \"\" # email string\n",
301+
"for i in range (0, len(teamstoday), 2):\n",
302+
" away = team_dict[teamstoday[i]]\n",
303+
" home = team_dict[teamstoday[i+1]]\n",
304+
" s += away + ',' + home + '\\n'\n",
305+
" all_games['days'] = pd.to_datetime(all_games['game_date']).apply(convert_days)\n",
306+
" all_games['hometeam'] = all_games['matchup'].str.extract(r'\\w* @ (\\w*)')\n",
307+
" thiscomp1 = all_games[(all_games['team_abbreviation'] == away) & (all_games['hometeam'] == home)].reset_index(drop=True)\n",
308+
" thiscomp2 = all_games[(all_games['team_abbreviation'] == home) & (all_games['hometeam'] == away)].reset_index(drop=True)\n",
309+
" target = pd.concat([thiscomp1, thiscomp2]).sort_values('days').iloc[-1].drop(labels=['team_id','team_name','game_date','matchup','min'])\n",
310+
" daysdiff = (pd.Timestamp.today() - pd.Timestamp(1983,10,28)).days - target['days']\n",
311+
" \n",
312+
" data = pd.Series([0]*final.shape[1], index = final.drop(columns=['2diff']).columns)\n",
313+
" data['playoff'] = 0\n",
314+
" data['22020'] = 1\n",
315+
" if target['hometeam'] == home:\n",
316+
" data['opposite'] = 0\n",
317+
" else:\n",
318+
" data['opposite'] = 1\n",
319+
"\n",
320+
" if target['wl'] == 'W':\n",
321+
" data['W'] = 1\n",
322+
" else:\n",
323+
" data['W'] = 0\n",
324+
" \n",
325+
" # compute the predictors\n",
326+
" data['1diff'] = target['pts'] - target['pts_oppos']\n",
327+
" data['daysdiff'] = daysdiff\n",
328+
" data[28:64] = target[4:-2]\n",
329+
" data[66:115] = team[(team['season'] == target['season_id']) & (team['team_abbreviation'] == target['team_abbreviation'])].drop(columns=['season_id','team_abbreviation','team_name','TEAM_ID','TEAM_NAME','GP','W','L','CFID','CFPARAMS','season']).iloc[0]\n",
330+
" data[115:-7] = team[(team['season'] == target['season_id']) & (team['team_abbreviation'] == target['hometeam'])].drop(columns=['season_id','team_abbreviation','team_name','TEAM_ID','TEAM_NAME','GP','W','L','CFID','CFPARAMS','season']).iloc[0]\n",
331+
" data['blk_diff'] = data['blk'] - data['blk_oppos']\n",
332+
" data['oreb_diff'] = data['oreb'] - data['oreb_oppos']\n",
333+
" data['reb_diff'] = data['reb'] - data['reb_oppos']\n",
334+
" data['ast_diff'] = data['ast'] - data['ast_oppos']\n",
335+
" data['stl_diff'] = data['stl'] - data['stl_oppos']\n",
336+
" data['tov_diff'] = data['tov'] - data['tov_oppos']\n",
337+
" data['pf_diff'] = data['pf'] - data['pf_oppos']\n",
338+
" \n",
339+
" # prediction\n",
340+
" s += str(rfr.predict(data.values.reshape(1,-1))) + '\\n' # email string"
341+
]
342+
},
343+
{
344+
"cell_type": "markdown",
345+
"metadata": {},
346+
"source": [
347+
"## Over/Under"
348+
]
349+
},
350+
{
351+
"cell_type": "code",
352+
"execution_count": 21,
353+
"metadata": {},
354+
"outputs": [],
355+
"source": [
356+
"# Read in data training data from database\n",
357+
"work = pd.read_sql_query(\"select * from train_total\", con=engine)\n",
358+
"\n",
359+
"# playoffs indicator\n",
360+
"work['playoff'] = work['season_id'].str.extract(r'(\\d)\\d{4}').astype(int)\n",
361+
"work['playoff'] = work['playoff'].replace(2,0)\n",
362+
"work['playoff'] = work['playoff'].replace(4,1)\n",
363+
"\n",
364+
"# Get season after 1995\n",
365+
"work['season_id'] = work['season_id'].str.replace('^\\d','2')\n",
366+
"work = work[work['season_id'] >= '21996']\n",
367+
"\n",
368+
"# drop columns we dont need\n",
369+
"train = work.drop(columns=['team_id','team_abbreviation','game_date','matchup','min','days'])\n",
370+
"\n",
371+
"# compute stat difference\n",
372+
"train['blk_diff'] = train['blk'] - train['blk_oppos']\n",
373+
"train['oreb_diff'] = train['oreb'] - train['oreb_oppos']\n",
374+
"train['reb_diff'] = train['reb'] - train['reb_oppos']\n",
375+
"train['ast_diff'] = train['ast'] - train['ast_oppos']\n",
376+
"train['stl_diff'] = train['stl'] - train['stl_oppos']\n",
377+
"train['tov_diff'] = train['tov'] - train['tov_oppos']\n",
378+
"train['pf_diff'] = train['pf'] - train['pf_oppos']"
379+
]
380+
},
381+
{
382+
"cell_type": "code",
383+
"execution_count": 22,
384+
"metadata": {},
385+
"outputs": [],
386+
"source": [
387+
"# dummy variables\n",
388+
"final = pd.concat([\n",
389+
" pd.get_dummies(train['season_id']), \n",
390+
" pd.get_dummies(train['wl'],drop_first=True), \n",
391+
" train.drop(columns=['awayteam','season_id','wl','hometeam','game_id'])], axis=1)\n",
392+
"\n",
393+
"# split into train and test dataset\n",
394+
"tr, te = train_test_split(final,test_size=0.1,random_state=0)"
395+
]
396+
},
397+
{
398+
"cell_type": "markdown",
399+
"metadata": {},
400+
"source": [
401+
"# Model training"
402+
]
403+
},
404+
{
405+
"cell_type": "code",
406+
"execution_count": 23,
407+
"metadata": {},
408+
"outputs": [],
409+
"source": [
410+
"# random forest\n",
411+
"rfr = RandomForestRegressor(max_depth=12, min_samples_split=64, n_jobs=-1, random_state=0)"
412+
]
413+
},
414+
{
415+
"cell_type": "code",
416+
"execution_count": 24,
417+
"metadata": {},
418+
"outputs": [
419+
{
420+
"data": {
421+
"text/plain": [
422+
"RandomForestRegressor(max_depth=12, min_samples_split=64, n_jobs=-1,\n",
423+
" random_state=0)"
424+
]
425+
},
426+
"execution_count": 24,
427+
"metadata": {},
428+
"output_type": "execute_result"
429+
}
430+
],
431+
"source": [
432+
"# fit the model\n",
433+
"rfr.fit(tr.drop(columns=['2diff']), tr['2diff'])"
434+
]
435+
},
436+
{
437+
"cell_type": "code",
438+
"execution_count": 25,
439+
"metadata": {},
440+
"outputs": [
441+
{
442+
"data": {
443+
"text/plain": [
444+
"14.881988864855604"
445+
]
446+
},
447+
"execution_count": 25,
448+
"metadata": {},
449+
"output_type": "execute_result"
450+
}
451+
],
452+
"source": [
453+
"# MAE for testing\n",
454+
"mean_absolute_error(te['2diff'],rfr.predict(te.drop(columns=['2diff'])))"
455+
]
456+
},
457+
{
458+
"cell_type": "code",
459+
"execution_count": 26,
460+
"metadata": {},
461+
"outputs": [
462+
{
463+
"data": {
464+
"text/plain": [
465+
"13.022569548413044"
466+
]
467+
},
468+
"execution_count": 26,
469+
"metadata": {},
470+
"output_type": "execute_result"
471+
}
472+
],
473+
"source": [
474+
"# MAE for training\n",
475+
"mean_absolute_error(tr['2diff'],rfr.predict(tr.drop(columns=['2diff'])))"
476+
]
477+
},
478+
{
479+
"cell_type": "markdown",
480+
"metadata": {},
481+
"source": [
482+
"## Get game schedule tomorrow"
483+
]
484+
},
485+
{
486+
"cell_type": "code",
487+
"execution_count": 27,
488+
"metadata": {},
489+
"outputs": [],
490+
"source": [
491+
"# webscrape from MSN\n",
492+
"teamstoday = []\n",
493+
"page = html_soup.find_all('div',{'id':today})[0].find_all('td')\n",
494+
"for i in range (len(page)):\n",
495+
" if i % 5 == 2:\n",
496+
" teamstoday.append(page[i].text.split('\\n')[1].strip())\n",
497+
"\n",
498+
"# get raw data from database\n",
499+
"all_games = pd.read_sql_query(\"select * from raw\", con=engine)\n",
500+
"\n",
501+
"# get predictors from past data\n",
502+
"for i in range (0, len(teamstoday), 2):\n",
503+
" away = team_dict[teamstoday[i]]\n",
504+
" home = team_dict[teamstoday[i+1]]\n",
505+
" s += away + ',' + home + '\\n' # email string\n",
506+
" all_games['days'] = pd.to_datetime(all_games['game_date']).apply(convert_days)\n",
507+
" all_games['hometeam'] = all_games['matchup'].str.extract(r'\\w* @ (\\w*)')\n",
508+
" thiscomp1 = all_games[(all_games['team_abbreviation'] == away) & (all_games['hometeam'] == home)].reset_index(drop=True)\n",
509+
" thiscomp2 = all_games[(all_games['team_abbreviation'] == home) & (all_games['hometeam'] == away)].reset_index(drop=True)\n",
510+
" target = pd.concat([thiscomp1, thiscomp2]).sort_values('days').iloc[-1].drop(labels=['team_id','team_name','game_date','matchup','min'])\n",
511+
" daysdiff = (pd.Timestamp.today() - pd.Timestamp(1983,10,28)).days - target['days']\n",
512+
" \n",
513+
" data = pd.Series([0]*final.shape[1], index = final.drop(columns=['2diff']).columns)\n",
514+
" data['playoff'] = 0\n",
515+
" data['22020'] = 1\n",
516+
" if target['hometeam'] == home:\n",
517+
" data['opposite'] = 0\n",
518+
" else:\n",
519+
" data['opposite'] = 1\n",
520+
"\n",
521+
" if target['wl'] == 'W':\n",
522+
" data['W'] = 1\n",
523+
" else:\n",
524+
" data['W'] = 0\n",
525+
" \n",
526+
" # compute the predictors\n",
527+
" data['1diff'] = target['pts'] - target['pts_oppos']\n",
528+
" data['daysdiff'] = daysdiff\n",
529+
" data[28:64] = target[4:-2]\n",
530+
" data[66:115] = team[(team['season'] == target['season_id']) & (team['team_abbreviation'] == target['team_abbreviation'])].drop(columns=['season_id','team_abbreviation','team_name','TEAM_ID','TEAM_NAME','GP','W','L','CFID','CFPARAMS','season']).iloc[0]\n",
531+
" data[115:-7] = team[(team['season'] == target['season_id']) & (team['team_abbreviation'] == target['hometeam'])].drop(columns=['season_id','team_abbreviation','team_name','TEAM_ID','TEAM_NAME','GP','W','L','CFID','CFPARAMS','season']).iloc[0]\n",
532+
" data['blk_diff'] = data['blk'] - data['blk_oppos']\n",
533+
" data['oreb_diff'] = data['oreb'] - data['oreb_oppos']\n",
534+
" data['reb_diff'] = data['reb'] - data['reb_oppos']\n",
535+
" data['ast_diff'] = data['ast'] - data['ast_oppos']\n",
536+
" data['stl_diff'] = data['stl'] - data['stl_oppos']\n",
537+
" data['tov_diff'] = data['tov'] - data['tov_oppos']\n",
538+
" data['pf_diff'] = data['pf'] - data['pf_oppos']\n",
539+
" \n",
540+
" # prediction\n",
541+
" s += str(rfr.predict(data.values.reshape(1,-1))) + '\\n' # email string"
542+
]
543+
},
544+
{
545+
"cell_type": "markdown",
546+
"metadata": {},
547+
"source": [
548+
"# Email the predictions"
549+
]
550+
},
551+
{
552+
"cell_type": "code",
553+
"execution_count": 29,
554+
"metadata": {},
555+
"outputs": [],
556+
"source": [
557+
"import smtplib, ssl\n",
558+
"\n",
559+
"port = 465 # For SSL\n",
560+
"smtp_server = \"smtp.gmail.com\"\n",
561+
"sender_email = \"leowei08@gmail.com\"\n",
562+
"receiver_email1 = \"leowei08@gmail.com\" \n",
563+
"password = 'password'\n",
564+
"message = \"\"\"\\\n",
565+
"Subject: Predictions Today {today}\n",
566+
"\n",
567+
"\n",
568+
"{content}.\"\"\"\n",
569+
"\n",
570+
"context = ssl.create_default_context()\n",
571+
"with smtplib.SMTP_SSL(smtp_server, port, context=context) as server:\n",
572+
" server.login(sender_email, password)\n",
573+
" server.sendmail(sender_email, receiver_email1, message.format(today=str(datetime.date.today()).replace('-',''), content=s))"
574+
]
575+
}
576+
],
577+
"metadata": {
578+
"kernelspec": {
579+
"display_name": "Python 3",
580+
"language": "python",
581+
"name": "python3"
582+
},
583+
"language_info": {
584+
"codemirror_mode": {
585+
"name": "ipython",
586+
"version": 3
587+
},
588+
"file_extension": ".py",
589+
"mimetype": "text/x-python",
590+
"name": "python",
591+
"nbconvert_exporter": "python",
592+
"pygments_lexer": "ipython3",
593+
"version": "3.7.11"
594+
}
595+
},
596+
"nbformat": 4,
597+
"nbformat_minor": 4
598+
}

‎UpdateGames.ipynb

+401
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,401 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"# Import library\n",
10+
"from nba_api.stats.static import teams\n",
11+
"from nba_api.stats.endpoints import leaguegamefinder, playbyplayv2\n",
12+
"import pandas as pd\n",
13+
"from tqdm import tqdm\n",
14+
"import matplotlib.pyplot as plt\n",
15+
"import numpy as np\n",
16+
"import requests, datetime, time, sqlalchemy\n",
17+
"pd.set_option('display.max_rows', 500)\n",
18+
"pd.set_option('display.max_columns', 500)\n",
19+
"pd.set_option('display.width', 1000)"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": 2,
25+
"metadata": {},
26+
"outputs": [],
27+
"source": [
28+
"# data type for each column\n",
29+
"types = {'opposite': 'int',\n",
30+
" 'daysdiff': 'int',\n",
31+
" 'season_id': 'str',\n",
32+
" 'team_id': 'int',\n",
33+
" 'team_abbreviation': 'str',\n",
34+
" 'team_name': 'str',\n",
35+
" 'game_id': 'str',\n",
36+
" 'game_date': 'str',\n",
37+
" 'matchup': 'str',\n",
38+
" 'wl': 'str',\n",
39+
" 'min': 'int',\n",
40+
" 'pts': 'int',\n",
41+
" 'fgm': 'int',\n",
42+
" 'fga': 'int',\n",
43+
" 'fg_pct': 'float',\n",
44+
" 'fg3m': 'int',\n",
45+
" 'fg3a': 'float',\n",
46+
" 'fg3_pct': 'float',\n",
47+
" 'ftm': 'int',\n",
48+
" 'fta': 'int',\n",
49+
" 'ft_pct': 'float',\n",
50+
" 'oreb': 'float',\n",
51+
" 'dreb': 'float',\n",
52+
" 'reb': 'float',\n",
53+
" 'ast': 'int',\n",
54+
" 'stl': 'float',\n",
55+
" 'blk': 'int',\n",
56+
" 'tov': 'int',\n",
57+
" 'pf': 'int',\n",
58+
" 'pts_oppos': 'int',\n",
59+
" 'fgm_oppos': 'int',\n",
60+
" 'fga_oppos': 'int',\n",
61+
" 'fg_pct_oppos': 'float',\n",
62+
" 'fg3m_oppos': 'int',\n",
63+
" 'fg3a_oppos': 'float',\n",
64+
" 'fg3_pct_oppos': 'float',\n",
65+
" 'ftm_oppos': 'int',\n",
66+
" 'fta_oppos': 'int',\n",
67+
" 'ft_pct_oppos': 'float',\n",
68+
" 'oreb_oppos': 'float',\n",
69+
" 'dreb_oppos': 'float',\n",
70+
" 'reb_oppos': 'float',\n",
71+
" 'ast_oppos': 'int',\n",
72+
" 'stl_oppos': 'float',\n",
73+
" 'blk_oppos': 'int',\n",
74+
" 'tov_oppos': 'int',\n",
75+
" 'pf_oppos': 'int',\n",
76+
" 'days': 'int',\n",
77+
" 'hometeam': 'str',\n",
78+
" '1diff': 'int',\n",
79+
" '2diff': 'int',\n",
80+
" 'awayteam': 'str'}"
81+
]
82+
},
83+
{
84+
"cell_type": "markdown",
85+
"metadata": {},
86+
"source": [
87+
"## Update game history "
88+
]
89+
},
90+
{
91+
"cell_type": "code",
92+
"execution_count": 3,
93+
"metadata": {},
94+
"outputs": [],
95+
"source": [
96+
"# connect to local database\n",
97+
"engine = sqlalchemy.create_engine('postgresql://postgres:password@localhost:5432/NBA')"
98+
]
99+
},
100+
{
101+
"cell_type": "code",
102+
"execution_count": 4,
103+
"metadata": {},
104+
"outputs": [],
105+
"source": [
106+
"# get the game results today\n",
107+
"today = datetime.date.today() - datetime.timedelta(days=1)\n",
108+
"l = str(today).split('-')\n",
109+
"today = l[1]+\"/\"+l[2]+\"/\"+l[0]\n",
110+
"df = leaguegamefinder.LeagueGameFinder(date_from_nullable=today).get_data_frames()[0]"
111+
]
112+
},
113+
{
114+
"cell_type": "code",
115+
"execution_count": 5,
116+
"metadata": {},
117+
"outputs": [
118+
{
119+
"name": "stderr",
120+
"output_type": "stream",
121+
"text": [
122+
"100%|█████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 130.35it/s]\n"
123+
]
124+
}
125+
],
126+
"source": [
127+
"# Get the colunm names for the home team to be PTS_home\n",
128+
"c = df.columns.tolist()[9:-1]\n",
129+
"l = []\n",
130+
"for i in range(len(c)):\n",
131+
" l.append(c[i] + '_oppos')\n",
132+
"\n",
133+
"# away team abbreviation\n",
134+
"away = df[df['MATCHUP'].str.contains('@')].reset_index(drop=True)\n",
135+
"\n",
136+
"# home team abbreviation\n",
137+
"home = df[df['MATCHUP'].str.contains('vs.')]\n",
138+
"\n",
139+
"# convert a home team row and an away team row to 1 away and home team row\n",
140+
"new = pd.DataFrame()\n",
141+
"for i in tqdm(range(away.shape[0])):\n",
142+
" tmp = home[home['GAME_ID']==away.iloc[i,:]['GAME_ID']].iloc[:,9:-1].reset_index(drop=True)\n",
143+
" tmp.columns = l\n",
144+
" new = pd.concat([new, pd.concat([away.iloc[i:i+1,:].reset_index(drop=True), tmp],axis=1)])"
145+
]
146+
},
147+
{
148+
"cell_type": "code",
149+
"execution_count": 6,
150+
"metadata": {},
151+
"outputs": [],
152+
"source": [
153+
"# drop column and fill averages with 0\n",
154+
"new.drop(columns=['PLUS_MINUS'],inplace=True)\n",
155+
"new = new.fillna(0)"
156+
]
157+
},
158+
{
159+
"cell_type": "code",
160+
"execution_count": 7,
161+
"metadata": {},
162+
"outputs": [],
163+
"source": [
164+
"# make column names lowercase\n",
165+
"cols = [item.lower() for item in new.columns.to_list()]"
166+
]
167+
},
168+
{
169+
"cell_type": "code",
170+
"execution_count": 8,
171+
"metadata": {},
172+
"outputs": [],
173+
"source": [
174+
"new.columns = cols"
175+
]
176+
},
177+
{
178+
"cell_type": "code",
179+
"execution_count": 15,
180+
"metadata": {},
181+
"outputs": [],
182+
"source": [
183+
"# add them to the database\n",
184+
"new.reset_index(drop=True).to_sql('raw',engine,if_exists='append',index=False)"
185+
]
186+
},
187+
{
188+
"cell_type": "markdown",
189+
"metadata": {},
190+
"source": [
191+
"## Update training data"
192+
]
193+
},
194+
{
195+
"cell_type": "code",
196+
"execution_count": 10,
197+
"metadata": {},
198+
"outputs": [],
199+
"source": [
200+
"# get home team abbreviation for todays games\n",
201+
"new['hometeam'] = new['matchup'].str.extract(r'\\w* @ (\\w*)')"
202+
]
203+
},
204+
{
205+
"cell_type": "code",
206+
"execution_count": 11,
207+
"metadata": {},
208+
"outputs": [],
209+
"source": [
210+
"# get all past game data\n",
211+
"games = pd.read_sql_query(\"select * from raw\", con=engine)"
212+
]
213+
},
214+
{
215+
"cell_type": "code",
216+
"execution_count": 12,
217+
"metadata": {},
218+
"outputs": [],
219+
"source": [
220+
"# convert date to number of days after the first game\n",
221+
"def convert_days(date):\n",
222+
" d = pd.Timestamp(1983,10,28)\n",
223+
" return (date - d).days\n",
224+
"\n",
225+
"# compute\n",
226+
"games['days'] = pd.to_datetime(games['game_date']).apply(convert_days)\n",
227+
"\n",
228+
"# get home team abbreviation for past games\n",
229+
"games['hometeam'] = games['matchup'].str.extract(r'\\w* @ (\\w*)')"
230+
]
231+
},
232+
{
233+
"cell_type": "code",
234+
"execution_count": 17,
235+
"metadata": {},
236+
"outputs": [],
237+
"source": [
238+
"# Find the last time before today both teams faced each other\n",
239+
"z = 0\n",
240+
"for i in range (new.shape[0]):\n",
241+
" z += 1\n",
242+
" home = new.iloc[i]['hometeam']\n",
243+
" away = new.iloc[i]['team_abbreviation']\n",
244+
" one = games[(games['team_abbreviation']==home) & (games['hometeam'] == away)].reset_index(drop=True)\n",
245+
" two = games[(games['team_abbreviation']==away) & (games['hometeam'] == home)].reset_index(drop=True)\n",
246+
" tmp = pd.concat([one,two]).sort_values('days').iloc[-2:,:]\n",
247+
" \n",
248+
" d = {}\n",
249+
" if tmp.shape[0] > 1:\n",
250+
" if tmp.iloc[0]['team_abbreviation'] != tmp.iloc[1]['team_abbreviation']:\n",
251+
" d['opposite'] = 1\n",
252+
" else:\n",
253+
" d['opposite'] = 0\n",
254+
" d['daysdiff'] = tmp.iloc[1]['days']-tmp.iloc[0]['days']\n",
255+
" tmp1 = pd.Series(d).append(tmp.iloc[0])\n",
256+
" d = {}\n",
257+
" \n",
258+
" # compute score difference\n",
259+
" d['1diff'] = tmp.iloc[0]['pts'] - tmp.iloc[0]['pts_oppos']\n",
260+
" d['2diff'] = tmp.iloc[1]['pts'] - tmp.iloc[1]['pts_oppos']\n",
261+
" d['awayteam'] = tmp.iloc[0]['team_abbreviation']\n",
262+
" tmp2 = tmp1.append(pd.Series(d))\n",
263+
" \n",
264+
" # save to database\n",
265+
" tmp2.to_frame().transpose().astype(types).to_sql('train',con=engine,if_exists='append',index=False)"
266+
]
267+
},
268+
{
269+
"cell_type": "code",
270+
"execution_count": 23,
271+
"metadata": {},
272+
"outputs": [],
273+
"source": [
274+
"# Find the last time before today both teams faced each other\n",
275+
"z = 0\n",
276+
"for i in range (new.shape[0]):\n",
277+
" z += 1\n",
278+
" home = new.iloc[i]['hometeam']\n",
279+
" away = new.iloc[i]['team_abbreviation']\n",
280+
" one = games[(games['team_abbreviation']==home) & (games['hometeam'] == away)].reset_index(drop=True)\n",
281+
" two = games[(games['team_abbreviation']==away) & (games['hometeam'] == home)].reset_index(drop=True)\n",
282+
" tmp = pd.concat([one,two]).sort_values('days').iloc[-2:,:]\n",
283+
" \n",
284+
" d = {}\n",
285+
" if tmp.shape[0] > 1:\n",
286+
" if tmp.iloc[0]['team_abbreviation'] != tmp.iloc[1]['team_abbreviation']:\n",
287+
" d['opposite'] = 1\n",
288+
" else:\n",
289+
" d['opposite'] = 0\n",
290+
" d['daysdiff'] = tmp.iloc[1]['days']-tmp.iloc[0]['days']\n",
291+
" tmp1 = pd.Series(d).append(tmp.iloc[0])\n",
292+
" d = {}\n",
293+
" \n",
294+
" # compute score total\n",
295+
" d['1diff'] = tmp.iloc[0]['pts'] + tmp.iloc[0]['pts_oppos']\n",
296+
" d['2diff'] = tmp.iloc[1]['pts'] + tmp.iloc[1]['pts_oppos']\n",
297+
" d['awayteam'] = tmp.iloc[0]['team_abbreviation']\n",
298+
" tmp2 = tmp1.append(pd.Series(d))\n",
299+
" \n",
300+
" # save to database\n",
301+
" tmp2.to_frame().transpose().astype(types).to_sql('train_total',con=engine,if_exists='append',index=False)"
302+
]
303+
},
304+
{
305+
"cell_type": "markdown",
306+
"metadata": {},
307+
"source": [
308+
"## Update Play-by-Play Data"
309+
]
310+
},
311+
{
312+
"cell_type": "code",
313+
"execution_count": 21,
314+
"metadata": {},
315+
"outputs": [],
316+
"source": [
317+
"# function to convert score string to score difference\n",
318+
"def calc_scorediff(x):\n",
319+
" if x != None:\n",
320+
" return eval(x)"
321+
]
322+
},
323+
{
324+
"cell_type": "code",
325+
"execution_count": 22,
326+
"metadata": {},
327+
"outputs": [],
328+
"source": [
329+
"# get play-by-play data for the games today\n",
330+
"for game in new['game_id']:\n",
331+
" play = playbyplayv2.PlayByPlayV2(game).get_data_frames()[0]\n",
332+
" if play.shape[0] != 0:\n",
333+
" score = play['SCORE'].apply(calc_scorediff).fillna(method='ffill').fillna(0)\n",
334+
"\n",
335+
" # Record the last index of 0:00 of each quarter\n",
336+
" zeroindex = 0\n",
337+
" ends = []\n",
338+
" l = play['PCTIMESTRING'].to_list()\n",
339+
" for i in range (len(l)):\n",
340+
" if l[i] == '0:00':\n",
341+
" zeroindex = i\n",
342+
" else:\n",
343+
" if zeroindex != 0:\n",
344+
" ends.append(zeroindex)\n",
345+
" zeroindex=0\n",
346+
" ends.append(i)\n",
347+
" if len(ends) < 4:\n",
348+
" continue\n",
349+
" \n",
350+
" # 4 quarters\n",
351+
" q1 = [(datetime.datetime.strptime(item, \"%M:%S\") + datetime.timedelta(minutes=36)) for item in l[:ends[0]+1]]\n",
352+
" q2 = [(datetime.datetime.strptime(item, \"%M:%S\") + datetime.timedelta(minutes=24)) for item in l[ends[0]+1:ends[1]+1]]\n",
353+
" q3 = [(datetime.datetime.strptime(item, \"%M:%S\") + datetime.timedelta(minutes=12)) for item in l[ends[1]+1:ends[2]+1]]\n",
354+
" q4 = [datetime.datetime.strptime(item, \"%M:%S\") for item in l[ends[2]+1:ends[3]+1]]\n",
355+
" times = [(datetime.datetime(1900,1,1,0,48) - event).total_seconds() for event in q1+q2+q3+q4]\n",
356+
"\n",
357+
" timedf = pd.DataFrame([times,score]).transpose()\n",
358+
" tdata = []\n",
359+
" try:\n",
360+
" for i in range (360):\n",
361+
" tdata.append(timedf[timedf[0] <= 8*(i+1)].iloc[-1][1])\n",
362+
" except IndexError:\n",
363+
" continue\n",
364+
" tdf = pd.DataFrame(tdata).transpose()\n",
365+
" tdf.columns = [\"t\"+str(col) for col in tdf.columns]\n",
366+
" tdf.insert(0,'game_id',game)\n",
367+
" \n",
368+
" # overtime indicator\n",
369+
" if len(ends) > 4:\n",
370+
" tdf.insert(1,'overtime', 1)\n",
371+
" else:\n",
372+
" tdf.insert(1,'overtime', 0)\n",
373+
" \n",
374+
" # save to database\n",
375+
" tdf.to_sql('playbyplay', con=engine, if_exists='append', index=False)\n",
376+
" time.sleep(3)"
377+
]
378+
}
379+
],
380+
"metadata": {
381+
"kernelspec": {
382+
"display_name": "Python 3",
383+
"language": "python",
384+
"name": "python3"
385+
},
386+
"language_info": {
387+
"codemirror_mode": {
388+
"name": "ipython",
389+
"version": 3
390+
},
391+
"file_extension": ".py",
392+
"mimetype": "text/x-python",
393+
"name": "python",
394+
"nbconvert_exporter": "python",
395+
"pygments_lexer": "ipython3",
396+
"version": "3.7.11"
397+
}
398+
},
399+
"nbformat": 4,
400+
"nbformat_minor": 4
401+
}

0 commit comments

Comments
 (0)
Please sign in to comment.