-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpullingData.py
More file actions
163 lines (131 loc) · 7.61 KB
/
pullingData.py
File metadata and controls
163 lines (131 loc) · 7.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import pandas as pd
from nba_api.stats.endpoints import LeagueGameLog
import stats
import re
# method to calculate the expanding mean
def expanding_mean(group):
return group.shift(1).expanding().mean()
def main():
# Step 1: Pull historic data and calculate season stats
season_list = []
for i in range(2019, 2026):
games = LeagueGameLog(season=i, season_type_all_star='Regular Season').get_data_frames()[0]
games = games.merge(games, how='inner', on='GAME_ID', suffixes=[None, '_OPP'])
games = games[games["TEAM_ID"] != games["TEAM_ID_OPP"]].reset_index(drop=True)
games.drop(columns=['SEASON_ID', 'VIDEO_AVAILABLE', 'SEASON_ID_OPP', 'VIDEO_AVAILABLE_OPP', 'WL_OPP'], inplace=True)
# Advanced stats for team
games['PACE'] = stats.pace(games)
games['POSSESSIONS'] = stats.possessions(games)
games['OFF_RTG'] = stats.off_rating(games)
games['DEF_RTG'] = stats.def_rating(games)
games['NET_RTG'] = games['OFF_RTG'] - games['DEF_RTG']
games['EFG'] = stats.eff_fg_pct(games)
games['TOV_PCT'] = stats.tov_pct(games)
games['OREB_PCT'] = stats.oreb_pct(games)
games['DREB_PCT'] = stats.dreb_pct(games)
games['FT_PER_FGA'] = stats.ft_per_fga(games)
# Advanced stats for opposing team
games['PACE_OPP'] = stats.pace_OPP(games)
games['POSSESSIONS_OPP'] = stats.possessions_OPP(games)
games['OFF_RTG_OPP'] = stats.off_rating_OPP(games)
games['DEF_RTG_OPP'] = stats.def_rating_OPP(games)
games['NET_RTG_OPP'] = games['OFF_RTG_OPP'] - games['DEF_RTG_OPP']
games['EFG_OPP'] = stats.eff_fg_pct_OPP(games)
games['TOV_PCT_OPP'] = stats.tov_pct_OPP(games)
games['OREB_PCT_OPP'] = stats.oreb_pct_OPP(games)
games['DREB_PCT_OPP'] = stats.dreb_pct_OPP(games)
games['FT_PER_FGA_OPP'] = stats.ft_per_fga_OPP(games)
games['HOME'] = games['MATCHUP'].apply(lambda x: 0 if re.search(r'@', str(x)) else 1)
# name_cols is the list of metadata columns that we dont want to apply expanding means on
name_cols = ['GAME_DATE', 'GAME_DATE_OPP', 'MATCHUP', 'MATCHUP_OPP',
'TEAM_ABBREVIATION', 'TEAM_ABBREVIATION_OPP', 'TEAM_NAME',
'TEAM_NAME_OPP', 'WL', 'GAME_ID', 'MIN', 'MIN_OPP', 'HOME', 'TEAM_ID', 'TEAM_ID_OPP']
# game_stats are the remaining columns, or stats, that we wish to apply expanding mean on
game_stats = games.columns.difference(name_cols)
# get all the previous games for both the home team and the away team, and then calculate their average stats up until the current matchup
grouped_home = games.groupby('TEAM_ID', group_keys=False)[[col for col in game_stats if not col.endswith('_OPP')]].apply(expanding_mean)
grouped_opp = games.groupby('TEAM_ID_OPP', group_keys=False)[[col for col in game_stats if col.endswith('_OPP')]].apply(expanding_mean)
# create a new row for this matchup with the averaged stats and add to our set for the season
grouped = grouped_home.join(grouped_opp).join(games[name_cols]).loc[:, games.columns.tolist()]
season_list.append(grouped)
historic_df = pd.concat(season_list, ignore_index=True)
# Step 2: Load odds with upcoming games info
odds_df = pd.read_csv("data/todays_odds.csv") # Columns: Date, Matchup, Bookmaker, Team, Moneyline Odds, Opposing Odds, Estimated Win Percentage
odds_df['Date'] = pd.to_datetime(odds_df['Date']).dt.date
# Step 3: Build new rows for upcoming games with rolling averages for those teams
future_rows = []
max_game_date = historic_df['GAME_DATE'].max()
# print(teams_group.groups.keys())
for _, row in odds_df.iterrows():
matchup = row['Matchup']
game_date = row['Date']
pattern = r"(.+?)\s+vs\s+(.+)"
match = re.match(pattern, matchup)
if match:
home_team = match.group(1)
away_team = match.group(2)
home = row['Team'] == home_team
if home_team == 'Los Angeles Clippers':
home_team = 'LA Clippers'
if away_team == 'Los Angeles Clippers':
away_team = 'LA Clippers'
home_games = historic_df.groupby('TEAM_NAME').get_group(home_team)
away_games = historic_df.groupby('TEAM_NAME_OPP').get_group(away_team)
home_last_game = home_games[home_games['GAME_DATE'] <= max_game_date].sort_values('GAME_DATE').iloc[-1].copy()
away_last_game = away_games[away_games['GAME_DATE'] <= max_game_date].sort_values('GAME_DATE').iloc[-1].copy()
name_cols = ['GAME_DATE', 'GAME_DATE_OPP', 'MATCHUP', 'MATCHUP_OPP',
'TEAM_ABBREVIATION', 'TEAM_ABBREVIATION_OPP', 'TEAM_NAME',
'TEAM_NAME_OPP', 'WL', 'GAME_ID', 'MIN', 'MIN_OPP', 'HOME', 'TEAM_ID','TEAM_ID_OPP']
game_cols = games.columns.difference(name_cols)
new_row = {
'GAME_DATE': game_date,
'GAME_DATE_OPP': game_date,
'MATCHUP': matchup,
'MATCHUP_OPP': matchup,
'HOME': home
}
if home:
game_stats_home = home_last_game[[col for col in game_cols if not col.endswith('_OPP')]]
game_stats_opp = away_last_game[[col for col in game_cols if col.endswith('_OPP')]]
new_row.update({
'GAME_ID': home_last_game['GAME_ID'], # wrong but shouldnt matter
'TEAM_ID': home_last_game['TEAM_ID'],
'TEAM_ID_OPP': away_last_game['TEAM_ID_OPP'],
'TEAM_NAME': home_team,
'TEAM_NAME_OPP': away_team,
'TEAM_ABBREVIATION': home_last_game['TEAM_ABBREVIATION'],
'TEAM_ABBREVIATION_OPP': away_last_game['TEAM_ABBREVIATION_OPP'],
'MIN': home_last_game['MIN'],
'MIN_OPP': away_last_game['MIN_OPP'],
**game_stats_home,
**game_stats_opp,
})
else:
game_stats_home = away_last_game[[col for col in game_cols if col.endswith('_OPP')]]
game_stats_opp = home_last_game[[col for col in game_cols if not col.endswith('_OPP')]]
# For game_stats_home: remove "_OPP" from index labels
game_stats_home = game_stats_home.rename(lambda x: x.replace('_OPP', ''))
# For game_stats_opp: add "_OPP" suffix to index labels
game_stats_opp = game_stats_opp.rename(lambda x: x + '_OPP')
new_row.update({
'GAME_ID': away_last_game['GAME_ID'],
'TEAM_NAME': away_team,
'TEAM_NAME_OPP': home_team,
'TEAM_ID': away_last_game['TEAM_ID_OPP'],
'TEAM_ID_OPP': home_last_game['TEAM_ID'],
'TEAM_ABBREVIATION': away_last_game['TEAM_ABBREVIATION_OPP'],
'TEAM_ABBREVIATION_OPP': home_last_game['TEAM_ABBREVIATION'],
'MIN': away_last_game['MIN'],
'MIN_OPP': home_last_game['MIN_OPP'],
**game_stats_home,
**game_stats_opp,
})
future_rows.append(new_row)
upcoming_df = pd.DataFrame(future_rows)
# Step 4: Append upcoming rows to historic dataframe
full_df = pd.concat([historic_df, upcoming_df], ignore_index=True)
# Optionally save or return full dataset
full_df.to_csv("data/19_25_with_upcoming.csv", index=False)
print(f"Appended {len(upcoming_df)} upcoming game rows to historic data. Saved to data/19_25_with_upcoming.csv")
if __name__ == '__main__':
main()