diff --git a/NCAA March Madness/README.md b/NCAA March Madness/README.md deleted file mode 100644 index e09c3bf..0000000 --- a/NCAA March Madness/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# 2023 NCAA March Madness Predictions - -In this series of notebooks, we'll demonstrate how to set up a Snowflake environment, ingest data, conduct feature engineering, prepare and train a machine learning model, and ultimately, predict the outcomes of the 2023 NCAA March Madness tournament using [Snowpark for Python](https://www.snowflake.com/en/data-cloud/snowpark/). - -These examples showcase some of the work conducted for the [March Machine Learning Mania 2023](https://www.kaggle.com/competitions/march-machine-learning-mania-2023/data) Kaggle competition and are inspired by the article [Predicting the Unpredictable — March Madness using Snowpark and Hex](https://medium.com/snowflake/predicting-the-unpredictable-march-madness-using-snowpark-and-hex-f16dc4f57add) by [Chase Romano](https://medium.com/@chasea.romano) and [Tyler White](https://medium.com/@btylerwhite). - -### Setting it up. - -Clone the repo. If you're using macOS or Linux, execute the following commands: -``` -python3.8 -m venv venv -source venv/bin/activate -pip install -r NCAA\ March\ Madness/requirements.txt -``` - -## Solution Overview - -![image](https://user-images.githubusercontent.com/50381805/230445402-5e616b05-f87c-4df2-b2c4-c1e3ae647c1a.png) - -### Preparing the Environment - -First, we will create a dedicated database and a few schemas to keep our data organized within the database. - -### Data Ingestion - -Next, we will download the source data from the Kaggle competition and store it in the `RAW` schema within the `MARCH_MADNESS` database. - -### Feature Engineering - -In this step, we will aggregate season statistics, tournament statistics, and coaching tenures to use as features for training our machine learning model. - -### Model Preparation and Training - -We will combine the features with the target variable and prepare the dataset for model training. Then, we will train a machine learning model using the prepared data. - -### Model Inference - -Finally, we will use the trained model to make predictions for the 2023 NCAA March Madness tournament. diff --git a/NCAA March Madness/notebooks/1-environment-preparation.ipynb b/NCAA March Madness/notebooks/1-environment-preparation.ipynb deleted file mode 100644 index 8520033..0000000 --- a/NCAA March Madness/notebooks/1-environment-preparation.ipynb +++ /dev/null @@ -1,82 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "from dotenv import load_dotenv\n", - "from snowflake.snowpark import Session\n", - "\n", - "load_dotenv(override=True)\n", - "\n", - "connection_params = {\n", - " \"account\": os.environ.get(\"SNOWFLAKE_ACCOUNT\"),\n", - " \"user\": os.environ.get(\"SNOWFLAKE_USER\"),\n", - " \"password\": os.environ.get(\"SNOWFLAKE_PASSWORD\"),\n", - " \"role\": \"SYSADMIN\",\n", - "}\n", - "\n", - "session = Session.builder.configs(connection_params).create()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Row(status='Stage area PYTHON_CODE successfully created.')]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "session.sql(\n", - " \"\"\"CREATE WAREHOUSE IF NOT EXISTS MARCH_MADNESS_WH \n", - " WITH WAREHOUSE_SIZE = 'XSMALL' \n", - " WAREHOUSE_TYPE = 'STANDARD' \n", - " AUTO_SUSPEND = 60 \n", - " AUTO_RESUME = TRUE \n", - " INITIALLY_SUSPENDED = TRUE;\"\"\"\n", - ").collect()\n", - "session.sql(\"CREATE DATABASE IF NOT EXISTS MARCH_MADNESS;\").collect()\n", - "session.sql(\"DROP SCHEMA IF EXISTS MARCH_MADNESS.PUBLIC;\").collect()\n", - "session.sql(\"CREATE SCHEMA IF NOT EXISTS MARCH_MADNESS.COMMON;\").collect()\n", - "session.sql(\"CREATE SCHEMA IF NOT EXISTS MARCH_MADNESS.RAW;\").collect()\n", - "session.sql(\"CREATE SCHEMA IF NOT EXISTS MARCH_MADNESS.FEATURES;\").collect()\n", - "session.sql(\"CREATE STAGE IF NOT EXISTS MARCH_MADNESS.COMMON.MODELS;\").collect()\n", - "session.sql(\"CREATE STAGE IF NOT EXISTS MARCH_MADNESS.COMMON.PYTHON_CODE;\").collect()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/NCAA March Madness/notebooks/2-pipeline.ipynb b/NCAA March Madness/notebooks/2-pipeline.ipynb deleted file mode 100644 index 689c787..0000000 --- a/NCAA March Madness/notebooks/2-pipeline.ipynb +++ /dev/null @@ -1,1008 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import sys\n", - "import zipfile\n", - "\n", - "import cachetools\n", - "import joblib\n", - "import pandas as pd\n", - "import snowflake.snowpark.functions as F\n", - "import snowflake.snowpark.types as T\n", - "import xgboost as xgb\n", - "from dotenv import load_dotenv\n", - "from sklearn.metrics import accuracy_score, classification_report\n", - "from snowflake.snowpark import Session\n", - "\n", - "load_dotenv(override=True)\n", - "\n", - "connection_params = {\n", - " \"account\": os.environ.get(\"SNOWFLAKE_ACCOUNT\"),\n", - " \"user\": os.environ.get(\"SNOWFLAKE_USER\"),\n", - " \"password\": os.environ.get(\"SNOWFLAKE_PASSWORD\"),\n", - " \"role\": \"SYSADMIN\",\n", - " \"database\": \"MARCH_MADNESS\",\n", - " \"warehouse\": \"MARCH_MADNESS_WH\",\n", - " \"schema\": \"COMMON\",\n", - "}\n", - "\n", - "session = Session.builder.configs(connection_params).create()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data Ingestion\n", - "\n", - "Load the contents of the CSV files into raw tables." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Check if the file exists, if it doesn't, we'll assume the data has already been loaded.\n", - "if os.path.exists(\"../data/march-machine-learning-mania-2023.zip\"):\n", - " # Open the zip file using the 'with' statement to ensure proper closing\n", - " with zipfile.ZipFile(\"../data/march-machine-learning-mania-2023.zip\") as zf:\n", - " # Iterate over the files in the zip file\n", - " for file in zf.filelist:\n", - " # Check if the file is a CSV file\n", - " if file.filename.endswith(\".csv\"):\n", - " # Open the CSV file within the zip file\n", - " with zf.open(file.filename) as z:\n", - " # Read the CSV file into a pandas DataFrame using the ISO-8859-1 encoding\n", - " df = pd.read_csv(z, encoding=\"iso-8859-1\")\n", - "\n", - " # Create the table name for the Snowflake schema using the CSV file name\n", - " table_name = (\n", - " f\"RAW.{file.filename.split('/')[-1].replace('.csv', '').upper()}\"\n", - " )\n", - "\n", - " # Convert the column names to uppercase\n", - " df.columns = [col.upper() for col in df.columns]\n", - "\n", - " # Save the pandas DataFrame as a table in Snowflake with the specified table name\n", - " # and overwrite the table if it already exists\n", - " session.create_dataframe(df).write.save_as_table(\n", - " table_name=table_name, mode=\"overwrite\"\n", - " )" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Feature Engineering\n", - "\n", - "Create features from the raw data for model preparation.\n", - "\n", - "These will be defined as stored procedures." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "def prepare_season_averages(\n", - " session: Session, source_table: str, target_table: str\n", - ") -> str:\n", - " # Assign Snowflake DataFrame to season results table.\n", - " season_results = session.table(source_table)\n", - "\n", - " # Drop DAYNUM and WLOC as they aren't needed.\n", - " season_results = season_results.drop(\"DAYNUM\", \"WLOC\")\n", - "\n", - " # Drop any columns that start with an L.\n", - " w_season_results = season_results.select(\n", - " *[col for col in season_results.columns if not col.startswith(\"L\")]\n", - " )\n", - " w_season_results = w_season_results.select(\n", - " [\n", - " F.col(col).alias(col[1:]) if col.startswith(\"W\") else col\n", - " for col in w_season_results.columns\n", - " ]\n", - " )\n", - "\n", - " # Drop any columns that start with an L.\n", - " l_season_results = season_results.select(\n", - " *[col for col in season_results.columns if not col.startswith(\"W\")]\n", - " )\n", - " l_season_results = l_season_results.select(\n", - " [\n", - " F.col(col).alias(col[1:]) if col.startswith(\"L\") else col\n", - " for col in l_season_results.columns\n", - " ]\n", - " )\n", - "\n", - " # Union these dataframes for the entirety of the seasonal stats.\n", - " union_season_results = w_season_results.union(l_season_results)\n", - "\n", - " # Average all columns besides SEASON and TEAMID.\n", - " avg_union_season_results = union_season_results.group_by(\"SEASON\", \"TEAMID\").agg(\n", - " *[\n", - " F.avg(F.col(c)).alias(f\"AVG_{c}\")\n", - " for c in union_season_results.columns\n", - " if c not in [\"SEASON\", \"TEAMID\"]\n", - " ]\n", - " )\n", - "\n", - " avg_union_season_results.write.save_as_table(target_table, mode=\"overwrite\")\n", - "\n", - " return f\"Successfully created {target_table}.\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can test the function locally before we register it as a stored procedure." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Successfully created FEATURES.MAVGSEASONDETAILEDRESULTS.'" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "prepare_season_averages(\n", - " session, \"RAW.MREGULARSEASONDETAILEDRESULTS\", \"FEATURES.MAVGSEASONDETAILEDRESULTS\"\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Register the stored procedure." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "The version of package snowflake-snowpark-python in the local environment is 1.3.0, which does not fit the criteria for the requirement snowflake-snowpark-python. Your UDF might not work when the package version is different between the server and your local environment\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "session.sproc.register(\n", - " func=prepare_season_averages,\n", - " return_type=T.StringType(),\n", - " input_types=[T.StringType(), T.StringType()],\n", - " name=\"PREPARE_SEASON_AVERAGES\",\n", - " is_permanent=True,\n", - " stage_location=\"@COMMON.PYTHON_CODE\",\n", - " packages=[\"snowflake-snowpark-python\"],\n", - " replace=True,\n", - " source_code_display=True,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Call the stored procedure to average the Men's regular season detailed stats. " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Successfully created FEATURES.MAVGSEASONDETAILEDRESULTS.'" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "session.call(\n", - " \"PREPARE_SEASON_AVERAGES\",\n", - " \"RAW.MREGULARSEASONDETAILEDRESULTS\",\n", - " \"FEATURES.MAVGSEASONDETAILEDRESULTS\",\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define a function to one-hot encode the conference data. The minimum season in the primary data is 2003, so we'll filter to only look at those years." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "def prepare_ohe_conferences(\n", - " session: Session, source_table: str, target_table: str\n", - ") -> str:\n", - " # Assign Snowflake DataFrame to Results table.\n", - " conferences_df = session.table(source_table)\n", - "\n", - " # Filter to 2003 and later seasons.\n", - " conferences_df = conferences_df.filter(F.col(\"SEASON\") >= F.lit(2003))\n", - "\n", - " # Rename the 'CONFABBREV' column to 'CONFERENCE' and convert the data to a Pandas DataFrame\n", - " conferences_df = conferences_df.with_column_renamed(\n", - " \"CONFABBREV\", \"CONFERENCE\"\n", - " ).to_pandas()\n", - "\n", - " # One-hot encode the 'CONFERENCE' column using Pandas get_dummies function\n", - " one_hot_encoded_df = pd.get_dummies(conferences_df, columns=[\"CONFERENCE\"])\n", - "\n", - " # Uppercase all of these columns.\n", - " one_hot_encoded_df.columns = [col.upper() for col in one_hot_encoded_df.columns]\n", - "\n", - " # Convert the Pandas DataFrame back to a Snowflake DataFrame\n", - " one_hot_encoded_df = session.create_dataframe(one_hot_encoded_df)\n", - "\n", - " # Save the Snowflake DataFrame as a table in Snowflake with the specified table name\n", - " one_hot_encoded_df.write.save_as_table(target_table, mode=\"overwrite\")\n", - "\n", - " return f\"Successfully created {target_table}.\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Register the stored procedure." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "The version of package snowflake-snowpark-python in the local environment is 1.3.0, which does not fit the criteria for the requirement snowflake-snowpark-python. Your UDF might not work when the package version is different between the server and your local environment\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "session.sproc.register(\n", - " func=prepare_ohe_conferences,\n", - " return_type=T.StringType(),\n", - " input_types=[T.StringType(), T.StringType()],\n", - " name=\"PREPARE_OHE_CONFERENCES\",\n", - " is_permanent=True,\n", - " stage_location=\"@COMMON.PYTHON_CODE\",\n", - " packages=[\"pandas\", \"snowflake-snowpark-python\"],\n", - " replace=True,\n", - " source_code_display=True,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Call the stored procedure to build a table of one-hot encoded conference data." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Successfully created FEATURES.MTEAMCONFERENCESOHE.'" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "session.call(\n", - " \"PREPARE_OHE_CONFERENCES\", \"RAW.MTEAMCONFERENCES\", \"FEATURES.MTEAMCONFERENCESOHE\"\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Preparation\n", - "\n", - "Bringing in the existing tournament results, regular season averages, and conference data." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "def prepare_tourney_results(session: Session, target_table: str) -> str:\n", - " tourney_results = (\n", - " session.table(\"RAW.MNCAATOURNEYCOMPACTRESULTS\")\n", - " .filter(F.col(\"SEASON\") >= F.lit(2003))\n", - " .select(\"SEASON\", \"WTEAMID\", \"LTEAMID\")\n", - " .with_column_renamed(\"WTEAMID\", \"TEAMID_1\")\n", - " .with_column_renamed(\"LTEAMID\", \"TEAMID_2\")\n", - " )\n", - "\n", - " # Bring in the regular season averages as a Snowpark DataFrame.\n", - " season_df = session.table(\"FEATURES.MAVGSEASONDETAILEDRESULTS\")\n", - "\n", - " # Bring in the conference one-hot encoded data as a Snowpark DataFrame.\n", - " conferences_df = session.table(\"FEATURES.MTEAMCONFERENCESOHE\")\n", - "\n", - " # Create a new Snowpark DataFrame for the winning team's regular season averages.\n", - " wteamseasonavgs = season_df.select(\n", - " [\n", - " F.col(c).alias(f\"{c}_REGSEASON_1\") if c not in [\"SEASON\", \"TEAMID\"] else c\n", - " for c in season_df.columns\n", - " ]\n", - " ).with_column_renamed(\"TEAMID\", \"TEAMID_1\")\n", - "\n", - " # Create a new Snowpark DataFrame for the losing team's regular season averages.\n", - " lteamseasonavgs = season_df.select(\n", - " [\n", - " F.col(c).alias(f\"{c}_REGSEASON_2\") if c not in [\"SEASON\", \"TEAMID\"] else c\n", - " for c in season_df.columns\n", - " ]\n", - " ).with_column_renamed(\"TEAMID\", \"TEAMID_2\")\n", - "\n", - " # Create a new Snowpark DataFrame for the winning team's conference one-hot encoded data.\n", - " wconferences = conferences_df.select(\n", - " [\n", - " F.col(c).alias(f\"{c}_1\") if c != \"SEASON\" else c\n", - " for c in conferences_df.columns\n", - " ]\n", - " )\n", - "\n", - " # Create a new Snowpark DataFrame for the losing team's conference one-hot encoded data.\n", - " lconferences = conferences_df.select(\n", - " [\n", - " F.col(c).alias(f\"{c}_2\") if c != \"SEASON\" else c\n", - " for c in conferences_df.columns\n", - " ]\n", - " )\n", - "\n", - " # Join all of our dataframes together.\n", - " tourney_teams = (\n", - " tourney_results.natural_join(wteamseasonavgs, how=\"left\")\n", - " .natural_join(lteamseasonavgs, how=\"left\")\n", - " .natural_join(wconferences, how=\"left\")\n", - " .natural_join(lconferences, how=\"left\")\n", - " )\n", - "\n", - " # Create an indicator column for wins and losses.\n", - " wins = tourney_teams.with_column(\"WIN_INDICATOR\", F.lit(1))\n", - " losses = tourney_teams.with_column(\"WIN_INDICATOR\", F.lit(0))\n", - "\n", - " # Relabel our columns to swap the 1 and 2 suffixes.\n", - " col_relabels = {}\n", - " for col in losses.columns:\n", - " if col.endswith(\"_1\"):\n", - " col_relabels[col] = col.replace(\"_1\", \"_2\")\n", - " elif col.endswith(\"_2\"):\n", - " col_relabels[col] = col.replace(\"_2\", \"_1\")\n", - "\n", - " # Relabel the columns in the losses DataFrame.\n", - " losses = losses.select(\n", - " [\n", - " F.col(c).alias(col_relabels[c]) if c in col_relabels else c\n", - " for c in losses.columns\n", - " ]\n", - " )\n", - "\n", - " # Union the wins and losses DataFrames together.\n", - " union_df = wins.union_all_by_name(losses)\n", - "\n", - " # Save the Snowflake DataFrame as a table in Snowflake with the specified table name.\n", - " union_df.write.save_as_table(target_table, mode=\"overwrite\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Register the stored procedure." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "The version of package snowflake-snowpark-python in the local environment is 1.3.0, which does not fit the criteria for the requirement snowflake-snowpark-python. Your UDF might not work when the package version is different between the server and your local environment\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "session.sproc.register(\n", - " func=prepare_tourney_results,\n", - " return_type=T.StringType(),\n", - " input_types=[T.StringType()],\n", - " name=\"PREPARE_TOURNEY_RESULTS\",\n", - " is_permanent=True,\n", - " stage_location=\"@COMMON.PYTHON_CODE\",\n", - " packages=[\"snowflake-snowpark-python\"],\n", - " replace=True,\n", - " source_code_display=True,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Call the stored procedure to build a table for model training." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "session.call(\"PREPARE_TOURNEY_RESULTS\", \"FEATURES.MFEATURESJOINED\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Training\n", - "\n", - "Bringing in the existing joined features table to train and store the model." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def train_model(session: Session, model_stage: str, model_file_name: str) -> dict:\n", - " # Bring in the data as a Pandas DataFrame.\n", - " union_df = session.table(\"FEATURES.MFEATURESJOINED\").to_pandas()\n", - "\n", - " # Split the data into training and testing sets.\n", - " train_df = union_df[union_df[\"SEASON\"] < 2020]\n", - " test_df = union_df[union_df[\"SEASON\"] > 2020]\n", - "\n", - " # Drop the columns we don't want to use in our model.\n", - " X_train = train_df.drop(\"WIN_INDICATOR\", axis=1)\n", - "\n", - " # Create a new DataFrame with just the WIN_INDICATOR column.\n", - " y_train = train_df[\"WIN_INDICATOR\"]\n", - "\n", - " # Drop the columns we don't want to use in our model.\n", - " X_test = test_df.drop(\"WIN_INDICATOR\", axis=1)\n", - "\n", - " # Create a new DataFrame with just the WIN_INDICATOR column.\n", - " y_test = test_df[\"WIN_INDICATOR\"]\n", - "\n", - " # Train the model.\n", - " model = xgb.XGBClassifier(n_estimators=2000)\n", - " model.fit(X_train, y_train)\n", - "\n", - " # Make predictions on the test set.\n", - " y_pred = model.predict(X_test)\n", - "\n", - " # Create the path in the event this doesn't exist. This will only be relevant to executing locally.\n", - " if not os.path.exists(\"/tmp\"):\n", - " os.mkdir(\"/tmp\")\n", - "\n", - " # Save the model to a file and upload it to the specified stage.\n", - " model_file = os.path.join(\"/tmp\", model_file_name)\n", - " joblib.dump(model, model_file)\n", - " session.file.put(model_file, model_stage, auto_compress=False, overwrite=True)\n", - "\n", - " return {\"Accuracy\": accuracy_score(y_test, y_pred)}" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can test this locally and see how well it did." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'Accuracy': 0.6052631578947368}" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_model(session, \"@COMMON.MODELS\", \"xgb_model.pkl\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will need to bring down local copies of the data to run the classification report and run the model locally. " - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " 0 0.61 0.59 0.60 133\n", - " 1 0.60 0.62 0.61 133\n", - "\n", - " accuracy 0.61 266\n", - " macro avg 0.61 0.61 0.61 266\n", - "weighted avg 0.61 0.61 0.61 266\n", - "\n" - ] - } - ], - "source": [ - "union_df = session.table(\"FEATURES.MFEATURESJOINED\").to_pandas()\n", - "train_df = union_df[union_df[\"SEASON\"] < 2020]\n", - "test_df = union_df[union_df[\"SEASON\"] > 2020]\n", - "X_train = train_df.drop(\"WIN_INDICATOR\", axis=1)\n", - "y_train = train_df[\"WIN_INDICATOR\"]\n", - "X_test = test_df.drop(\"WIN_INDICATOR\", axis=1)\n", - "y_test = test_df[\"WIN_INDICATOR\"]\n", - "\n", - "# Load the model from local storage to test how well it performs.\n", - "model = joblib.load(\"/tmp/xgb_model.pkl\")\n", - "\n", - "# Make predictions on the test set.\n", - "y_pred = model.predict(X_test)\n", - "\n", - "print(classification_report(y_test, y_pred))" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see the model is persisted in Snowflake as the function performs a `PUT` operation. " - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "----------------------------------------------------------------------------------------------------\n", - "|\"name\" |\"size\" |\"md5\" |\"last_modified\" |\n", - "----------------------------------------------------------------------------------------------------\n", - "|models/xgb_model.pkl |2289888 |6d34bf466d225eaca5c829f254a843df |Thu, 6 Apr 2023 16:52:29 GMT |\n", - "----------------------------------------------------------------------------------------------------\n", - "\n" - ] - } - ], - "source": [ - "session.sql(\"LS @COMMON.MODELS\").show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Register the stored procedure." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "The version of package snowflake-snowpark-python in the local environment is 1.3.0, which does not fit the criteria for the requirement snowflake-snowpark-python. Your UDF might not work when the package version is different between the server and your local environment\n", - "The version of package xgboost in the local environment is 1.7.5, which does not fit the criteria for the requirement xgboost. Your UDF might not work when the package version is different between the server and your local environment\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "session.sproc.register(\n", - " func=train_model,\n", - " return_type=T.StringType(),\n", - " input_types=[T.StringType(), T.StringType()],\n", - " name=\"TRAIN_MODEL\",\n", - " is_permanent=True,\n", - " stage_location=\"@COMMON.PYTHON_CODE\",\n", - " packages=[\"snowflake-snowpark-python\", \"xgboost\"],\n", - " replace=True,\n", - " source_code_display=True,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Call the stored procedure to run in Snowflake." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"{'Accuracy': 0.6052631578947368}\"" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "session.call(\"TRAIN_MODEL\", \"@COMMON.MODELS\", \"xgb_model.pkl\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Predictions\n", - "\n", - "Now that our model is trained and persisted in Snowflake, let's see what it predicts for the 2023 tournament. We need to produce every possible game that could be played in the NCAA in 2023, leading us to 65,703 game probabilities. We can look at which teams played in 2023 to build this combination." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, we will write a helper function to load the existing model in Snowflake. " - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "@cachetools.cached(cache={})\n", - "def load_model(file_name):\n", - " model_file_path = sys._xoptions.get(\"snowflake_import_directory\") + file_name\n", - " return joblib.load(model_file_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "def infer_model(session: Session, target_table: str) -> str:\n", - " # Bring in the data as a Pandas DataFrame.\n", - " m_2023_teams = (\n", - " session.table(\"RAW.MREGULARSEASONCOMPACTRESULTS\")\n", - " .filter(F.col(\"SEASON\") == 2023)\n", - " .select(\"WTEAMID\", \"LTEAMID\")\n", - " )\n", - "\n", - " # Create a DataFrame with all possible combinations of teams.\n", - " m_w_teams = m_2023_teams.select(F.col(\"WTEAMID\").alias(\"TEAMID\"))\n", - " m_l_teams = m_2023_teams.select(F.col(\"LTEAMID\").alias(\"TEAMID\"))\n", - " m_2023_distinct_teams = m_w_teams.union(m_l_teams)\n", - " m_2023_teams_cj = (\n", - " m_2023_distinct_teams.cross_join(m_2023_distinct_teams, lsuffix=\"_1\", rsuffix=\"_2\")\n", - " .filter(F.col(\"TEAMID_1\") < F.col(\"TEAMID_2\"))\n", - " .select(F.lit(2023).alias(\"SEASON\"), \"TEAMID_1\", \"TEAMID_2\")\n", - " )\n", - "\n", - " # Bring in the regular season averages as a Snowpark DataFrame.\n", - " season_df = session.table(\"FEATURES.MAVGSEASONDETAILEDRESULTS\")\n", - "\n", - " # Bring in the conference one-hot encoded data as a Snowpark DataFrame.\n", - " conferences_df = session.table(\"FEATURES.MTEAMCONFERENCESOHE\")\n", - "\n", - " # Create a new Snowpark DataFrame for the winning team's regular season averages.\n", - " wteamseasonavgs = season_df.select(\n", - " [\n", - " F.col(c).alias(f\"{c}_REGSEASON_1\") if c not in [\"SEASON\", \"TEAMID\"] else c\n", - " for c in season_df.columns\n", - " ]\n", - " ).with_column_renamed(\"TEAMID\", \"TEAMID_1\")\n", - "\n", - " # Create a new Snowpark DataFrame for the losing team's regular season averages.\n", - " lteamseasonavgs = season_df.select(\n", - " [\n", - " F.col(c).alias(f\"{c}_REGSEASON_2\") if c not in [\"SEASON\", \"TEAMID\"] else c\n", - " for c in season_df.columns\n", - " ]\n", - " ).with_column_renamed(\"TEAMID\", \"TEAMID_2\")\n", - "\n", - " # Create a new Snowpark DataFrame for the winning team's conference one-hot encoded data.\n", - " wconferences = conferences_df.select(\n", - " [F.col(c).alias(f\"{c}_1\") if c != \"SEASON\" else c for c in conferences_df.columns]\n", - " )\n", - "\n", - " # Create a new Snowpark DataFrame for the losing team's conference one-hot encoded data.\n", - " lconferences = conferences_df.select(\n", - " [F.col(c).alias(f\"{c}_2\") if c != \"SEASON\" else c for c in conferences_df.columns]\n", - " )\n", - "\n", - " # Join all of our dataframes together.\n", - " m_2023_combos = (\n", - " m_2023_teams_cj.natural_join(wteamseasonavgs, how=\"left\")\n", - " .natural_join(lteamseasonavgs, how=\"left\")\n", - " .natural_join(wconferences, how=\"left\")\n", - " .natural_join(lconferences, how=\"left\")\n", - " )\n", - "\n", - " m_2023_combos_pd = m_2023_combos.to_pandas()\n", - "\n", - " # Load the model from the specified stage. This is provided via the imports specification on the registration.\n", - " load_model(\"xgb_model.pkl\")\n", - "\n", - " # Make predictions on the set.\n", - " m_2023_combos_pd[\"PREDICTION\"] = model.predict_proba(m_2023_combos_pd)[:, 1]\n", - "\n", - " # Create a new DataFrame with only the columns we need.\n", - " submission_prep = m_2023_combos_pd.loc[:, [\"SEASON\", \"TEAMID_1\", \"TEAMID_2\", \"PREDICTION\"]]\n", - "\n", - " # Convert the DataFrame to a Snowpark DataFrame.\n", - " submission_prep = session.create_dataframe(submission_prep)\n", - "\n", - " # Save the predictions to a Snowflake table.\n", - " submission_prep.write.save_as_table(target_table, mode=\"overwrite\")\n", - "\n", - " return f\"Successfully created {target_table}.\"" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "The version of package cachetools in the local environment is 5.3.0, which does not fit the criteria for the requirement cachetools. Your UDF might not work when the package version is different between the server and your local environment\n", - "The version of package snowflake-snowpark-python in the local environment is 1.3.0, which does not fit the criteria for the requirement snowflake-snowpark-python. Your UDF might not work when the package version is different between the server and your local environment\n", - "The version of package xgboost in the local environment is 1.7.5, which does not fit the criteria for the requirement xgboost. Your UDF might not work when the package version is different between the server and your local environment\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "session.sproc.register(\n", - " func=infer_model,\n", - " return_type=T.StringType(),\n", - " input_types=[T.StringType()],\n", - " name=\"INFER_MODEL\",\n", - " is_permanent=True,\n", - " stage_location=\"@COMMON.PYTHON_CODE\",\n", - " imports=[\"@COMMON.MODELS/xgb_model.pkl\"],\n", - " packages=[\"cachetools\", \"snowflake-snowpark-python\", \"xgboost\"],\n", - " replace=True,\n", - " source_code_display=True,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Successfully created FEATURES.M2023PREDICTIONS.'" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "session.call(\"INFER_MODEL\", \"FEATURES.M2023PREDICTIONS\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's see how it did with Connecticut as the example." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-------------------------------------------------------------\n", - "|\"SEASON\" |\"TEAMNAME_1\" |\"TEAMNAME_2\" |\"PREDICTION_PCT\" |\n", - "-------------------------------------------------------------\n", - "|2023 |Abilene Chr |Connecticut |0.0079 |\n", - "|2023 |Alabama |Connecticut |4.0888 |\n", - "|2023 |Arizona |Connecticut |0.0777 |\n", - "|2023 |Arizona St |Connecticut |0.6766 |\n", - "|2023 |Arkansas |Connecticut |3.4728 |\n", - "|2023 |Belmont |Connecticut |6.146 |\n", - "|2023 |Boston Univ |Connecticut |0.0269 |\n", - "|2023 |Buffalo |Connecticut |0.0003 |\n", - "|2023 |Butler |Connecticut |4.9312 |\n", - "|2023 |BYU |Connecticut |0.0002 |\n", - "-------------------------------------------------------------\n", - "\n" - ] - } - ], - "source": [ - "session.sql(\"\"\"\n", - "SELECT P.SEASON, \n", - " M1.TEAMNAME AS TEAMNAME_1,\n", - " M2.TEAMNAME AS TEAMNAME_2,\n", - " ROUND(P.PREDICTION * 100, 4) AS PREDICTION_PCT\n", - "FROM MARCH_MADNESS.FEATURES.M2023PREDICTIONS AS P\n", - "INNER JOIN RAW.MTEAMS AS M1 ON P.TEAMID_1 = M1.TEAMID\n", - "INNER JOIN RAW.MTEAMS AS M2 ON P.TEAMID_2 = M2.TEAMID\n", - "WHERE TEAMNAME_1 = 'Connecticut' OR TEAMNAME_2 = 'Connecticut'\n", - "\"\"\").show()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/NCAA March Madness/requirements.txt b/NCAA March Madness/requirements.txt deleted file mode 100644 index b225554..0000000 --- a/NCAA March Madness/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -cachetools -python-dotenv -scikit-learn -snowflake-snowpark-python[pandas] -xgboost