diff --git a/.github/workflows/workshop_unit_test.yml b/.github/workflows/workshop_unit_test.yml index 3c1382c3..708eab4d 100644 --- a/.github/workflows/workshop_unit_test.yml +++ b/.github/workflows/workshop_unit_test.yml @@ -31,7 +31,7 @@ jobs: - name: Install AZ ML and tools run: | # SETUP line 34 to point to your own AML workspace az extension add -n ml -y --version 2.2.1 - az configure --defaults group=azureml workspace=ws01ent location=westus2 + az configure --defaults group=mlops-rg-910157 workspace=aml910157 location=eastus - name: Run Feature Engineering uses: ./.github/actions/aml-job-create with: diff --git a/src/workshop/core/scoring/deployment.yml b/src/workshop/core/scoring/deployment.yml index 29c3500c..8f8adb6f 100644 --- a/src/workshop/core/scoring/deployment.yml +++ b/src/workshop/core/scoring/deployment.yml @@ -1,6 +1,6 @@ $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json name: green -endpoint_name: mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name defined in endpoint.yml +endpoint_name: mlops-h1-endpoint-910157 #setup replace `mlops-workshop-endpoint` with your own endpoint name defined in endpoint.yml model: azureml:nyc_fare_prediction:1 code_configuration: code: ./ diff --git a/src/workshop/core/scoring/endpoint.yml b/src/workshop/core/scoring/endpoint.yml index 611e0721..6dbd60b1 100644 --- a/src/workshop/core/scoring/endpoint.yml +++ b/src/workshop/core/scoring/endpoint.yml @@ -1,3 +1,3 @@ $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json -name: mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name. It has to be globally unique +name: mlops-h1-endpoint-910157 #setup replace `mlops-workshop-endpoint` with your own endpoint name. It has to be globally unique auth_mode: key diff --git a/src/workshop/core/training/.amlignore b/src/workshop/core/training/.amlignore new file mode 100644 index 00000000..0621f9fc --- /dev/null +++ b/src/workshop/core/training/.amlignore @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/core/training/.amlignore.amltmp b/src/workshop/core/training/.amlignore.amltmp new file mode 100644 index 00000000..0621f9fc --- /dev/null +++ b/src/workshop/core/training/.amlignore.amltmp @@ -0,0 +1,6 @@ +## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. +## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots + +.ipynb_aml_checkpoints/ +*.amltmp +*.amltemp \ No newline at end of file diff --git a/src/workshop/core/training/ml_training.py b/src/workshop/core/training/ml_training.py index 6f59dcdd..2f9aaf62 100644 --- a/src/workshop/core/training/ml_training.py +++ b/src/workshop/core/training/ml_training.py @@ -43,7 +43,7 @@ def createClassModel(algo_name, catg, nums): #--------------------------------------------- #setup: Update alpha value #--------------------------------------------- - model = Ridge(alpha=100000) #setup + model = Ridge(alpha=100) #setup elif algo_name == 'random_forest': model = RandomForestRegressor() else: diff --git a/src/workshop/core/training/ml_training.py.amltmp b/src/workshop/core/training/ml_training.py.amltmp new file mode 100644 index 00000000..2f9aaf62 --- /dev/null +++ b/src/workshop/core/training/ml_training.py.amltmp @@ -0,0 +1,103 @@ +import pandas as pd +import numpy as np +import os +import argparse +import mlflow +import mlflow.sklearn +from azureml.core import Run, Dataset,Datastore, Workspace +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import Ridge +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.impute import SimpleImputer +from sklearn.compose import ColumnTransformer +from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error +import joblib +def parse_args(): + # arg parser + parser = argparse.ArgumentParser() + + parser.add_argument("--prep_data", default="data", type=str, help="Path to prepped data, default to local folder") + parser.add_argument("--model_folder", type=str,default="data", help="Path of model ouput folder, default to local folder") + parser.add_argument("--input_file_name", type=str, default="final_df.parquet") + parser.add_argument("--run_mode", type=str, default="local") + + + # parse args + args = parser.parse_args() + + # return args + return args + + +def createClassModel(algo_name, catg, nums): + numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]) + + categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value="MISSING")), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) + + preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)]) + + if algo_name == 'linear_regression': + #--------------------------------------------- + #setup: Update alpha value + #--------------------------------------------- + model = Ridge(alpha=100) #setup + elif algo_name == 'random_forest': + model = RandomForestRegressor() + else: + pass + + ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), ("model", model)]) + + return ModelPipeline + +def main(args): + + # read in data + final_df = pd.read_parquet(os.path.join(args.prep_data,args.input_file_name)) + catg_cols = ["vendorID", "month_num", "day_of_month", "normalizeHolidayName", "isPaidTimeOff"] + num_cols = ["passengerCount", "tripDistance", "precipTime", "temperature", "precipDepth", "hr_sin", "hr_cos", "dy_sin", "dy_cos"] + label = ["totalAmount"] + # make sure categorical columns are strings + final_df[catg_cols] = final_df[catg_cols].astype("str") + + # split data + X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222) + + # test 2 algorithms + os.makedirs(args.model_folder, exist_ok=True) + + algorithmname = "linear_regression" + fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline + fitPipeline.fit(X_train, y_train.values.ravel()) # fit pipeine + + y_pred = fitPipeline.predict(X_test) # score with fitted pipeline + + # Evaluate + r2 = r2_score(y_test, y_pred) + mape = mean_absolute_percentage_error(y_test, y_pred) + rmse = np.sqrt(mean_squared_error(y_test, y_pred)) + + + joblib.dump(fitPipeline,args.model_folder+"/"+algorithmname+".joblib") + + print("Training finished!. Metrics:") + print(f"R2_{algorithmname}", r2) + print(f"MAPE_{algorithmname}", mape) + print(f"RMSE_{algorithmname}", rmse) + print("Model",args.model_folder+"/"+algorithmname+".joblib","saved!") + + if args.run_mode == 'remote': + mlflow.log_metric(f"R2_{algorithmname}", r2) + mlflow.log_metric(f"MAPE_{algorithmname}", mape) + mlflow.log_metric(f"RMSE_{algorithmname}", rmse) + mlflow.sklearn.log_model(fitPipeline,f"{algorithmname}_model") + +# run script +if __name__ == "__main__": + # parse args + args = parse_args() + # run main function + main(args) \ No newline at end of file diff --git a/src/workshop/core/training/ml_training.py.save b/src/workshop/core/training/ml_training.py.save new file mode 100644 index 00000000..c85b8ad5 --- /dev/null +++ b/src/workshop/core/training/ml_training.py.save @@ -0,0 +1,103 @@ +import pandas as pd +import numpy as np +import os +import argparse +import mlflow +import mlflow.sklearn +from azureml.core import Run, Dataset,Datastore, Workspace +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import Ridge +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.impute import SimpleImputer +from sklearn.compose import ColumnTransformer +from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error +import joblib +def parse_args(): + # arg parser + parser = argparse.ArgumentParser() + + parser.add_argument("--prep_data", default="data", type=str, help="Path to prepped data, default to local folder") + parser.add_argument("--model_folder", type=str,default="data", help="Path of model ouput folder, default to local folder") + parser.add_argument("--input_file_name", type=str, default="final_df.parquet") + parser.add_argument("--run_mode", type=str, default="local") + + + # parse args + args = parser.parse_args() + + # return args + return args + + +def createClassModel(algo_name, catg, nums): + numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]) + + categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value="MISSING")), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) + + preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)]) + + if algo_name == 'linear_regression': + #--------------------------------------------- + #setup: Update alpha value + #--------------------------------------------- + model = Ridge(alpha=100) #setup + elif algo_name == 'random_forest': + model = RandomForestRegressor() + else: + pass + + ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), ("model", model)]) + + return ModelPipeline + +def main(args): + + # read in data + final_df = pd.read_parquet(os.path.join(args.prep_data,args.input_file_name)) + catg_cols = ["vendorID", "month_num", "day_of_month", "normalizeHolidayName", "isPaidTimeOff"] + num_cols = ["passengerCount", "tripDistance", "precipTime", "temperature", "precipDepth", "hr_sin", "hr_cos", "dy_sin", "dy_cos"] + label = ["totalAmount"] + # make sure categorical columns are strings + final_df[catg_cols] = final_df[catg_cols].astype("str") + + # split data + X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222) + + # test 2 algorithms + os.makedirs(args.model_folder, exist_ok=True) + + algorithmname = "linear_regression" + fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline + fitPipeline.fit(X_train, y_train.values.ravel()) # fit pipeine + + y_pred = fitPipeline.predict(X_test) # score with fitted pipeline + + # Evaluate + r2 = r2_score(y_test, y_pred) + mape = mean_absolute_percentage_error(y_test, y_pred) + rmse = np.sqrt(mean_squared_error(y_test, y_pred)) + + + joblib.dump(fitPipeline,args.model_folder+"/"+algorithmname+".joblib") + + print("Training finished!. Metrics:") + print(f"R2_{algorithmname}", r2) + print(f"MAPE_{algorithmname}", mape) + print(f"RMSE_{algorithmname}", rmse) + print("Model",args.model_folder+"/"+algorithmname+".joblib","saved!") + + if args.run_mode == 'remote': + mlflow.log_metric(f"R2_{algorithmname}", r2) + mlflow.log_metric(f"MAPE_{algorithmname}", mape) + mlflow.log_metric(f"RMSE_{algorithmname}", rmse) + mlflow.sklearn.log_model(fitPipeline,f"{algorithmname}_model") + +# run script +if __name__ == "__main__": + # parse args + args = parse_args() + # run main function + main(args) diff --git a/src/workshop/data/linear_regression.joblib b/src/workshop/data/linear_regression.joblib index d6bd0590..8ec65470 100644 Binary files a/src/workshop/data/linear_regression.joblib and b/src/workshop/data/linear_regression.joblib differ