Adding project source + data

alfozan · Oct 21, 2020 · 42271d7 · 42271d7
1 parent df37324
commit 42271d7
Show file tree

Hide file tree

Showing 9 changed files with 17,727 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.DS_Store
+mlruns/
+__pycache__/
+.idea/
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,9 @@
+FROM python:3.7-slim
+
+ADD requirements.txt .
+
+RUN apt update
+RUN apt install git -y
+RUN pip install --upgrade pip --no-cache-dir
+RUN pip install -r requirements.txt --no-cache-dir
+RUN apt install graphviz -y
diff --git a/MLproject b/MLproject
@@ -0,0 +1,8 @@
+name: mlflow_example
+
+docker_env:
+  image: mlflow_example
+
+entry_points:
+  main:
+    command: "python train.py"
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
-# MLflow + GBRT (Gradient Boosted Regression Trees) Demo Notebook
+# MLflow Example Project + Notebook
 
-This notebook demonstrates an example of dataset preprocessing, ML model training and evaluation, model tuning using MLflow tracking and finally making predictions.
+This project and notebook demonstrates an example of dataset preprocessing, ML model training and evaluation, model tuning using MLflow tracking and finally making predictions.
 
 with @PyDataRiyadh: https://twitter.com/PyDataRiyadh/status/1291043529146466304
diff --git a/data/hour.csv b/data/hour.csv
diff --git a/mlflow_model_driver.py b/mlflow_model_driver.py
@@ -0,0 +1,50 @@
+import os
+import subprocess
+import sys
+import warnings
+
+import mlflow
+
+warnings.filterwarnings('ignore')
+
+PROJECT_DIR = sys.path[0]
+os.chdir(PROJECT_DIR)
+
+experiment_name = 'rented_bikes'
+mlflow.set_experiment(experiment_name)
+
+PORT = 5001  # REST API serving port
+CONTAINER_NAME = "mlflow_example_model_serving"
+
+best_run_df = mlflow.search_runs(order_by=['metrics.RMSE_CV ASC'], max_results=1)
+if len(best_run_df.index) == 0:
+    raise Exception(f"Found no runs for experiment '{experiment_name}'")
+
+best_run = mlflow.get_run(best_run_df.at[0, 'run_id'])
+best_model_uri = f"{best_run.info.artifact_uri}/model"
+# best_model = mlflow.sklearn.load_model(best_model_uri)
+
+# print best run info
+print("Best run info:")
+print(f"Run id: {best_run.info.run_id}")
+print(f"Run parameters: {best_run.data.params}")
+print("Run score: RMSE_CV = {:.4f}".format(best_run.data.metrics['RMSE_CV']))
+print(f"Run model URI: {best_model_uri}")
+
+# remove current container if exists
+subprocess.run(f"docker rm --force {CONTAINER_NAME}", shell=True, check=False, stdout=subprocess.DEVNULL)
+
+# run mlflow model serving in a docker container
+docker_run_cmd = f"""
+docker run
+--name={CONTAINER_NAME}
+--volume={PROJECT_DIR}:{PROJECT_DIR}
+--publish {PORT}:{PORT}
+--interactive
+--rm
+mlflow_example
+mlflow models serve --model-uri {best_model_uri} --host 0.0.0.0 --port {PORT} --workers 2 --no-conda
+""".replace('\n', ' ').strip()
+print(f"Running command:\n{docker_run_cmd}")
+
+subprocess.run(docker_run_cmd, shell=True, check=True)
diff --git a/mlflow_project_driver.py b/mlflow_project_driver.py
@@ -0,0 +1,56 @@
+import itertools
+import os
+import subprocess
+import sys
+import warnings
+
+import mlflow
+
+warnings.filterwarnings('ignore')
+
+PROJECT_DIR = sys.path[0]
+os.chdir(PROJECT_DIR)
+
+experiment_name = 'rented_bikes'
+mlflow.set_experiment(experiment_name)
+
+# delete default experiment if exits
+if mlflow.get_experiment_by_name("Default").lifecycle_stage == 'active':
+    mlflow.delete_experiment("0")
+subprocess.run("mlflow gc", shell=True, check=False, stdout=subprocess.DEVNULL)
+
+# Model Hyper-parameters
+parameters = {
+    "learning_rate": [0.1, 0.05, 0.01],
+    "max_depth": [4, 5, 6],
+}
+
+# Tuning the hyper-parameters via grid search
+# generate parameters combinations
+params_keys = parameters.keys()
+params_values = [
+    parameters[key] if isinstance(parameters[key], list) else [parameters[key]]
+    for key in params_keys
+]
+runs_parameters = [
+    dict(zip(params_keys, combination)) for combination in itertools.product(*params_values)
+]
+
+# execute experiment runs in parallel in docker containers
+submitted_runs = []
+for run_parameters in runs_parameters:
+    submitted_runs.append(mlflow.projects.run(
+        uri='.',
+        backend='local',
+        parameters=run_parameters,
+        synchronous=False,
+        docker_args={"user": f"{os.getuid()}:{os.getgid()}"},
+    ))
+
+print(f"Submitted {len(submitted_runs)} runs. Waiting for them to finish...")
+
+# get runs status (blocking)
+runs_status = [run.wait() for run in submitted_runs]
+
+print(f"Experiment '{experiment_name}' finished!")
+print(f"{sum(runs_status)} runs succeeded out of {len(runs_status)} submitted")
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,8 @@
+mlflow>=1.11.0
+pandas>=1.1.2
+numpy>=1.18.5
+matplotlib>=3.2.2
+seaborn>=0.11.0
+scikit-learn>=0.22.2
+pydotplus>=2.0.2
+graphviz>=0.10.1
diff --git a/train.py b/train.py
@@ -0,0 +1,210 @@
+import argparse
+import os
+import warnings
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import mlflow
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from mlflow.models.signature import infer_signature
+from pydotplus import graph_from_dot_data
+from sklearn import tree
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.inspection import permutation_importance
+from sklearn.metrics import mean_squared_error
+from sklearn.model_selection import KFold, cross_val_score, train_test_split
+
+plt.style.use("fivethirtyeight")
+warnings.filterwarnings('ignore')
+np.random.seed(42)
+
+# create model_artifacts directory
+model_artifacts_dir = "/tmp/model_artifacts"
+Path(model_artifacts_dir).mkdir(exist_ok=True)
+
+
+# Evaluation Metrics
+def rmse(y, y_pred):
+    return np.sqrt(mean_squared_error(y, y_pred))
+
+
+def rmse_score(y, y_pred):
+    score = rmse(y, y_pred)
+    return score
+
+
+# Cross-validation RMSLE score
+def rmsle_cv(model, X_train, y_train):
+    kf = KFold(n_splits=3, shuffle=True, random_state=42).get_n_splits(X_train.values)
+    # Evaluate a score by cross-validation
+    rmse = np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv=kf))
+    return rmse
+
+
+def rmse_cv_score(model, X_train, y_train):
+    score = rmsle_cv(model, X_train, y_train)
+    return score
+
+
+# Feature Importance
+def model_feature_importance(model):
+    feature_importance = pd.DataFrame(
+        model.feature_importances_,
+        index=X_train.columns,
+        columns=["Importance"],
+    )
+
+    # sort by importance
+    feature_importance.sort_values(by="Importance", ascending=False, inplace=True)
+
+    # plot
+    plt.figure(figsize=(12, 8))
+    sns.barplot(
+        data=feature_importance.reset_index(),
+        y="index",
+        x="Importance",
+    ).set_title("Feature Importance")
+    # save image
+    plt.savefig(f"{model_artifacts_dir}/feature_importance.png", bbox_inches='tight')
+
+
+def model_permutation_importance(model):
+    p_importance = permutation_importance(model, X_test, y_test, random_state=42, n_jobs=-1)
+
+    # sort by importance
+    sorted_idx = p_importance.importances_mean.argsort()[::-1]
+    p_importance = pd.DataFrame(
+        data=p_importance.importances[sorted_idx].T,
+        columns=X_train.columns[sorted_idx]
+    )
+
+    # plot
+    plt.figure(figsize=(12, 8))
+    sns.barplot(
+        data=p_importance,
+        orient="h"
+    ).set_title("Permutation Importance")
+
+    # save image
+    plt.savefig(f"{model_artifacts_dir}/permutation_importance.png", bbox_inches="tight")
+
+
+def model_tree_visualization(model):
+    # generate visualization
+    tree_dot_data = tree.export_graphviz(
+        decision_tree=model.estimators_[0, 0],  # Get the first tree,
+        label="all",
+        feature_names=X_train.columns,
+        filled=True,
+        rounded=True,
+        proportion=True,
+        impurity=False,
+        precision=1,
+    )
+
+    # save image
+    graph_from_dot_data(tree_dot_data).write_png(f"{model_artifacts_dir}/Decision_Tree_Visualization.png")
+
+
+# Read the data csv file (make sure you're running this from the root of MLflow!)
+data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/hour.csv")
+# load input data into pandas dataframe
+bike_sharing = pd.read_csv(data_path)
+
+# Data preprocessing
+# remove unused columns
+bike_sharing.drop(columns=["instant", "dteday", "registered", "casual"], inplace=True)
+
+# use better column names
+bike_sharing.rename(
+    columns={
+        "yr": "year",
+        "mnth": "month",
+        "hr": "hour_of_day",
+        "holiday": "is_holiday",
+        "workingday": "is_workingday",
+        "weathersit": "weather_situation",
+        "temp": "temperature",
+        "atemp": "feels_like_temperature",
+        "hum": "humidity",
+        "cnt": "rented_bikes",
+    },
+    inplace=True,
+)
+
+# Prepare training and test data sets
+
+# Split the dataset randomly into 70% for training and 30% for testing.
+X = bike_sharing.drop("rented_bikes", axis=1)
+y = bike_sharing.rented_bikes
+X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=42)
+
+# main entry point
+if __name__ == "__main__":
+    # parse run parameters
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--learning_rate', type=float, default=0.1)
+    parser.add_argument('--max_depth', type=int, default=3)
+    run_parameters = vars(parser.parse_args())
+
+    with mlflow.start_run():
+        run_id = mlflow.active_run().info.run_id
+        print(f"Run {run_id}:", f"Started with parameters {run_parameters}")
+        print(f"Run {run_id}:", f"Training samples: {X_train.size}, Test samples: {X_test.size}")
+
+        # create model instance: GBRT (Gradient Boosted Regression Tree) scikit-learn implementation
+        model = GradientBoostingRegressor(**run_parameters)
+
+        # Model Training
+        model.fit(X_train, y_train)
+        print(f"Run {run_id}:", "Training completed")
+
+        # get evaluations scores
+        score = rmse_score(y_test, model.predict(X_test))
+        score_cv = rmse_cv_score(model, X_train, y_train)
+        print(f"Run {run_id}:", "RMSE score: {:.4f}".format(score))
+        print(f"Run {run_id}:", "Cross-validation RMSE score: {:.4f} (std = {:.4f})".format(score_cv.mean(), score_cv.std()))
+
+        # generate charts
+        model_feature_importance(model)
+        plt.close()
+        model_permutation_importance(model)
+        plt.close()
+        model_tree_visualization(model)
+
+        # log estimator name
+        mlflow.set_tag("estimator_name", model.__class__.__name__)
+
+        # log input features
+        mlflow.set_tag("features", str(X_train.columns.values.tolist()))
+
+        # Log tracked parameters only
+        mlflow.log_params(run_parameters)
+
+        mlflow.log_metrics({
+            'RMSE_CV': score_cv.mean(),
+            'RMSE': score,
+        })
+
+        # log training loss
+        for s in model.train_score_:
+            mlflow.log_metric("Train Loss", s)
+
+        # get model signature
+        signature = infer_signature(model_input=X_train, model_output=model.predict(X_train))
+
+        # Save model to artifacts
+        mlflow.sklearn.log_model(model, "model", signature=signature)
+
+        # log charts
+        mlflow.log_artifacts(model_artifacts_dir)
+
+        # optional: auto-logging for scikit-learn estimators
+        # mlflow.sklearn.autolog()
+
+        # optional: log all model parameters
+        # mlflow.log_params(model.get_params())
+
+        print(f"Run {run_id}:", "Logging completed")