Skip to content

Commit

Permalink
Adding project source + data
Browse files Browse the repository at this point in the history
  • Loading branch information
alfozan committed Oct 21, 2020
1 parent df37324 commit 42271d7
Show file tree
Hide file tree
Showing 9 changed files with 17,727 additions and 2 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.DS_Store
mlruns/
__pycache__/
.idea/
9 changes: 9 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM python:3.7-slim

ADD requirements.txt .

RUN apt update
RUN apt install git -y
RUN pip install --upgrade pip --no-cache-dir
RUN pip install -r requirements.txt --no-cache-dir
RUN apt install graphviz -y
8 changes: 8 additions & 0 deletions MLproject
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: mlflow_example

docker_env:
image: mlflow_example

entry_points:
main:
command: "python train.py"
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# MLflow + GBRT (Gradient Boosted Regression Trees) Demo Notebook
# MLflow Example Project + Notebook

This notebook demonstrates an example of dataset preprocessing, ML model training and evaluation, model tuning using MLflow tracking and finally making predictions.
This project and notebook demonstrates an example of dataset preprocessing, ML model training and evaluation, model tuning using MLflow tracking and finally making predictions.

with @PyDataRiyadh: https://twitter.com/PyDataRiyadh/status/1291043529146466304
17,380 changes: 17,380 additions & 0 deletions data/hour.csv

Large diffs are not rendered by default.

50 changes: 50 additions & 0 deletions mlflow_model_driver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os
import subprocess
import sys
import warnings

import mlflow

warnings.filterwarnings('ignore')

PROJECT_DIR = sys.path[0]
os.chdir(PROJECT_DIR)

experiment_name = 'rented_bikes'
mlflow.set_experiment(experiment_name)

PORT = 5001 # REST API serving port
CONTAINER_NAME = "mlflow_example_model_serving"

best_run_df = mlflow.search_runs(order_by=['metrics.RMSE_CV ASC'], max_results=1)
if len(best_run_df.index) == 0:
raise Exception(f"Found no runs for experiment '{experiment_name}'")

best_run = mlflow.get_run(best_run_df.at[0, 'run_id'])
best_model_uri = f"{best_run.info.artifact_uri}/model"
# best_model = mlflow.sklearn.load_model(best_model_uri)

# print best run info
print("Best run info:")
print(f"Run id: {best_run.info.run_id}")
print(f"Run parameters: {best_run.data.params}")
print("Run score: RMSE_CV = {:.4f}".format(best_run.data.metrics['RMSE_CV']))
print(f"Run model URI: {best_model_uri}")

# remove current container if exists
subprocess.run(f"docker rm --force {CONTAINER_NAME}", shell=True, check=False, stdout=subprocess.DEVNULL)

# run mlflow model serving in a docker container
docker_run_cmd = f"""
docker run
--name={CONTAINER_NAME}
--volume={PROJECT_DIR}:{PROJECT_DIR}
--publish {PORT}:{PORT}
--interactive
--rm
mlflow_example
mlflow models serve --model-uri {best_model_uri} --host 0.0.0.0 --port {PORT} --workers 2 --no-conda
""".replace('\n', ' ').strip()
print(f"Running command:\n{docker_run_cmd}")

subprocess.run(docker_run_cmd, shell=True, check=True)
56 changes: 56 additions & 0 deletions mlflow_project_driver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import itertools
import os
import subprocess
import sys
import warnings

import mlflow

warnings.filterwarnings('ignore')

PROJECT_DIR = sys.path[0]
os.chdir(PROJECT_DIR)

experiment_name = 'rented_bikes'
mlflow.set_experiment(experiment_name)

# delete default experiment if exits
if mlflow.get_experiment_by_name("Default").lifecycle_stage == 'active':
mlflow.delete_experiment("0")
subprocess.run("mlflow gc", shell=True, check=False, stdout=subprocess.DEVNULL)

# Model Hyper-parameters
parameters = {
"learning_rate": [0.1, 0.05, 0.01],
"max_depth": [4, 5, 6],
}

# Tuning the hyper-parameters via grid search
# generate parameters combinations
params_keys = parameters.keys()
params_values = [
parameters[key] if isinstance(parameters[key], list) else [parameters[key]]
for key in params_keys
]
runs_parameters = [
dict(zip(params_keys, combination)) for combination in itertools.product(*params_values)
]

# execute experiment runs in parallel in docker containers
submitted_runs = []
for run_parameters in runs_parameters:
submitted_runs.append(mlflow.projects.run(
uri='.',
backend='local',
parameters=run_parameters,
synchronous=False,
docker_args={"user": f"{os.getuid()}:{os.getgid()}"},
))

print(f"Submitted {len(submitted_runs)} runs. Waiting for them to finish...")

# get runs status (blocking)
runs_status = [run.wait() for run in submitted_runs]

print(f"Experiment '{experiment_name}' finished!")
print(f"{sum(runs_status)} runs succeeded out of {len(runs_status)} submitted")
8 changes: 8 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
mlflow>=1.11.0
pandas>=1.1.2
numpy>=1.18.5
matplotlib>=3.2.2
seaborn>=0.11.0
scikit-learn>=0.22.2
pydotplus>=2.0.2
graphviz>=0.10.1
210 changes: 210 additions & 0 deletions train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
import argparse
import os
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd
import seaborn as sns
from mlflow.models.signature import infer_signature
from pydotplus import graph_from_dot_data
from sklearn import tree
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score, train_test_split

plt.style.use("fivethirtyeight")
warnings.filterwarnings('ignore')
np.random.seed(42)

# create model_artifacts directory
model_artifacts_dir = "/tmp/model_artifacts"
Path(model_artifacts_dir).mkdir(exist_ok=True)


# Evaluation Metrics
def rmse(y, y_pred):
return np.sqrt(mean_squared_error(y, y_pred))


def rmse_score(y, y_pred):
score = rmse(y, y_pred)
return score


# Cross-validation RMSLE score
def rmsle_cv(model, X_train, y_train):
kf = KFold(n_splits=3, shuffle=True, random_state=42).get_n_splits(X_train.values)
# Evaluate a score by cross-validation
rmse = np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv=kf))
return rmse


def rmse_cv_score(model, X_train, y_train):
score = rmsle_cv(model, X_train, y_train)
return score


# Feature Importance
def model_feature_importance(model):
feature_importance = pd.DataFrame(
model.feature_importances_,
index=X_train.columns,
columns=["Importance"],
)

# sort by importance
feature_importance.sort_values(by="Importance", ascending=False, inplace=True)

# plot
plt.figure(figsize=(12, 8))
sns.barplot(
data=feature_importance.reset_index(),
y="index",
x="Importance",
).set_title("Feature Importance")
# save image
plt.savefig(f"{model_artifacts_dir}/feature_importance.png", bbox_inches='tight')


def model_permutation_importance(model):
p_importance = permutation_importance(model, X_test, y_test, random_state=42, n_jobs=-1)

# sort by importance
sorted_idx = p_importance.importances_mean.argsort()[::-1]
p_importance = pd.DataFrame(
data=p_importance.importances[sorted_idx].T,
columns=X_train.columns[sorted_idx]
)

# plot
plt.figure(figsize=(12, 8))
sns.barplot(
data=p_importance,
orient="h"
).set_title("Permutation Importance")

# save image
plt.savefig(f"{model_artifacts_dir}/permutation_importance.png", bbox_inches="tight")


def model_tree_visualization(model):
# generate visualization
tree_dot_data = tree.export_graphviz(
decision_tree=model.estimators_[0, 0], # Get the first tree,
label="all",
feature_names=X_train.columns,
filled=True,
rounded=True,
proportion=True,
impurity=False,
precision=1,
)

# save image
graph_from_dot_data(tree_dot_data).write_png(f"{model_artifacts_dir}/Decision_Tree_Visualization.png")


# Read the data csv file (make sure you're running this from the root of MLflow!)
data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/hour.csv")
# load input data into pandas dataframe
bike_sharing = pd.read_csv(data_path)

# Data preprocessing
# remove unused columns
bike_sharing.drop(columns=["instant", "dteday", "registered", "casual"], inplace=True)

# use better column names
bike_sharing.rename(
columns={
"yr": "year",
"mnth": "month",
"hr": "hour_of_day",
"holiday": "is_holiday",
"workingday": "is_workingday",
"weathersit": "weather_situation",
"temp": "temperature",
"atemp": "feels_like_temperature",
"hum": "humidity",
"cnt": "rented_bikes",
},
inplace=True,
)

# Prepare training and test data sets

# Split the dataset randomly into 70% for training and 30% for testing.
X = bike_sharing.drop("rented_bikes", axis=1)
y = bike_sharing.rented_bikes
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=42)

# main entry point
if __name__ == "__main__":
# parse run parameters
parser = argparse.ArgumentParser()
parser.add_argument('--learning_rate', type=float, default=0.1)
parser.add_argument('--max_depth', type=int, default=3)
run_parameters = vars(parser.parse_args())

with mlflow.start_run():
run_id = mlflow.active_run().info.run_id
print(f"Run {run_id}:", f"Started with parameters {run_parameters}")
print(f"Run {run_id}:", f"Training samples: {X_train.size}, Test samples: {X_test.size}")

# create model instance: GBRT (Gradient Boosted Regression Tree) scikit-learn implementation
model = GradientBoostingRegressor(**run_parameters)

# Model Training
model.fit(X_train, y_train)
print(f"Run {run_id}:", "Training completed")

# get evaluations scores
score = rmse_score(y_test, model.predict(X_test))
score_cv = rmse_cv_score(model, X_train, y_train)
print(f"Run {run_id}:", "RMSE score: {:.4f}".format(score))
print(f"Run {run_id}:", "Cross-validation RMSE score: {:.4f} (std = {:.4f})".format(score_cv.mean(), score_cv.std()))

# generate charts
model_feature_importance(model)
plt.close()
model_permutation_importance(model)
plt.close()
model_tree_visualization(model)

# log estimator name
mlflow.set_tag("estimator_name", model.__class__.__name__)

# log input features
mlflow.set_tag("features", str(X_train.columns.values.tolist()))

# Log tracked parameters only
mlflow.log_params(run_parameters)

mlflow.log_metrics({
'RMSE_CV': score_cv.mean(),
'RMSE': score,
})

# log training loss
for s in model.train_score_:
mlflow.log_metric("Train Loss", s)

# get model signature
signature = infer_signature(model_input=X_train, model_output=model.predict(X_train))

# Save model to artifacts
mlflow.sklearn.log_model(model, "model", signature=signature)

# log charts
mlflow.log_artifacts(model_artifacts_dir)

# optional: auto-logging for scikit-learn estimators
# mlflow.sklearn.autolog()

# optional: log all model parameters
# mlflow.log_params(model.get_params())

print(f"Run {run_id}:", "Logging completed")

0 comments on commit 42271d7

Please sign in to comment.