-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
17,727 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
.DS_Store | ||
mlruns/ | ||
__pycache__/ | ||
.idea/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
FROM python:3.7-slim | ||
|
||
ADD requirements.txt . | ||
|
||
RUN apt update | ||
RUN apt install git -y | ||
RUN pip install --upgrade pip --no-cache-dir | ||
RUN pip install -r requirements.txt --no-cache-dir | ||
RUN apt install graphviz -y |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
name: mlflow_example | ||
|
||
docker_env: | ||
image: mlflow_example | ||
|
||
entry_points: | ||
main: | ||
command: "python train.py" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
# MLflow + GBRT (Gradient Boosted Regression Trees) Demo Notebook | ||
# MLflow Example Project + Notebook | ||
|
||
This notebook demonstrates an example of dataset preprocessing, ML model training and evaluation, model tuning using MLflow tracking and finally making predictions. | ||
This project and notebook demonstrates an example of dataset preprocessing, ML model training and evaluation, model tuning using MLflow tracking and finally making predictions. | ||
|
||
with @PyDataRiyadh: https://twitter.com/PyDataRiyadh/status/1291043529146466304 |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import os | ||
import subprocess | ||
import sys | ||
import warnings | ||
|
||
import mlflow | ||
|
||
warnings.filterwarnings('ignore') | ||
|
||
PROJECT_DIR = sys.path[0] | ||
os.chdir(PROJECT_DIR) | ||
|
||
experiment_name = 'rented_bikes' | ||
mlflow.set_experiment(experiment_name) | ||
|
||
PORT = 5001 # REST API serving port | ||
CONTAINER_NAME = "mlflow_example_model_serving" | ||
|
||
best_run_df = mlflow.search_runs(order_by=['metrics.RMSE_CV ASC'], max_results=1) | ||
if len(best_run_df.index) == 0: | ||
raise Exception(f"Found no runs for experiment '{experiment_name}'") | ||
|
||
best_run = mlflow.get_run(best_run_df.at[0, 'run_id']) | ||
best_model_uri = f"{best_run.info.artifact_uri}/model" | ||
# best_model = mlflow.sklearn.load_model(best_model_uri) | ||
|
||
# print best run info | ||
print("Best run info:") | ||
print(f"Run id: {best_run.info.run_id}") | ||
print(f"Run parameters: {best_run.data.params}") | ||
print("Run score: RMSE_CV = {:.4f}".format(best_run.data.metrics['RMSE_CV'])) | ||
print(f"Run model URI: {best_model_uri}") | ||
|
||
# remove current container if exists | ||
subprocess.run(f"docker rm --force {CONTAINER_NAME}", shell=True, check=False, stdout=subprocess.DEVNULL) | ||
|
||
# run mlflow model serving in a docker container | ||
docker_run_cmd = f""" | ||
docker run | ||
--name={CONTAINER_NAME} | ||
--volume={PROJECT_DIR}:{PROJECT_DIR} | ||
--publish {PORT}:{PORT} | ||
--interactive | ||
--rm | ||
mlflow_example | ||
mlflow models serve --model-uri {best_model_uri} --host 0.0.0.0 --port {PORT} --workers 2 --no-conda | ||
""".replace('\n', ' ').strip() | ||
print(f"Running command:\n{docker_run_cmd}") | ||
|
||
subprocess.run(docker_run_cmd, shell=True, check=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import itertools | ||
import os | ||
import subprocess | ||
import sys | ||
import warnings | ||
|
||
import mlflow | ||
|
||
warnings.filterwarnings('ignore') | ||
|
||
PROJECT_DIR = sys.path[0] | ||
os.chdir(PROJECT_DIR) | ||
|
||
experiment_name = 'rented_bikes' | ||
mlflow.set_experiment(experiment_name) | ||
|
||
# delete default experiment if exits | ||
if mlflow.get_experiment_by_name("Default").lifecycle_stage == 'active': | ||
mlflow.delete_experiment("0") | ||
subprocess.run("mlflow gc", shell=True, check=False, stdout=subprocess.DEVNULL) | ||
|
||
# Model Hyper-parameters | ||
parameters = { | ||
"learning_rate": [0.1, 0.05, 0.01], | ||
"max_depth": [4, 5, 6], | ||
} | ||
|
||
# Tuning the hyper-parameters via grid search | ||
# generate parameters combinations | ||
params_keys = parameters.keys() | ||
params_values = [ | ||
parameters[key] if isinstance(parameters[key], list) else [parameters[key]] | ||
for key in params_keys | ||
] | ||
runs_parameters = [ | ||
dict(zip(params_keys, combination)) for combination in itertools.product(*params_values) | ||
] | ||
|
||
# execute experiment runs in parallel in docker containers | ||
submitted_runs = [] | ||
for run_parameters in runs_parameters: | ||
submitted_runs.append(mlflow.projects.run( | ||
uri='.', | ||
backend='local', | ||
parameters=run_parameters, | ||
synchronous=False, | ||
docker_args={"user": f"{os.getuid()}:{os.getgid()}"}, | ||
)) | ||
|
||
print(f"Submitted {len(submitted_runs)} runs. Waiting for them to finish...") | ||
|
||
# get runs status (blocking) | ||
runs_status = [run.wait() for run in submitted_runs] | ||
|
||
print(f"Experiment '{experiment_name}' finished!") | ||
print(f"{sum(runs_status)} runs succeeded out of {len(runs_status)} submitted") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
mlflow>=1.11.0 | ||
pandas>=1.1.2 | ||
numpy>=1.18.5 | ||
matplotlib>=3.2.2 | ||
seaborn>=0.11.0 | ||
scikit-learn>=0.22.2 | ||
pydotplus>=2.0.2 | ||
graphviz>=0.10.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,210 @@ | ||
import argparse | ||
import os | ||
import warnings | ||
from pathlib import Path | ||
|
||
import matplotlib.pyplot as plt | ||
import mlflow | ||
import numpy as np | ||
import pandas as pd | ||
import seaborn as sns | ||
from mlflow.models.signature import infer_signature | ||
from pydotplus import graph_from_dot_data | ||
from sklearn import tree | ||
from sklearn.ensemble import GradientBoostingRegressor | ||
from sklearn.inspection import permutation_importance | ||
from sklearn.metrics import mean_squared_error | ||
from sklearn.model_selection import KFold, cross_val_score, train_test_split | ||
|
||
plt.style.use("fivethirtyeight") | ||
warnings.filterwarnings('ignore') | ||
np.random.seed(42) | ||
|
||
# create model_artifacts directory | ||
model_artifacts_dir = "/tmp/model_artifacts" | ||
Path(model_artifacts_dir).mkdir(exist_ok=True) | ||
|
||
|
||
# Evaluation Metrics | ||
def rmse(y, y_pred): | ||
return np.sqrt(mean_squared_error(y, y_pred)) | ||
|
||
|
||
def rmse_score(y, y_pred): | ||
score = rmse(y, y_pred) | ||
return score | ||
|
||
|
||
# Cross-validation RMSLE score | ||
def rmsle_cv(model, X_train, y_train): | ||
kf = KFold(n_splits=3, shuffle=True, random_state=42).get_n_splits(X_train.values) | ||
# Evaluate a score by cross-validation | ||
rmse = np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv=kf)) | ||
return rmse | ||
|
||
|
||
def rmse_cv_score(model, X_train, y_train): | ||
score = rmsle_cv(model, X_train, y_train) | ||
return score | ||
|
||
|
||
# Feature Importance | ||
def model_feature_importance(model): | ||
feature_importance = pd.DataFrame( | ||
model.feature_importances_, | ||
index=X_train.columns, | ||
columns=["Importance"], | ||
) | ||
|
||
# sort by importance | ||
feature_importance.sort_values(by="Importance", ascending=False, inplace=True) | ||
|
||
# plot | ||
plt.figure(figsize=(12, 8)) | ||
sns.barplot( | ||
data=feature_importance.reset_index(), | ||
y="index", | ||
x="Importance", | ||
).set_title("Feature Importance") | ||
# save image | ||
plt.savefig(f"{model_artifacts_dir}/feature_importance.png", bbox_inches='tight') | ||
|
||
|
||
def model_permutation_importance(model): | ||
p_importance = permutation_importance(model, X_test, y_test, random_state=42, n_jobs=-1) | ||
|
||
# sort by importance | ||
sorted_idx = p_importance.importances_mean.argsort()[::-1] | ||
p_importance = pd.DataFrame( | ||
data=p_importance.importances[sorted_idx].T, | ||
columns=X_train.columns[sorted_idx] | ||
) | ||
|
||
# plot | ||
plt.figure(figsize=(12, 8)) | ||
sns.barplot( | ||
data=p_importance, | ||
orient="h" | ||
).set_title("Permutation Importance") | ||
|
||
# save image | ||
plt.savefig(f"{model_artifacts_dir}/permutation_importance.png", bbox_inches="tight") | ||
|
||
|
||
def model_tree_visualization(model): | ||
# generate visualization | ||
tree_dot_data = tree.export_graphviz( | ||
decision_tree=model.estimators_[0, 0], # Get the first tree, | ||
label="all", | ||
feature_names=X_train.columns, | ||
filled=True, | ||
rounded=True, | ||
proportion=True, | ||
impurity=False, | ||
precision=1, | ||
) | ||
|
||
# save image | ||
graph_from_dot_data(tree_dot_data).write_png(f"{model_artifacts_dir}/Decision_Tree_Visualization.png") | ||
|
||
|
||
# Read the data csv file (make sure you're running this from the root of MLflow!) | ||
data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/hour.csv") | ||
# load input data into pandas dataframe | ||
bike_sharing = pd.read_csv(data_path) | ||
|
||
# Data preprocessing | ||
# remove unused columns | ||
bike_sharing.drop(columns=["instant", "dteday", "registered", "casual"], inplace=True) | ||
|
||
# use better column names | ||
bike_sharing.rename( | ||
columns={ | ||
"yr": "year", | ||
"mnth": "month", | ||
"hr": "hour_of_day", | ||
"holiday": "is_holiday", | ||
"workingday": "is_workingday", | ||
"weathersit": "weather_situation", | ||
"temp": "temperature", | ||
"atemp": "feels_like_temperature", | ||
"hum": "humidity", | ||
"cnt": "rented_bikes", | ||
}, | ||
inplace=True, | ||
) | ||
|
||
# Prepare training and test data sets | ||
|
||
# Split the dataset randomly into 70% for training and 30% for testing. | ||
X = bike_sharing.drop("rented_bikes", axis=1) | ||
y = bike_sharing.rented_bikes | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=42) | ||
|
||
# main entry point | ||
if __name__ == "__main__": | ||
# parse run parameters | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--learning_rate', type=float, default=0.1) | ||
parser.add_argument('--max_depth', type=int, default=3) | ||
run_parameters = vars(parser.parse_args()) | ||
|
||
with mlflow.start_run(): | ||
run_id = mlflow.active_run().info.run_id | ||
print(f"Run {run_id}:", f"Started with parameters {run_parameters}") | ||
print(f"Run {run_id}:", f"Training samples: {X_train.size}, Test samples: {X_test.size}") | ||
|
||
# create model instance: GBRT (Gradient Boosted Regression Tree) scikit-learn implementation | ||
model = GradientBoostingRegressor(**run_parameters) | ||
|
||
# Model Training | ||
model.fit(X_train, y_train) | ||
print(f"Run {run_id}:", "Training completed") | ||
|
||
# get evaluations scores | ||
score = rmse_score(y_test, model.predict(X_test)) | ||
score_cv = rmse_cv_score(model, X_train, y_train) | ||
print(f"Run {run_id}:", "RMSE score: {:.4f}".format(score)) | ||
print(f"Run {run_id}:", "Cross-validation RMSE score: {:.4f} (std = {:.4f})".format(score_cv.mean(), score_cv.std())) | ||
|
||
# generate charts | ||
model_feature_importance(model) | ||
plt.close() | ||
model_permutation_importance(model) | ||
plt.close() | ||
model_tree_visualization(model) | ||
|
||
# log estimator name | ||
mlflow.set_tag("estimator_name", model.__class__.__name__) | ||
|
||
# log input features | ||
mlflow.set_tag("features", str(X_train.columns.values.tolist())) | ||
|
||
# Log tracked parameters only | ||
mlflow.log_params(run_parameters) | ||
|
||
mlflow.log_metrics({ | ||
'RMSE_CV': score_cv.mean(), | ||
'RMSE': score, | ||
}) | ||
|
||
# log training loss | ||
for s in model.train_score_: | ||
mlflow.log_metric("Train Loss", s) | ||
|
||
# get model signature | ||
signature = infer_signature(model_input=X_train, model_output=model.predict(X_train)) | ||
|
||
# Save model to artifacts | ||
mlflow.sklearn.log_model(model, "model", signature=signature) | ||
|
||
# log charts | ||
mlflow.log_artifacts(model_artifacts_dir) | ||
|
||
# optional: auto-logging for scikit-learn estimators | ||
# mlflow.sklearn.autolog() | ||
|
||
# optional: log all model parameters | ||
# mlflow.log_params(model.get_params()) | ||
|
||
print(f"Run {run_id}:", "Logging completed") |