diff --git a/Model Validation.ipynb b/Model Validation.ipynb index 8070c05..ab1d306 100644 --- a/Model Validation.ipynb +++ b/Model Validation.ipynb @@ -40,10 +40,12 @@ }, "outputs": [], "source": [ - "dbutils.widgets.text(\"source_catalog\", \"\", \"Source Catalog\")\n", - "dbutils.widgets.text(\"source_db\", \"\", \"Source Database\")\n", "dbutils.widgets.text(\"model_name\", \"\", \"Model Name\")\n", - "dbutils.widgets.text(\"validation_data\", \"\", \"validation data\")" + "dbutils.widgets.text(\"model_version\", \"\", \"Model Version\")\n", + "#dbutils.widgets.text(\"source_catalog\", \"\", \"Source Catalog\")\n", + "#dbutils.widgets.text(\"source_db\", \"\", \"Source Database\")\n", + "dbutils.widgets.text(\"validation_data\", \"\", \"validation data\")\n", + "dbutils.widgets.text(\"target_catalog\", \"\", \"Target Catalog\")" ] }, { @@ -64,10 +66,21 @@ }, "outputs": [], "source": [ - "catalog = dbutils.widgets.get(\"source_catalog\")\n", - "db = dbutils.widgets.get(\"source_db\")\n", - "model_name = dbutils.widgets.get(\"model_name\")\n", - "validation_data = dbutils.widgets.get(\"validation_data\")" + "source_full_model_name = dbutils.widgets.get(\"model_name\")\n", + "model_version = dbutils.widgets.get(\"model_version\")\n", + "#catalog = dbutils.widgets.get(\"source_catalog\")\n", + "#db = dbutils.widgets.get(\"source_db\")\n", + "validation_data = dbutils.widgets.get(\"validation_data\")\n", + "target_catalog = dbutils.widgets.get(\"target_catalog\")\n", + "\n", + "#Not going to use the inputs for catalog and db. Instead use model name and extract catalog and db\n", + "catalog, db, model_name = source_full_model_name.split('.', 2)\n", + "\n", + "catalog = catalog.strip()\n", + "db = db.strip()\n", + "model_name = model_name.strip()\n", + "\n", + "model_version = int(model_version)\n" ] }, { @@ -86,7 +99,7 @@ } }, "source": [ - "# Validate automl model before moving it to stage" + "# Validate automl model before moving it to target" ] }, { @@ -110,14 +123,12 @@ "# We are interested in validating the automl model in dev before propogating to stage\n", "import mlflow\n", "from mlflow.tracking import MlflowClient\n", - "model_alias = \"most_recent\"\n", - "full_model_name = f\"{catalog}.{db}.{model_name}\"\n", "\n", "client = MlflowClient()\n", - "model_details = client.get_model_version_by_alias(full_model_name, model_alias)\n", - "model_version = int(model_details.version)\n", + "#model_details = client.get_model_version_by_alias(full_model_name, model_alias)\n", + "#model_version = int(model_details.version)\n", "\n", - "print(f\"Validating {model_alias} model for {full_model_name} on model version {model_version}\")" + "print(f\"Validating model for {source_full_model_name} on model version {model_version}\")" ] }, { @@ -136,7 +147,7 @@ } }, "source": [ - "## Validate description" + "## Validate if model version description is populated" ] }, { @@ -158,17 +169,18 @@ "outputs": [], "source": [ "# If there's no description or an insufficient number of charaters, tag accordingly\n", + "model_details = client.get_model_version(name=source_full_model_name, version=model_version)\n", "if not model_details.description:\n", " has_description = False\n", - " print(\"Please add model description\")\n", + " print(\"Please add model description to the version\")\n", "elif not len(model_details.description) > 20:\n", " has_description = False\n", " print(\"Please add detailed model description (40 char min).\")\n", "else:\n", " has_description = True\n", "\n", - "print(f'Model {full_model_name} version {model_details.version} has description: {has_description}')\n", - "client.set_model_version_tag(name=full_model_name, version=str(model_details.version), key=\"has_description\", value=has_description)" + "print(f'Model {source_full_model_name} version {model_details.version} has description: {has_description}')\n", + "client.set_model_version_tag(name=source_full_model_name, version=str(model_details.version), key=\"has_description\", value=has_description)" ] }, { @@ -187,7 +199,7 @@ } }, "source": [ - "## Validate if champion model exist in dev." + "## Validate if champion model exist." ] }, { @@ -213,7 +225,7 @@ "\n", "try:\n", " #Compare the challenger smape score to the existing champion if it exists\n", - " champion_model = client.get_model_version_by_alias(full_model_name, \"Champion\")\n", + " champion_model = client.get_model_version_by_alias(source_full_model_name, \"Champion\")\n", " champion_smape = mlflow.get_run(champion_model.run_id).data.metrics['test_smape']\n", " print(f'Champion test_smape score: {champion_smape}. Challenger champion_smape score: {test_smape}.')\n", " metric_smape_passed = test_smape <= champion_smape\n", @@ -222,14 +234,14 @@ " metric_smape_passed = True\n", "\n", "if metric_smape_passed == True:\n", - " print(f'Model {full_model_name} version {model_details.version} metric_smape_passed: {metric_smape_passed}. Updating the model alias to champion')\n", - " client.set_model_version_tag(name=full_model_name, version=model_details.version, key=\"metric_smape_passed\", value=metric_smape_passed)\n", - " client.set_registered_model_alias(name=full_model_name, alias=\"Champion\", version=model_version)\n", - " client.delete_registered_model_alias(name=full_model_name, alias=model_alias)\n", + " print(f'Model {source_full_model_name} version {model_details.version} smape comparison with champion model passed. champion_smape_passed: {metric_smape_passed}.')\n", + " client.set_model_version_tag(name=source_full_model_name, version=model_details.version,key=\"champion_smape_passed\", value=metric_smape_passed)\n", + " client.set_registered_model_alias(name=source_full_model_name, alias=\"Challenger\", version=model_version)\n", "else:\n", - " print(f'Model {full_model_name} version {model_details.version} metric_smape_passed: {metric_smape_passed}. No good model to proceed')\n", - " client.set_model_version_tag(name=full_model_name, version=model_details.version, key=\"metric_smape_passed\", value=metric_smape_passed)\n", - " dbutils.notebooks.exit(\"Model validation Completed\")\n" + " print(f'Model {source_full_model_name} version {model_details.version} metric_smape_passed: {metric_smape_passed}. Not a good model to proceed with')\n", + " client.set_model_version_tag(name=source_full_model_name, version=model_details.version, key=\"metric_smape_passed\", value=metric_smape_passed)\n", + " client.set_registered_model_alias(name=source_full_model_name, alias=\"Challenger\", version=model_version)\n", + " dbutils.notebook.exit(\"validation Completed. Model version is not fit for next stage\")\n" ] }, { @@ -248,7 +260,7 @@ } }, "source": [ - "## Validating model performance against stage dataset" + "## Validating model performance against validation dataset" ] }, { @@ -272,11 +284,9 @@ "import pyspark.sql.functions as F\n", "import mlflow\n", "#get our validation dataset:\n", - "validation_stage_df = spark.table(f\"mlops_stage.{db}.{validation_data}\").toPandas()\n", + "validation_data_df = spark.table(f\"{catalog}.{db}.{validation_data}\").toPandas()\n", "\n", - "#Call the model with the given alias and return the prediction\n", - "#model = mlflow.pyfunc.spark_udf(spark, model_uri=f\"models:/{catalog}.{db}.{model_name}@{model_alias}\")\n", - "requirements = mlflow.pyfunc.get_model_dependencies(model_uri=f\"models:/{catalog}.{db}.{model_name}@champion\")\n", + "requirements = mlflow.pyfunc.get_model_dependencies(model_uri=f\"models:/{catalog}.{db}.{model_name}@Challenger\")\n", "%pip install -r {requirements}" ] }, @@ -299,9 +309,9 @@ "outputs": [], "source": [ "import mlflow\n", - "model = mlflow.pyfunc.load_model(model_uri=f\"models:/{catalog}.{db}.{model_name}@champion\")\n", - "validation_stage_df['prediction'] = model.predict(validation_stage_df)\n", - "display(validation_stage_df)" + "model = mlflow.pyfunc.load_model(model_uri=f\"models:/{catalog}.{db}.{model_name}@Challenger\")\n", + "validation_data_df['prediction'] = model.predict(validation_data_df)\n", + "display(validation_data_df)" ] }, { @@ -342,8 +352,8 @@ " return np.mean(2 * np.abs(predicted - actual) / (np.abs(actual) + np.abs(predicted))) * 100\n", "\n", "\n", - "stage_smape_value = smape(validation_stage_df, 'actual', 'prediction')\n", - "print(f\"SMAPE value based on staging validation data: {stage_smape_value:.2f}\")" + "validation_smape_value = smape(validation_data_df, 'Year_1', 'prediction')\n", + "print(f\"SMAPE value based on staging validation data: {validation_smape_value:.2f}\")" ] }, { @@ -364,85 +374,33 @@ }, "outputs": [], "source": [ - "if stage_smape_value <= test_smape:\n", - " print(f\"Validation SMAPE value {stage_smape_value} is less than or equal to the test SMAPE value {test_smape}. Validation Passed\")\n", - " metric_smape_passed = stage_smape_value <= test_smape\n", + "if validation_smape_value <= test_smape:\n", + " print(f\"Validation SMAPE value {validation_smape_value} is less than or equal to the test SMAPE value {test_smape}. Validation Passed\")\n", + " validation_metric_smape_passed = validation_smape_value <= test_smape\n", + " print(f'Model {source_full_model_name} version {model_details.version} validation_metric_smape_passed: {validation_metric_smape_passed}')\n", + " client.set_model_version_tag(name=source_full_model_name, version=model_details.version, key=\"validation_metric_smape_passed\", value=validation_metric_smape_passed)\n", + " client.set_registered_model_alias(name=source_full_model_name, alias=\"Champion\", version=model_version)\n", "else:\n", - " print(f\"Validation SMAPE value {stage_smape_value} is greater than the test SMAPE value {test_smape}. Validation Failed\")\n", - " metric_smape_passed = stage_smape_value <= test_smape\n", - "\n", - "print(f'Model {full_model_name} version {model_details.version} metric_smape_passed: {metric_smape_passed}')\n", - "\n", - "client.set_model_version_tag(name=full_model_name, version=model_details.version, key=\"metric_smape_passed\", value=metric_smape_passed)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "0a4d9d48-8c02-496a-8151-42bd434456e9", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "outputs": [], - "source": [ - "%pip install --quiet mlflow==2.19" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "8fe79216-dcac-440d-8c56-9ce9066a2ab6", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "outputs": [], - "source": [ - "client = MlflowClient()\n", - "stage_model_details = client.copy_model_version(\n", - " f\"models:/{full_model_name}/{model_version}\",\n", - " f\"mlops_stage.{db}.{model_name}\",\n", - ")\n", - "stage_model_version = int(stage_model_details.version)\n", - "client.set_registered_model_alias(\n", - " name=f\"mlops_stage.{db}.{model_name}\", alias=\"Champion\", version=stage_model_version\n", - ")" + " print(f\"Validation SMAPE value {validation_smape_value} is greater than the test SMAPE value {test_smape}. Validation Failed\")\n", + " validation_metric_smape_passed = validation_smape_value <= test_smape\n", + " client.set_model_version_tag(name=source_full_model_name, version=model_details.version, key=\"validation_metric_smape_passed\", value=validation_metric_smape_passed)\n", + " dbutils.notebook.exit(\"validation Completed. Model version is not fit for next stage\")\n" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, - "nuid": "4389087a-1894-49d5-809e-fb579b2dae9c", + "nuid": "8a122443-b1e5-449a-b43b-a9952a0c1b20", "showTitle": false, "tableResultSettingsMap": {}, "title": "" } }, "source": [ - "## Validating model performance against production dataset" + "## Promoting model to next catalog" ] }, { @@ -455,133 +413,7 @@ "rowLimit": 10000 }, "inputWidgets": {}, - "nuid": "30a5e9c8-f404-4234-ba82-46ab4ca5a4f8", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "outputs": [], - "source": [ - "import pyspark.sql.functions as F\n", - "import mlflow\n", - "#get our validation dataset:\n", - "validation_prod_df = spark.table(f\"mlops_prod.{db}.{validation_data}\").toPandas()\n", - "\n", - "#Call the model with the given alias and return the prediction\n", - "#model = mlflow.pyfunc.spark_udf(spark, model_uri=f\"models:/{catalog}.{db}.{model_name}@{model_alias}\")\n", - "requirements = mlflow.pyfunc.get_model_dependencies(model_uri=f\"models:/mlops_stage.{db}.{model_name}@Champion\")\n", - "%pip install -r {requirements}" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "f56e0f99-8b57-4b06-8a17-3c063b803689", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "outputs": [], - "source": [ - "import mlflow\n", - "stage_model = mlflow.pyfunc.load_model(model_uri=f\"models:/mlops_stage.{db}.{model_name}@Champion\")\n", - "validation_prod_df['prediction'] = stage_model.predict(validation_prod_df)\n", - "display(validation_prod_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "605d3ce3-07f3-4400-99d9-d78df3d0de08", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "outputs": [], - "source": [ - "prod_smape_value = smape(validation_prod_df, 'actual', 'prediction')\n", - "print(f\"SMAPE value based on production validation data: {prod_smape_value:.2f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "2f467ab9-af8d-4f7b-b742-7693c1047269", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "outputs": [], - "source": [ - "if prod_smape_value <= test_smape:\n", - " print(f\"Validation SMAPE value {prod_smape_value} is less than or equal to the test SMAPE value {test_smape}. Validation Passed\")\n", - " metric_smape_passed = prod_smape_value <= test_smape\n", - "else:\n", - " print(f\"Validation SMAPE value {prod_smape_value} is greater than the test SMAPE value {test_smape}. Validation Failed\")\n", - " metric_smape_passed = prod_smape_value <= test_smape\n", - "\n", - "print(f'Model mlops_stage.{db}.{model_name} version {stage_model_details.version} metric_smape_passed: {metric_smape_passed}')\n", - "\n", - "client.set_model_version_tag(name=f\"mlops_stage.{db}.{model_name}\", version=stage_model_details.version, key=\"metric_smape_passed\", value=metric_smape_passed)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "b361f531-5a2d-4cb6-a9d0-78e5d5db7c75", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "outputs": [], - "source": [ - "%pip install --quiet mlflow==2.19" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "3bcb5558-9a4f-4416-b6bf-8c95410bca6e", + "nuid": "8fe79216-dcac-440d-8c56-9ce9066a2ab6", "showTitle": false, "tableResultSettingsMap": {}, "title": "" @@ -589,15 +421,13 @@ }, "outputs": [], "source": [ - "client = MlflowClient()\n", - "prod_model_details = client.copy_model_version(\n", - " f\"models:/mlops_stage.{db}.{model_name}/{stage_model_details.version}\",\n", - " f\"mlops_prod.{db}.{model_name}\",\n", - ")\n", - "prod_model_version = int(prod_model_details.version)\n", - "client.set_registered_model_alias(\n", - " name=f\"mlops_prod.{db}.{model_name}\", alias=\"Champion\", version=prod_model_version\n", - ")" + "if validation_metric_smape_passed == True:\n", + " %pip install --quiet mlflow==2.19\n", + " client = MlflowClient()\n", + " if catalog != \"mlops_prod\":\n", + " target_model_details = client.copy_model_version(f\"models:/{source_full_model_name}/{model_version}\",f\"{target_catalog}.{db}.{model_name}\")\n", + " target_model_version = int(target_model_details.version)\n", + " client.set_registered_model_alias(name=f\"{target_catalog}.{db}.{model_name}\", alias=\"Challenger\", version=target_model_version)\n" ] } ], @@ -629,7 +459,7 @@ "notebookName": "Model Validation", "widgets": { "model_name": { - "currentValue": "interest_forecast_7_day", + "currentValue": "mlops_dev.interest_forecast.interest_forecast_7_day", "nuid": "c53b0431-b742-4139-8427-3564fdb0470e", "typedWidgetInfo": { "autoCreated": false, @@ -654,6 +484,32 @@ } } }, + "model_version": { + "currentValue": "1", + "nuid": "98234b38-d9b1-466e-9194-7b3296a3eadc", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "", + "label": "Model Version", + "name": "model_version", + "options": { + "widgetDisplayType": "Text", + "validationRegex": null + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "widgetType": "text", + "defaultValue": "", + "label": "Model Version", + "name": "model_version", + "options": { + "widgetType": "text", + "autoCreated": null, + "validationRegex": null + } + } + }, "source_catalog": { "currentValue": "mlops_dev", "nuid": "55c98b2f-810b-403a-bc58-2d89515f59f9", @@ -706,6 +562,32 @@ } } }, + "target_catalog": { + "currentValue": "mlops_stage", + "nuid": "8fc5db88-6ac7-48cf-af48-817beff2ffb0", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "", + "label": "Target Catalog", + "name": "target_catalog", + "options": { + "widgetDisplayType": "Text", + "validationRegex": null + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "widgetType": "text", + "defaultValue": "", + "label": "Target Catalog", + "name": "target_catalog", + "options": { + "widgetType": "text", + "autoCreated": null, + "validationRegex": null + } + } + }, "validation_data": { "currentValue": "validation_table", "nuid": "fe3dfd7f-6af3-4617-b1c8-9e71eb64365a",