diff --git a/.gitignore b/.gitignore index c2232377e..06c7d3dc3 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ images/ .idea/ .vscode/ .empty/ +.DS_Store/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/describe/README.md b/describe/README.md index f765a9666..621dbd1a3 100644 --- a/describe/README.md +++ b/describe/README.md @@ -1 +1,26 @@ -# describe \ No newline at end of file +# Describe + +Get the table's summary statistics and summary plots + +The functions will require the following parameters: + +```markdown + +:param context: the function context +:param table: MLRun input pointing to pandas dataframe (csv/parquet file path) +:param label_column: ground truth column label +:param class_labels: label for each class in tables and plots +:param plot_hist: (True) set this to False for large tables +:param plots_dest: destination folder of summary plots (relative to artifact_path) +:param update_dataset: when the table is a registered dataset update the charts in-place + +``` + +The function will output the following artifacts per column within the data frame (based on data types): + +1. histogram chart +2. violin chart +3. imbalance chart +4. correlation-matrix chart +5. correlation-matrix csv +6. imbalance-weights-vec csv \ No newline at end of file diff --git a/describe/describe.ipynb b/describe/describe.ipynb index d0718b12b..ac56d712e 100644 --- a/describe/describe.ipynb +++ b/describe/describe.ipynb @@ -20,13 +20,13 @@ "output_type": "stream", "text": [ "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'mlrun/ml-models'\n" + "%nuclio: setting spec.image to 'mlrun/mlrun'\n" ] } ], "source": [ "%nuclio config kind = \"job\"\n", - "%nuclio config spec.image = \"mlrun/ml-models\"" + "%nuclio config spec.image = \"mlrun/mlrun\"" ] }, { @@ -61,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -161,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -172,59 +172,75 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### mlconfig" + "### MLconfig" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2020-11-24 15:34:46,837 [warning] warning!, server (0.5.4-rc1) and client (0.5.4) ver dont match\n" + ] + }, + { + "data": { + "text/plain": [ + "'/User/functions-udpate/describe'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "from mlrun import mlconf\n", - "import os\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'\n", - "mlconf.artifact_path = mlconf.artifact_path or os.path.abspath('./')" + "import mlrun\n", + "mlrun.set_environment(api_path = 'http://mlrun-api:8080',\n", + " artifact_path = os.path.abspath('./'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### save" + "### Save" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "> 2020-07-23 07:46:39,543 [info] function spec saved to path: function.yaml\n" + "> 2020-11-24 15:35:06,452 [info] function spec saved to path: function.yaml\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 18, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from mlrun import code_to_function \n", "# create job function object from notebook code\n", - "fn = code_to_function(\"describe\", handler=\"summarize\",\n", - " description=\"describe and visualizes dataset stats\",\n", - " categories=[\"analysis\"],\n", - " labels = {\"author\": \"yjb\"},\n", - " code_output='.')\n", + "fn = mlrun.code_to_function(\"describe\", handler=\"summarize\",\n", + " description=\"describe and visualizes dataset stats\",\n", + " categories=[\"analysis\"],\n", + " labels = {\"author\": \"yjb\"},\n", + " code_output='.')\n", "\n", "fn.export()" ] @@ -233,7 +249,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## tests" + "## Tests" ] }, { @@ -244,7 +260,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 9, @@ -253,8 +269,7 @@ } ], "source": [ - "from mlrun.platforms import auto_mount\n", - "fn.apply(auto_mount())" + "fn.apply(mlrun.platforms.auto_mount())" ] }, { @@ -263,48 +278,34 @@ "metadata": {}, "outputs": [], "source": [ - "from mlrun import NewTask, run_local\n", + "DATA_URL = 'https://s3.wasabisys.com/iguazio/data/iris/iris_dataset.csv'\n", "\n", - "#DATA_URL = \"https://iguazio-sample-data.s3.amazonaws.com/datasets/classifier-data.csv\"\n", - "DATA_URL = 'https://iguazio-sample-data.s3.amazonaws.com/datasets/iris_dataset.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "task = NewTask(\n", - " name=\"tasks-describe\", \n", - " handler=summarize, \n", - " inputs={\"table\": DATA_URL}, params={'update_dataset': True, 'label_column': 'label'})" + "task = mlrun.NewTask(name=\"tasks-describe\", \n", + " handler=summarize, \n", + " inputs={\"table\": DATA_URL}, \n", + " params={'update_dataset': True, \n", + " 'label_column': 'label'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### run locally" + "### Run Locally" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "> 2020-07-22 09:00:32,582 [debug] Validating field against patterns: {'field_name': 'run.metadata.name', 'field_value': 'tasks-describe', 'pattern': ['^.{0,63}$', '^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$']}\n", - "> 2020-07-22 09:00:32,598 [info] starting run tasks-describe uid=f30656601819462c892a9365dd175f72 -> http://mlrun-api:8080\n", - "> 2020-07-22 09:00:37,475 [debug] log artifact histograms at /User/functions/describe/plots/hist.html, size: 140127, db: N\n", - "> 2020-07-22 09:00:38,377 [debug] log artifact violin at /User/functions/describe/plots/violin.html, size: 54096, db: N\n", - "> 2020-07-22 09:00:38,680 [debug] log artifact imbalance at /User/functions/describe/plots/imbalance.html, size: 10045, db: Y\n", - "> 2020-07-22 09:00:38,697 [debug] log artifact imbalance-weights-vec at /User/functions/describe/plots/imbalance-weights-vec.csv, size: 65, db: N\n", - "> 2020-07-22 09:00:38,702 [debug] log artifact correlation-matrix at /User/functions/describe/plots/correlation-matrix.csv, size: 324, db: N\n", - "> 2020-07-22 09:00:38,877 [debug] log artifact correlation at /User/functions/describe/plots/corr.html, size: 12052, db: N\n" + "> 2020-11-24 15:35:06,489 [warning] warning!, server (0.5.4-rc1) and client (0.5.4) ver dont match\n", + "> 2020-11-24 15:35:06,489 [info] starting run tasks-describe uid=38d5c276628e46ff8634942b3585f636 DB=http://mlrun-api:8080\n", + "> 2020-11-24 15:35:06,538 [warning] warning!, server (0.5.4-rc1) and client (0.5.4) ver dont match\n" ] }, { @@ -312,13 +313,13 @@ "text/html": [ "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 28 13:54:47runningdescribe-spark-describe_spark
v3io_user=admin
kind=job
owner=admin
dataset
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to track results use .show() or .logs() or in CLI: \n", + "!mlrun get run a9cc8a2b48ce42d180e490043091da52 --project default , !mlrun logs a9cc8a2b48ce42d180e490043091da52 --project default\n", + "> 2020-10-28 13:54:48,285 [info] run executed, status=running\n" + ] + } + ], + "source": [ + "run_res = fn.run(inputs={\"dataset\": \"iris_dataset.csv\"},\n", + " artifact_path=artifact_path, watch=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2020-10-28 13:55:36,021 [warning] warning!, server (0.5.3-rc1) and client (0.5.2) ver dont match\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 28 13:54:55completeddescribe-spark-describe_spark
v3io_user=admin
kind=job
owner=admin
host=describe-spark-describe-spark-pxzxc
dataset
n=150
nvar=5
total_missing=0.0
memsize=0.0 YiB
recordsize=0.0 YiB
NUM=5
DATE=0
CONST=0
CAT=0
UNIQUE=0
CORR=0
REJECTED=0
summary_stats
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "run_res.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:root] *", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/describe_spark/function.yaml b/describe_spark/function.yaml new file mode 100644 index 000000000..98075a3cf --- /dev/null +++ b/describe_spark/function.yaml @@ -0,0 +1,332 @@ +kind: job +metadata: + name: describe-spark + tag: '' + hash: 242cd594bd1c4be61f4fe6a2ff5a8d2902d5b8ca + project: default +spec: + command: '' + args: [] + image: iguazio/shell:3.0_b5565_20201026062233_wsdf + env: + - name: V3IO_API + value: '' + - name: V3IO_USERNAME + value: '' + - name: V3IO_ACCESS_KEY + value: '' + - name: CURRENT_NODE_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.hostIP + - name: IGZ_DATA_CONFIG_FILE + value: /igz/java/conf/v3io.conf + default_handler: describe_spark + entry_points: + describe: + name: describe + doc: '' + parameters: + - name: df + default: '' + - name: bins + default: '' + - name: corr_reject + default: '' + - name: config + default: '' + outputs: + - default: '' + lineno: 38 + pretty_name: + name: pretty_name + doc: '' + parameters: + - name: x + default: '' + outputs: + - default: '' + lineno: 51 + corr_matrix: + name: corr_matrix + doc: '' + parameters: + - name: df + default: '' + - name: columns + default: null + outputs: + - default: '' + lineno: 58 + separate: + name: separate + doc: '' + parameters: + - name: l + default: '' + - name: n + default: '' + outputs: + - default: '' + lineno: 63 + create_hist_data: + name: create_hist_data + doc: '' + parameters: + - name: df + default: '' + - name: column + default: '' + - name: minim + default: '' + - name: maxim + default: '' + - name: bins + default: 10 + outputs: + - default: '' + lineno: 80 + create_all_conditions: + name: create_all_conditions + doc: 'Recursive function that exploits the + + ability to call the Spark SQL Column method + + .when() in a recursive way.' + parameters: + - name: current_col + default: '' + - name: column + default: '' + - name: left_edges + default: '' + - name: count + default: 1 + outputs: + - default: '' + lineno: 82 + describe_integer_1d: + name: describe_integer_1d + doc: '' + parameters: + - name: df + default: '' + - name: column + default: '' + - name: current_result + default: '' + - name: nrows + default: '' + outputs: + - default: '' + lineno: 134 + describe_float_1d: + name: describe_float_1d + doc: '' + parameters: + - name: df + default: '' + - name: column + default: '' + - name: current_result + default: '' + - name: nrows + default: '' + outputs: + - default: '' + lineno: 170 + describe_date_1d: + name: describe_date_1d + doc: '' + parameters: + - name: df + default: '' + - name: column + default: '' + outputs: + - default: '' + lineno: 204 + guess_json_type: + name: guess_json_type + doc: '' + parameters: + - name: string_value + default: '' + outputs: + - default: '' + lineno: 221 + describe_categorical_1d: + name: describe_categorical_1d + doc: '' + parameters: + - name: df + default: '' + - name: column + default: '' + outputs: + - default: '' + lineno: 229 + describe_constant_1d: + name: describe_constant_1d + doc: '' + parameters: + - name: df + default: '' + - name: column + default: '' + outputs: + - default: '' + lineno: 267 + describe_unique_1d: + name: describe_unique_1d + doc: '' + parameters: + - name: df + default: '' + - name: column + default: '' + outputs: + - default: '' + lineno: 274 + describe_1d: + name: describe_1d + doc: '' + parameters: + - name: df + default: '' + - name: column + default: '' + - name: nrows + default: '' + - name: lookup_config + default: null + outputs: + - default: '' + lineno: 281 + gradient_format: + name: gradient_format + doc: '' + parameters: + - name: value + default: '' + - name: limit1 + default: '' + - name: limit2 + default: '' + - name: c1 + default: '' + - name: c2 + default: '' + outputs: + - default: '' + lineno: 396 + LerpColour: + name: LerpColour + doc: '' + parameters: + - name: c1 + default: '' + - name: c2 + default: '' + - name: t + default: '' + outputs: + - default: '' + lineno: 397 + fmt_color: + name: fmt_color + doc: '' + parameters: + - name: text + default: '' + - name: color + default: '' + outputs: + - default: '' + lineno: 403 + fmt_class: + name: fmt_class + doc: '' + parameters: + - name: text + default: '' + - name: cls + default: '' + outputs: + - default: '' + lineno: 407 + fmt_bytesize: + name: fmt_bytesize + doc: '' + parameters: + - name: num + default: '' + - name: suffix + default: B + outputs: + - default: '' + lineno: 411 + fmt_percent: + name: fmt_percent + doc: '' + parameters: + - name: v + default: '' + outputs: + - default: '' + lineno: 421 + fmt_varname: + name: fmt_varname + doc: '' + parameters: + - name: v + default: '' + outputs: + - default: '' + lineno: 424 + fmt_row_severity: + name: fmt_row_severity + doc: '' + parameters: + - name: v + default: '' + outputs: + - default: '' + lineno: 441 + fmt_skewness: + name: fmt_skewness + doc: '' + parameters: + - name: v + default: '' + outputs: + - default: '' + lineno: 447 + describe_spark: + name: describe_spark + doc: '' + parameters: + - name: context + type: MLClientCtx + default: '' + - name: dataset + type: DataItem + default: '' + - name: artifact_path + default: '' + - name: bins + type: int + default: 30 + - name: describe_extended + type: bool + default: true + outputs: + - default: '' + lineno: 463 + description: '' + image_pull_policy: IfNotPresent + build: + functionSourceCode: # Generated by nuclio.export.NuclioExporter

import mlrun
from mlrun.platforms.iguazio import mount_v3io, mount_v3iod
from mlrun.datastore import DataItem
from mlrun.execution import MLClientCtx

import os
from subprocess import run
import pandas as pd
import numpy as np

from pyspark.sql.types import LongType
from pyspark.sql import SparkSession

import sys
import base64 as b64
import warnings
warnings.filterwarnings("ignore")

from itertools import product
import matplotlib

import numpy as np
import json
import pandas as pd
from matplotlib import pyplot as plt
from pkg_resources import resource_filename
import six
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.sql.functions import (abs as df_abs, col, count, countDistinct,
                                   max as df_max, mean, min as df_min,
                                   sum as df_sum, when
                                   )
from pyspark.sql.functions import variance, stddev, kurtosis, skewness


def describe(df, bins, corr_reject, config, **kwargs):
    if not isinstance(df, SparkDataFrame):
        raise TypeError("df must be of type pyspark.sql.DataFrame")

    table_stats = {"n": df.count()}
    if table_stats["n"] == 0:
        raise ValueError("df cannot be empty")

    try:
        matplotlib.style.use("default")
    except:
        pass

    def pretty_name(x):
        x *= 100
        if x == int(x):
            return '%.0f%%' % x
        else:
            return '%.1f%%' % x

    def corr_matrix(df, columns=None):
        if columns is None:
            columns = df.columns
        combinations = list(product(columns,columns))

        def separate(l, n):
            for i in range(0, len(l), n):
                yield l[i:i+n]

        grouped = list(separate(combinations,len(columns)))
        df_cleaned = df.select(*columns).na.drop(how="any")

        for i in grouped:
            for j in enumerate(i):
                i[j[0]] = i[j[0]] + (df_cleaned.corr(str(j[1][0]), str(j[1][1])),)

        df_pandas = pd.DataFrame(grouped).applymap(lambda x: x[2])
        df_pandas.columns = columns
        df_pandas.index = columns
        
        return df_pandas

    def create_hist_data(df, column, minim, maxim, bins=10):

        def create_all_conditions(current_col, column, left_edges, count=1):
            """
            Recursive function that exploits the
            ability to call the Spark SQL Column method
            .when() in a recursive way.
            """
            left_edges = left_edges[:]
            if len(left_edges) == 0:
                return current_col
            if len(left_edges) == 1:
                next_col = current_col.when(col(column) >= float(left_edges[0]), count)
                left_edges.pop(0)
                return create_all_conditions(next_col, column, left_edges[:], count+1)
            next_col = current_col.when((float(left_edges[0]) <= col(column))
                                        & (col(column) < float(left_edges[1])), count)
            left_edges.pop(0)
            return create_all_conditions(next_col, column, left_edges[:], count+1)

        num_range = maxim - minim
        bin_width = num_range / float(bins)
        left_edges = [minim]
        for _bin in range(bins):
            left_edges = left_edges + [left_edges[-1] + bin_width]
        left_edges.pop()
        expression_col = when((float(left_edges[0]) <= col(column))
                              & (col(column) < float(left_edges[1])), 0)
        left_edges_copy = left_edges[:]
        left_edges_copy.pop(0)
        bin_data = (df.select(col(column))
                    .na.drop()
                    .select(col(column),
                            create_all_conditions(expression_col,
                                                  column,
                                                  left_edges_copy
                                                 ).alias("bin_id")
                           )
                    .groupBy("bin_id").count()
                   ).toPandas()

        bin_data.index = bin_data["bin_id"]
        new_index = list(range(bins))
        bin_data = bin_data.reindex(new_index)
        bin_data["bin_id"] = bin_data.index
        bin_data = bin_data.fillna(0)

        bin_data["left_edge"] = left_edges
        bin_data["width"] = bin_width
        

        return bin_data


    def describe_integer_1d(df, column, current_result, nrows):
        
        stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       variance(col(column)).alias("variance"),
                                                       kurtosis(col(column)).alias("kurtosis"),
                                                       stddev(col(column)).alias("std"),
                                                       skewness(col(column)).alias("skewness"),
                                                       df_sum(col(column)).alias("sum")
                                                       ).toPandas()


        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats_df[pretty_name(x)] = (df.select(column)
                                        .na.drop()
                                        .selectExpr("percentile(`{col}`,CAST({n} AS DOUBLE))"
                                                    .format(col=column, n=x)).toPandas().iloc[:,0]
                                        )
        stats = stats_df.iloc[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column)
                        .na.drop()
                        .select(df_abs(col(column)-stats["mean"]).alias("delta"))
                        .agg(df_sum(col("delta"))).toPandas().iloc[0,0] / float(current_result["count"]))
        stats["type"] = "NUM"
        stats['n_zeros'] = df.select(column).where(col(column)==0.0).count()
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins)

        return stats

    def describe_float_1d(df, column, current_result, nrows):
        stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       variance(col(column)).alias("variance"),
                                                       kurtosis(col(column)).alias("kurtosis"),
                                                       stddev(col(column)).alias("std"),
                                                       skewness(col(column)).alias("skewness"),
                                                       df_sum(col(column)).alias("sum")
                                                       ).toPandas()

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats_df[pretty_name(x)] = (df.select(column)
                                        .na.drop()
                                        .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))"
                                                    .format(col=column, n=x)).toPandas().iloc[:,0]
                                        )
        stats = stats_df.iloc[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column)
                        .na.drop()
                        .select(df_abs(col(column)-stats["mean"]).alias("delta"))
                        .agg(df_sum(col("delta"))).toPandas().iloc[0,0] / float(current_result["count"]))
        stats["type"] = "NUM"
        stats['n_zeros'] = df.select(column).where(col(column)==0.0).count()
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins)

        return stats

    def describe_date_1d(df, column):
        stats_df = df.select(column).na.drop().agg(df_min(col(column)).alias("min"),
                                                   df_max(col(column)).alias("max")
                                                  ).toPandas()
        stats = stats_df.iloc[0].copy()
        stats.name = column

        if isinstance(stats["max"], pd.Timestamp):
            stats = stats.astype(object)
            stats["max"] = str(stats["max"].to_pydatetime())
            stats["min"] = str(stats["min"].to_pydatetime())

        else:
            stats["range"] = stats["max"] - stats["min"]
        stats["type"] = "DATE"
        return stats

    def guess_json_type(string_value):
        try:
            obj = json.loads(string_value)
        except:
            return None

        return type(obj)

    def describe_categorical_1d(df, column):
        value_counts = (df.select(column).na.drop()
                        .groupBy(column)
                        .agg(count(col(column)))
                        .orderBy("count({c})".format(c=column),ascending=False)
                       ).cache()

        stats = (value_counts
                 .limit(1)
                 .withColumnRenamed(column, "top")
                 .withColumnRenamed("count({c})".format(c=column), "freq")
                ).toPandas().iloc[0]

        top_50 = value_counts.limit(50).toPandas().sort_values("count({c})".format(c=column),
                                                               ascending=False)
        top_50_categories = top_50[column].values.tolist()

        others_count = pd.Series([df.select(column).na.drop()
                        .where(~(col(column).isin(*top_50_categories)))
                        .count()
                        ], index=["***Other Values***"])
        others_distinct_count = pd.Series([value_counts
                                .where(~(col(column).isin(*top_50_categories)))
                                .count()
                                ], index=["***Other Values Distinct Count***"])

        top = top_50.set_index(column)["count({c})".format(c=column)]
        top = top.append(others_count)
        top = top.append(others_distinct_count)
        stats["value_counts"] = top
        stats["type"] = "CAT"
        value_counts.unpersist()
        unparsed_valid_jsons = df.select(column).na.drop().rdd.map(
            lambda x: guess_json_type(x[column])).filter(
            lambda x: x).distinct().collect()
        stats["unparsed_json_types"] = unparsed_valid_jsons
        return stats

    def describe_constant_1d(df, column):
        stats = pd.Series(['CONST'], index=['type'], name=column)
        stats["value_counts"] = (df.select(column)
                                 .na.drop()
                                 .limit(1)).toPandas().iloc[:,0].value_counts()
        return stats

    def describe_unique_1d(df, column):
        stats = pd.Series(['UNIQUE'], index=['type'], name=column)
        stats["value_counts"] = (df.select(column)
                                 .na.drop()
                                 .limit(50)).toPandas().iloc[:,0].value_counts()
        return stats

    def describe_1d(df, column, nrows, lookup_config=None):
        column_type = df.select(column).dtypes[0][1]
        if ("array" in column_type) or ("stuct" in column_type) or ("map" in column_type):
            raise NotImplementedError("Column {c} is of type {t} and cannot be analyzed".format(c=column, t=column_type))

        distinct_count = df.select(column).agg(countDistinct(col(column)).alias("distinct_count")).toPandas()
        non_nan_count = df.select(column).na.drop().select(count(col(column)).alias("count")).toPandas()
        results_data = pd.concat([distinct_count, non_nan_count],axis=1)
        results_data["p_unique"] = results_data["distinct_count"] / float(results_data["count"])
        results_data["is_unique"] = results_data["distinct_count"] == nrows
        results_data["n_missing"] = nrows - results_data["count"]
        results_data["p_missing"] = results_data["n_missing"] / float(nrows)
        results_data["p_infinite"] = 0
        results_data["n_infinite"] = 0
        result = results_data.iloc[0].copy()
        result["memorysize"] = 0
        result.name = column

        if result["distinct_count"] <= 1:
            result = result.append(describe_constant_1d(df, column))
        elif column_type in {"tinyint", "smallint", "int", "bigint"}:
            result = result.append(describe_integer_1d(df, column, result, nrows))
        elif column_type in {"float", "double", "decimal"}:
            result = result.append(describe_float_1d(df, column, result, nrows))
        elif column_type in {"date", "timestamp"}:
            result = result.append(describe_date_1d(df, column))
        elif result["is_unique"] == True:
            result = result.append(describe_unique_1d(df, column))
        else:
            result = result.append(describe_categorical_1d(df, column))
            if result["n_missing"] > 0:
                result["distinct_count"] = result["distinct_count"] + 1

        if (result["count"] > result["distinct_count"] > 1):
            try:
                result["mode"] = result["top"]
            except KeyError:
                result["mode"] = 0
        else:
            try:
                result["mode"] = result["value_counts"].index[0]
            except KeyError:
                result["mode"] = 0
            except IndexError:
                result["mode"] = "MISSING"

        if lookup_config:
            lookup_object = lookup_config['object']
            col_name_in_db = lookup_config['col_name_in_db'] if 'col_name_in_db' in lookup_config else None
            try:
                matched, unmatched = lookup_object.lookup(df.select(column), col_name_in_db)
                result['lookedup_values'] = str(matched.count()) + "/" + str(df.select(column).count())
            except:
                result['lookedup_values'] = 'FAILED'
        else:
            result['lookedup_values'] = ''

        return result


    ldesc = {}
    for colum in df.columns:
        if colum in config:
            if 'lookup' in config[colum]:
                lookup_config = config[colum]['lookup']
                desc = describe_1d(df, colum, table_stats["n"], lookup_config=lookup_config)
            else:
                desc = describe_1d(df, colum, table_stats["n"])
        else:
            desc = describe_1d(df, colum, table_stats["n"])
        ldesc.update({colum: desc})

    if corr_reject is not None:
        computable_corrs = [colum for colum in ldesc if ldesc[colum]["type"] in {"NUM"}]

        if len(computable_corrs) > 0:
            corr = corr_matrix(df, columns=computable_corrs)
            for x, corr_x in corr.iterrows():
                for y, corr in corr_x.iteritems():
                    if x == y:
                        break

    variable_stats = pd.DataFrame(ldesc)

    table_stats["nvar"] = len(df.columns)
    table_stats["total_missing"] = float(variable_stats.loc["n_missing"].sum()) / (table_stats["n"] * table_stats["nvar"])
    memsize = 0
    table_stats['memsize'] = fmt_bytesize(memsize)
    table_stats['recordsize'] = fmt_bytesize(memsize / table_stats['n'])
    table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR")})
    table_stats.update(dict(variable_stats.loc['type'].value_counts()))
    table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR']

    freq_dict = {}
    for var in variable_stats:
        if "value_counts" not in variable_stats[var]:
            pass
        elif not(variable_stats[var]["value_counts"] is np.nan):
            freq_dict[var] = variable_stats[var]["value_counts"]
        else:
            pass
    try:
        variable_stats = variable_stats.drop("value_counts")
    except (ValueError, KeyError):
        pass

    return table_stats, variable_stats.T, freq_dict

import numpy as np
from pyspark.sql.functions import abs as absou

SKEWNESS_CUTOFF = 20
DEFAULT_FLOAT_FORMATTER = u'spark_df_profiling.__default_float_formatter'


def gradient_format(value, limit1, limit2, c1, c2):
    def LerpColour(c1,c2,t):
        return (int(c1[0]+(c2[0]-c1[0])*t),int(c1[1]+(c2[1]-c1[1])*t),int(c1[2]+(c2[2]-c1[2])*t))
    c = LerpColour(c1, c2, (value-limit1)/(limit2-limit1))
    return fmt_color(value,"rgb{}".format(str(c)))


def fmt_color(text, color):
    return(u'<span style="color:{color}">{text}</span>'.format(color=color,text=str(text)))


def fmt_class(text, cls):
    return(u'<span class="{cls}">{text}</span>'.format(cls=cls,text=str(text)))


def fmt_bytesize(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if num < 0:
            num = num*-1
            if num < 1024.0:
                return "%3.1f %s%s" % (num, unit, suffix)
            num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)


def fmt_percent(v):
    return  "{:2.1f}%".format(v*100)

def fmt_varname(v):
    return u'<code>{0}</code>'.format(v)


value_formatters={
        u'freq': (lambda v: gradient_format(v, 0, 62000, (30, 198, 244), (99, 200, 72))),
        u'p_missing': fmt_percent,
        u'p_infinite': fmt_percent,
        u'p_unique': fmt_percent,
        u'p_zeros': fmt_percent,
        u'memorysize': fmt_bytesize,
        u'total_missing': fmt_percent,
        DEFAULT_FLOAT_FORMATTER: lambda v: str(float('{:.5g}'.format(v))).rstrip('0').rstrip('.'),
        u'correlation_var': lambda v: fmt_varname(v),
        u'unparsed_json_types': lambda v: ', '.join([s.__name__ for s in v])
        }

def fmt_row_severity(v):
    if np.isnan(v) or v<= 0.01:
        return "ignore"
    else:
        return "alert"

def fmt_skewness(v):
    if not np.isnan(v) and (v<-SKEWNESS_CUTOFF or v> SKEWNESS_CUTOFF):
        return "alert"
    else:
        return ""

row_formatters={
    u'p_zeros': fmt_row_severity,
    u'p_missing': fmt_row_severity,
    u'p_infinite': fmt_row_severity,
    u'n_duplicates': fmt_row_severity,
    u'skewness': fmt_skewness,
}

run(["/bin/bash", "/etc/config/v3io/v3io-spark-operator.sh"])

def describe_spark(context: MLClientCtx, 
                   dataset: DataItem, 
                   artifact_path,
                   bins: int=30,
                   describe_extended: bool=True):
    
    location = dataset.local()
    
    spark = SparkSession.builder.appName("Spark job").getOrCreate()
    
    df = spark.read.csv(location, header=True, inferSchema= True)

    kwargs = []
    
    float_cols = [item[0] for item in df.dtypes if item[1].startswith('float') or item[1].startswith('double')]
    
    if describe_extended == True:
        
        table, variables, freq = describe(df, bins, float_cols, kwargs)

        tbl_1 = variables.reset_index()

        if len(freq) != 0:
            tbl_2 = pd.DataFrame.from_dict(freq, orient = "index").sort_index().stack().reset_index()
            tbl_2.columns = ['col', 'key', 'val']
            tbl_2['Merged'] = [{key: val} for key, val in zip(tbl_2.key, tbl_2.val)]
            tbl_2 = tbl_2.groupby('col', as_index=False).agg(lambda x: tuple(x))[['col','Merged']]

            summary = pd.merge(tbl_1, tbl_2, how='left', left_on='index', right_on='col')

        else:
            summary = tbl_1

        context.log_dataset("summary_stats", 
                            df=summary,
                            format="csv", index=False,
                            artifact_path=context.artifact_subpath('data'))

        context.log_results(table)
    
    else:
        tbl_1 = df.describe().toPandas()
        
        summary = tbl_1.T
        
        context.log_dataset("summary_stats", 
                            df=summary,
                            format="csv", index=False,
                            artifact_path=context.artifact_subpath('data'))
    
    spark.stop()

 + commands: [] + code_origin: https://github.com/Idan707/functions.git#a68e6f7607e56573f329abc5b510bb24612d886e:.ipynb diff --git a/describe_spark/read_csv_spark.ipynb b/describe_spark/read_csv_spark.ipynb new file mode 100644 index 000000000..dd27375ec --- /dev/null +++ b/describe_spark/read_csv_spark.ipynb @@ -0,0 +1,421 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import and Config" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# nuclio: ignore\n", + "import nuclio" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "%nuclio: setting kind to 'job'\n", + "%nuclio: setting spec.image to 'iguazio/shell:3.0_b5565_20201026062233_wsdf'\n" + ] + } + ], + "source": [ + "%nuclio config kind = \"job\"\n", + "%nuclio config spec.image = \"iguazio/shell:3.0_b5565_20201026062233_wsdf\" # docker image available on idan707/spark_shell " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#!/usr/local/bin/python\n", + "\n", + "import mlrun\n", + "from mlrun.platforms.iguazio import mount_v3io, mount_v3iod\n", + "from mlrun.datastore import DataItem\n", + "from mlrun.execution import MLClientCtx\n", + "\n", + "import os\n", + "#import spark_df_profiling\n", + "from subprocess import run\n", + "\n", + "from pyspark.sql import SparkSession\n", + "import pyspark.sql.functions as f\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Build Simple Read CSV Function using Spark" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#!/usr/local/bin/python\n", + "\n", + "run([\"/bin/bash\", \"/etc/config/v3io/v3io-spark-operator.sh\"])\n", + "\n", + "def describe_spark(context: MLClientCtx, \n", + " dataset: DataItem, \n", + " artifact_path):\n", + " \n", + " # get file location\n", + " location = dataset.local()\n", + " \n", + " # build spark session\n", + " spark = SparkSession.builder.appName(\"Spark job\").getOrCreate()\n", + " \n", + " # read csv\n", + " df = spark.read.csv(location, header=True, inferSchema= True)\n", + " \n", + " # show\n", + " df.show(5)\n", + " \n", + " # sample for logging\n", + " df_to_log = df.sample(False, 0.1).toPandas()\n", + " \n", + " # log final report\n", + " context.log_dataset(\"df_sample\", \n", + " df=df_to_log,\n", + " format=\"csv\", index=False,\n", + " artifact_path=context.artifact_subpath('data'))\n", + " \n", + " spark.stop()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# nuclio: end-code" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save and Config" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "fn = mlrun.code_to_function(handler=\"describe_spark\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "fn.apply(mount_v3io())\n", + "fn.apply(mount_v3iod(namespace=\"default-tenant\", v3io_config_configmap=\"spark-operator-v3io-config\"))\n", + "fn.spec.image_pull_policy = \"IfNotPresent\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2020-10-28 11:26:16,525 [warning] warning!, server (0.5.3-rc1) and client (0.5.2) ver dont match\n" + ] + } + ], + "source": [ + "artifact_path = mlrun.set_environment(api_path = 'http://mlrun-api:8080',\n", + " artifact_path = os.path.abspath('./'))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2020-10-28 11:26:16,536 [warning] warning!, server (0.5.3-rc1) and client (0.5.2) ver dont match\n", + "> 2020-10-28 11:26:16,536 [info] starting run test-describe_spark uid=c90bf87b5c9641ca9bc17940e068ab38 -> http://mlrun-api:8080\n", + "> 2020-10-28 11:26:16,680 [info] Job is running in the background, pod: test-describe-spark-xlzj7\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 28 11:26:16runningtest-describe_spark
v3io_user=admin
kind=job
owner=admin
dataset
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to track results use .show() or .logs() or in CLI: \n", + "!mlrun get run c90bf87b5c9641ca9bc17940e068ab38 --project default , !mlrun logs c90bf87b5c9641ca9bc17940e068ab38 --project default\n", + "> 2020-10-28 11:26:16,762 [info] run executed, status=running\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fn.run(inputs={\"dataset\": \"iris_dataset.csv\"},\n", + " artifact_path=artifact_path)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:root] *", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/model_server/README.md b/model_server/README.md index a5ccc81b9..1b382cb9a 100644 --- a/model_server/README.md +++ b/model_server/README.md @@ -1,6 +1,11 @@ # serving models -**`xgboost/xgb-serving.ipynb`** deploy an xgboost server model
+Model Serving provides a solution to host machine learning / deep learning (ML/DL) models as REST endpoints that are updated automatically, enabling data science teams to own the end-to-end lifecycle of a real-time machine learning model from training to production. -**`model_server.ipynb`** deploy any classifier model that has been pickled (cloudpickle)
- For demonstrations, see **[lightgbm-project](https://github.com/yjb-ds/lightgbm-project)**, **[demo-sklearn-project](https://github.com/yjb-ds/demo-sklearn-project)**, and **[demo-xgb-project](https://github.com/yjb-ds/demo-xgb-project/tree/functions)** \ No newline at end of file +**`model_server.ipynb`** deploy any classifier model that has been pickled (cloudpickle). + +For more demonstrations : + +1. **[lightgbm-project](https://github.com/yjb-ds/lightgbm-project)** +2. **[demo-sklearn-project](https://github.com/yjb-ds/demo-sklearn-project)** +3. **[demo-xgb-project](https://github.com/yjb-ds/demo-xgb-project/tree/functions)** \ No newline at end of file diff --git a/model_server/function.yaml b/model_server/function.yaml index 1c7739f92..8de42cf67 100644 --- a/model_server/function.yaml +++ b/model_server/function.yaml @@ -1,7 +1,7 @@ kind: remote metadata: name: model-server - hash: 8f9b901041ee2f8781fe86beb1c9486193a1f9ee + hash: a1dc5a391186ead91ff5cef97ba9ab277adb0ceb project: default labels: author: yaronh @@ -14,12 +14,11 @@ spec: args: [] image: '' description: generic sklearn model server + min_replicas: 1 max_replicas: 4 env: - name: MODEL_CLASS value: ClassifierModel - - name: ENABLE_EXPLAINER - value: 'False' config: spec.triggers.http: kind: http @@ -29,23 +28,23 @@ spec: annotations: {} base_spec: apiVersion: nuclio.io/v1 - kind: nuclio:serving + kind: serving metadata: annotations: - nuclio.io/generated_by: function generated from 30-08-2020 + nuclio.io/generated_by: function generated from 29-11-2020 by admin labels: {} name: model-server spec: build: - baseImage: mlrun/mlrun commands: - - python -m pip install numpy cloudpickle v3io sklearn - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKaW1wb3J0IG51bXB5IGFzIG5wCmZyb20gdHlwaW5nIGltcG9ydCBMaXN0CmZyb20gZGF0ZXRpbWUgaW1wb3J0IGRhdGV0aW1lCmltcG9ydCBtbHJ1bgoKY2xhc3MgQ2xhc3NpZmllck1vZGVsKG1scnVuLnJ1bnRpbWVzLk1MTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgIiIiTG9hZCBtb2RlbCBmcm9tIHN0b3JhZ2UuIiIiCiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCcucGtsJykKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZChvcGVuKG1vZGVsX2ZpbGUsICdyYicpKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGJvZHk6IGRpY3QpIC0+IExpc3Q6CiAgICAgICAgIiIiR2VuZXJhdGUgbW9kZWwgcHJlZGljdGlvbnMgZnJvbSBzYW1wbGUuCiAgICAgICAgCiAgICAgICAgOnBhcmFtIGJvZHkgOiBBIGRpY3Qgb2Ygb2JzZXJ2YXRpb25zLCBlYWNoIG9mIHdoaWNoIGlzIGFuIDEtZGltZW5zaW9uYWwgZmVhdHVyZSB2ZWN0b3IuCiAgICAgICAgICAgIAogICAgICAgIFJldHVybnMgbW9kZWwgcHJlZGljdGlvbnMgYXMgYSBgTGlzdGAsIG9uZSBmb3IgZWFjaCByb3cgaW4gdGhlIGBib2R5YCBpbnB1dCBgTGlzdGAuCiAgICAgICAgIiIiCiAgICAgICAgdHJ5OgogICAgICAgICAgICBmZWF0cyA9IG5wLmFzYXJyYXkoYm9keVsnaW5zdGFuY2VzJ10pCiAgICAgICAgICAgIHJlc3VsdDogbnAubmRhcnJheSA9IHNlbGYubW9kZWwucHJlZGljdChmZWF0cykKICAgICAgICAgICAgcmVzcCA9IHJlc3VsdC50b2xpc3QoKQogICAgICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZToKICAgICAgICAgICAgcmFpc2UgRXhjZXB0aW9uKGYiRmFpbGVkIHRvIHByZWRpY3Qge2V9IikKICAgICAgICAKICAgICAgICByZXR1cm4gcmVzcAoKCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZycpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo= + - python -m pip install numpy cloudpickle v3io sklearn mlrun + functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG1scnVuCgpmcm9tIGNsb3VkcGlja2xlIGltcG9ydCBsb2FkCmZyb20gdHlwaW5nIGltcG9ydCBMaXN0CmZyb20gZGF0ZXRpbWUgaW1wb3J0IGRhdGV0aW1lCmZyb20gc2tsZWFybi5kYXRhc2V0cyBpbXBvcnQgbG9hZF9pcmlzCgppbXBvcnQgd2FybmluZ3MKd2FybmluZ3MuZmlsdGVyd2FybmluZ3MoJ2lnbm9yZScpCgppbXBvcnQgb3MKaW1wb3J0IG51bXB5IGFzIG5wCgpjbGFzcyBDbGFzc2lmaWVyTW9kZWwobWxydW4ucnVudGltZXMuTUxNb2RlbFNlcnZlcik6CiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICAiIiJMb2FkIG1vZGVsIGZyb20gc3RvcmFnZS4iIiIKICAgICAgICBtb2RlbF9maWxlLCBleHRyYV9kYXRhID0gc2VsZi5nZXRfbW9kZWwoJy5wa2wnKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkKG9wZW4obW9kZWxfZmlsZSwgJ3JiJykpCgogICAgZGVmIHByZWRpY3Qoc2VsZiwgYm9keTogZGljdCkgLT4gTGlzdDoKICAgICAgICAiIiJHZW5lcmF0ZSBtb2RlbCBwcmVkaWN0aW9ucyBmcm9tIHNhbXBsZS4KICAgICAgICAKICAgICAgICA6cGFyYW0gYm9keSA6IEEgZGljdCBvZiBvYnNlcnZhdGlvbnMsIGVhY2ggb2Ygd2hpY2ggaXMgYW4gMS1kaW1lbnNpb25hbCBmZWF0dXJlIHZlY3Rvci4KICAgICAgICAgICAgCiAgICAgICAgUmV0dXJucyBtb2RlbCBwcmVkaWN0aW9ucyBhcyBhIGBMaXN0YCwgb25lIGZvciBlYWNoIHJvdyBpbiB0aGUgYGJvZHlgIGlucHV0IGBMaXN0YC4KICAgICAgICAiIiIKICAgICAgICB0cnk6CiAgICAgICAgICAgIGZlYXRzID0gbnAuYXNhcnJheShib2R5WydpbnN0YW5jZXMnXSkKICAgICAgICAgICAgcmVzdWx0OiBucC5uZGFycmF5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGZlYXRzKQogICAgICAgICAgICByZXNwID0gcmVzdWx0LnRvbGlzdCgpCiAgICAgICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyBlOgogICAgICAgICAgICByYWlzZSBFeGNlcHRpb24oZiJGYWlsZWQgdG8gcHJlZGljdCB7ZX0iKQogICAgICAgIAogICAgICAgIHJldHVybiByZXNwCgoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg== noBaseImagesPull: true env: - name: MODEL_CLASS value: ClassifierModel handler: model_server:handler + image: mlrun/ml-models runtime: python:3.6 volumes: [] source: '' diff --git a/model_server/model_server.ipynb b/model_server/model_server.ipynb index c79e3b591..750fe3a99 100644 --- a/model_server/model_server.ipynb +++ b/model_server/model_server.ipynb @@ -9,43 +9,32 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# nuclio: ignore\n", - "import nuclio" + "import mlrun" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "%nuclio: setting kind to 'nuclio:serving'\n", + "%nuclio: setting kind to 'serving'\n", "%nuclio: setting 'MODEL_CLASS' environment variable\n", - "%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'\n" + "%nuclio: setting spec.image to 'mlrun/mlrun'\n" ] } ], "source": [ - "%nuclio config kind=\"nuclio:serving\"\n", + "%nuclio config kind=\"serving\"\n", "%nuclio env MODEL_CLASS=ClassifierModel\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "python -m pip install numpy cloudpickle v3io sklearn" + "%nuclio config spec.image = \"mlrun/mlrun\"" ] }, { @@ -54,20 +43,17 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", "from cloudpickle import load\n", - "import numpy as np\n", "from typing import List\n", "from datetime import datetime\n", - "import mlrun" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ + "from sklearn.datasets import load_iris\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import os\n", + "import numpy as np\n", + "\n", "class ClassifierModel(mlrun.runtimes.MLModelServer):\n", " def load(self):\n", " \"\"\"Load model from storage.\"\"\"\n", @@ -93,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -118,73 +104,47 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### test locally" + "### Test locally" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "import cloudpickle as cp\n", - "models_path = '/User/ml/demos/sklearn-pipe/models'\n", + "model = 'https://s3.wasabisys.com/iguazio/models/iris/model.pkl'\n", "\n", - "from sklearn.datasets import load_iris\n", "iris = load_iris()\n", "\n", "x = iris['data'].tolist()\n", "y = iris['target']\n", "\n", - "for model in os.listdir(models_path):\n", - " if model.endswith(\".pkl\"):\n", - " \n", - " my_server = ClassifierModel('classifier', model_dir=os.path.join(models_path, model))\n", - " my_server.load()\n", - "\n", - " a = my_server.predict({\"instances\": x})\n", + "my_server = ClassifierModel('classifier', model_dir=model)\n", + "my_server.load()\n", "\n", - " assert len(a)==150" + "a = my_server.predict({\"instances\": x})\n", + "assert len(a)==150" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## document and save" + "## Document and save" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-08-10 12:54:58,765 [info] function spec saved to path: function.yaml\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "from mlrun import new_model_server\n", - "fn = new_model_server('model-server', model_class='ClassifierModel')\n", + "fn = mlrun.new_model_server('model-server', model_class='ClassifierModel')\n", "fn.spec.description = \"generic sklearn model server\"\n", "fn.metadata.categories = ['serving', 'ml']\n", "fn.metadata.labels = {'author': 'yaronh', 'framework': 'sklearn'}\n", - "#print(fn.to_yaml())\n", - "fn.export()" + "#fn.export()" ] }, { @@ -196,26 +156,33 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "> 2020-08-10 12:55:08,975 [info] deploy started\n", - "[nuclio] 2020-08-10 12:55:10,073 (info) Build complete\n", - "[nuclio] 2020-08-10 12:55:16,173 (info) Function deploy complete\n", - "[nuclio] 2020-08-10 12:55:16,181 done updating sk-project-sklearn-server, function address: 34.202.248.16:30045\n" + "> 2020-12-06 11:26:41,000 [info] Starting remote function deploy\n", + "2020-12-06 11:26:41 (info) Deploying function\n", + "2020-12-06 11:26:41 (info) Building\n", + "2020-12-06 11:26:41 (info) Staging files and preparing base images\n", + "2020-12-06 11:26:41 (info) Building processor image\n", + "2020-12-06 11:28:28 (info) Build complete\n", + "2020-12-06 11:28:34 (info) Function deploy complete\n", + "> 2020-12-06 11:28:35,076 [info] function deployed, address=default-tenant.app.yh210.iguazio-cd2.com:31804\n" ] } ], "source": [ - "from mlrun import mount_v3io\n", - "fn.apply(mount_v3io())\n", - "fn.set_envs({'SERVING_MODEL_iris_dataset_v1': models_path,\n", - " 'INFERENCE_STREAM': 'users/admin/tststream'})\n", - "#fn.verbose = True\n", + "import mlrun\n", + "user_name = os.getenv(\"V3IO_USER_NAME\")\n", + "artifact_path = mlrun.set_environment(api_path = 'http://mlrun-api:8080',\n", + " artifact_path = os.path.abspath('./'))\n", + "fn.apply(mlrun.mount_v3io())\n", + "fn.set_envs({'SERVING_MODEL_iris_dataset_v1': model,\n", + " 'INFERENCE_STREAM': 'users/{}/tststream'.format(user_name)})\n", + "\n", "address = fn.deploy(project='sk-project')" ] }, @@ -228,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -269,7 +236,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.6" } }, "nbformat": 4, diff --git a/model_server_tester/README.md b/model_server_tester/README.md new file mode 100644 index 000000000..d04fe7d9e --- /dev/null +++ b/model_server_tester/README.md @@ -0,0 +1,14 @@ +# Live Model Server Testing + +Test your model server via HTTP calls + +```markdown + +:param table: csv/parquet table with test data +:param addr: function address/url +:param label_column: name of the label column in table +:param model: tested model name +:param match_err: raise error on validation (require proper test set) +:param rows: number of rows to use from test set + +``` diff --git a/model_server_tester/function.yaml b/model_server_tester/function.yaml index 483ec9759..24563f817 100644 --- a/model_server_tester/function.yaml +++ b/model_server_tester/function.yaml @@ -2,8 +2,8 @@ kind: job metadata: name: model-server-tester tag: '' - hash: 934f5a336bffc3204b47174b6c5d367a1e0e6267 - project: '' + hash: 23c722aee8394de7116a7665c69c815851c5cdbf + project: default labels: author: yaronh categories: @@ -21,12 +21,15 @@ spec: doc: Test a model server parameters: - name: context + default: '' - name: table type: DataItem doc: csv/parquet table with test data + default: '' - name: addr type: str doc: function address/url + default: '' - name: label_column type: str doc: name of the label column in table @@ -34,17 +37,20 @@ spec: - name: model type: str doc: 'tested model name ' + default: '' - name: match_err type: bool doc: raise error on validation (require proper test set) + default: false - name: rows type: int doc: number of rows to use from test set default: 20 - outputs: [] - lineno: 12 + outputs: + - default: '' + lineno: 14 description: test model servers build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlciBvbiAyMDIwLTA1LTA4IDIxOjM1CgppbXBvcnQgb3MKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcmVxdWVzdHMKaW1wb3J0IGpzb24KaW1wb3J0IG51bXB5IGFzIG5wCmZyb20gZGF0ZXRpbWUgaW1wb3J0IGRhdGV0aW1lCmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQpmcm9tIG1scnVuLmFydGlmYWN0cyBpbXBvcnQgZ2V0X21vZGVsLCBDaGFydEFydGlmYWN0CgpkZWYgbW9kZWxfc2VydmVyX3Rlc3Rlcihjb250ZXh0LAogICAgICAgICAgICAgICAgICAgICAgICB0YWJsZTogRGF0YUl0ZW0sCiAgICAgICAgICAgICAgICAgICAgICAgIGFkZHI6IHN0ciwgCiAgICAgICAgICAgICAgICAgICAgICAgIGxhYmVsX2NvbHVtbjogc3RyID0gImxhYmVsIiwKICAgICAgICAgICAgICAgICAgICAgICAgbW9kZWw6IHN0ciA9ICcnLAogICAgICAgICAgICAgICAgICAgICAgICBtYXRjaF9lcnI6IGJvb2wgPSBGYWxzZSwKICAgICAgICAgICAgICAgICAgICAgICAgcm93czogaW50ID0gMjApOgogICAgIiIiIFRlc3QgYSBtb2RlbCBzZXJ2ZXIgCiAgICAKICAgIDpwYXJhbSB0YWJsZTogICAgICAgICBjc3YvcGFycXVldCB0YWJsZSB3aXRoIHRlc3QgZGF0YQogICAgOnBhcmFtIGFkZHI6ICAgICAgICAgIGZ1bmN0aW9uIGFkZHJlc3MvdXJsCiAgICA6cGFyYW0gbGFiZWxfY29sdW1uOiAgbmFtZSBvZiB0aGUgbGFiZWwgY29sdW1uIGluIHRhYmxlCiAgICA6cGFyYW0gbW9kZWw6ICAgICAgICAgdGVzdGVkIG1vZGVsIG5hbWUgCiAgICA6cGFyYW0gbWF0Y2hfZXJyOiAgICAgcmFpc2UgZXJyb3Igb24gdmFsaWRhdGlvbiAocmVxdWlyZSBwcm9wZXIgdGVzdCBzZXQpCiAgICA6cGFyYW0gcm93czogICAgICAgICAgbnVtYmVyIG9mIHJvd3MgdG8gdXNlIGZyb20gdGVzdCBzZXQKICAgICIiIgogICAgICAgIAogICAgdGFibGUgPSB0YWJsZS5hc19kZigpCgogICAgeV9saXN0ID0gdGFibGUucG9wKGxhYmVsX2NvbHVtbikudmFsdWVzLnRvbGlzdCgpCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYndGVzdGluZyB3aXRoIGRhdGFzZXQgYWdhaW5zdCB7YWRkcn0sIG1vZGVsOiB7bW9kZWx9JykKICAgIGlmIHJvd3MgYW5kIHJvd3MgPCB0YWJsZS5zaGFwZVswXToKICAgICAgICB0YWJsZSA9IHRhYmxlLnNhbXBsZShyb3dzKQogICAgCiAgICBjb3VudCA9IGVycl9jb3VudCA9IG1hdGNoID0gMAogICAgdGltZXMgPSBbXQogICAgZm9yIHgsIHkgaW4gemlwKHRhYmxlLnZhbHVlcywgeV9saXN0KToKICAgICAgICBjb3VudCArPSAxCiAgICAgICAgZXZlbnRfZGF0YSA9IGpzb24uZHVtcHMoeyJpbnN0YW5jZXMiOlt4LnRvbGlzdCgpXX0pCiAgICAgICAgaGFkX2VyciA9IEZhbHNlCiAgICAgICAgdHJ5OgogICAgICAgICAgICBzdGFydCA9IGRhdGV0aW1lLm5vdygpCiAgICAgICAgICAgIHJlc3AgPSByZXF1ZXN0cy5wdXQoZid7YWRkcn0ve21vZGVsfS9wcmVkaWN0JywganNvbj1ldmVudF9kYXRhKQogICAgICAgICAgICBpZiBub3QgcmVzcC5vazoKICAgICAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmVycm9yKGYnYmFkIGZ1bmN0aW9uIHJlc3AhIVxue3Jlc3AudGV4dH0nKQogICAgICAgICAgICAgICAgZXJyX2NvdW50ICs9IDEKICAgICAgICAgICAgICAgIGNvbnRpbnVlCiAgICAgICAgICAgIHRpbWVzLmFwcGVuZCgoZGF0ZXRpbWUubm93KCktc3RhcnQpLm1pY3Jvc2Vjb25kcykKICAgICAgICAgICAgICAgIAogICAgICAgIGV4Y2VwdCBPU0Vycm9yIGFzIGVycjoKICAgICAgICAgICAgY29udGV4dC5sb2dnZXIuZXJyb3IoZidlcnJvciBpbiByZXF1ZXN0LCBkYXRhOntldmVudF9kYXRhfSwgZXJyb3I6IHtlcnJ9JykKICAgICAgICAgICAgZXJyX2NvdW50ICs9IDEKICAgICAgICAgICAgY29udGludWUKICAgICAgICAgICAgCiAgICAgICAgeV9yZXNwID0gcmVzcC5qc29uKClbMF0KICAgICAgICBpZiB5ID09IHlfcmVzcDoKICAgICAgICAgICAgbWF0Y2ggKz0gMQogICAgICAgIAogICAgY29udGV4dC5sb2dfcmVzdWx0KCd0b3RhbF90ZXN0cycsIGNvdW50KQogICAgY29udGV4dC5sb2dfcmVzdWx0KCdlcnJvcnMnLCBlcnJfY291bnQpCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoJ21hdGNoJywgbWF0Y2gpCiAgICBpZiBjb3VudCAtIGVycl9jb3VudCA+IDA6CiAgICAgICAgdGltZXNfYXJyID0gbnAuYXJyYXkodGltZXMpCiAgICAgICAgY29udGV4dC5sb2dfcmVzdWx0KCdhdmdfbGF0ZW5jeScsIGludChucC5tZWFuKHRpbWVzX2FycikpKQogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgnbWluX2xhdGVuY3knLCBpbnQobnAuYW1pbih0aW1lc19hcnIpKSkKICAgICAgICBjb250ZXh0LmxvZ19yZXN1bHQoJ21heF9sYXRlbmN5JywgaW50KG5wLmFtYXgodGltZXNfYXJyKSkpCiAgICAgICAgCiAgICAgICAgY2hhcnQgPSBDaGFydEFydGlmYWN0KCdsYXRlbmN5JywgaGVhZGVyPVsnVGVzdCcsICdMYXRlbmN5IChtaWNyb3NlYyknXSkKICAgICAgICBmb3IgaSBpbiByYW5nZShsZW4odGltZXMpKToKICAgICAgICAgICAgY2hhcnQuYWRkX3JvdyhbaSsxLCBpbnQodGltZXNbaV0pXSkKICAgICAgICBjb250ZXh0LmxvZ19hcnRpZmFjdChjaGFydCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYncnVuIHtjb3VudH0gdGVzdHMsIHtlcnJfY291bnR9IGVycm9ycyBhbmQge21hdGNofSBtYXRjaCBleHBlY3RlZCB2YWx1ZScpCiAgICAKICAgIGlmIGVycl9jb3VudDoKICAgICAgICByYWlzZSBWYWx1ZUVycm9yKGYnZmFpbGVkIG9uIHtlcnJfY291bnR9IHRlc3RzIG9mIHtjb3VudH0nKQogICAgCiAgICBpZiBtYXRjaF9lcnIgYW5kIG1hdGNoICE9IGNvdW50OgogICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZidvbmx5IHttYXRjaH0gcmVzdWx0cyBtYXRjaCBvdXQgb2Yge2NvdW50fScpCgo= + functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IHJlcXVlc3RzCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucAppbXBvcnQgbWxydW4KCmZyb20gZGF0ZXRpbWUgaW1wb3J0IGRhdGV0aW1lCmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQpmcm9tIG1scnVuLmFydGlmYWN0cyBpbXBvcnQgZ2V0X21vZGVsLCBDaGFydEFydGlmYWN0CgpkZWYgbW9kZWxfc2VydmVyX3Rlc3Rlcihjb250ZXh0LAogICAgICAgICAgICAgICAgICAgICAgICB0YWJsZTogRGF0YUl0ZW0sCiAgICAgICAgICAgICAgICAgICAgICAgIGFkZHI6IHN0ciwgCiAgICAgICAgICAgICAgICAgICAgICAgIGxhYmVsX2NvbHVtbjogc3RyID0gImxhYmVsIiwKICAgICAgICAgICAgICAgICAgICAgICAgbW9kZWw6IHN0ciA9ICcnLAogICAgICAgICAgICAgICAgICAgICAgICBtYXRjaF9lcnI6IGJvb2wgPSBGYWxzZSwKICAgICAgICAgICAgICAgICAgICAgICAgcm93czogaW50ID0gMjApOgogICAgIiIiIFRlc3QgYSBtb2RlbCBzZXJ2ZXIgCiAgICAKICAgIDpwYXJhbSB0YWJsZTogICAgICAgICBjc3YvcGFycXVldCB0YWJsZSB3aXRoIHRlc3QgZGF0YQogICAgOnBhcmFtIGFkZHI6ICAgICAgICAgIGZ1bmN0aW9uIGFkZHJlc3MvdXJsCiAgICA6cGFyYW0gbGFiZWxfY29sdW1uOiAgbmFtZSBvZiB0aGUgbGFiZWwgY29sdW1uIGluIHRhYmxlCiAgICA6cGFyYW0gbW9kZWw6ICAgICAgICAgdGVzdGVkIG1vZGVsIG5hbWUgCiAgICA6cGFyYW0gbWF0Y2hfZXJyOiAgICAgcmFpc2UgZXJyb3Igb24gdmFsaWRhdGlvbiAocmVxdWlyZSBwcm9wZXIgdGVzdCBzZXQpCiAgICA6cGFyYW0gcm93czogICAgICAgICAgbnVtYmVyIG9mIHJvd3MgdG8gdXNlIGZyb20gdGVzdCBzZXQKICAgICIiIgogICAgICAgIAogICAgdGFibGUgPSB0YWJsZS5hc19kZigpCgogICAgeV9saXN0ID0gdGFibGUucG9wKGxhYmVsX2NvbHVtbikudmFsdWVzLnRvbGlzdCgpCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYndGVzdGluZyB3aXRoIGRhdGFzZXQgYWdhaW5zdCB7YWRkcn0sIG1vZGVsOiB7bW9kZWx9JykKICAgIGlmIHJvd3MgYW5kIHJvd3MgPCB0YWJsZS5zaGFwZVswXToKICAgICAgICB0YWJsZSA9IHRhYmxlLnNhbXBsZShyb3dzKQogICAgCiAgICBjb3VudCA9IGVycl9jb3VudCA9IG1hdGNoID0gMAogICAgdGltZXMgPSBbXQogICAgZm9yIHgsIHkgaW4gemlwKHRhYmxlLnZhbHVlcywgeV9saXN0KToKICAgICAgICBjb3VudCArPSAxCiAgICAgICAgZXZlbnRfZGF0YSA9IGpzb24uZHVtcHMoeyJpbnN0YW5jZXMiOlt4LnRvbGlzdCgpXX0pCiAgICAgICAgaGFkX2VyciA9IEZhbHNlCiAgICAgICAgdHJ5OgogICAgICAgICAgICBzdGFydCA9IGRhdGV0aW1lLm5vdygpCiAgICAgICAgICAgIHJlc3AgPSByZXF1ZXN0cy5wdXQoZid7YWRkcn0ve21vZGVsfS9wcmVkaWN0JywganNvbj1ldmVudF9kYXRhKQogICAgICAgICAgICBpZiBub3QgcmVzcC5vazoKICAgICAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmVycm9yKGYnYmFkIGZ1bmN0aW9uIHJlc3AhIVxue3Jlc3AudGV4dH0nKQogICAgICAgICAgICAgICAgZXJyX2NvdW50ICs9IDEKICAgICAgICAgICAgICAgIGNvbnRpbnVlCiAgICAgICAgICAgIHRpbWVzLmFwcGVuZCgoZGF0ZXRpbWUubm93KCktc3RhcnQpLm1pY3Jvc2Vjb25kcykKICAgICAgICAgICAgICAgIAogICAgICAgIGV4Y2VwdCBPU0Vycm9yIGFzIGVycjoKICAgICAgICAgICAgY29udGV4dC5sb2dnZXIuZXJyb3IoZidlcnJvciBpbiByZXF1ZXN0LCBkYXRhOntldmVudF9kYXRhfSwgZXJyb3I6IHtlcnJ9JykKICAgICAgICAgICAgZXJyX2NvdW50ICs9IDEKICAgICAgICAgICAgY29udGludWUKICAgICAgICAgICAgCiAgICAgICAgeV9yZXNwID0gcmVzcC5qc29uKClbMF0KICAgICAgICBpZiB5ID09IHlfcmVzcDoKICAgICAgICAgICAgbWF0Y2ggKz0gMQogICAgICAgIAogICAgY29udGV4dC5sb2dfcmVzdWx0KCd0b3RhbF90ZXN0cycsIGNvdW50KQogICAgY29udGV4dC5sb2dfcmVzdWx0KCdlcnJvcnMnLCBlcnJfY291bnQpCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoJ21hdGNoJywgbWF0Y2gpCiAgICBpZiBjb3VudCAtIGVycl9jb3VudCA+IDA6CiAgICAgICAgdGltZXNfYXJyID0gbnAuYXJyYXkodGltZXMpCiAgICAgICAgY29udGV4dC5sb2dfcmVzdWx0KCdhdmdfbGF0ZW5jeScsIGludChucC5tZWFuKHRpbWVzX2FycikpKQogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgnbWluX2xhdGVuY3knLCBpbnQobnAuYW1pbih0aW1lc19hcnIpKSkKICAgICAgICBjb250ZXh0LmxvZ19yZXN1bHQoJ21heF9sYXRlbmN5JywgaW50KG5wLmFtYXgodGltZXNfYXJyKSkpCiAgICAgICAgCiAgICAgICAgY2hhcnQgPSBDaGFydEFydGlmYWN0KCdsYXRlbmN5JywgaGVhZGVyPVsnVGVzdCcsICdMYXRlbmN5IChtaWNyb3NlYyknXSkKICAgICAgICBmb3IgaSBpbiByYW5nZShsZW4odGltZXMpKToKICAgICAgICAgICAgY2hhcnQuYWRkX3JvdyhbaSsxLCBpbnQodGltZXNbaV0pXSkKICAgICAgICBjb250ZXh0LmxvZ19hcnRpZmFjdChjaGFydCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYncnVuIHtjb3VudH0gdGVzdHMsIHtlcnJfY291bnR9IGVycm9ycyBhbmQge21hdGNofSBtYXRjaCBleHBlY3RlZCB2YWx1ZScpCiAgICAKICAgIGlmIGVycl9jb3VudDoKICAgICAgICByYWlzZSBWYWx1ZUVycm9yKGYnZmFpbGVkIG9uIHtlcnJfY291bnR9IHRlc3RzIG9mIHtjb3VudH0nKQogICAgCiAgICBpZiBtYXRjaF9lcnIgYW5kIG1hdGNoICE9IGNvdW50OgogICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZidvbmx5IHttYXRjaH0gcmVzdWx0cyBtYXRjaCBvdXQgb2Yge2NvdW50fScpCgo= commands: [] - code_origin: https://github.com/mlrun/functions.git#544df038d917cad745e946cb64378582151527ee:model_server_tester.ipynb + code_origin: https://github.com/Idan707/functions.git#7175ca7249cf11e0e163b21cffacd692032ba0a5:model_server_tester.ipynb diff --git a/model_server_tester/model_server_tester.ipynb b/model_server_tester/model_server_tester.ipynb index a34ef3193..a2d284104 100644 --- a/model_server_tester/model_server_tester.ipynb +++ b/model_server_tester/model_server_tester.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -47,6 +47,7 @@ "import requests\n", "import json\n", "import numpy as np\n", + "\n", "from datetime import datetime\n", "from mlrun.datastore import DataItem\n", "from mlrun.artifacts import get_model, ChartArtifact\n", @@ -132,49 +133,105 @@ "# marks the end of a code section" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Deploy model server for testing" + ] + }, { "cell_type": "code", "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2020-10-28 16:43:54,679 [warning] warning!, server (0.5.3-rc1) and client (0.5.2) ver dont match\n", + "> 2020-10-28 16:43:55,002 [info] deploy started\n", + "[nuclio] 2020-10-28 16:45:17,274 (info) Build complete\n", + "[nuclio] 2020-10-28 16:45:22,363 done updating default-model-server, function address: default-tenant.app.dsteam.iguazio-cd1.com:30150\n", + "> 2020-10-28 16:45:22,369 [warning] warning!, server (0.5.3-rc1) and client (0.5.2) ver dont match\n" + ] + } + ], "source": [ - "from os import path\n", - "from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'\n", + "import mlrun\n", + "project_name = 'sk-project'\n", + "MODEL_PATH = 'https://s3.wasabisys.com/iguazio/models/iris/model.pkl'\n", "\n", - "# specify artifacts target location\n", - "artifact_path = mlconf.artifact_path or path.abspath('./')\n", - "project_name = 'sk-project'" + "artifact_path = mlrun.set_environment(api_path = 'http://mlrun-api:8080',\n", + " artifact_path = os.path.abspath('./'))\n", + "\n", + "# import model server function from hub\n", + "fn = mlrun.import_function('hub://model_server')\n", + "fn.add_model(\"mymodel\", MODEL_PATH)\n", + "address = fn.deploy()" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[mlrun] 2020-04-06 23:26:00,746 starting run model_server_tester uid=b293663098374087b0bdd4d1e24bca86 -> http://10.196.88.27:80\n", - "[mlrun] 2020-04-06 23:26:00,915 testing with dataset against http://13.58.191.176:30115, model: iris_dataset_v1\n", - "[mlrun] 2020-04-06 23:26:01,137 run 10 tests, 0 errors and 1 match expected value\n", - "\n" + "> 2020-10-28 16:45:22,418 [info] deploy started\n", + "[nuclio] 2020-10-28 16:45:25,272 (info) Build complete\n", + "[nuclio] 2020-10-28 16:45:34,868 done updating default-model-server, function address: default-tenant.app.dsteam.iguazio-cd1.com:30150\n" + ] + } + ], + "source": [ + "user_name = os.getenv('V3IO_USERNAME')\n", + "\n", + "fn.apply(mlrun.mount_v3io())\n", + "fn.set_envs({'SERVING_MODEL_iris_dataset_v1': MODEL_PATH,\n", + " 'INFERENCE_STREAM': 'users/{}/tststream'.format(user_name)})\n", + "\n", + "address = fn.deploy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run model server tester locally" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2020-10-28 16:45:34,916 [warning] warning!, server (0.5.3-rc1) and client (0.5.2) ver dont match\n", + "> 2020-10-28 16:45:34,916 [info] starting run model_server_tester uid=c84fdd4dfacd447dbe417d709f5983f0 -> http://mlrun-api:8080\n", + "> 2020-10-28 16:45:34,972 [warning] warning!, server (0.5.3-rc1) and client (0.5.2) ver dont match\n", + "> 2020-10-28 16:45:35,264 [info] testing with dataset against http://default-tenant.app.dsteam.iguazio-cd1.com:30150, model: mymodel\n", + "> 2020-10-28 16:45:35,967 [info] run 20 tests, 0 errors and 6 match expected value\n" ] }, { "data": { "text/html": [ - "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
sk-project0Oct 28 16:45:43runningmodel_server_tester
v3io_user=admin
kind=job
owner=admin
table
addr=http://default-tenant.app.dsteam.iguazio-cd1.com:30150
model=mymodel
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to track results use .show() or .logs() or in CLI: \n", + "!mlrun get run 73cb7da1bfeb4b50afb29bce40a9c861 --project sk-project , !mlrun logs 73cb7da1bfeb4b50afb29bce40a9c861 --project sk-project\n", + "> 2020-10-28 16:45:43,790 [info] run executed, status=running\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_func.run(mlrun.NewTask(name='model_server_tester', \n", + " handler=model_server_tester, \n", + " params={'addr': address, 'model': 'mymodel'},\n", + " inputs={'table': DATA_PATH},\n", + " project=project_name, \n", + " artifact_path=os.path.join(artifact_path, 'data')))" + ] + }, { "cell_type": "code", "execution_count": null, @@ -426,9 +744,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [conda env:root] *", "language": "python", - "name": "python3" + "name": "conda-root-py" }, "language_info": { "codemirror_mode": { @@ -440,7 +758,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.6" } }, "nbformat": 4, diff --git a/sklearn_classifier/README.md b/sklearn_classifier/README.md index 6aa7c6d4b..4fdee7ee2 100644 --- a/sklearn_classifier/README.md +++ b/sklearn_classifier/README.md @@ -1,5 +1,24 @@ -# training functions +# **Training Functions** -1. **`sklearn-classify`**
-train any sklearn classifier model - \ No newline at end of file +## `sklearn-classifer` + +Run any scikit-learn compatible classifier or list of classifiers + +### steps + +1. **generate a scikit-learn model configuration** using the `model_pkg_class` parameter + * input a package and class name, for example, `sklearn.linear_model.LogisticRegression` + * mlrun will find the class and instantiate a copy using default parameters + * You can modify both the model class instantiator and the fit methods (other functions could be similarly modified) +2. **get a sample of data** from a data source + * select all rows using -1 + * select a random sample of rows using a negative integer + * select consecutive rows using a positive integer +3. **split the data** into train, validation, and test sets + * the test set is saved as an artifact and never seen again until testing + * WIP: this will be parametrized to produce cross-validator splits (one way of performing CV) +4. **train the model** +5. **pickle / serialize the model** + * models can be pickled or saved as json +6. **evaluate the model** + * a custom evaluator can be provided, see function doc for details diff --git a/sklearn_classifier/function.yaml b/sklearn_classifier/function.yaml index 15dba88b3..6f521afab 100644 --- a/sklearn_classifier/function.yaml +++ b/sklearn_classifier/function.yaml @@ -2,8 +2,8 @@ kind: job metadata: name: sklearn-classifier tag: '' - hash: aa99bb25dd46d0b0a183541030e7850fa2b71873 - project: '' + hash: 9ad5f21bff75a52254d34af4b68bcc1afd8a8bc3 + project: default labels: author: yjb framework: sklearn @@ -28,13 +28,16 @@ spec: - name: context type: MLClientCtx doc: the function context + default: '' - name: model_pkg_class type: str doc: the model to train, e.g, "sklearn.neural_networks.MLPClassifier", or json model config + default: '' - name: dataset type: DataItem doc: ("data") name of raw data file + default: '' - name: label_column type: str doc: ground-truth (y) labels @@ -42,11 +45,12 @@ spec: - name: encode_cols type: List[str] doc: dictionary of names and prefixes for columns that are to hot be encoded. + default: [] - name: sample type: int doc: Selects the first n rows, or select a sample starting from the first. If negative <-1, select a random sample - default: <_ast.USub object at 0x7f768a309e10> + default: <_ast.USub object at 0x7f62bdf73410> - name: test_size type: float doc: (0.05) test set size @@ -55,16 +59,18 @@ spec: type: float doc: (0.75) Once the test set has been removed the training set gets this proportion. - default: 0.75 + default: 0.7 - name: test_set_key type: str doc: key of held out data in artifact store default: test_set - name: model_evaluator doc: (None) a custom model evaluator can be specified + default: null - name: models_dest type: str doc: ("") models subfolder on artifact path + default: '' - name: plots_dest type: str doc: plot subfolder on artifact path @@ -75,14 +81,16 @@ spec: default: parquet - name: model_pkg_file type: str + default: '' - name: random_state type: int doc: (1) sklearn rng seed default: 1 - outputs: [] - lineno: 28 + outputs: + - default: '' + lineno: 29 description: train any classifier using scikit-learn's API build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCndhcm5pbmdzLnNpbXBsZWZpbHRlcihhY3Rpb249Imlnbm9yZSIsIGNhdGVnb3J5PUZ1dHVyZVdhcm5pbmcpCgppbXBvcnQganNvbgppbXBvcnQgb3MKCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzLCBsb2FkLCBkdW1wCgpmcm9tIHNrbGVhcm4gaW1wb3J0IG1ldHJpY3MKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgbnVtcHkgYXMgbnAKaW1wb3J0IG1hdHBsb3RsaWIucHlwbG90IGFzIHBsdAoKZnJvbSBza2xlYXJuLnByZXByb2Nlc3NpbmcgaW1wb3J0IGxhYmVsX2JpbmFyaXplCmZyb20gc2tsZWFybi5tb2RlbF9zZWxlY3Rpb24gaW1wb3J0IHRyYWluX3Rlc3Rfc3BsaXQKZnJvbSBza2xlYXJuIGltcG9ydCBtZXRyaWNzCgpmcm9tIHR5cGluZyBpbXBvcnQgTGlzdApmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSBtbHJ1bi5kYXRhc3RvcmUgaW1wb3J0IERhdGFJdGVtCmZyb20gbWxydW4uYXJ0aWZhY3RzIGltcG9ydCBQbG90QXJ0aWZhY3QKCmZyb20gbWxydW4ubWx1dGlscyBpbXBvcnQgKGdldF9zYW1wbGUsIGdldF9zcGxpdHMsCiAgICAgICAgICAgICAgICAgICAgIGdlbl9za2xlYXJuX21vZGVsLCBjcmVhdGVfY2xhc3MsIGV2YWxfbW9kZWxfdjIpCgpkZWYgdHJhaW5fbW9kZWwoCiAgICBjb250ZXh0OiBNTENsaWVudEN0eCwKICAgIG1vZGVsX3BrZ19jbGFzczogc3RyLAogICAgZGF0YXNldDogRGF0YUl0ZW0sCiAgICBsYWJlbF9jb2x1bW46IHN0ciA9ICJsYWJlbHMiLAogICAgZW5jb2RlX2NvbHM6IExpc3Rbc3RyXSA9IFtdLAogICAgc2FtcGxlOiBpbnQgPSAtMSwKICAgIHRlc3Rfc2l6ZTogZmxvYXQgPSAwLjMwLAogICAgdHJhaW5fdmFsX3NwbGl0OiBmbG9hdCA9IDAuNzUsCiAgICB0ZXN0X3NldF9rZXk6IHN0ciA9ICJ0ZXN0X3NldCIsCiAgICBtb2RlbF9ldmFsdWF0b3IgPSBOb25lLAogICAgbW9kZWxzX2Rlc3Q6IHN0ciA9ICIiLAogICAgcGxvdHNfZGVzdDogc3RyID0gInBsb3RzIiwKICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICBtb2RlbF9wa2dfZmlsZTogc3RyID0gIiIsCiAgICByYW5kb21fc3RhdGU6IGludCA9IDEsCikgLT4gTm9uZToKICAgICIiInRyYWluIGEgY2xhc3NpZmllcgogICAgCiAgICBBbiBvcHRpb25hbCBjdXRvbSBtb2RlbCBldmFsdWF0b3IgY2FuIGJlIHN1cHBsaWVkIHRoYXQgc2hvdWxkIGhhdmUgdGhlIHNpZ25hdHVyZToKICAgIGBteV9jdXN0b21fZXZhbHVhdG9yKGNvbnRleHQsIHh2YWxpZCwgeXZhbGlkLCBtb2RlbClgIGFuZCByZXR1cm4gYSBkaWN0aW9uYXJ5IG9mIAogICAgc2NhbGFyICJyZXN1bHRzIiwgYSAicGxvdHMiIGtleXMgd2l0aCBhIGxpc3Qgb2YgUGxvdEFydGlmYWN0cywgYW5kIAogICAgYW5kICJ0YWJsZXMiIGtleSBjb250YWluaW5nIGEgcmV0dXJuZWQgbGlzdCBvZiBUYWJsZUFydGlmYWN0cy4KICAgIAogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIG1vZGVsX3BrZ19jbGFzczogICB0aGUgbW9kZWwgdG8gdHJhaW4sIGUuZywgInNrbGVhcm4ubmV1cmFsX25ldHdvcmtzLk1MUENsYXNzaWZpZXIiLCAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgb3IganNvbiBtb2RlbCBjb25maWcKICAgIDpwYXJhbSBkYXRhc2V0OiAgICAgICAgICAgKCJkYXRhIikgbmFtZSBvZiByYXcgZGF0YSBmaWxlCiAgICA6cGFyYW0gbGFiZWxfY29sdW1uOiAgICAgIGdyb3VuZC10cnV0aCAoeSkgbGFiZWxzCiAgICA6cGFyYW0gZW5jb2RlX2NvbHM6ICAgICAgIGRpY3Rpb25hcnkgb2YgbmFtZXMgYW5kIHByZWZpeGVzIGZvciBjb2x1bW5zIHRoYXQgYXJlCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHRvIGhvdCBiZSBlbmNvZGVkLgogICAgOnBhcmFtIHNhbXBsZTogICAgICAgICAgICBTZWxlY3RzIHRoZSBmaXJzdCBuIHJvd3MsIG9yIHNlbGVjdCBhIHNhbXBsZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICBzdGFydGluZyBmcm9tIHRoZSBmaXJzdC4gSWYgbmVnYXRpdmUgPC0xLCBzZWxlY3QKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgYSByYW5kb20gc2FtcGxlCiAgICA6cGFyYW0gdGVzdF9zaXplOiAgICAgICAgICgwLjA1KSB0ZXN0IHNldCBzaXplCiAgICA6cGFyYW0gdHJhaW5fdmFsX3NwbGl0OiAgICgwLjc1KSBPbmNlIHRoZSB0ZXN0IHNldCBoYXMgYmVlbiByZW1vdmVkIHRoZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICB0cmFpbmluZyBzZXQgZ2V0cyB0aGlzIHByb3BvcnRpb24uCiAgICA6cGFyYW0gdGVzdF9zZXRfa2V5OiAgICAgIGtleSBvZiBoZWxkIG91dCBkYXRhIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gbW9kZWxfZXZhbHVhdG9yOiAgIChOb25lKSBhIGN1c3RvbSBtb2RlbCBldmFsdWF0b3IgY2FuIGJlIHNwZWNpZmllZAogICAgOnBhcmFtIG1vZGVsc19kZXN0OiAgICAgICAoIiIpIG1vZGVscyBzdWJmb2xkZXIgb24gYXJ0aWZhY3QgcGF0aAogICAgOnBhcmFtIHBsb3RzX2Rlc3Q6ICAgICAgICBwbG90IHN1YmZvbGRlciBvbiBhcnRpZmFjdCBwYXRoCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgICAgICgicGFycXVldCIpIGZvcm1hdCBmb3IgdGVzdF9zZXRfa2V5IGhvbGQgb3V0IGRhdGEKICAgIDpwYXJhbSByYW5kb21fc3RhdGU6ICAgICAgKDEpIHNrbGVhcm4gcm5nIHNlZWQKCiAgICAiIiIKICAgIG1vZGVsc19kZXN0ID0gbW9kZWxzX2Rlc3Qgb3IgIm1vZGVsIgogICAgCiAgICByYXcsIGxhYmVscywgaGVhZGVyID0gZ2V0X3NhbXBsZShkYXRhc2V0LCBzYW1wbGUsIGxhYmVsX2NvbHVtbikKICAgIAogICAgaWYgZW5jb2RlX2NvbHM6CiAgICAgICAgcmF3ID0gcGQuZ2V0X2R1bW1pZXMocmF3LCAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICBjb2x1bW5zPWxpc3QoZW5jb2RlX2NvbHMua2V5cygpKSwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgcHJlZml4PWxpc3QoZW5jb2RlX2NvbHMudmFsdWVzKCkpLCAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICBkcm9wX2ZpcnN0PVRydWUpCiAgICAKICAgICh4dHJhaW4sIHl0cmFpbiksICh4dmFsaWQsIHl2YWxpZCksICh4dGVzdCwgeXRlc3QpID0gICAgICAgICBnZXRfc3BsaXRzKHJhdywgbGFiZWxzLCAzLCB0ZXN0X3NpemUsIDEtdHJhaW5fdmFsX3NwbGl0LCByYW5kb21fc3RhdGUpCiAgICAKICAgIGNvbnRleHQubG9nX2RhdGFzZXQodGVzdF9zZXRfa2V5LCAKICAgICAgICAgICAgICAgICAgICAgICAgZGY9cGQuY29uY2F0KFt4dGVzdCwgeXRlc3QudG9fZnJhbWUoKV0sIGF4aXM9MSksCiAgICAgICAgICAgICAgICAgICAgICAgIGZvcm1hdD1maWxlX2V4dCwgaW5kZXg9RmFsc2UsIAogICAgICAgICAgICAgICAgICAgICAgICBsYWJlbHM9eyJkYXRhLXR5cGUiOiAiaGVsZC1vdXQifSwKICAgICAgICAgICAgICAgICAgICAgICAgYXJ0aWZhY3RfcGF0aD1jb250ZXh0LmFydGlmYWN0X3N1YnBhdGgoJ2RhdGEnKSkKCiAgICBtb2RlbF9jb25maWcgPSBnZW5fc2tsZWFybl9tb2RlbChtb2RlbF9wa2dfY2xhc3MsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBjb250ZXh0LnBhcmFtZXRlcnMuaXRlbXMoKSkKCiAgICBtb2RlbF9jb25maWdbIkZJVCJdLnVwZGF0ZSh7IlgiOiB4dHJhaW4sCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgInkiOiB5dHJhaW4udmFsdWVzfSkKICAgIAogICAgQ2xhc3NpZmllckNsYXNzID0gY3JlYXRlX2NsYXNzKG1vZGVsX2NvbmZpZ1siTUVUQSJdWyJjbGFzcyJdKQogICAgCiAgICBtb2RlbCA9IENsYXNzaWZpZXJDbGFzcygqKm1vZGVsX2NvbmZpZ1siQ0xBU1MiXSkKICAgIAogICAgbW9kZWwuZml0KCoqbW9kZWxfY29uZmlnWyJGSVQiXSkKICAgIAogICAgYXJ0aWZhY3RfcGF0aCA9IGNvbnRleHQuYXJ0aWZhY3Rfc3VicGF0aChtb2RlbHNfZGVzdCkKICAgIHBsb3RzX3BhdGggPSBjb250ZXh0LmFydGlmYWN0X3N1YnBhdGgobW9kZWxzX2Rlc3QsIHBsb3RzX2Rlc3QpCiAgICBpZiBtb2RlbF9ldmFsdWF0b3I6CiAgICAgICAgZXZhbF9tZXRyaWNzID0gbW9kZWxfZXZhbHVhdG9yKGNvbnRleHQsIHh2YWxpZCwgeXZhbGlkLCBtb2RlbCwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgcGxvdHNfYXJ0aWZhY3RfcGF0aD1wbG90c19wYXRoKQogICAgZWxzZToKICAgICAgICBldmFsX21ldHJpY3MgPSBldmFsX21vZGVsX3YyKGNvbnRleHQsIHh2YWxpZCwgeXZhbGlkLCBtb2RlbCwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHBsb3RzX2FydGlmYWN0X3BhdGg9cGxvdHNfcGF0aCkKICAgICAgICAKICAgIGNvbnRleHQuc2V0X2xhYmVsKCdjbGFzcycsIG1vZGVsX3BrZ19jbGFzcykKICAgIGNvbnRleHQubG9nX21vZGVsKCJtb2RlbCIsIGJvZHk9ZHVtcHMobW9kZWwpLAogICAgICAgICAgICAgICAgICAgICAgYXJ0aWZhY3RfcGF0aD1hcnRpZmFjdF9wYXRoLAogICAgICAgICAgICAgICAgICAgICAgZXh0cmFfZGF0YT1ldmFsX21ldHJpY3MsIAogICAgICAgICAgICAgICAgICAgICAgbW9kZWxfZmlsZT0ibW9kZWwucGtsIiwKICAgICAgICAgICAgICAgICAgICAgIG1ldHJpY3M9Y29udGV4dC5yZXN1bHRzLAogICAgICAgICAgICAgICAgICAgICAgbGFiZWxzPXsiY2xhc3MiOiBtb2RlbF9wa2dfY2xhc3N9KQoK + functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCndhcm5pbmdzLnNpbXBsZWZpbHRlcihhY3Rpb249Imlnbm9yZSIsIGNhdGVnb3J5PUZ1dHVyZVdhcm5pbmcpCgppbXBvcnQganNvbgppbXBvcnQgb3MKCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzLCBsb2FkLCBkdW1wCgpmcm9tIHNrbGVhcm4gaW1wb3J0IG1ldHJpY3MKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgbnVtcHkgYXMgbnAKaW1wb3J0IG1hdHBsb3RsaWIucHlwbG90IGFzIHBsdAoKZnJvbSBza2xlYXJuLnByZXByb2Nlc3NpbmcgaW1wb3J0IGxhYmVsX2JpbmFyaXplCmZyb20gc2tsZWFybi5tb2RlbF9zZWxlY3Rpb24gaW1wb3J0IHRyYWluX3Rlc3Rfc3BsaXQKZnJvbSBza2xlYXJuIGltcG9ydCBtZXRyaWNzCgpmcm9tIHR5cGluZyBpbXBvcnQgTGlzdApmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSBtbHJ1bi5kYXRhc3RvcmUgaW1wb3J0IERhdGFJdGVtCmZyb20gbWxydW4uYXJ0aWZhY3RzIGltcG9ydCBQbG90QXJ0aWZhY3QKZnJvbSBtbHJ1bi5tbHV0aWxzIGltcG9ydCAoZ2V0X3NhbXBsZSwgZ2V0X3NwbGl0cywKICAgICAgICAgICAgICAgICAgICAgZ2VuX3NrbGVhcm5fbW9kZWwsIGNyZWF0ZV9jbGFzcywgZXZhbF9tb2RlbF92MikKCmltcG9ydCBtbHJ1bgoKZGVmIHRyYWluX21vZGVsKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBtb2RlbF9wa2dfY2xhc3M6IHN0ciwKICAgIGRhdGFzZXQ6IERhdGFJdGVtLAogICAgbGFiZWxfY29sdW1uOiBzdHIgPSAibGFiZWxzIiwKICAgIGVuY29kZV9jb2xzOiBMaXN0W3N0cl0gPSBbXSwKICAgIHNhbXBsZTogaW50ID0gLTEsCiAgICB0ZXN0X3NpemU6IGZsb2F0ID0gMC4zMCwKICAgIHRyYWluX3ZhbF9zcGxpdDogZmxvYXQgPSAwLjcwLAogICAgdGVzdF9zZXRfa2V5OiBzdHIgPSAidGVzdF9zZXQiLAogICAgbW9kZWxfZXZhbHVhdG9yID0gTm9uZSwKICAgIG1vZGVsc19kZXN0OiBzdHIgPSAiIiwKICAgIHBsb3RzX2Rlc3Q6IHN0ciA9ICJwbG90cyIsCiAgICBmaWxlX2V4dDogc3RyID0gInBhcnF1ZXQiLAogICAgbW9kZWxfcGtnX2ZpbGU6IHN0ciA9ICIiLAogICAgcmFuZG9tX3N0YXRlOiBpbnQgPSAxLAopIC0+IE5vbmU6CiAgICAiIiJ0cmFpbiBhIGNsYXNzaWZpZXIKICAgIAogICAgQW4gb3B0aW9uYWwgY3V0b20gbW9kZWwgZXZhbHVhdG9yIGNhbiBiZSBzdXBwbGllZCB0aGF0IHNob3VsZCBoYXZlIHRoZSBzaWduYXR1cmU6CiAgICBgbXlfY3VzdG9tX2V2YWx1YXRvcihjb250ZXh0LCB4dmFsaWQsIHl2YWxpZCwgbW9kZWwpYCBhbmQgcmV0dXJuIGEgZGljdGlvbmFyeSBvZiAKICAgIHNjYWxhciAicmVzdWx0cyIsIGEgInBsb3RzIiBrZXlzIHdpdGggYSBsaXN0IG9mIFBsb3RBcnRpZmFjdHMsIGFuZCAKICAgIGFuZCAidGFibGVzIiBrZXkgY29udGFpbmluZyBhIHJldHVybmVkIGxpc3Qgb2YgVGFibGVBcnRpZmFjdHMuCiAgICAKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBtb2RlbF9wa2dfY2xhc3M6ICAgdGhlIG1vZGVsIHRvIHRyYWluLCBlLmcsICJza2xlYXJuLm5ldXJhbF9uZXR3b3Jrcy5NTFBDbGFzc2lmaWVyIiwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIG9yIGpzb24gbW9kZWwgY29uZmlnCiAgICA6cGFyYW0gZGF0YXNldDogICAgICAgICAgICgiZGF0YSIpIG5hbWUgb2YgcmF3IGRhdGEgZmlsZQogICAgOnBhcmFtIGxhYmVsX2NvbHVtbjogICAgICBncm91bmQtdHJ1dGggKHkpIGxhYmVscwogICAgOnBhcmFtIGVuY29kZV9jb2xzOiAgICAgICBkaWN0aW9uYXJ5IG9mIG5hbWVzIGFuZCBwcmVmaXhlcyBmb3IgY29sdW1ucyB0aGF0IGFyZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICB0byBob3QgYmUgZW5jb2RlZC4KICAgIDpwYXJhbSBzYW1wbGU6ICAgICAgICAgICAgU2VsZWN0cyB0aGUgZmlyc3QgbiByb3dzLCBvciBzZWxlY3QgYSBzYW1wbGUKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc3RhcnRpbmcgZnJvbSB0aGUgZmlyc3QuIElmIG5lZ2F0aXZlIDwtMSwgc2VsZWN0CiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGEgcmFuZG9tIHNhbXBsZQogICAgOnBhcmFtIHRlc3Rfc2l6ZTogICAgICAgICAoMC4wNSkgdGVzdCBzZXQgc2l6ZQogICAgOnBhcmFtIHRyYWluX3ZhbF9zcGxpdDogICAoMC43NSkgT25jZSB0aGUgdGVzdCBzZXQgaGFzIGJlZW4gcmVtb3ZlZCB0aGUKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgdHJhaW5pbmcgc2V0IGdldHMgdGhpcyBwcm9wb3J0aW9uLgogICAgOnBhcmFtIHRlc3Rfc2V0X2tleTogICAgICBrZXkgb2YgaGVsZCBvdXQgZGF0YSBpbiBhcnRpZmFjdCBzdG9yZQogICAgOnBhcmFtIG1vZGVsX2V2YWx1YXRvcjogICAoTm9uZSkgYSBjdXN0b20gbW9kZWwgZXZhbHVhdG9yIGNhbiBiZSBzcGVjaWZpZWQKICAgIDpwYXJhbSBtb2RlbHNfZGVzdDogICAgICAgKCIiKSBtb2RlbHMgc3ViZm9sZGVyIG9uIGFydGlmYWN0IHBhdGgKICAgIDpwYXJhbSBwbG90c19kZXN0OiAgICAgICAgcGxvdCBzdWJmb2xkZXIgb24gYXJ0aWZhY3QgcGF0aAogICAgOnBhcmFtIGZpbGVfZXh0OiAgICAgICAgICAoInBhcnF1ZXQiKSBmb3JtYXQgZm9yIHRlc3Rfc2V0X2tleSBob2xkIG91dCBkYXRhCiAgICA6cGFyYW0gcmFuZG9tX3N0YXRlOiAgICAgICgxKSBza2xlYXJuIHJuZyBzZWVkCgogICAgIiIiCiAgICBtb2RlbHNfZGVzdCA9IG1vZGVsc19kZXN0IG9yICJtb2RlbCIKICAgIAogICAgcmF3LCBsYWJlbHMsIGhlYWRlciA9IGdldF9zYW1wbGUoZGF0YXNldCwgc2FtcGxlLCBsYWJlbF9jb2x1bW4pCiAgICAKICAgIGlmIGVuY29kZV9jb2xzOgogICAgICAgIHJhdyA9IHBkLmdldF9kdW1taWVzKHJhdywgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgY29sdW1ucz1saXN0KGVuY29kZV9jb2xzLmtleXMoKSksIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgIHByZWZpeD1saXN0KGVuY29kZV9jb2xzLnZhbHVlcygpKSwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZHJvcF9maXJzdD1UcnVlKQogICAgCiAgICAoeHRyYWluLCB5dHJhaW4pLCAoeHZhbGlkLCB5dmFsaWQpLCAoeHRlc3QsIHl0ZXN0KSA9ICAgICAgICAgZ2V0X3NwbGl0cyhyYXcsIGxhYmVscywgMywgdGVzdF9zaXplLCAxLXRyYWluX3ZhbF9zcGxpdCwgcmFuZG9tX3N0YXRlKQogICAgCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KHRlc3Rfc2V0X2tleSwgCiAgICAgICAgICAgICAgICAgICAgICAgIGRmPXBkLmNvbmNhdChbeHRlc3QsIHl0ZXN0LnRvX2ZyYW1lKCldLCBheGlzPTEpLAogICAgICAgICAgICAgICAgICAgICAgICBmb3JtYXQ9ZmlsZV9leHQsIGluZGV4PUZhbHNlLCAKICAgICAgICAgICAgICAgICAgICAgICAgbGFiZWxzPXsiZGF0YS10eXBlIjogImhlbGQtb3V0In0sCiAgICAgICAgICAgICAgICAgICAgICAgIGFydGlmYWN0X3BhdGg9Y29udGV4dC5hcnRpZmFjdF9zdWJwYXRoKCdkYXRhJykpCgogICAgbW9kZWxfY29uZmlnID0gZ2VuX3NrbGVhcm5fbW9kZWwobW9kZWxfcGtnX2NsYXNzLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgY29udGV4dC5wYXJhbWV0ZXJzLml0ZW1zKCkpCgogICAgbW9kZWxfY29uZmlnWyJGSVQiXS51cGRhdGUoeyJYIjogeHRyYWluLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICJ5IjogeXRyYWluLnZhbHVlc30pCiAgICAKICAgIENsYXNzaWZpZXJDbGFzcyA9IGNyZWF0ZV9jbGFzcyhtb2RlbF9jb25maWdbIk1FVEEiXVsiY2xhc3MiXSkKICAgIAogICAgbW9kZWwgPSBDbGFzc2lmaWVyQ2xhc3MoKiptb2RlbF9jb25maWdbIkNMQVNTIl0pCiAgICAKICAgIG1vZGVsLmZpdCgqKm1vZGVsX2NvbmZpZ1siRklUIl0pCiAgICAKICAgIGFydGlmYWN0X3BhdGggPSBjb250ZXh0LmFydGlmYWN0X3N1YnBhdGgobW9kZWxzX2Rlc3QpCiAgICBwbG90c19wYXRoID0gY29udGV4dC5hcnRpZmFjdF9zdWJwYXRoKG1vZGVsc19kZXN0LCBwbG90c19kZXN0KQogICAgaWYgbW9kZWxfZXZhbHVhdG9yOgogICAgICAgIGV2YWxfbWV0cmljcyA9IG1vZGVsX2V2YWx1YXRvcihjb250ZXh0LCB4dmFsaWQsIHl2YWxpZCwgbW9kZWwsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHBsb3RzX2FydGlmYWN0X3BhdGg9cGxvdHNfcGF0aCkKICAgIGVsc2U6CiAgICAgICAgZXZhbF9tZXRyaWNzID0gZXZhbF9tb2RlbF92Mihjb250ZXh0LCB4dmFsaWQsIHl2YWxpZCwgbW9kZWwsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBwbG90c19hcnRpZmFjdF9wYXRoPXBsb3RzX3BhdGgpCiAgICAgICAgCiAgICBjb250ZXh0LnNldF9sYWJlbCgnY2xhc3MnLCBtb2RlbF9wa2dfY2xhc3MpCiAgICBjb250ZXh0LmxvZ19tb2RlbCgibW9kZWwiLCBib2R5PWR1bXBzKG1vZGVsKSwKICAgICAgICAgICAgICAgICAgICAgIGFydGlmYWN0X3BhdGg9YXJ0aWZhY3RfcGF0aCwKICAgICAgICAgICAgICAgICAgICAgIGV4dHJhX2RhdGE9ZXZhbF9tZXRyaWNzLCAKICAgICAgICAgICAgICAgICAgICAgIG1vZGVsX2ZpbGU9Im1vZGVsLnBrbCIsCiAgICAgICAgICAgICAgICAgICAgICBtZXRyaWNzPWNvbnRleHQucmVzdWx0cywKICAgICAgICAgICAgICAgICAgICAgIGxhYmVscz17ImNsYXNzIjogbW9kZWxfcGtnX2NsYXNzfSkKCg== commands: [] - code_origin: sklearn_classifier.ipynb + code_origin: https://github.com/Idan707/functions.git#156b4145b7fa1fdada432f00f0081ca5ccdf1b35:sklearn_classifier.ipynb diff --git a/sklearn_classifier/sample-configs/BayesianGaussianMixture.json b/sklearn_classifier/sample-configs/BayesianGaussianMixture.json deleted file mode 100644 index 37a953f72..000000000 --- a/sklearn_classifier/sample-configs/BayesianGaussianMixture.json +++ /dev/null @@ -1 +0,0 @@ -{"CLASS_PARAMS": {"self": 1, "n_components": "full", "covariance_type": 0.001, "tol": 1e-06, "reg_covar": 100, "max_iter": 1, "n_init": "kmeans", "init_params": "dirichlet_process", "weight_concentration_prior_type": null, "weight_concentration_prior": null, "mean_precision_prior": null, "mean_prior": null, "degrees_of_freedom_prior": null, "covariance_prior": null, "random_state": false, "warm_start": 0, "verbose": 10}} \ No newline at end of file diff --git a/sklearn_classifier/sample-configs/LGBMClassifier.json b/sklearn_classifier/sample-configs/LGBMClassifier.json deleted file mode 100644 index b8299edc3..000000000 --- a/sklearn_classifier/sample-configs/LGBMClassifier.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "CLASS" : { - "boosting_type" : "gbdt", - "num_leaves" : 300, - "max_depth" : 50, - "learning_rate" : 0.1, - "n_estimators" : 300, - "objective" : "binary", - "scale_pos_weight" : 1, - "min_split_gain" : 0.0, - "min_child_samples" : 20, - "subsample" : 1, - "colsample_bytree" : 1, - "reg_alpha" : 0, - "reg_lambda" : 1, - "n_jobs" : 16, - "silent" : true, - "importance_type" : "split", - "random_state" : 1}, - "FIT" : { - "verbose" : false - }, - "META" : { - "class" : "lightgbm.sklearn.LGBMClassifier", - "version" : "2.3.1" - } -} diff --git a/sklearn_classifier/sample-configs/LogisticRegression.json b/sklearn_classifier/sample-configs/LogisticRegression.json deleted file mode 100644 index f6b21c9fd..000000000 --- a/sklearn_classifier/sample-configs/LogisticRegression.json +++ /dev/null @@ -1 +0,0 @@ -{"CLASS": {"penalty": "l2", "dual": false, "tol": 0.0001, "C": 1.0, "fit_intercept": true, "intercept_scaling": 1, "class_weight": null, "random_state": null, "solver": "warn", "max_iter": 100, "multi_class": "warn", "verbose": 0, "warm_start": false, "n_jobs": null, "l1_ratio": null}, "FIT": {"X": null, "y": null, "sample_weight": null}, "META": {"sklearn_version": "0.21.3", "classifier": "sklearn.linear_model.logistic.LogisticRegression"}} \ No newline at end of file diff --git a/sklearn_classifier/sample-configs/XGBClassifier.json b/sklearn_classifier/sample-configs/XGBClassifier.json deleted file mode 100644 index 8dcf84b82..000000000 --- a/sklearn_classifier/sample-configs/XGBClassifier.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "CLASS" : { - "num_class" : 3, - "max_depth" : 50, - "learning_rate" : 0.1, - "verbosity" : 1, - "objective" : "multi:softmax", - "booster" : "gbtree", - "tree_method" : "hist", - "n_jobs" : 16, - "random_state" : 1, - "n_estimators" : 200, - "gamma" : null, - "min_child_weight" : 1, - "max_delta_step" : 0, - "subsample" : 1, - "reg_alpha" : 0, - "reg_lambda" : 1, - "scale_pos_weight" : 1, - "random_state" : 1}, - "FIT" : { - "verbose" : false}, - "META" : { - "class": "xgboost.sklearn.XGBClassifier", - "version" : "1.0.2" - } -} \ No newline at end of file diff --git a/sklearn_classifier/sklearn-classifier.py b/sklearn_classifier/sklearn-classifier.py index a787f4c5a..dc0b58de6 100644 --- a/sklearn_classifier/sklearn-classifier.py +++ b/sklearn_classifier/sklearn-classifier.py @@ -21,10 +21,11 @@ from mlrun.execution import MLClientCtx from mlrun.datastore import DataItem from mlrun.artifacts import PlotArtifact - from mlrun.mlutils import (get_sample, get_splits, gen_sklearn_model, create_class, eval_model_v2) +import mlrun + def train_model( context: MLClientCtx, model_pkg_class: str, @@ -33,7 +34,7 @@ def train_model( encode_cols: List[str] = [], sample: int = -1, test_size: float = 0.30, - train_val_split: float = 0.75, + train_val_split: float = 0.70, test_set_key: str = "test_set", model_evaluator = None, models_dest: str = "", diff --git a/sklearn_classifier/sklearn_classifier.ipynb b/sklearn_classifier/sklearn_classifier.ipynb index 0b75e92f2..933c347c1 100644 --- a/sklearn_classifier/sklearn_classifier.ipynb +++ b/sklearn_classifier/sklearn_classifier.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# generic scikit-learn classifier\n", + "# Generic scikit-learn classifier\n", "\n", "run any scikit-learn compatible classifier or list of classifiers" ] @@ -13,7 +13,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## steps\n", + "## Steps\n", "1. **generate a scikit-learn model configuration** using the `model_pkg_class` parameter\n", " * input a package and class name, for example, `sklearn.linear_model.LogisticRegression` \n", " * mlrun will find the class and instantiate a copy using default parameters \n", @@ -44,9 +44,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "%nuclio: setting kind to 'job'\n", + "%nuclio: setting spec.image to 'mlrun/ml-models'\n", + "%nuclio: setting spec.maxReplicas to 1\n" + ] + } + ], "source": [ "%nuclio config kind = \"job\"\n", "%nuclio config spec.image = \"mlrun/ml-models\"" @@ -54,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -79,10 +89,8 @@ "from mlrun.execution import MLClientCtx\n", "from mlrun.datastore import DataItem\n", "from mlrun.artifacts import PlotArtifact\n", - "\n", "from mlrun.mlutils import (get_sample, get_splits,\n", " gen_sklearn_model, create_class, eval_model_v2)\n", - "#from models import eval_class_model, log_model\n", "\n", "def train_model(\n", " context: MLClientCtx,\n", @@ -92,7 +100,7 @@ " encode_cols: List[str] = [],\n", " sample: int = -1,\n", " test_size: float = 0.30,\n", - " train_val_split: float = 0.75,\n", + " train_val_split: float = 0.70,\n", " test_set_key: str = \"test_set\",\n", " model_evaluator = None,\n", " models_dest: str = \"\",\n", @@ -191,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "pycharm": { "name": "#%%\n" @@ -210,14 +218,14 @@ } }, "source": [ - "### sklearn trainer setup\n", + "### Sklearn trainer setup\n", "\n", "the following task paramaters are common to all runs" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { "pycharm": { "name": "#%%\n" @@ -230,7 +238,7 @@ " \"params\" : {\n", " \"sample\" : -1,\n", " \"test_size\" : 0.30,\n", - " \"train_val_split\" : 0.75,\n", + " \"train_val_split\" : 0.70,\n", " \"random_state\" : 1,\n", " \"n_jobs\" : -1,\n", " \"plots_dest\" : \"plots-p\",\n", @@ -251,7 +259,7 @@ } }, "source": [ - "### set model parameters and run locally\n", + "### Set model parameters and run locally\n", "\n", "* loop over a list of candidate models, update the task and run a local trainer for that model \n", "* optionally customize some parameters for each model\n", @@ -262,53 +270,41 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "from mlrun import mlconf\n", - "\n", - "DATA_REPO = \"https://raw.githubusercontent.com/yjb-ds/testdata/master/\" \n", - "\n", - "# choose a binary or multiclass dataset\n", - "#DATA_PATH = \"sklearn_classfier/iris_dataset.csv\" # MULTICLASS\n", - "DATA_PATH = \"data/clf-k4-m24-n10k-imb.csv\" # MULTICLASS\n", - "\n", - "DATA_URL = f\"{DATA_REPO}/{DATA_PATH}\"" + "DATA_URL = 'https://s3.wasabisys.com/iguazio/data/iris/iris_dataset.csv'" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[mlrun] 2020-06-05 20:18:32,985 starting run sklearn_ensemble_RandomForestClassifier uid=ceaa30f9ef3f4f83bf50b0ce965e6d2f -> http://10.199.227.162:8080\n", - "[mlrun] 2020-06-05 20:18:34,831 log artifact test_set at /User/ml2/sklearn.ensemble.RandomForestClassifier/data/test_set.parquet, size: 701200, db: Y\n", - "[mlrun] 2020-06-05 20:18:35,955 log artifact confusion-matrix at /User/ml2/sklearn.ensemble.RandomForestClassifier/model/plots/confusion-matrix.html, size: 21829, db: N\n", - "[mlrun] 2020-06-05 20:18:36,216 log artifact feature-importances at /User/ml2/sklearn.ensemble.RandomForestClassifier/model/plots/feature-importances.html, size: 13789, db: N\n", - "[mlrun] 2020-06-05 20:18:36,324 log artifact precision-recall-multiclass at /User/ml2/sklearn.ensemble.RandomForestClassifier/model/plots/precision-recall-multiclass.html, size: 50133, db: N\n", - "[mlrun] 2020-06-05 20:18:36,456 log artifact roc-multiclass at /User/ml2/sklearn.ensemble.RandomForestClassifier/model/plots/roc-multiclass.html, size: 26213, db: N\n", - "[mlrun] 2020-06-05 20:18:36,528 log artifact model at /User/ml2/sklearn.ensemble.RandomForestClassifier/model/, size: 549340, db: Y\n", - "\n" + "> 2020-10-28 15:07:07,185 [warning] warning!, server (0.5.3-rc1) and client (0.5.2) ver dont match\n", + "> 2020-10-28 15:07:07,192 [warning] warning!, server (0.5.3-rc1) and client (0.5.2) ver dont match\n", + "> 2020-10-28 15:07:07,192 [info] starting run sklearn_ensemble_RandomForestClassifier uid=7828bb4ac6c142f1bb30e7e2d289d5fc -> http://mlrun-api:8080\n", + "> 2020-10-28 15:07:07,228 [warning] warning!, server (0.5.3-rc1) and client (0.5.2) ver dont match\n" ] }, { "data": { "text/html": [ - "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 28 15:07:28runningsklearn_ensemble_RandomForestClassifier
v3io_user=admin
kind=job
owner=admin
dataset
model_pkg_class=sklearn.ensemble.RandomForestClassifier
label_column=label
CLASS_max_depth=5
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to track results use .show() or .logs() or in CLI: \n", + "!mlrun get run e10b0a9b81cb42b48116ba59a2851375 --project default , !mlrun logs e10b0a9b81cb42b48116ba59a2851375 --project default\n", + "> 2020-10-28 15:07:28,964 [info] run executed, status=running\n", + "> 2020-10-28 15:07:28,965 [info] starting run sklearn_linear_model_LogisticRegression uid=ccf66a211ed44789ae62bd9a71439543 -> http://mlrun-api:8080\n", + "> 2020-10-28 15:07:29,103 [info] Job is running in the background, pod: sklearn-linear-model-logisticregression-gdcrk\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 28 15:07:29runningsklearn_linear_model_LogisticRegression
v3io_user=admin
kind=job
owner=admin
dataset
model_pkg_class=sklearn.linear_model.LogisticRegression
label_column=label
CLASS_solver=liblinear
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to track results use .show() or .logs() or in CLI: \n", + "!mlrun get run ccf66a211ed44789ae62bd9a71439543 --project default , !mlrun logs ccf66a211ed44789ae62bd9a71439543 --project default\n", + "> 2020-10-28 15:07:29,164 [info] run executed, status=running\n", + "> 2020-10-28 15:07:29,165 [info] starting run sklearn_ensemble_AdaBoostClassifier uid=4cbdeb89e6784f8b826102ed7a3bc24b -> http://mlrun-api:8080\n", + "> 2020-10-28 15:07:29,414 [info] Job is running in the background, pod: sklearn-ensemble-adaboostclassifier-h6svl\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 28 15:07:29runningsklearn_ensemble_AdaBoostClassifier
v3io_user=admin
kind=job
owner=admin
dataset
model_pkg_class=sklearn.ensemble.AdaBoostClassifier
label_column=label
CLASS_n_estimators=200
CLASS_learning_rate=0.01
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to track results use .show() or .logs() or in CLI: \n", + "!mlrun get run 4cbdeb89e6784f8b826102ed7a3bc24b --project default , !mlrun logs 4cbdeb89e6784f8b826102ed7a3bc24b --project default\n", + "> 2020-10-28 15:07:29,496 [info] run executed, status=running\n" + ] + } + ], + "source": [ + "outputs = []\n", + "for model in models:\n", + " task_copy = task_params.copy()\n", + " task_copy.update(\n", + " {\n", + " \"params\":{ \"model_pkg_class\" : model,\n", + " \"label_column\" : \"label\"}\n", + " }\n", + " )\n", + " \n", + " # customize specific model parameters\n", + " if \"RandomForestClassifier\" in model:\n", + " task_copy[\"params\"].update({\"CLASS_max_depth\" : 5})\n", + "\n", + " if \"LogisticRegression\" in model:\n", + " task_copy[\"params\"].update({\"CLASS_solver\" : \"liblinear\"})\n", + " \n", + " if \"AdaBoostClassifier\" in model:\n", + " task_copy[\"params\"].update({\"CLASS_n_estimators\" : 200,\n", + " \"CLASS_learning_rate\" : 0.01\n", + " })\n", + " \n", + " name = model.replace('.', '_')\n", + " output = fn.run(mlrun.NewTask(**task_copy),\n", + " handler=train_model,\n", + " name=name,\n", + " inputs={\"dataset\" : DATA_URL}, \n", + " artifact_path=os.path.join(artifact_path, model))\n", + " \n", + " outputs.append({name: output.outputs})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1130,9 +1847,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/sklearn_classifier_dask/README.md b/sklearn_classifier_dask/README.md new file mode 100644 index 000000000..fde0b0808 --- /dev/null +++ b/sklearn_classifier_dask/README.md @@ -0,0 +1,49 @@ +# **Training Functions** + +## `sklearn-classifer with Dask` + +Run any scikit-learn compatible classifier or list of classifiers with Dask + +### steps + +1. **Generate a scikit-learn model configuration** using the `model_pkg_class` parameter + * input a package and class name, for example, `sklearn.linear_model.LogisticRegression` + * mlrun will find the class and instantiate a copy using default parameters + * You can modify both the model class and the fit methods +2. **Get a sample of data** from a data source + * select a random sample of rows using a negative integer + * select consecutive rows using a positive integer +3. **Split the data** into train, validation, and test sets + * the test set is saved as an artifact and never seen again until testing +4. **Train the model** +5. **pickle / serialize the model** + * models can be pickled or saved as json +6. **Evaluate the model** + * a custom evaluator can be provided, see function doc for details + + +Train a sklearn classifier with Dask + + :param context: Function context. + :param dataset: Raw data file. + :param model_pkg_class: Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", + or json model config. + :param label_column: (label) Ground-truth y labels. + :param train_validation_size: (0.75) Train validation set proportion out of the full dataset. + :param sample: (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default. + :param models_dest: (models) Models subfolder on artifact path. + :param test_set_key: (test_set) Mlrun db key of held out data in artifact store. + :param plots_dest: (plots) Plot subfolder on artifact path. + :param dask_key: (dask key) Key of dataframe in dask client "datasets" attribute. + :param dask_persist: (False) Should the data be persisted (through the `client.persist`) + :param scheduler_key: (scheduler) Dask scheduler configuration, json also logged as an artifact. + :param file_ext: (parquet) format for test_set_key hold out data + :param random_state: (42) sklearn seed + + +### TODO + +1. Add cross validation methods +2. Improve dask efficiency by calling dask data frame (not from pandas) +3. Log dataset artifact as dask data frame +4. Add values imputer (instead of drop na) diff --git a/sklearn_classifier_dask/function.yaml b/sklearn_classifier_dask/function.yaml new file mode 100644 index 000000000..64afd17d9 --- /dev/null +++ b/sklearn_classifier_dask/function.yaml @@ -0,0 +1,23 @@ +kind: dask +metadata: + name: dask_init + hash: b4b3af2251b76b5aea339535e1e9d5d71f83aae1 + project: default + categories: [] +spec: + command: '' + image: mlrun/ml-models + env: [] + resources: + limits: + memory: 8G + build: + commands: [] + description: '' + replicas: 5 + remote: true + service_type: NodePort + nthreads: 6 + min_replicas: 0 + max_replicas: 16 + scheduler_timeout: 60 minutes diff --git a/sklearn_classifier_dask/sklearn-classifier-dask.py b/sklearn_classifier_dask/sklearn-classifier-dask.py new file mode 100644 index 000000000..8a9f6b232 --- /dev/null +++ b/sklearn_classifier_dask/sklearn-classifier-dask.py @@ -0,0 +1,197 @@ +# Generated by nuclio.export.NuclioExporter + +import warnings +warnings.filterwarnings('ignore') + +import os +import joblib +import numpy as np +import pandas as pd +import sklearn +from cloudpickle import dumps, load, dump +from typing import List, Optional + +from dask.distributed import Client +from dask import dataframe as dd +from dask import array as da +from dask.delayed import delayed +from dask_ml import model_selection +from dask_ml import metrics +from dask_ml.preprocessing import StandardScaler, LabelEncoder + +from mlrun.execution import MLClientCtx +from mlrun.datastore import DataItem +from mlrun.artifacts import PlotArtifact +from mlrun.mlutils import (gen_sklearn_model, create_class) + +import matplotlib.pyplot as plt +from yellowbrick.classifier import ROCAUC, ClassificationReport, ConfusionMatrix +from yellowbrick.model_selection import FeatureImportances + +def train_model(context: MLClientCtx, + dataset: DataItem, + model_pkg_class: str, + label_column: str = "label", + train_validation_size: float = 0.75, + sample: float = 1.0, + models_dest: str = "models", + test_set_key: str = "test_set", + plots_dest: str = "plots", + dask_key: str = "dask_key", + dask_persist: bool = False, + scheduler_key: str = '', + file_ext: str = "parquet", + random_state: int = 42) -> None: + + """ + Train a sklearn classifier with Dask + + :param context: Function context. + :param dataset: Raw data file. + :param model_pkg_class: Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", + or json model config. + :param label_column: (label) Ground-truth y labels. + :param train_validation_size: (0.75) Train validation set proportion out of the full dataset. + :param sample: (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default. + :param models_dest: (models) Models subfolder on artifact path. + :param test_set_key: (test_set) Mlrun db key of held out data in artifact store. + :param plots_dest: (plots) Plot subfolder on artifact path. + :param dask_key: (dask key) Key of dataframe in dask client "datasets" attribute. + :param dask_persist: (False) Should the data be persisted (through the `client.persist`) + :param scheduler_key: (scheduler) Dask scheduler configuration, json also logged as an artifact. + :param file_ext: (parquet) format for test_set_key hold out data + :param random_state: (42) sklearn seed + """ + + if scheduler_key: + client = Client(scheduler_key) + + else: + client = Client() + + context.logger.info("Read Data") + df = dataset.as_df(df_module=dd) + + context.logger.info("Prep Data") + numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] + df = df.select_dtypes(include=numerics) + + if df.isna().any().any().compute() == True: + raise Exception('NAs valus found') + + df_header = df.columns + + df = df.sample(frac=sample).reset_index(drop=True) + encoder = LabelEncoder() + encoder = encoder.fit(df[label_column]) + X = df.drop(label_column, axis=1).to_dask_array(lengths=True) + y = encoder.transform(df[label_column]) + + classes = df[label_column].drop_duplicates() # no unique values in dask + classes = [str(i) for i in classes] + + context.logger.info("Split and Train") + X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=train_validation_size, + random_state=random_state) + + scaler = StandardScaler() + scaler = scaler.fit(X_train) + X_train_transformed = scaler.transform(X_train) + X_test_transformed = scaler.transform(X_test) + + model_config = gen_sklearn_model(model_pkg_class, + context.parameters.items()) + + model_config["FIT"].update({"X": X_train_transformed, + "y": y_train}) + + ClassifierClass = create_class(model_config["META"]["class"]) + + model = ClassifierClass(**model_config["CLASS"]) + + with joblib.parallel_backend("dask"): + + model = model.fit(**model_config["FIT"]) + + artifact_path = context.artifact_subpath(models_dest) + + plots_path = context.artifact_subpath(models_dest, plots_dest) + + context.logger.info("Evaluate") + extra_data_dict = {} + for report in (ROCAUC, ClassificationReport, ConfusionMatrix): + + report_name = str(report.__name__) + plt.cla() + plt.clf() + plt.close() + + viz = report(model, classes=classes, per_class=True, is_fitted=True) + viz.fit(X_train_transformed, y_train) # Fit the training data to the visualizer + viz.score(X_test_transformed, y_test.compute()) # Evaluate the model on the test data + + plot = context.log_artifact(PlotArtifact(report_name, + body=viz.fig, + title=report_name), + db_key=False) + extra_data_dict[str(report)] = plot + + if report_name == 'ROCAUC': + context.log_results({"micro": viz.roc_auc.get("micro"), + "macro": viz.roc_auc.get("macro")}) + + elif report_name == 'ClassificationReport': + for score_name in viz.scores_: + for score_class in viz.scores_[score_name]: + + context.log_results({score_name + "-" + score_class : + viz.scores_[score_name].get(score_class)}) + + + viz = FeatureImportances(model, classes=classes, per_class=True, + is_fitted=True, labels=df_header.delete(df_header.get_loc(label_column))) + viz.fit(X_train_transformed, y_train) + viz.score(X_test_transformed, y_test) + + plot = context.log_artifact(PlotArtifact("FeatureImportances", body=viz.fig, + title="FeatureImportances"), db_key=False) + extra_data_dict[str("FeatureImportances")] = plot + + plt.cla() + plt.clf() + plt.close() + + context.logger.info("Log artifacts") + artifact_path = context.artifact_subpath(models_dest) + + plots_path = context.artifact_subpath(models_dest, plots_dest) + + context.set_label('class', model_pkg_class) + + context.log_model("model", body=dumps(model), + artifact_path=artifact_path, + model_file="model.pkl", + extra_data=extra_data_dict, + metrics=context.results, + labels={"class": model_pkg_class}) + + context.log_artifact("standard_scaler", body=dumps(scaler), + artifact_path=artifact_path, + model_file="scaler.gz", + label="standard_scaler") + + context.log_artifact("label_encoder", body=dumps(encoder), + artifact_path=artifact_path, + model_file="encoder.gz", + label="label_encoder") + + df_to_save = delayed(np.column_stack)((X_test, y_test)).compute() + context.log_dataset(test_set_key, + df=pd.DataFrame(df_to_save, + columns=df_header), # improve log dataset ability + format=file_ext, index=False, + labels={"data-type": "held-out"}, + artifact_path=context.artifact_subpath('data')) + + context.logger.info("Done!") + diff --git a/sklearn_classifier_dask/sklearn_classifier_dask.ipynb b/sklearn_classifier_dask/sklearn_classifier_dask.ipynb new file mode 100644 index 000000000..a334248b5 --- /dev/null +++ b/sklearn_classifier_dask/sklearn_classifier_dask.ipynb @@ -0,0 +1,1261 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generic Scikit-Learn Classifier With Dask\n", + "\n", + "Run any scikit-learn compatible classifier or list of classifiers" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# nuclio: ignore\n", + "import nuclio" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "%nuclio: setting kind to 'job'\n", + "%nuclio: setting spec.image to 'mlrun/ml-models'\n" + ] + } + ], + "source": [ + "%nuclio config kind = \"job\"\n", + "%nuclio config spec.image = \"mlrun/ml-models\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import os\n", + "import joblib\n", + "import numpy as np\n", + "import pandas as pd\n", + "import sklearn\n", + "from cloudpickle import dumps, load, dump\n", + "from typing import List, Optional\n", + "\n", + "from dask.distributed import Client\n", + "from dask import dataframe as dd\n", + "from dask import array as da\n", + "from dask.delayed import delayed\n", + "from dask_ml import model_selection\n", + "from dask_ml import metrics\n", + "from dask_ml.preprocessing import StandardScaler, LabelEncoder\n", + "\n", + "from mlrun.execution import MLClientCtx\n", + "from mlrun.datastore import DataItem\n", + "from mlrun.artifacts import PlotArtifact\n", + "from mlrun.mlutils import (gen_sklearn_model, create_class)\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from yellowbrick.classifier import ROCAUC, ClassificationReport, ConfusionMatrix\n", + "from yellowbrick.model_selection import FeatureImportances" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def train_model(context: MLClientCtx,\n", + " dataset: DataItem,\n", + " model_pkg_class: str,\n", + " label_column: str = \"label\",\n", + " train_validation_size: float = 0.75,\n", + " sample: float = 1.0,\n", + " models_dest: str = \"models\",\n", + " test_set_key: str = \"test_set\",\n", + " plots_dest: str = \"plots\",\n", + " dask_key: str = \"dask_key\",\n", + " dask_persist: bool = False,\n", + " scheduler_key: str = '',\n", + " file_ext: str = \"parquet\",\n", + " random_state: int = 42) -> None:\n", + " \n", + " \"\"\"\n", + " Train a sklearn classifier with Dask\n", + " \n", + " :param context: Function context.\n", + " :param dataset: Raw data file.\n", + " :param model_pkg_class: Model to train, e.g, \"sklearn.ensemble.RandomForestClassifier\", \n", + " or json model config.\n", + " :param label_column: (label) Ground-truth y labels.\n", + " :param train_validation_size: (0.75) Train validation set proportion out of the full dataset.\n", + " :param sample: (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default.\n", + " :param models_dest: (models) Models subfolder on artifact path.\n", + " :param test_set_key: (test_set) Mlrun db key of held out data in artifact store.\n", + " :param plots_dest: (plots) Plot subfolder on artifact path.\n", + " :param dask_key: (dask key) Key of dataframe in dask client \"datasets\" attribute.\n", + " :param dask_persist: (False) Should the data be persisted (through the `client.persist`)\n", + " :param scheduler_key: (scheduler) Dask scheduler configuration, json also logged as an artifact.\n", + " :param file_ext: (parquet) format for test_set_key hold out data\n", + " :param random_state: (42) sklearn seed\n", + " \"\"\"\n", + " \n", + " # set up dask client \n", + " if scheduler_key:\n", + " client = Client(scheduler_key)\n", + " \n", + " else:\n", + " client = Client()\n", + "\n", + " context.logger.info(\"Read Data\")\n", + " # read data with dask and mlrun\n", + " df = dataset.as_df(df_module=dd) \n", + "\n", + " # take only numrical cols\n", + " context.logger.info(\"Prep Data\")\n", + " numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", + " df = df.select_dtypes(include=numerics)\n", + " \n", + " # dropna\n", + " if df.isna().any().any().compute() == True:\n", + " raise Exception('NAs valus found')\n", + " \n", + " # save cols names\n", + " df_header = df.columns\n", + " \n", + " df = df.sample(frac=sample).reset_index(drop=True)\n", + " encoder = LabelEncoder()\n", + " encoder = encoder.fit(df[label_column])\n", + " X = df.drop(label_column, axis=1).to_dask_array(lengths=True)\n", + " y = encoder.transform(df[label_column])\n", + "\n", + " classes = df[label_column].drop_duplicates() # no unique values in dask\n", + " classes = [str(i) for i in classes]\n", + "\n", + " context.logger.info(\"Split and Train\")\n", + " X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=train_validation_size,\n", + " random_state=random_state)\n", + " \n", + " scaler = StandardScaler()\n", + " scaler = scaler.fit(X_train)\n", + " X_train_transformed = scaler.transform(X_train)\n", + " X_test_transformed = scaler.transform(X_test)\n", + " \n", + " model_config = gen_sklearn_model(model_pkg_class,\n", + " context.parameters.items())\n", + "\n", + " model_config[\"FIT\"].update({\"X\": X_train_transformed,\n", + " \"y\": y_train})\n", + " \n", + " ClassifierClass = create_class(model_config[\"META\"][\"class\"])\n", + " \n", + " model = ClassifierClass(**model_config[\"CLASS\"])\n", + " \n", + " # load and fit model\n", + " with joblib.parallel_backend(\"dask\"):\n", + " \n", + " # initialize classifier from sklearn\n", + " model = model.fit(**model_config[\"FIT\"])\n", + "\n", + " # log artifacts\n", + " artifact_path = context.artifact_subpath(models_dest)\n", + " \n", + " # log plots\n", + " plots_path = context.artifact_subpath(models_dest, plots_dest)\n", + "\n", + " # create reports\n", + " context.logger.info(\"Evaluate\")\n", + " extra_data_dict = {}\n", + " for report in (ROCAUC, ClassificationReport, ConfusionMatrix):\n", + " \n", + " report_name = str(report.__name__)\n", + " # clear output\n", + " plt.cla()\n", + " plt.clf()\n", + " plt.close()\n", + " \n", + " # genrate report\n", + " viz = report(model, classes=classes, per_class=True, is_fitted=True)\n", + " viz.fit(X_train_transformed, y_train) # Fit the training data to the visualizer\n", + " viz.score(X_test_transformed, y_test.compute()) # Evaluate the model on the test data\n", + " \n", + " # log reports\n", + " plot = context.log_artifact(PlotArtifact(report_name, \n", + " body=viz.fig,\n", + " title=report_name), \n", + " db_key=False)\n", + " extra_data_dict[str(report)] = plot\n", + " \n", + " # log results\n", + " if report_name == 'ROCAUC':\n", + " context.log_results({\"micro\": viz.roc_auc.get(\"micro\"),\n", + " \"macro\": viz.roc_auc.get(\"macro\")})\n", + " \n", + " elif report_name == 'ClassificationReport':\n", + " for score_name in viz.scores_:\n", + " for score_class in viz.scores_[score_name]:\n", + " \n", + " context.log_results({score_name + \"-\" + score_class : \n", + " viz.scores_[score_name].get(score_class)})\n", + " \n", + " #viz.show()\n", + " \n", + " # get feature importance\n", + " viz = FeatureImportances(model, classes=classes, per_class=True, \n", + " is_fitted=True, labels=df_header.delete(df_header.get_loc(label_column)))\n", + " viz.fit(X_train_transformed, y_train) \n", + " viz.score(X_test_transformed, y_test)\n", + " #viz.show()\n", + " \n", + " plot = context.log_artifact(PlotArtifact(\"FeatureImportances\", body=viz.fig, \n", + " title=\"FeatureImportances\"), db_key=False)\n", + " extra_data_dict[str(\"FeatureImportances\")] = plot\n", + " \n", + " # clear final output\n", + " plt.cla()\n", + " plt.clf()\n", + " plt.close()\n", + "\n", + " # log artifacts\n", + " context.logger.info(\"Log artifacts\")\n", + " artifact_path = context.artifact_subpath(models_dest)\n", + " \n", + " # log plots\n", + " plots_path = context.artifact_subpath(models_dest, plots_dest)\n", + " \n", + " # set label\n", + " context.set_label('class', model_pkg_class)\n", + " \n", + " # log models\n", + " context.log_model(\"model\", body=dumps(model),\n", + " artifact_path=artifact_path,\n", + " model_file=\"model.pkl\",\n", + " extra_data=extra_data_dict,\n", + " metrics=context.results,\n", + " labels={\"class\": model_pkg_class})\n", + " \n", + " # log scalers\n", + " context.log_artifact(\"standard_scaler\", body=dumps(scaler),\n", + " artifact_path=artifact_path,\n", + " model_file=\"scaler.gz\",\n", + " label=\"standard_scaler\")\n", + " \n", + " # log encoder\n", + " context.log_artifact(\"label_encoder\", body=dumps(encoder),\n", + " artifact_path=artifact_path,\n", + " model_file=\"encoder.gz\",\n", + " label=\"label_encoder\")\n", + " \n", + " # set aside some test data\n", + " df_to_save = delayed(np.column_stack)((X_test, y_test)).compute()\n", + " context.log_dataset(test_set_key, \n", + " df=pd.DataFrame(df_to_save, \n", + " columns=df_header), # improve log dataset ability\n", + " format=file_ext, index=False, \n", + " labels={\"data-type\": \"held-out\"},\n", + " artifact_path=context.artifact_subpath('data'))\n", + " \n", + " context.logger.info(\"Done!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# nuclio: end-code" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save and Config" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun\n", + "skf = mlrun.code_to_function('sklearn-classifier-dask', kind='job', code_output=\".\") .apply(mlrun.mount_v3io())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set Environment" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2020-11-23 14:59:39,336 [warning] warning!, server (0.5.4-rc1) and client (0.5.4) ver dont match\n" + ] + } + ], + "source": [ + "artifact_path = mlrun.set_environment(api_path = 'http://mlrun-api:8080',\n", + " artifact_path = os.path.abspath('./'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Init Dask" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### init a dask cluster and set dask specs" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2020-11-23 14:59:40,443 [info] using in-cluster config.\n" + ] + } + ], + "source": [ + "dsf = mlrun.new_function('dask_init', kind='dask', image='mlrun/ml-models')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2020-11-23 14:59:41,313 [info] function spec saved to path: function.yaml\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dsf.spec.remote = True\n", + "dsf.spec.replicas = 5\n", + "dsf.spec.service_type = 'NodePort'\n", + "dsf.with_limits(mem=\"8G\")\n", + "dsf.spec.nthreads = 6\n", + "dsf.export(\"function.yaml\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### mount v3io in for file system access" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dsf.apply(mlrun.mount_v3io())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### init dask client \n", + "copy the scheduler address to **DASK_CLIENT** param in the following cell, this will make the function use the dask cluster." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2020-11-23 15:00:15,924 [info] trying dask client at: tcp://mlrun-dask-init-4fdf1dc3-5.default-tenant:8786\n", + "> 2020-11-23 15:00:15,932 [info] using remote dask scheduler (mlrun-dask-init-4fdf1dc3-5) at: tcp://mlrun-dask-init-4fdf1dc3-5.default-tenant:8786\n" + ] + }, + { + "data": { + "text/html": [ + "dashboard link: default-tenant.app.dsteam.iguazio-cd1.com:32122" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 4
  • \n", + "
  • Cores: 24
  • \n", + "
  • Memory: 32.00 GB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dsf.client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set Parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_URL = '/User/iris.csv'\n", + "DASK_CLIENT = 'tcp://mlrun-dask-init-4fdf1dc3-5.default-tenant:8786'\n", + "\n", + "task_params = {\n", + " \"params\" : {\n", + " \"sample\" : 1,\n", + " \"train_val_split\" : 0.75,\n", + " \"random_state\" : 42,\n", + " \"n_jobs\" : -1,\n", + " \"plots_dest\" : \"plots-p\",\n", + " \"models_dest\" : 'sklearn-clfmodel'}}\n", + "\n", + "\n", + "models = [\n", + " \"sklearn.ensemble.RandomForestClassifier\",\n", + " \"sklearn.ensemble.AdaBoostClassifier\",\n", + " \"sklearn.linear_model.LogisticRegression\"\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test and Run" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2020-11-23 15:00:23,779 [warning] warning!, server (0.5.4-rc1) and client (0.5.4) ver dont match\n", + "> 2020-11-23 15:00:23,780 [info] starting run sklearn_ensemble_RandomForestClassifier uid=a78d70155eb54280a04f1d6c5b42f673 DB=http://mlrun-api:8080\n", + "> 2020-11-23 15:00:23,941 [info] Job is running in the background, pod: sklearn-ensemble-randomforestclassifier-cd6bk\n", + "> 2020-11-23 15:00:29,218 [info] Read Data\n", + "> 2020-11-23 15:00:29,236 [info] Prep Data\n", + "> 2020-11-23 15:00:29,665 [info] Split and Train\n", + "> 2020-11-23 15:00:32,016 [info] Evaluate\n", + "> 2020-11-23 15:00:33,768 [info] Log artifacts\n", + "> 2020-11-23 15:00:34,595 [info] Done!\n", + "> 2020-11-23 15:00:34,660 [info] run executed, status=completed\n", + "final state: completed\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Nov 23 15:00:29completedsklearn_ensemble_RandomForestClassifier
v3io_user=admin
kind=job
owner=admin
host=sklearn-ensemble-randomforestclassifier-cd6bk
class=sklearn.ensemble.RandomForestClassifier
dataset
model_pkg_class=sklearn.ensemble.RandomForestClassifier
label_column=label
scheduler_key=tcp://mlrun-dask-init-4fdf1dc3-5.default-tenant:8786
CLASS_max_depth=5
micro=0.9941135734072022
macro=0.9942943331178625
precision-1=1.0
precision-2=0.8888888888888888
precision-0=0.9166666666666666
recall-1=1.0
recall-2=0.9411764705882353
recall-0=0.8461538461538461
f1-1=1.0
f1-2=0.9142857142857143
f1-0=0.8799999999999999
ROCAUC
ClassificationReport
ConfusionMatrix
FeatureImportances
model
standard_scaler
label_encoder
test_set
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to track results use .show() or .logs() or in CLI: \n", + "!mlrun get run a78d70155eb54280a04f1d6c5b42f673 --project default , !mlrun logs a78d70155eb54280a04f1d6c5b42f673 --project default\n", + "> 2020-11-23 15:00:43,168 [info] run executed, status=completed\n", + "> 2020-11-23 15:00:43,169 [info] starting run sklearn_ensemble_AdaBoostClassifier uid=efac1f79e21f46259c423268d334ae6d DB=http://mlrun-api:8080\n", + "> 2020-11-23 15:00:43,335 [info] Job is running in the background, pod: sklearn-ensemble-adaboostclassifier-fd887\n", + "> 2020-11-23 15:00:48,569 [info] Read Data\n", + "> 2020-11-23 15:00:48,588 [info] Prep Data\n", + "> 2020-11-23 15:00:48,796 [info] Split and Train\n", + "> 2020-11-23 15:00:49,220 [info] Evaluate\n", + "> 2020-11-23 15:00:51,094 [info] Log artifacts\n", + "> 2020-11-23 15:00:51,533 [info] Done!\n", + "> 2020-11-23 15:00:51,581 [info] run executed, status=completed\n", + "final state: completed\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Nov 23 15:00:48completedsklearn_ensemble_AdaBoostClassifier
v3io_user=admin
kind=job
owner=admin
host=sklearn-ensemble-adaboostclassifier-fd887
class=sklearn.ensemble.AdaBoostClassifier
dataset
model_pkg_class=sklearn.ensemble.AdaBoostClassifier
label_column=label
scheduler_key=tcp://mlrun-dask-init-4fdf1dc3-5.default-tenant:8786
CLASS_n_estimators=200
CLASS_learning_rate=0.01
micro=0.9581024930747923
macro=0.9808974358974359
precision-0=1.0
precision-2=0.8
precision-1=0.9375
recall-0=1.0
recall-2=0.9230769230769231
recall-1=0.8333333333333334
f1-0=1.0
f1-2=0.8571428571428571
f1-1=0.8823529411764706
ROCAUC
ClassificationReport
ConfusionMatrix
FeatureImportances
model
standard_scaler
label_encoder
test_set
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to track results use .show() or .logs() or in CLI: \n", + "!mlrun get run efac1f79e21f46259c423268d334ae6d --project default , !mlrun logs efac1f79e21f46259c423268d334ae6d --project default\n", + "> 2020-11-23 15:00:52,501 [info] run executed, status=completed\n", + "> 2020-11-23 15:00:52,502 [info] starting run sklearn_linear_model_LogisticRegression uid=e314ab6e5e8546afbf851765e47506b0 DB=http://mlrun-api:8080\n", + "> 2020-11-23 15:00:52,675 [info] Job is running in the background, pod: sklearn-linear-model-logisticregression-drxn4\n", + "> 2020-11-23 15:00:58,029 [info] Read Data\n", + "> 2020-11-23 15:00:58,045 [info] Prep Data\n", + "> 2020-11-23 15:00:58,232 [info] Split and Train\n", + "> 2020-11-23 15:00:58,420 [info] Evaluate\n", + "> 2020-11-23 15:01:00,116 [info] Log artifacts\n", + "> 2020-11-23 15:01:00,439 [info] Done!\n", + "> 2020-11-23 15:01:00,489 [info] run executed, status=completed\n", + "final state: completed\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Nov 23 15:00:57completedsklearn_linear_model_LogisticRegression
v3io_user=admin
kind=job
owner=admin
host=sklearn-linear-model-logisticregression-drxn4
class=sklearn.linear_model.LogisticRegression
dataset
model_pkg_class=sklearn.linear_model.LogisticRegression
label_column=label
scheduler_key=tcp://mlrun-dask-init-4fdf1dc3-5.default-tenant:8786
CLASS_solver=liblinear
micro=0.9854570637119113
macro=0.9832142857142858
precision-1=1.0
precision-0=0.8461538461538461
precision-2=0.8571428571428571
recall-1=1.0
recall-0=0.8461538461538461
recall-2=0.8571428571428571
f1-1=1.0
f1-0=0.8461538461538461
f1-2=0.8571428571428571
ROCAUC
ClassificationReport
ConfusionMatrix
FeatureImportances
model
standard_scaler
label_encoder
test_set
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to track results use .show() or .logs() or in CLI: \n", + "!mlrun get run e314ab6e5e8546afbf851765e47506b0 --project default , !mlrun logs e314ab6e5e8546afbf851765e47506b0 --project default\n", + "> 2020-11-23 15:01:01,914 [info] run executed, status=completed\n" + ] + } + ], + "source": [ + "outputs = []\n", + "for model in models:\n", + " task_copy = task_params.copy()\n", + " task_copy.update(\n", + " {\n", + " \"params\":{ \"model_pkg_class\" : model,\n", + " \"label_column\" : \"label\",\n", + " \"scheduler_key\": DASK_CLIENT}\n", + " }\n", + " )\n", + " \n", + " # customize specific model parameters\n", + " if \"RandomForestClassifier\" in model:\n", + " task_copy[\"params\"].update({\"CLASS_max_depth\" : 5})\n", + "\n", + " if \"LogisticRegression\" in model:\n", + " task_copy[\"params\"].update({\"CLASS_solver\" : \"liblinear\"})\n", + " \n", + " if \"AdaBoostClassifier\" in model:\n", + " task_copy[\"params\"].update({\"CLASS_n_estimators\" : 200,\n", + " \"CLASS_learning_rate\" : 0.01\n", + " })\n", + " \n", + " name = model.replace('.', '_')\n", + " output = skf.run(mlrun.NewTask(**task_copy),\n", + " handler=train_model,\n", + " name=name,\n", + " inputs={\"dataset\" : DATA_URL},\n", + " artifact_path=os.path.join(artifact_path, model))\n", + " \n", + " outputs.append({name: output.outputs})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/test_classifier/README.md b/test_classifier/README.md index e69de29bb..6680e33bb 100644 --- a/test_classifier/README.md +++ b/test_classifier/README.md @@ -0,0 +1,22 @@ +# **Testing Functions** + +## `sklearn-classifer` + +Test one or more classifier models against held-out dataset +Using held-out test features, evaluates the performance of the estimated model +Can be part of a kubeflow pipeline as a test step that is run post EDA and +training/validation cycles. + +```markdown + +:param context: the function context +:param models_path: artifact models representing a file or a folder +:param test_set: test features and labels +:param label_column: column name for ground truth labels +:param score_method: for multiclass classification +:param plots_dest: dir for test plots +:param model_evaluator: NOT IMPLEMENTED: specific method to generate eval, passed in as string + or available in this folder +:param predictions_column: column name for the predictions column on the resulted artifact +:param model_update: (True) update model, when running as stand alone no need in update +``` \ No newline at end of file diff --git a/test_classifier/function.yaml b/test_classifier/function.yaml index 632bd284d..494dcdbdd 100644 --- a/test_classifier/function.yaml +++ b/test_classifier/function.yaml @@ -2,8 +2,8 @@ kind: job metadata: name: test-classifier tag: '' - hash: b3a28d41d4e9142cd7426ed970aa46237bb40728 - project: '' + hash: 7ede87ea7a064bd1d4b4771a9ebb517f08ba2cca + project: default labels: author: yjb framework: sklearn @@ -26,15 +26,19 @@ spec: parameters: - name: context doc: the function context + default: '' - name: models_path type: DataItem doc: artifact models representing a file or a folder + default: '' - name: test_set type: DataItem doc: test features and labels + default: '' - name: label_column type: str doc: column name for ground truth labels + default: '' - name: score_method type: str doc: for multiclass classification @@ -42,9 +46,11 @@ spec: - name: plots_dest type: str doc: dir for test plots + default: '' - name: model_evaluator doc: 'NOT IMPLEMENTED: specific method to generate eval, passed in as string or available in this folder' + default: null - name: default_model type: str default: model.pkl @@ -52,10 +58,14 @@ spec: type: str doc: column name for the predictions column on the resulted artifact default: yscore - outputs: [] - lineno: 14 + - name: model_update + doc: (True) update model, when running as stand alone no need in update + default: true + outputs: + - default: '' + lineno: 16 description: test a classifier using held-out or new data build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCndhcm5pbmdzLnNpbXBsZWZpbHRlcihhY3Rpb249Imlnbm9yZSIsIGNhdGVnb3J5PUZ1dHVyZVdhcm5pbmcpCgppbXBvcnQgb3MKaW1wb3J0IHBhbmRhcyBhcyBwZApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5hcnRpZmFjdHMgaW1wb3J0IGdldF9tb2RlbCwgdXBkYXRlX21vZGVsCmZyb20gbWxydW4ubWx1dGlscyBpbXBvcnQgZXZhbF9tb2RlbF92Mgpmcm9tIGNsb3VkcGlja2xlIGltcG9ydCBsb2FkCmZyb20gdXJsbGliLnJlcXVlc3QgaW1wb3J0IHVybG9wZW4KCmRlZiB0ZXN0X2NsYXNzaWZpZXIoCiAgICBjb250ZXh0LAogICAgbW9kZWxzX3BhdGg6IERhdGFJdGVtLCAKICAgIHRlc3Rfc2V0OiBEYXRhSXRlbSwKICAgIGxhYmVsX2NvbHVtbjogc3RyLAogICAgc2NvcmVfbWV0aG9kOiBzdHIgPSAnbWljcm8nLAogICAgcGxvdHNfZGVzdDogc3RyID0gIiIsCiAgICBtb2RlbF9ldmFsdWF0b3IgPSBOb25lLAogICAgZGVmYXVsdF9tb2RlbDogc3RyID0gIm1vZGVsLnBrbCIsCiAgICBwcmVkaWN0aW9uc19jb2x1bW46IHN0ciA9ICd5c2NvcmUnCikgLT4gTm9uZToKICAgICIiIlRlc3Qgb25lIG9yIG1vcmUgY2xhc3NpZmllciBtb2RlbHMgYWdhaW5zdCBoZWxkLW91dCBkYXRhc2V0CiAgICAKICAgIFVzaW5nIGhlbGQtb3V0IHRlc3QgZmVhdHVyZXMsIGV2YWx1YXRlcyB0aGUgcGVmb3JtYW5jZSBvZiB0aGUgZXN0aW1hdGVkIG1vZGVsCiAgICAKICAgIENhbiBiZSBwYXJ0IG9mIGEga3ViZWZsb3cgcGlwZWxpbmUgYXMgYSB0ZXN0IHN0ZXAgdGhhdCBpcyBydW4gcG9zdCBFREEgYW5kIAogICAgdHJhaW5pbmcvdmFsaWRhdGlvbiBjeWNsZXMKICAgIAogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBtb2RlbHNfcGF0aDogICAgICAgIGFydGlmYWN0IG1vZGVscyByZXByZXNlbnRpbmcgYSBmaWxlIG9yIGEgZm9sZGVyCiAgICA6cGFyYW0gdGVzdF9zZXQ6ICAgICAgICAgICB0ZXN0IGZlYXR1cmVzIGFuZCBsYWJlbHMKICAgIDpwYXJhbSBsYWJlbF9jb2x1bW46ICAgICAgIGNvbHVtbiBuYW1lIGZvciBncm91bmQgdHJ1dGggbGFiZWxzCiAgICA6cGFyYW0gc2NvcmVfbWV0aG9kOiAgICAgICBmb3IgbXVsdGljbGFzcyBjbGFzc2lmaWNhdGlvbgogICAgOnBhcmFtIHBsb3RzX2Rlc3Q6ICAgICAgICAgZGlyIGZvciB0ZXN0IHBsb3RzCiAgICA6cGFyYW0gbW9kZWxfZXZhbHVhdG9yOiAgICBOT1QgSU1QTEVNRU5URUQ6IHNwZWNpZmljIG1ldGhvZCB0byBnZW5lcmF0ZSBldmFsLCBwYXNzZWQgaW4gYXMgc3RyaW5nCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBvciBhdmFpbGFibGUgaW4gdGhpcyBmb2xkZXIKICAgIDpwYXJhbSBwcmVkaWN0aW9uc19jb2x1bW46IGNvbHVtbiBuYW1lIGZvciB0aGUgcHJlZGljdGlvbnMgY29sdW1uIG9uIHRoZSByZXN1bHRlZCBhcnRpZmFjdAogICAgIiIiCiAgICB4dGVzdCA9IHRlc3Rfc2V0LmFzX2RmKCkKICAgIHl0ZXN0ID0geHRlc3QucG9wKGxhYmVsX2NvbHVtbikKICAgIAogICAgdHJ5OgogICAgICAgIG1vZGVsX2ZpbGUsIG1vZGVsX29iaiwgXyA9IGdldF9tb2RlbChtb2RlbHNfcGF0aCwgc3VmZml4PScucGtsJykKICAgICAgICBtb2RlbF9vYmogPSBsb2FkKG9wZW4obW9kZWxfZmlsZSwgInJiIikpCiAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGE6CiAgICAgICAgcmFpc2UgRXhjZXB0aW9uKCJtb2RlbCBsb2NhdGlvbiBsaWtlbHkgc3BlY2lmaWVkIikKICAgIAogICAgZXh0cmFfZGF0YSA9IGV2YWxfbW9kZWxfdjIoY29udGV4dCwgeHRlc3QsIHl0ZXN0LnZhbHVlcywgbW9kZWxfb2JqKQogICAgaWYgbW9kZWxfb2JqOgogICAgICAgIHVwZGF0ZV9tb2RlbChtb2RlbHNfcGF0aCwgZXh0cmFfZGF0YT1leHRyYV9kYXRhLCAKICAgICAgICAgICAgICAgICAgICAgbWV0cmljcz1jb250ZXh0LnJlc3VsdHMsIGtleV9wcmVmaXg9J3ZhbGlkYXRpb24tJykKICAgIAogICAgeV9oYXQgPSBtb2RlbF9vYmoucHJlZGljdCh4dGVzdCkKICAgIGlmIHlfaGF0Lm5kaW0gPT0gMSBvciB5X2hhdC5zaGFwZVsxXSA9PSAxOgogICAgICAgIHNjb3JlX25hbWVzID0gW3ByZWRpY3Rpb25zX2NvbHVtbl0KICAgIGVsc2U6CiAgICAgICAgc2NvcmVfbmFtZXMgPSBbZiJ7cHJlZGljdGlvbnNfY29sdW1ufV8iICsgc3RyKHgpIGZvciB4IGluIHJhbmdlKHlfaGF0LnNoYXBlWzFdKV0KCiAgICBkZiA9IHBkLmNvbmNhdChbeHRlc3QsIHl0ZXN0LCBwZC5EYXRhRnJhbWUoeV9oYXQsIGNvbHVtbnM9c2NvcmVfbmFtZXMpXSwgYXhpcz0xKQogICAgY29udGV4dC5sb2dfZGF0YXNldCgidGVzdF9zZXRfcHJlZHMiLCBkZj1kZiwgZm9ybWF0PSJwYXJxdWV0IiwgaW5kZXg9RmFsc2UpCgo= + functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCndhcm5pbmdzLnNpbXBsZWZpbHRlcihhY3Rpb249Imlnbm9yZSIsIGNhdGVnb3J5PUZ1dHVyZVdhcm5pbmcpCgppbXBvcnQgb3MKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgbWxydW4KCmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQpmcm9tIG1scnVuLmFydGlmYWN0cyBpbXBvcnQgZ2V0X21vZGVsLCB1cGRhdGVfbW9kZWwKZnJvbSBtbHJ1bi5tbHV0aWxzIGltcG9ydCBldmFsX21vZGVsX3YyCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGxvYWQKZnJvbSB1cmxsaWIucmVxdWVzdCBpbXBvcnQgdXJsb3BlbgoKZGVmIHRlc3RfY2xhc3NpZmllcigKICAgIGNvbnRleHQsCiAgICBtb2RlbHNfcGF0aDogRGF0YUl0ZW0sIAogICAgdGVzdF9zZXQ6IERhdGFJdGVtLAogICAgbGFiZWxfY29sdW1uOiBzdHIsCiAgICBzY29yZV9tZXRob2Q6IHN0ciA9ICdtaWNybycsCiAgICBwbG90c19kZXN0OiBzdHIgPSAiIiwKICAgIG1vZGVsX2V2YWx1YXRvciA9IE5vbmUsCiAgICBkZWZhdWx0X21vZGVsOiBzdHIgPSAibW9kZWwucGtsIiwKICAgIHByZWRpY3Rpb25zX2NvbHVtbjogc3RyID0gJ3lzY29yZScsCiAgICBtb2RlbF91cGRhdGUgPSBUcnVlCikgLT4gTm9uZToKICAgICIiIlRlc3Qgb25lIG9yIG1vcmUgY2xhc3NpZmllciBtb2RlbHMgYWdhaW5zdCBoZWxkLW91dCBkYXRhc2V0CiAgICAKICAgIFVzaW5nIGhlbGQtb3V0IHRlc3QgZmVhdHVyZXMsIGV2YWx1YXRlcyB0aGUgcGVmb3JtYW5jZSBvZiB0aGUgZXN0aW1hdGVkIG1vZGVsCiAgICAKICAgIENhbiBiZSBwYXJ0IG9mIGEga3ViZWZsb3cgcGlwZWxpbmUgYXMgYSB0ZXN0IHN0ZXAgdGhhdCBpcyBydW4gcG9zdCBFREEgYW5kIAogICAgdHJhaW5pbmcvdmFsaWRhdGlvbiBjeWNsZXMKICAgIAogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBtb2RlbHNfcGF0aDogICAgICAgIGFydGlmYWN0IG1vZGVscyByZXByZXNlbnRpbmcgYSBmaWxlIG9yIGEgZm9sZGVyCiAgICA6cGFyYW0gdGVzdF9zZXQ6ICAgICAgICAgICB0ZXN0IGZlYXR1cmVzIGFuZCBsYWJlbHMKICAgIDpwYXJhbSBsYWJlbF9jb2x1bW46ICAgICAgIGNvbHVtbiBuYW1lIGZvciBncm91bmQgdHJ1dGggbGFiZWxzCiAgICA6cGFyYW0gc2NvcmVfbWV0aG9kOiAgICAgICBmb3IgbXVsdGljbGFzcyBjbGFzc2lmaWNhdGlvbgogICAgOnBhcmFtIHBsb3RzX2Rlc3Q6ICAgICAgICAgZGlyIGZvciB0ZXN0IHBsb3RzCiAgICA6cGFyYW0gbW9kZWxfZXZhbHVhdG9yOiAgICBOT1QgSU1QTEVNRU5URUQ6IHNwZWNpZmljIG1ldGhvZCB0byBnZW5lcmF0ZSBldmFsLCBwYXNzZWQgaW4gYXMgc3RyaW5nCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBvciBhdmFpbGFibGUgaW4gdGhpcyBmb2xkZXIKICAgIDpwYXJhbSBwcmVkaWN0aW9uc19jb2x1bW46IGNvbHVtbiBuYW1lIGZvciB0aGUgcHJlZGljdGlvbnMgY29sdW1uIG9uIHRoZSByZXN1bHRlZCBhcnRpZmFjdAogICAgOnBhcmFtIG1vZGVsX3VwZGF0ZTogICAgICAgKFRydWUpIHVwZGF0ZSBtb2RlbCwgd2hlbiBydW5uaW5nIGFzIHN0YW5kIGFsb25lIG5vIG5lZWQgaW4gdXBkYXRlCiAgICAiIiIKICAgIHh0ZXN0ID0gdGVzdF9zZXQuYXNfZGYoKQogICAgeXRlc3QgPSB4dGVzdC5wb3AobGFiZWxfY29sdW1uKQogICAgCiAgICB0cnk6CiAgICAgICAgbW9kZWxfZmlsZSwgbW9kZWxfb2JqLCBfID0gZ2V0X21vZGVsKG1vZGVsc19wYXRoLCBzdWZmaXg9Jy5wa2wnKQogICAgICAgIG1vZGVsX29iaiA9IGxvYWQob3Blbihtb2RlbF9maWxlLCAicmIiKSkKICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgYToKICAgICAgICByYWlzZSBFeGNlcHRpb24oIm1vZGVsIGxvY2F0aW9uIGxpa2VseSBzcGVjaWZpZWQiKQogICAgCiAgICBleHRyYV9kYXRhID0gZXZhbF9tb2RlbF92Mihjb250ZXh0LCB4dGVzdCwgeXRlc3QudmFsdWVzLCBtb2RlbF9vYmopCiAgICBpZiBtb2RlbF9vYmogYW5kIG1vZGVsX3VwZGF0ZSA9PSBUcnVlOgogICAgICAgIHVwZGF0ZV9tb2RlbChtb2RlbHNfcGF0aCwgZXh0cmFfZGF0YT1leHRyYV9kYXRhLCAKICAgICAgICAgICAgICAgICAgICAgbWV0cmljcz1jb250ZXh0LnJlc3VsdHMsIGtleV9wcmVmaXg9J3ZhbGlkYXRpb24tJykKICAgIAogICAgeV9oYXQgPSBtb2RlbF9vYmoucHJlZGljdCh4dGVzdCkKICAgIGlmIHlfaGF0Lm5kaW0gPT0gMSBvciB5X2hhdC5zaGFwZVsxXSA9PSAxOgogICAgICAgIHNjb3JlX25hbWVzID0gW3ByZWRpY3Rpb25zX2NvbHVtbl0KICAgIGVsc2U6CiAgICAgICAgc2NvcmVfbmFtZXMgPSBbZiJ7cHJlZGljdGlvbnNfY29sdW1ufV8iICsgc3RyKHgpIGZvciB4IGluIHJhbmdlKHlfaGF0LnNoYXBlWzFdKV0KCiAgICBkZiA9IHBkLmNvbmNhdChbeHRlc3QsIHl0ZXN0LCBwZC5EYXRhRnJhbWUoeV9oYXQsIGNvbHVtbnM9c2NvcmVfbmFtZXMpXSwgYXhpcz0xKQogICAgY29udGV4dC5sb2dfZGF0YXNldCgidGVzdF9zZXRfcHJlZHMiLCBkZj1kZiwgZm9ybWF0PSJwYXJxdWV0IiwgaW5kZXg9RmFsc2UpCgo= commands: [] - code_origin: https://github.com/mlrun/functions#c60a3607cf4805a927738969a8e4730c01e803d6:test_classifier.ipynb + code_origin: https://github.com/Idan707/functions.git#877277c6378d0bd61e1938e6c6c9bb9e51810fcb:test_classifier.ipynb diff --git a/test_classifier/test-classifier.py b/test_classifier/test-classifier.py index 511d29967..75af1012e 100644 --- a/test_classifier/test-classifier.py +++ b/test_classifier/test-classifier.py @@ -5,6 +5,8 @@ import os import pandas as pd +import mlrun + from mlrun.datastore import DataItem from mlrun.artifacts import get_model, update_model from mlrun.mlutils import eval_model_v2 @@ -20,7 +22,8 @@ def test_classifier( plots_dest: str = "", model_evaluator = None, default_model: str = "model.pkl", - predictions_column: str = 'yscore' + predictions_column: str = 'yscore', + model_update = True ) -> None: """Test one or more classifier models against held-out dataset @@ -38,6 +41,7 @@ def test_classifier( :param model_evaluator: NOT IMPLEMENTED: specific method to generate eval, passed in as string or available in this folder :param predictions_column: column name for the predictions column on the resulted artifact + :param model_update: (True) update model, when running as stand alone no need in update """ xtest = test_set.as_df() ytest = xtest.pop(label_column) @@ -49,7 +53,7 @@ def test_classifier( raise Exception("model location likely specified") extra_data = eval_model_v2(context, xtest, ytest.values, model_obj) - if model_obj: + if model_obj and model_update == True: update_model(models_path, extra_data=extra_data, metrics=context.results, key_prefix='validation-') diff --git a/test_classifier/test_classifier.ipynb b/test_classifier/test_classifier.ipynb index 6832924be..bc2711406 100644 --- a/test_classifier/test_classifier.ipynb +++ b/test_classifier/test_classifier.ipynb @@ -36,17 +36,11 @@ "outputs": [], "source": [ "import warnings\n", - "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ + "warnings.filterwarnings('ignore')\n", + "\n", "import os\n", "import pandas as pd\n", + "\n", "from mlrun.datastore import DataItem\n", "from mlrun.artifacts import get_model, update_model\n", "from mlrun.mlutils import eval_model_v2\n", @@ -62,7 +56,8 @@ " plots_dest: str = \"\",\n", " model_evaluator = None,\n", " default_model: str = \"model.pkl\",\n", - " predictions_column: str = 'yscore'\n", + " predictions_column: str = 'yscore',\n", + " model_update = True\n", ") -> None:\n", " \"\"\"Test one or more classifier models against held-out dataset\n", " \n", @@ -80,6 +75,7 @@ " :param model_evaluator: NOT IMPLEMENTED: specific method to generate eval, passed in as string\n", " or available in this folder\n", " :param predictions_column: column name for the predictions column on the resulted artifact\n", + " :param model_update: (True) update model, when running as stand alone no need in update\n", " \"\"\"\n", " xtest = test_set.as_df()\n", " ytest = xtest.pop(label_column)\n", @@ -91,7 +87,7 @@ " raise Exception(\"model location likely specified\")\n", " \n", " extra_data = eval_model_v2(context, xtest, ytest.values, model_obj)\n", - " if model_obj:\n", + " if model_obj and model_update == True:\n", " update_model(models_path, extra_data=extra_data, \n", " metrics=context.results, key_prefix='validation-')\n", " \n", @@ -110,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -125,24 +121,30 @@ } }, "source": [ - "### mlconfig" + "### MLconfig" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": { "pycharm": { "name": "#%%\n" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2020-10-28 15:23:07,168 [warning] warning!, server (0.5.3-rc1) and client (0.5.2) ver dont match\n" + ] + } + ], "source": [ - "from mlrun import mlconf\n", - "import os\n", - "\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'\n", - "artifact_path = mlconf.artifact_path or os.path.abspath('./')" + "import mlrun\n", + "artifact_path = mlrun.set_environment(api_path = 'http://mlrun-api:8080',\n", + " artifact_path = os.path.abspath('./'))" ] }, { @@ -153,12 +155,12 @@ } }, "source": [ - "### save" + "### Save" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 6, "metadata": { "pycharm": { "name": "#%%\n" @@ -169,28 +171,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "[mlrun] 2020-07-13 16:17:02,696 function spec saved to path: function.yaml\n" + "> 2020-10-28 15:23:15,013 [info] function spec saved to path: function.yaml\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 12, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from mlrun import code_to_function \n", "# create job function object from notebook code\n", - "fn = code_to_function(\"test_classifier\", handler=\"test_classifier\",\n", - " description=\"test a classifier using held-out or new data\",\n", - " categories=[\"ml\", \"test\"],\n", - " labels = {\"author\": \"yjb\", \"framework\": \"sklearn\"},\n", - " code_output='.')\n", + "fn = mlrun.code_to_function(\"test_classifier\", \n", + " handler=\"test_classifier\",\n", + " description=\"test a classifier using held-out or new data\",\n", + " categories=[\"ml\", \"test\"],\n", + " labels = {\"author\": \"yjb\", \"framework\": \"sklearn\"},\n", + " code_output='.')\n", "fn.export()" ] }, @@ -202,12 +204,12 @@ } }, "source": [ - "## tests" + "## Tests" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": { "pycharm": { "name": "#%%\n" @@ -217,29 +219,16 @@ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from mlrun import mount_v3io\n", - "fn.apply(mount_v3io())" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "task_params = {\n", - " \"name\" : \"tasks test classifier\",\n", - " \"params\": {\n", - " \"label_column\" : \"labels\"}}" + "fn.apply(mlrun.platforms.auto_mount())" ] }, { @@ -250,7 +239,7 @@ } }, "source": [ - "### run locally" + "### Run Locally" ] }, { @@ -259,9 +248,8 @@ "metadata": {}, "outputs": [], "source": [ - "TEST_REPO = \"https://raw.githubusercontent.com/yjb-ds/testdata/master\"\n", - "DATA_PATH = \"/User/ml2/test_set.parquet\"\n", - "MODELS_PATH = \"/User/artifacts/multi-models/sklearn.linear_model.LogisticRegression/model.pkl\"" + "DATA_PATH = 'https://s3.wasabisys.com/iguazio/data/iris/iris_dataset.csv'\n", + "MODEL_PATH = 'https://s3.wasabisys.com/iguazio/models/iris/model.pkl'" ] }, { @@ -277,28 +265,24 @@ "name": "stdout", "output_type": "stream", "text": [ - "[mlrun] 2020-06-05 23:36:35,094 starting run tasks test classifier uid=9958837817a048dea70fe0b6780484c2 -> http://10.199.227.162:8080\n", - "[mlrun] 2020-06-05 23:36:36,015 log artifact confusion-matrix at /User/ml2/test/plots/confusion-matrix.html, size: 20273, db: N\n", - "[mlrun] 2020-06-05 23:36:36,273 log artifact feature-importances at /User/ml2/test/plots/feature-importances.html, size: 11857, db: N\n", - "[mlrun] 2020-06-05 23:36:36,393 log artifact precision-recall-multiclass at /User/ml2/test/plots/precision-recall-multiclass.html, size: 55889, db: N\n", - "[mlrun] 2020-06-05 23:36:36,523 log artifact roc-multiclass at /User/ml2/test/plots/roc-multiclass.html, size: 34633, db: N\n", - "[mlrun] 2020-06-05 23:36:36,928 log artifact test_set_preds at /User/ml2/test/test_set_preds.parquet, size: 702584, db: Y\n", - "\n" + "> 2020-10-28 15:23:15,049 [warning] warning!, server (0.5.3-rc1) and client (0.5.2) ver dont match\n", + "> 2020-10-28 15:23:15,049 [info] starting run mlrun-ffeba8-test_classifier uid=40514aecc0d64b8ebf1f2efb32198484 -> http://mlrun-api:8080\n", + "> 2020-10-28 15:23:15,093 [warning] warning!, server (0.5.3-rc1) and client (0.5.2) ver dont match\n" ] }, { "data": { "text/html": [ - "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Oct 28 15:23:19runningtest-classifier-test_classifier
v3io_user=admin
kind=job
owner=admin
test_set
models_path
label_column=label
model_update=False
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to track results use .show() or .logs() or in CLI: \n", + "!mlrun get run 11372c9755964a5bb76599f4821a3bd1 --project default , !mlrun logs 11372c9755964a5bb76599f4821a3bd1 --project default\n", + "> 2020-10-28 15:23:19,488 [info] run executed, status=running\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fn.run(mlrun.NewTask(params= {'label_column':'label',\n", + " 'model_update': False}), #Change to True when you have a old model metadata to update\n", + " handler=test_classifier,\n", + " inputs={\"test_set\": DATA_PATH,\n", + " \"models_path\": MODEL_PATH})" + ] } ], "metadata": { @@ -566,7 +789,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.6" } }, "nbformat": 4, diff --git a/tf2_serving/tf2_serving.ipynb b/tf2_serving/tf2_serving.ipynb index 126f17502..5142fd379 100644 --- a/tf2_serving/tf2_serving.ipynb +++ b/tf2_serving/tf2_serving.ipynb @@ -558,7 +558,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.6" } }, "nbformat": 4, diff --git a/v2_model_server/function.yaml b/v2_model_server/function.yaml index a7149b32b..441b5b656 100644 --- a/v2_model_server/function.yaml +++ b/v2_model_server/function.yaml @@ -2,7 +2,7 @@ kind: serving metadata: name: v2-model-server tag: '' - hash: 22485e4f7ad229768915af4ef48b058d6af4476e + hash: 06e79919a7cdc95ceeaf430fb10fc935a932e2f0 project: default labels: author: yaronh @@ -23,7 +23,7 @@ spec: default: '' outputs: - default: '' - lineno: 10 + lineno: 14 predict: name: predict doc: Generate model predictions from sample. @@ -36,7 +36,7 @@ spec: outputs: - default: '' type: List - lineno: 15 + lineno: 19 init_context: name: init_context doc: '' @@ -45,7 +45,7 @@ spec: default: '' outputs: - default: '' - lineno: 23 + lineno: 27 handler: name: handler doc: '' @@ -56,7 +56,7 @@ spec: default: '' outputs: - default: '' - lineno: 26 + lineno: 30 description: generic sklearn model server min_replicas: 1 max_replicas: 4 @@ -66,15 +66,14 @@ spec: kind: Function metadata: annotations: - nuclio.io/generated_by: function generated from 12-10-2020 by admin + nuclio.io/generated_by: function generated from 06-12-2020 by admin labels: {} name: v2-model-server spec: build: - baseImage: mlrun/mlrun - commands: - - python -m pip install numpy cloudpickle v3io sklearn - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG1scnVuCgpmcm9tIGNsb3VkcGlja2xlIGltcG9ydCBsb2FkCmltcG9ydCBudW1weSBhcyBucApmcm9tIHR5cGluZyBpbXBvcnQgTGlzdAoKY2xhc3MgQ2xhc3NpZmllck1vZGVsKG1scnVuLnNlcnZpbmcuVjJNb2RlbFNlcnZlcik6CiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICAiIiJsb2FkIGFuZCBpbml0aWFsaXplIHRoZSBtb2RlbCBhbmQvb3Igb3RoZXIgZWxlbWVudHMiIiIKICAgICAgICBtb2RlbF9maWxlLCBleHRyYV9kYXRhID0gc2VsZi5nZXRfbW9kZWwoJy5wa2wnKQogICAgICAgIHNlbGYubW9kZWwgPSBsb2FkKG9wZW4obW9kZWxfZmlsZSwgJ3JiJykpCgogICAgZGVmIHByZWRpY3Qoc2VsZiwgYm9keTogZGljdCkgLT4gTGlzdDoKICAgICAgICAiIiJHZW5lcmF0ZSBtb2RlbCBwcmVkaWN0aW9ucyBmcm9tIHNhbXBsZS4iIiIKICAgICAgICBmZWF0cyA9IG5wLmFzYXJyYXkoYm9keVsnaW5wdXRzJ10pCiAgICAgICAgcmVzdWx0OiBucC5uZGFycmF5ID0gc2VsZi5tb2RlbC5wcmVkaWN0KGZlYXRzKQogICAgICAgIHJldHVybiByZXN1bHQudG9saXN0KCkKCgpmcm9tIG1scnVuLnJ1bnRpbWVzIGltcG9ydCBudWNsaW9faW5pdF9ob29rCmRlZiBpbml0X2NvbnRleHQoY29udGV4dCk6CiAgICBudWNsaW9faW5pdF9ob29rKGNvbnRleHQsIGdsb2JhbHMoKSwgJ3NlcnZpbmdfdjInKQoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgogICAgcmV0dXJuIGNvbnRleHQubWxydW5faGFuZGxlcihjb250ZXh0LCBldmVudCkK + baseImage: mlrun/ml-models + commands: [] + functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG1scnVuCgpmcm9tIGNsb3VkcGlja2xlIGltcG9ydCBsb2FkCmZyb20gdHlwaW5nIGltcG9ydCBMaXN0CmZyb20gc2tsZWFybi5kYXRhc2V0cyBpbXBvcnQgbG9hZF9pcmlzCmltcG9ydCBudW1weSBhcyBucAoKaW1wb3J0IHdhcm5pbmdzIAp3YXJuaW5ncy5maWx0ZXJ3YXJuaW5ncygnaWdub3JlJykKCmNsYXNzIENsYXNzaWZpZXJNb2RlbChtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIpOgogICAgZGVmIGxvYWQoc2VsZik6CiAgICAgICAgIiIibG9hZCBhbmQgaW5pdGlhbGl6ZSB0aGUgbW9kZWwgYW5kL29yIG90aGVyIGVsZW1lbnRzIiIiCiAgICAgICAgbW9kZWxfZmlsZSwgZXh0cmFfZGF0YSA9IHNlbGYuZ2V0X21vZGVsKCcucGtsJykKICAgICAgICBzZWxmLm1vZGVsID0gbG9hZChvcGVuKG1vZGVsX2ZpbGUsICdyYicpKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGJvZHk6IGRpY3QpIC0+IExpc3Q6CiAgICAgICAgIiIiR2VuZXJhdGUgbW9kZWwgcHJlZGljdGlvbnMgZnJvbSBzYW1wbGUuIiIiCiAgICAgICAgZmVhdHMgPSBucC5hc2FycmF5KGJvZHlbJ2lucHV0cyddKQogICAgICAgIHJlc3VsdDogbnAubmRhcnJheSA9IHNlbGYubW9kZWwucHJlZGljdChmZWF0cykKICAgICAgICByZXR1cm4gcmVzdWx0LnRvbGlzdCgpCgoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg== noBaseImagesPull: true env: [] handler: v2_model_server:handler @@ -82,4 +81,6 @@ spec: volumes: [] source: '' function_kind: serving_v2 + graph: + kind: router default_class: ClassifierModel diff --git a/v2_model_server/v2-model-server.py b/v2_model_server/v2-model-server.py index 2fb44b742..c5bd59db3 100644 --- a/v2_model_server/v2-model-server.py +++ b/v2_model_server/v2-model-server.py @@ -3,8 +3,12 @@ import mlrun from cloudpickle import load -import numpy as np from typing import List +from sklearn.datasets import load_iris +import numpy as np + +import warnings +warnings.filterwarnings('ignore') class ClassifierModel(mlrun.serving.V2ModelServer): def load(self): diff --git a/v2_model_server/v2_model_server.ipynb b/v2_model_server/v2_model_server.ipynb index 0bd7fcbe3..da88b89a4 100644 --- a/v2_model_server/v2_model_server.ipynb +++ b/v2_model_server/v2_model_server.ipynb @@ -11,15 +11,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-12 14:52:57,535 [warning] Failed resolving version info. Ignoring and using defaults\n" - ] - } - ], + "outputs": [], "source": [ "import mlrun" ] @@ -47,17 +39,7 @@ ], "source": [ "%nuclio config kind=\"serving\"\n", - "%nuclio config spec.build.baseImage = \"mlrun/mlrun\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "%%nuclio cmd -c\n", - "python -m pip install numpy cloudpickle v3io sklearn" + "%nuclio config spec.build.baseImage=\"mlrun/mlrun\"" ] }, { @@ -69,18 +51,22 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from cloudpickle import load\n", + "from typing import List\n", + "from sklearn.datasets import load_iris\n", "import numpy as np\n", - "from typing import List" + "\n", + "import warnings \n", + "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -99,7 +85,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -113,22 +99,6 @@ "# Convert to function object" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The sklearn-project generated one or more models that will be deployed in the server project `sklearn-servers`" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "models_path = 'https://s3.wasabisys.com/iguazio/models/iris/model.pkl'" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -138,32 +108,35 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "> 2020-10-12 14:47:46,331 [info] function spec saved to path: function.yaml\n" + "> 2020-12-06 11:49:27,049 [info] function spec saved to path: function.yaml\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 16, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "fn = mlrun.code_to_function('v2-model-server', description=\"generic sklearn model server\",\n", - " categories=['serving', 'ml'],\n", - " labels={'author': 'yaronh', 'framework': 'sklearn'},\n", - " code_output='.')\n", + "import mlrun\n", + "fn = mlrun.code_to_function('v2-model-server', \n", + " description=\"generic sklearn model server\",\n", + " categories=['serving', 'ml'],\n", + " labels={'author': 'yaronh', 'framework': 'sklearn'},\n", + " code_output='.')\n", + "\n", "fn.spec.default_class = 'ClassifierModel'\n", "#print(fn.to_yaml())\n", "fn.export()" @@ -178,10 +151,12 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ + "models_path = 'https://s3.wasabisys.com/iguazio/models/iris/model.pkl'\n", + "mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or 'http://mlrun-api:8080'\n", "fn.add_model('mymodel', model_path=models_path)\n", "#fn.verbose = True" ] @@ -195,9 +170,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'routes': }\n", + "{'model_path': 'https://s3.wasabisys.com/iguazio/models/iris/model.pkl'}\n", + "> 2020-12-06 11:49:27,287 [info] model mymodel was loaded\n", + "> 2020-12-06 11:49:27,288 [info] Loaded ['mymodel']\n" + ] + } + ], "source": [ "# create an emulator (mock server) from the function configuration)\n", "server = fn.to_mock_server(globals())" @@ -212,35 +198,26 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "from sklearn.datasets import load_iris\n", "iris = load_iris()\n", "x = iris['data'].tolist()" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2020-10-12 14:44:19,297 [debug] router run model mymodel, op=infer\n", - "> 2020-10-12 14:44:19,297 [debug] router run model mymodel, op=infer\n" - ] - }, { "data": { "text/plain": [ "dict_keys(['id', 'model_name', 'outputs'])" ] }, - "execution_count": 13, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -259,33 +236,56 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fn.apply(mlrun.mount_v3io())" + ] + }, + { + "cell_type": "code", + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "> 2020-10-12 14:44:20,662 [info] deploy started\n", - "[nuclio] 2020-10-12 14:44:21,772 (info) Build complete\n", - "[nuclio] 2020-10-12 14:44:29,852 (info) Function deploy complete\n", - "[nuclio] 2020-10-12 14:44:29,859 done updating v2-srv-v2-model-server, function address: 3.128.234.166:30830\n" + "> 2020-12-06 11:49:27,353 [info] Starting remote function deploy\n", + "2020-12-06 11:49:27 (info) Deploying function\n", + "2020-12-06 11:49:27 (info) Building\n", + "2020-12-06 11:49:27 (info) Staging files and preparing base images\n", + "2020-12-06 11:49:27 (info) Building processor image\n", + "2020-12-06 11:53:30 (info) Build complete\n", + "2020-12-06 11:53:36 (info) Function deploy complete\n", + "> 2020-12-06 11:53:36,887 [info] function deployed, address=default-tenant.app.yh210.iguazio-cd2.com:31544\n" ] }, { "data": { "text/plain": [ - "'http://3.128.234.166:30830'" + "'http://default-tenant.app.yh210.iguazio-cd2.com:31544'" ] }, - "execution_count": 14, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "fn.apply(mlrun.mount_v3io())\n", - "fn.deploy(project='v2-srv')" + "fn.deploy()" ] }, { @@ -297,18 +297,18 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'id': 'bd688fa9-38cc-4a5d-95e8-1445bdd1520a',\n", + "{'id': '85877b5c-28a5-40dd-98d2-9c4e234ada57',\n", " 'model_name': 'mymodel',\n", " 'outputs': [0, 2]}" ] }, - "execution_count": 15, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -328,9 +328,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [conda env:root] *", "language": "python", - "name": "python3" + "name": "conda-root-py" }, "language_info": { "codemirror_mode": { diff --git a/v2_model_tester/function.yaml b/v2_model_tester/function.yaml index 5a7b9d7be..7bbb6b1ad 100644 --- a/v2_model_tester/function.yaml +++ b/v2_model_tester/function.yaml @@ -2,7 +2,7 @@ kind: job metadata: name: v2-model-tester tag: '' - hash: bbffca7f8decc17c1de599015984a548bd60702e + hash: 989fa7ebf36b83e40fbb657b708a894cbbda3a81 project: default labels: author: yaronh @@ -48,9 +48,9 @@ spec: default: 20 outputs: - default: '' - lineno: 12 + lineno: 13 description: test v2 model servers build: - functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IHJlcXVlc3RzCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucApmcm9tIGRhdGV0aW1lIGltcG9ydCBkYXRldGltZQpmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5hcnRpZmFjdHMgaW1wb3J0IENoYXJ0QXJ0aWZhY3QKCmRlZiBtb2RlbF9zZXJ2ZXJfdGVzdGVyKGNvbnRleHQsCiAgICAgICAgICAgICAgICAgICAgICAgIHRhYmxlOiBEYXRhSXRlbSwKICAgICAgICAgICAgICAgICAgICAgICAgYWRkcjogc3RyLCAKICAgICAgICAgICAgICAgICAgICAgICAgbGFiZWxfY29sdW1uOiBzdHIgPSAibGFiZWwiLAogICAgICAgICAgICAgICAgICAgICAgICBtb2RlbDogc3RyID0gJycsCiAgICAgICAgICAgICAgICAgICAgICAgIG1hdGNoX2VycjogYm9vbCA9IEZhbHNlLAogICAgICAgICAgICAgICAgICAgICAgICByb3dzOiBpbnQgPSAyMCk6CiAgICAiIiIgVGVzdCBhIG1vZGVsIHNlcnZlciAKICAgIAogICAgOnBhcmFtIHRhYmxlOiAgICAgICAgIGNzdi9wYXJxdWV0IHRhYmxlIHdpdGggdGVzdCBkYXRhCiAgICA6cGFyYW0gYWRkcjogICAgICAgICAgZnVuY3Rpb24gYWRkcmVzcy91cmwKICAgIDpwYXJhbSBsYWJlbF9jb2x1bW46ICBuYW1lIG9mIHRoZSBsYWJlbCBjb2x1bW4gaW4gdGFibGUKICAgIDpwYXJhbSBtb2RlbDogICAgICAgICB0ZXN0ZWQgbW9kZWwgbmFtZSAKICAgIDpwYXJhbSBtYXRjaF9lcnI6ICAgICByYWlzZSBlcnJvciBvbiB2YWxpZGF0aW9uIChyZXF1aXJlIHByb3BlciB0ZXN0IHNldCkKICAgIDpwYXJhbSByb3dzOiAgICAgICAgICBudW1iZXIgb2Ygcm93cyB0byB1c2UgZnJvbSB0ZXN0IHNldAogICAgIiIiCiAgICAgICAgCiAgICB0YWJsZSA9IHRhYmxlLmFzX2RmKCkKCiAgICB5X2xpc3QgPSB0YWJsZS5wb3AobGFiZWxfY29sdW1uKS52YWx1ZXMudG9saXN0KCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZid0ZXN0aW5nIHdpdGggZGF0YXNldCBhZ2FpbnN0IHthZGRyfSwgbW9kZWw6IHttb2RlbH0nKQogICAgaWYgcm93cyBhbmQgcm93cyA8IHRhYmxlLnNoYXBlWzBdOgogICAgICAgIHRhYmxlID0gdGFibGUuc2FtcGxlKHJvd3MpCiAgICAKICAgIGNvdW50ID0gZXJyX2NvdW50ID0gbWF0Y2ggPSAwCiAgICB0aW1lcyA9IFtdCiAgICBmb3IgeCwgeSBpbiB6aXAodGFibGUudmFsdWVzLCB5X2xpc3QpOgogICAgICAgIGNvdW50ICs9IDEKICAgICAgICBldmVudF9kYXRhID0ganNvbi5kdW1wcyh7ImlucHV0cyI6W3gudG9saXN0KCldfSkKICAgICAgICBoYWRfZXJyID0gRmFsc2UKICAgICAgICB0cnk6CiAgICAgICAgICAgIHN0YXJ0ID0gZGF0ZXRpbWUubm93KCkKICAgICAgICAgICAgcmVzcCA9IHJlcXVlc3RzLnB1dChmJ3thZGRyfS92Mi9tb2RlbHMve21vZGVsfS9pbmZlcicsIGpzb249ZXZlbnRfZGF0YSkKICAgICAgICAgICAgaWYgbm90IHJlc3Aub2s6CiAgICAgICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5lcnJvcihmJ2JhZCBmdW5jdGlvbiByZXNwISFcbntyZXNwLnRleHR9JykKICAgICAgICAgICAgICAgIGVycl9jb3VudCArPSAxCiAgICAgICAgICAgICAgICBjb250aW51ZQogICAgICAgICAgICB0aW1lcy5hcHBlbmQoKGRhdGV0aW1lLm5vdygpLXN0YXJ0KS5taWNyb3NlY29uZHMpCiAgICAgICAgICAgICAgICAKICAgICAgICBleGNlcHQgT1NFcnJvciBhcyBlcnI6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmVycm9yKGYnZXJyb3IgaW4gcmVxdWVzdCwgZGF0YTp7ZXZlbnRfZGF0YX0sIGVycm9yOiB7ZXJyfScpCiAgICAgICAgICAgIGVycl9jb3VudCArPSAxCiAgICAgICAgICAgIGNvbnRpbnVlCiAgICAgICAgCiAgICAgICAgcmVzcF9kYXRhID0gcmVzcC5qc29uKCkKICAgICAgICBwcmludChyZXNwX2RhdGEpCiAgICAgICAgeV9yZXNwID0gcmVzcF9kYXRhWydvdXRwdXRzJ11bMF0KICAgICAgICBpZiB5ID09IHlfcmVzcDoKICAgICAgICAgICAgbWF0Y2ggKz0gMQogICAgICAgIAogICAgY29udGV4dC5sb2dfcmVzdWx0KCd0b3RhbF90ZXN0cycsIGNvdW50KQogICAgY29udGV4dC5sb2dfcmVzdWx0KCdlcnJvcnMnLCBlcnJfY291bnQpCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoJ21hdGNoJywgbWF0Y2gpCiAgICBpZiBjb3VudCAtIGVycl9jb3VudCA+IDA6CiAgICAgICAgdGltZXNfYXJyID0gbnAuYXJyYXkodGltZXMpCiAgICAgICAgY29udGV4dC5sb2dfcmVzdWx0KCdhdmdfbGF0ZW5jeScsIGludChucC5tZWFuKHRpbWVzX2FycikpKQogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgnbWluX2xhdGVuY3knLCBpbnQobnAuYW1pbih0aW1lc19hcnIpKSkKICAgICAgICBjb250ZXh0LmxvZ19yZXN1bHQoJ21heF9sYXRlbmN5JywgaW50KG5wLmFtYXgodGltZXNfYXJyKSkpCiAgICAgICAgCiAgICAgICAgY2hhcnQgPSBDaGFydEFydGlmYWN0KCdsYXRlbmN5JywgaGVhZGVyPVsnVGVzdCcsICdMYXRlbmN5IChtaWNyb3NlYyknXSkKICAgICAgICBmb3IgaSBpbiByYW5nZShsZW4odGltZXMpKToKICAgICAgICAgICAgY2hhcnQuYWRkX3JvdyhbaSsxLCBpbnQodGltZXNbaV0pXSkKICAgICAgICBjb250ZXh0LmxvZ19hcnRpZmFjdChjaGFydCkKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYncnVuIHtjb3VudH0gdGVzdHMsIHtlcnJfY291bnR9IGVycm9ycyBhbmQge21hdGNofSBtYXRjaCBleHBlY3RlZCB2YWx1ZScpCiAgICAKICAgIGlmIGVycl9jb3VudDoKICAgICAgICByYWlzZSBWYWx1ZUVycm9yKGYnZmFpbGVkIG9uIHtlcnJfY291bnR9IHRlc3RzIG9mIHtjb3VudH0nKQogICAgCiAgICBpZiBtYXRjaF9lcnIgYW5kIG1hdGNoICE9IGNvdW50OgogICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZidvbmx5IHttYXRjaH0gcmVzdWx0cyBtYXRjaCBvdXQgb2Yge2NvdW50fScpCgo= + functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IHJlcXVlc3RzCmltcG9ydCBqc29uCmltcG9ydCBudW1weSBhcyBucApmcm9tIGRhdGV0aW1lIGltcG9ydCBkYXRldGltZQpmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KZnJvbSBtbHJ1bi5hcnRpZmFjdHMgaW1wb3J0IENoYXJ0QXJ0aWZhY3QKaW1wb3J0IG1scnVuCgpkZWYgbW9kZWxfc2VydmVyX3Rlc3Rlcihjb250ZXh0LAogICAgICAgICAgICAgICAgICAgICAgICB0YWJsZTogRGF0YUl0ZW0sCiAgICAgICAgICAgICAgICAgICAgICAgIGFkZHI6IHN0ciwgCiAgICAgICAgICAgICAgICAgICAgICAgIGxhYmVsX2NvbHVtbjogc3RyID0gImxhYmVsIiwKICAgICAgICAgICAgICAgICAgICAgICAgbW9kZWw6IHN0ciA9ICcnLAogICAgICAgICAgICAgICAgICAgICAgICBtYXRjaF9lcnI6IGJvb2wgPSBGYWxzZSwKICAgICAgICAgICAgICAgICAgICAgICAgcm93czogaW50ID0gMjApOgogICAgIiIiIFRlc3QgYSBtb2RlbCBzZXJ2ZXIgCiAgICAKICAgIDpwYXJhbSB0YWJsZTogICAgICAgICBjc3YvcGFycXVldCB0YWJsZSB3aXRoIHRlc3QgZGF0YQogICAgOnBhcmFtIGFkZHI6ICAgICAgICAgIGZ1bmN0aW9uIGFkZHJlc3MvdXJsCiAgICA6cGFyYW0gbGFiZWxfY29sdW1uOiAgbmFtZSBvZiB0aGUgbGFiZWwgY29sdW1uIGluIHRhYmxlCiAgICA6cGFyYW0gbW9kZWw6ICAgICAgICAgdGVzdGVkIG1vZGVsIG5hbWUgCiAgICA6cGFyYW0gbWF0Y2hfZXJyOiAgICAgcmFpc2UgZXJyb3Igb24gdmFsaWRhdGlvbiAocmVxdWlyZSBwcm9wZXIgdGVzdCBzZXQpCiAgICA6cGFyYW0gcm93czogICAgICAgICAgbnVtYmVyIG9mIHJvd3MgdG8gdXNlIGZyb20gdGVzdCBzZXQKICAgICIiIgogICAgICAgIAogICAgdGFibGUgPSB0YWJsZS5hc19kZigpCgogICAgeV9saXN0ID0gdGFibGUucG9wKGxhYmVsX2NvbHVtbikudmFsdWVzLnRvbGlzdCgpCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYndGVzdGluZyB3aXRoIGRhdGFzZXQgYWdhaW5zdCB7YWRkcn0sIG1vZGVsOiB7bW9kZWx9JykKICAgIGlmIHJvd3MgYW5kIHJvd3MgPCB0YWJsZS5zaGFwZVswXToKICAgICAgICB0YWJsZSA9IHRhYmxlLnNhbXBsZShyb3dzKQogICAgCiAgICBjb3VudCA9IGVycl9jb3VudCA9IG1hdGNoID0gMAogICAgdGltZXMgPSBbXQogICAgZm9yIHgsIHkgaW4gemlwKHRhYmxlLnZhbHVlcywgeV9saXN0KToKICAgICAgICBjb3VudCArPSAxCiAgICAgICAgZXZlbnRfZGF0YSA9IGpzb24uZHVtcHMoeyJpbnB1dHMiOlt4LnRvbGlzdCgpXX0pCiAgICAgICAgaGFkX2VyciA9IEZhbHNlCiAgICAgICAgdHJ5OgogICAgICAgICAgICBzdGFydCA9IGRhdGV0aW1lLm5vdygpCiAgICAgICAgICAgIHJlc3AgPSByZXF1ZXN0cy5wdXQoZid7YWRkcn0vdjIvbW9kZWxzL3ttb2RlbH0vaW5mZXInLCBqc29uPWV2ZW50X2RhdGEpCiAgICAgICAgICAgIGlmIG5vdCByZXNwLm9rOgogICAgICAgICAgICAgICAgY29udGV4dC5sb2dnZXIuZXJyb3IoZidiYWQgZnVuY3Rpb24gcmVzcCEhXG57cmVzcC50ZXh0fScpCiAgICAgICAgICAgICAgICBlcnJfY291bnQgKz0gMQogICAgICAgICAgICAgICAgY29udGludWUKICAgICAgICAgICAgdGltZXMuYXBwZW5kKChkYXRldGltZS5ub3coKS1zdGFydCkubWljcm9zZWNvbmRzKQogICAgICAgICAgICAgICAgCiAgICAgICAgZXhjZXB0IE9TRXJyb3IgYXMgZXJyOgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5lcnJvcihmJ2Vycm9yIGluIHJlcXVlc3QsIGRhdGE6e2V2ZW50X2RhdGF9LCBlcnJvcjoge2Vycn0nKQogICAgICAgICAgICBlcnJfY291bnQgKz0gMQogICAgICAgICAgICBjb250aW51ZQogICAgICAgIAogICAgICAgIHJlc3BfZGF0YSA9IHJlc3AuanNvbigpCiAgICAgICAgcHJpbnQocmVzcF9kYXRhKQogICAgICAgIHlfcmVzcCA9IHJlc3BfZGF0YVsnb3V0cHV0cyddWzBdCiAgICAgICAgaWYgeSA9PSB5X3Jlc3A6CiAgICAgICAgICAgIG1hdGNoICs9IDEKICAgICAgICAKICAgIGNvbnRleHQubG9nX3Jlc3VsdCgndG90YWxfdGVzdHMnLCBjb3VudCkKICAgIGNvbnRleHQubG9nX3Jlc3VsdCgnZXJyb3JzJywgZXJyX2NvdW50KQogICAgY29udGV4dC5sb2dfcmVzdWx0KCdtYXRjaCcsIG1hdGNoKQogICAgaWYgY291bnQgLSBlcnJfY291bnQgPiAwOgogICAgICAgIHRpbWVzX2FyciA9IG5wLmFycmF5KHRpbWVzKQogICAgICAgIGNvbnRleHQubG9nX3Jlc3VsdCgnYXZnX2xhdGVuY3knLCBpbnQobnAubWVhbih0aW1lc19hcnIpKSkKICAgICAgICBjb250ZXh0LmxvZ19yZXN1bHQoJ21pbl9sYXRlbmN5JywgaW50KG5wLmFtaW4odGltZXNfYXJyKSkpCiAgICAgICAgY29udGV4dC5sb2dfcmVzdWx0KCdtYXhfbGF0ZW5jeScsIGludChucC5hbWF4KHRpbWVzX2FycikpKQogICAgICAgIAogICAgICAgIGNoYXJ0ID0gQ2hhcnRBcnRpZmFjdCgnbGF0ZW5jeScsIGhlYWRlcj1bJ1Rlc3QnLCAnTGF0ZW5jeSAobWljcm9zZWMpJ10pCiAgICAgICAgZm9yIGkgaW4gcmFuZ2UobGVuKHRpbWVzKSk6CiAgICAgICAgICAgIGNoYXJ0LmFkZF9yb3coW2krMSwgaW50KHRpbWVzW2ldKV0pCiAgICAgICAgY29udGV4dC5sb2dfYXJ0aWZhY3QoY2hhcnQpCgogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmJ3J1biB7Y291bnR9IHRlc3RzLCB7ZXJyX2NvdW50fSBlcnJvcnMgYW5kIHttYXRjaH0gbWF0Y2ggZXhwZWN0ZWQgdmFsdWUnKQogICAgCiAgICBpZiBlcnJfY291bnQ6CiAgICAgICAgcmFpc2UgVmFsdWVFcnJvcihmJ2ZhaWxlZCBvbiB7ZXJyX2NvdW50fSB0ZXN0cyBvZiB7Y291bnR9JykKICAgIAogICAgaWYgbWF0Y2hfZXJyIGFuZCBtYXRjaCAhPSBjb3VudDoKICAgICAgICByYWlzZSBWYWx1ZUVycm9yKGYnb25seSB7bWF0Y2h9IHJlc3VsdHMgbWF0Y2ggb3V0IG9mIHtjb3VudH0nKQoK commands: [] - code_origin: https://github.com/mlrun/functions.git#0afac753c28f1c4126b841ebea14219700bc9635:v2_model_tester.ipynb + code_origin: https://github.com/Idan707/functions.git#a0e559d5ebff00e1c9b41307200258b507a8201b:v2_model_tester.ipynb diff --git a/v2_model_tester/v2_model_tester.ipynb b/v2_model_tester/v2_model_tester.ipynb index 992970197..ee14e4d73 100644 --- a/v2_model_tester/v2_model_tester.ipynb +++ b/v2_model_tester/v2_model_tester.ipynb @@ -28,7 +28,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "%nuclio: setting spec.image to 'mlrun/mlrun'\n" + "%nuclio: setting spec.image to 'mlrun/mlrun'\n", + "%nuclio: setting spec.maxReplicas to 1\n" ] } ], @@ -45,7 +46,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2020-10-12 14:57:23,715 [warning] Failed resolving version info. Ignoring and using defaults\n" + "> 2020-10-28 17:05:57,889 [warning] Failed resolving version info. Ignoring and using defaults\n" ] } ], @@ -142,53 +143,86 @@ "# marks the end of a code section" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Deploy model server for testing" + ] + }, { "cell_type": "code", "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2020-10-28 17:05:57,914 [warning] warning!, server (0.5.3-rc1) and client (unstable) ver dont match\n", + "> 2020-10-28 17:05:58,324 [info] deploy started\n", + "[nuclio] 2020-10-28 17:06:08,637 (info) Build complete\n", + "[nuclio] 2020-10-28 17:06:11,672 done updating default-v2-model-server, function address: default-tenant.app.dsteam.iguazio-cd1.com:30984\n", + "> 2020-10-28 17:06:11,679 [warning] warning!, server (0.5.3-rc1) and client (unstable) ver dont match\n" + ] + } + ], "source": [ - "from os import path\n", - "from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io\n", - "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'\n", + "import mlrun\n", + "\n", + "project_name = 'sk-project'\n", + "DATA_PATH = 'https://s3.wasabisys.com/iguazio/data/iris/iris_dataset.csv'\n", + "MODEL_PATH = 'https://s3.wasabisys.com/iguazio/models/iris/model.pkl'\n", "\n", - "# specify artifacts target location\n", - "artifact_path = mlconf.artifact_path or path.abspath('./')\n", - "project_name = 'sk-project'" + "artifact_path = mlrun.set_environment(api_path = 'http://mlrun-api:8080',\n", + " artifact_path = os.path.abspath('./'))\n", + "\n", + "fn = mlrun.import_function('hub://v2_model_server')\n", + "fn.add_model('mymodel', model_path=MODEL_PATH)\n", + "address = fn.deploy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run model server tester locally" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "> 2020-10-12 15:02:30,095 [info] starting run model_server_tester uid=e99145f5d1c2477397cda3bae2724743 -> http://mlrun-api:8080\n", - "> 2020-10-12 15:02:30,305 [info] testing with dataset against http://3.128.234.166:30913, model: iris_dataset_v1\n", - "{'id': '3ea79e20-c4ea-445f-8f78-64e052a92cfd', 'model_name': 'iris_dataset_v1', 'outputs': [0]}\n", - "{'id': 'd2e58ccf-8c79-4e83-9ac1-78b53faf85e4', 'model_name': 'iris_dataset_v1', 'outputs': [0]}\n", - "{'id': 'd6f221c3-b7ec-46ff-b5ef-e26e08338ef1', 'model_name': 'iris_dataset_v1', 'outputs': [1]}\n", - "{'id': '8a3f1460-8192-499e-8ba5-75f28a558125', 'model_name': 'iris_dataset_v1', 'outputs': [0]}\n", - "{'id': '9f15a3e0-b53e-4689-bba6-b489c79f24b6', 'model_name': 'iris_dataset_v1', 'outputs': [0]}\n", - "{'id': 'abdd1480-cdad-429d-b466-73c957aa0f90', 'model_name': 'iris_dataset_v1', 'outputs': [2]}\n", - "{'id': 'b4fb0135-e455-46db-93e5-c4d7cb95e8ab', 'model_name': 'iris_dataset_v1', 'outputs': [2]}\n", - "{'id': 'ac111d8e-79ab-4699-a2a3-b266c2921d82', 'model_name': 'iris_dataset_v1', 'outputs': [2]}\n", - "{'id': 'f881fbab-7575-44a1-987b-7470544ccf55', 'model_name': 'iris_dataset_v1', 'outputs': [0]}\n", - "{'id': '592bf961-65e5-429a-8e92-64bc1220b724', 'model_name': 'iris_dataset_v1', 'outputs': [0]}\n", - "{'id': 'a3627ceb-6651-4a67-a145-64c7594b10b4', 'model_name': 'iris_dataset_v1', 'outputs': [1]}\n", - "{'id': '82d42332-263a-4754-8da2-7d8fcce933f7', 'model_name': 'iris_dataset_v1', 'outputs': [1]}\n", - "{'id': '4ff18f8d-ffd0-488b-99b2-a0293952a342', 'model_name': 'iris_dataset_v1', 'outputs': [2]}\n", - "{'id': '4f9262d1-ea82-4cf8-bfb0-9ee9312f8a39', 'model_name': 'iris_dataset_v1', 'outputs': [0]}\n", - "{'id': '1b1d356b-296e-4382-9fe5-17789a950131', 'model_name': 'iris_dataset_v1', 'outputs': [1]}\n", - "{'id': 'a5d9df09-d4ff-4dc8-835f-7c9eb0e789bb', 'model_name': 'iris_dataset_v1', 'outputs': [0]}\n", - "{'id': '987fc45c-788f-456b-822d-d8a2e4391d84', 'model_name': 'iris_dataset_v1', 'outputs': [2]}\n", - "{'id': '765390a6-afdf-48c4-aa52-12ec50dd4fe2', 'model_name': 'iris_dataset_v1', 'outputs': [2]}\n", - "{'id': '79df5728-f025-4d21-bf7c-a5ea46390e83', 'model_name': 'iris_dataset_v1', 'outputs': [2]}\n", - "{'id': '6ea455e4-f5be-4afb-9c01-08bcadcc54d4', 'model_name': 'iris_dataset_v1', 'outputs': [1]}\n", - "> 2020-10-12 15:02:30,710 [info] run 20 tests, 0 errors and 8 match expected value\n" + "> 2020-10-28 17:06:11,735 [warning] warning!, server (0.5.3-rc1) and client (unstable) ver dont match\n", + "> 2020-10-28 17:06:11,736 [info] starting run model_server_tester uid=cdab1ab0ee78491aa7112199edd13eee -> http://mlrun-api:8080\n", + "> 2020-10-28 17:06:11,774 [warning] warning!, server (0.5.3-rc1) and client (unstable) ver dont match\n", + "> 2020-10-28 17:06:11,926 [info] testing with dataset against http://default-tenant.app.dsteam.iguazio-cd1.com:30984, model: mymodel\n", + "{'id': 'ff2b8b28-a577-41f7-9903-10d30b5e5ce1', 'model_name': 'mymodel', 'outputs': [1]}\n", + "{'id': 'de0ebcde-584b-47a1-a3a2-58b71d5dfd6e', 'model_name': 'mymodel', 'outputs': [2]}\n", + "{'id': 'b5b48278-1923-4b22-afa0-125d168cf8f2', 'model_name': 'mymodel', 'outputs': [0]}\n", + "{'id': 'adc5fa66-d375-4284-956d-258cfccf7ffd', 'model_name': 'mymodel', 'outputs': [0]}\n", + "{'id': '34d7aec0-8c00-4a46-beff-3d3f230f1eb6', 'model_name': 'mymodel', 'outputs': [1]}\n", + "{'id': '3b54ba8d-2dd9-4f1b-b047-d7dd0795d700', 'model_name': 'mymodel', 'outputs': [2]}\n", + "{'id': 'c1dc99d8-1ee8-49e1-bb29-6d358c6fe4df', 'model_name': 'mymodel', 'outputs': [2]}\n", + "{'id': 'cfb68667-c73e-4099-bc86-0621ee5f5b74', 'model_name': 'mymodel', 'outputs': [0]}\n", + "{'id': '4a6ce9d1-5554-4b86-81d4-27f9c2f3d1f4', 'model_name': 'mymodel', 'outputs': [2]}\n", + "{'id': 'e9b92268-1e8a-4027-b22c-1796548c85d3', 'model_name': 'mymodel', 'outputs': [2]}\n", + "{'id': '24856808-39f9-479f-9351-e9a53606b774', 'model_name': 'mymodel', 'outputs': [1]}\n", + "{'id': '4d02d624-123c-492e-a2c5-31fb5666b641', 'model_name': 'mymodel', 'outputs': [0]}\n", + "{'id': '73f24c9c-8b91-49e1-b489-9ce130987923', 'model_name': 'mymodel', 'outputs': [0]}\n", + "{'id': '9814eb33-0dd1-4a77-a88f-bbff7f4d0b6d', 'model_name': 'mymodel', 'outputs': [2]}\n", + "{'id': '125107eb-021f-4bd0-bf49-1ec9eb9c6271', 'model_name': 'mymodel', 'outputs': [1]}\n", + "{'id': 'a1110605-7f83-4944-aec1-f5c0ebe6df11', 'model_name': 'mymodel', 'outputs': [2]}\n", + "{'id': '54683000-a57b-4359-861a-12a07c721e80', 'model_name': 'mymodel', 'outputs': [1]}\n", + "{'id': 'f0966c56-3184-4ad6-897e-05ad3142078c', 'model_name': 'mymodel', 'outputs': [0]}\n", + "{'id': 'b12cf6f0-bf3c-4047-a214-6f7a87fe4bf8', 'model_name': 'mymodel', 'outputs': [1]}\n", + "{'id': 'f8f17071-833f-425f-843b-f83cddbc74a5', 'model_name': 'mymodel', 'outputs': [1]}\n", + "> 2020-10-28 17:06:12,640 [info] run 20 tests, 0 errors and 6 match expected value\n" ] }, { @@ -362,26 +396,26 @@ " \n", " \n", " sk-project\n", - " \n", + " \n", " 0\n", - " Oct 12 15:02:30\n", + " Oct 28 17:06:11\n", " completed\n", " model_server_tester\n", - "
v3io_user=admin
kind=handler
owner=admin
host=jupyter-58d8fdb6fc-nmqbq
\n", + "
v3io_user=admin
kind=handler
owner=admin
host=jupyter-d87678b84-n4lcf
\n", "
table
\n", - "
addr=http://3.128.234.166:30913
model=iris_dataset_v1
\n", - "
total_tests=20
errors=0
match=8
avg_latency=17454
min_latency=12366
max_latency=105585
\n", - "
latency
\n", + "
addr=http://default-tenant.app.dsteam.iguazio-cd1.com:30984
model=mymodel
\n", + "
total_tests=20
errors=0
match=6
avg_latency=32244
min_latency=24955
max_latency=116585
\n", + "
latency
\n", " \n", " \n", "\n", "\n", - "
\n", + "
\n", "
\n", - " Title\n", - " ×\n", + " Title\n", + " ×\n", "
\n", - " \n", + " \n", "
\n", "
\n" ], @@ -397,19 +431,24 @@ "output_type": "stream", "text": [ "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run e99145f5d1c2477397cda3bae2724743 --project sk-project , !mlrun logs e99145f5d1c2477397cda3bae2724743 --project sk-project\n", - "> 2020-10-12 15:02:30,772 [info] run executed, status=completed\n" + "!mlrun get run cdab1ab0ee78491aa7112199edd13eee --project sk-project , !mlrun logs cdab1ab0ee78491aa7112199edd13eee --project sk-project\n", + "> 2020-10-28 17:06:12,724 [info] run executed, status=completed\n" ] } ], "source": [ - "# run the function locally (parameters must be set !!)\n", - "addr = 'http://3.128.234.166:30913'\n", - "data_path = 'https://s3.wasabisys.com/iguazio/data/iris/iris_dataset.csv'\n", - "gen = run_local(name='model_server_tester', handler=model_server_tester, \n", - " params={'addr': addr, 'model': 'iris_dataset_v1'},\n", - " inputs={'table': data_path},\n", - " project=project_name, artifact_path=path.join(artifact_path, 'data')) " + "gen = mlrun.run_local(name='model_server_tester', handler=model_server_tester, \n", + " params={'addr': address, 'model': 'mymodel'},\n", + " inputs={'table': DATA_PATH},\n", + " project=project_name, \n", + " artifact_path=os.path.join(artifact_path, 'data')) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save" ] }, { @@ -421,13 +460,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2020-10-07 09:53:23,062 [info] function spec saved to path: function.yaml\n" + "> 2020-10-28 17:06:21,163 [info] function spec saved to path: function.yaml\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 7, @@ -436,29 +475,298 @@ } ], "source": [ - "from mlrun import code_to_function\n", - "test_func = code_to_function(name='v2_model_tester', kind='job', handler=\"model_server_tester\",\n", - " description=\"test v2 model servers\",\n", - " categories=[\"ml\", \"test\"],\n", - " labels={\"author\": \"yaronh\"},\n", - " code_output='.')\n", + "test_func = mlrun.code_to_function(name='v2_model_tester', \n", + " kind='job', \n", + " handler=\"model_server_tester\",\n", + " description=\"test v2 model servers\",\n", + " categories=[\"ml\", \"test\"],\n", + " labels={\"author\": \"yaronh\"},\n", + " code_output='.')\n", "\n", "test_func.export('function.yaml')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run remotely" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2020-10-28 17:06:21,179 [warning] warning!, server (0.5.3-rc1) and client (unstable) ver dont match\n", + "> 2020-10-28 17:06:21,180 [info] starting run model_server_tester uid=daaed7fcf79242e2b9e0c3a5148c64d1 -> http://mlrun-api:8080\n", + "> 2020-10-28 17:06:21,307 [info] Job is running in the background, pod: model-server-tester-k2s98\n", + "> 2020-10-28 17:06:24,160 [info] testing with dataset against http://default-tenant.app.dsteam.iguazio-cd1.com:30984, model: mymodel\n", + "{'id': 'b918f1e7-e376-490a-91a9-d603a8d05c48', 'model_name': 'mymodel', 'outputs': [1]}\n", + "{'id': 'f73b94f2-0191-497b-9a0d-b6e31f53753d', 'model_name': 'mymodel', 'outputs': [2]}\n", + "{'id': '113a1873-f82e-4c1e-8eb0-e8f93035c841', 'model_name': 'mymodel', 'outputs': [0]}\n", + "{'id': 'bf9ccb23-0d77-4f12-b00b-13ed6770c062', 'model_name': 'mymodel', 'outputs': [0]}\n", + "{'id': 'acab9cdd-5c10-415d-9658-002b38c8a350', 'model_name': 'mymodel', 'outputs': [2]}\n", + "{'id': 'a8f39660-e9c5-4d64-bdde-d0427fbadf0d', 'model_name': 'mymodel', 'outputs': [0]}\n", + "{'id': '6d76b01b-aa4a-47de-b0e1-79075981abca', 'model_name': 'mymodel', 'outputs': [2]}\n", + "{'id': '1e2ba61f-da83-42ae-a5eb-0b0be0a1a89b', 'model_name': 'mymodel', 'outputs': [1]}\n", + "{'id': 'f1c7a6a7-e706-44fb-b838-f7ceeac2862f', 'model_name': 'mymodel', 'outputs': [2]}\n", + "{'id': 'a4889639-520c-48b6-bf68-8f4b09442429', 'model_name': 'mymodel', 'outputs': [2]}\n", + "{'id': '0b1968ff-0a1c-4c6c-8aed-74916d3e3496', 'model_name': 'mymodel', 'outputs': [0]}\n", + "{'id': '51ba3eee-c377-4cd7-8373-52b77d5f03a4', 'model_name': 'mymodel', 'outputs': [2]}\n", + "{'id': 'd7033e1e-fc77-47e0-bc96-6d3964ab0911', 'model_name': 'mymodel', 'outputs': [1]}\n", + "{'id': '42125174-8eb0-4619-b253-f62c0cac6eb8', 'model_name': 'mymodel', 'outputs': [1]}\n", + "{'id': '84584afd-5d35-4e92-bb0b-b72745e89c42', 'model_name': 'mymodel', 'outputs': [1]}\n", + "{'id': '0be7b84c-5c75-4b5b-b524-5c158ab1cfe7', 'model_name': 'mymodel', 'outputs': [2]}\n", + "{'id': '8b350f47-5d1b-4e69-af8f-245beb87782c', 'model_name': 'mymodel', 'outputs': [0]}\n", + "{'id': 'e0a59fde-7fd9-43ec-8794-d90174b12b65', 'model_name': 'mymodel', 'outputs': [1]}\n", + "{'id': '5f787830-a716-42fe-b57d-049299f77cc0', 'model_name': 'mymodel', 'outputs': [1]}\n", + "{'id': '58603b5b-5c06-4889-924d-5073f64fde06', 'model_name': 'mymodel', 'outputs': [2]}\n", + "> 2020-10-28 17:06:24,742 [info] run 20 tests, 0 errors and 5 match expected value\n", + "> 2020-10-28 17:06:24,766 [info] run executed, status=completed\n", + "final state: completed\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
sk-project0Oct 28 17:06:24completedmodel_server_tester
v3io_user=admin
kind=job
owner=admin
host=model-server-tester-k2s98
table
addr=http://default-tenant.app.dsteam.iguazio-cd1.com:30984
model=mymodel
total_tests=20
errors=0
match=5
avg_latency=26616
min_latency=25060
max_latency=31181
latency
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to track results use .show() or .logs() or in CLI: \n", + "!mlrun get run daaed7fcf79242e2b9e0c3a5148c64d1 --project sk-project , !mlrun logs daaed7fcf79242e2b9e0c3a5148c64d1 --project sk-project\n", + "> 2020-10-28 17:06:27,426 [info] run executed, status=completed\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_func.run(mlrun.NewTask(name='model_server_tester', \n", + " handler=model_server_tester, \n", + " params={'addr': address, 'model': 'mymodel'},\n", + " inputs={'table': DATA_PATH},\n", + " project=project_name, \n", + " artifact_path=os.path.join(artifact_path, 'data')))" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [conda env:root] *", "language": "python", - "name": "python3" + "name": "conda-root-py" }, "language_info": { "codemirror_mode": {