marvin-ai · lucasbm88 · Nov 6, 2017 · Nov 8, 2017 · Nov 27, 2017 · Nov 27, 2017
diff --git a/marvin_python_toolbox/management/engine.py b/marvin_python_toolbox/management/engine.py
@@ -320,6 +320,51 @@ def engine_server(ctx, action, params_file, metadata_file, initial_dataset, data
             server.stop(0)
 
 
+@cli.command('engine-dockerbuild', help='Builds a docker image containing the engine. Requires docker service running in the host machine.')
+@click.option(
+    '--type',
+    '-t',
+    type=click.Choice(['spark', 'base']),
+    default='spark',
+    help='What image type to build. Example: marvin with spark.',
+)
+@click.option('--tag', '-tg', default='marvinai/marvin', help='Image tag to be used.')
+@click.option('--version', '-v', help="Image version to be used.")
+def build_docker(type, tag, version):
+    buildTypes = {
+        "spark": {
+            "folder": "marvin-spark-docker"
+        },
+        "base": {
+            "folder": "marvin-base-docker"
+        }
+    }
+    if version is None:
+        version = VERSION
+    print("Will generate a package with the engine in order to build the docker image.")
+    command_tar = ['tar', '-cf', 'engine.tar', '.']
+    run_command(command_tar, "Failed to generate tar file.")
+
+    docker_folder = buildTypes[type]["folder"]
+    print("Will move the package to the docker folder.")
+    command_mv = ['mv', 'engine.tar', 'docker/{0}/'.format(docker_folder)]
+    run_command(command_mv, "Failed to move the package to docker folder.")
+
+    print("Building docker image.")
+    tag = "{0}-{1}".format(tag, type)
+    command = ['docker', 'build', '-t', '{0}:{1}'.format(tag, version), 'docker/{0}/'.format(docker_folder)]
+    run_command(command, "Failed to build docker image.")
+
+    print("Successfully built docker image with tag {0}. To start the engine-httpserver with docker run <docker run -it {0}>.".format(tag))
+
+
+def run_command(command, error_message="A failure occurred."):
+    try:
+        subprocess.Popen(command, env=os.environ).wait()
+    except:
+        logger.exception(error_message)
+        sys.exit(1)
+
 TEMPLATE_BASES = {
     'python-engine': os.path.join(os.path.dirname(__file__), 'templates', 'python-engine')
 }
@@ -336,7 +381,6 @@ def engine_server(ctx, action, params_file, metadata_file, initial_dataset, data
 
 _orig_type = type
 
-
 @cli.command('engine-generateenv', help='Generate a new marvin engine environment and install default requirements.')
 @click.argument('engine-path', type=click.Path(exists=True))
 def generate_env(engine_path):
@@ -544,6 +588,7 @@ def _call_git_init(dest):
     default='all',
     type=click.Choice(['all', 'acquisitor', 'tpreparator', 'trainer', 'evaluator', 'ppreparator', 'predictor']),
     help='Marvin engine action name')
+@click.option('--model-protocol', '-mp', help='Model protocol to be loaded. Useful for loading a previous trained model.', type=click.Path(exists=True))
 @click.option('--initial-dataset', '-id', help='Initial dataset file path', type=click.Path(exists=True))
 @click.option('--dataset', '-d', help='Dataset file path', type=click.Path(exists=True))
 @click.option('--model', '-m', help='Engine model file path', type=click.Path(exists=True))
@@ -584,6 +629,7 @@ def engine_httpserver(ctx, action, params_file, initial_dataset, dataset,
             '-DmarvinConfig.engineHome={}'.format(ctx.obj['config']['inidir']),
             '-DmarvinConfig.ipAddress={}'.format(http_host),
             '-DmarvinConfig.port={}'.format(http_port),
+            '-DmarvinConfig.modelProtocol={}'.format(model-protocol),
             '-jar',
             executor_path])
 

diff --git a/...in_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/Dockerfile b/...in_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/Dockerfile
@@ -0,0 +1,31 @@
+FROM debian:jessie
+
+RUN echo "deb http://http.debian.net/debian jessie-backports main" >> /etc/apt/sources.list \
+	&& apt-get update \
+	&& apt-get install -y software-properties-common curl wget \
+	&& apt-get install -yt jessie-backports openjdk-8-jdk \
+	&& apt-get install -y git \
+	&& apt-get -qy install python2.7-dev python-pip ipython libsasl2-dev gcc \
+	&& apt-get -qy install libffi-dev \
+	&& apt-get -qy install libssl-dev \
+	&& apt-get -qy install libxml2-dev libxslt1-dev \
+	&& apt-get -qy install libpng12-dev libfreetype6-dev \
+	&& apt-get -qy install python-tk
+
+#Engines will run using the user marvin
+RUN useradd --create-home -s /bin/bash -G sudo marvin
+
+##Install virtualenv & update pip
+ENV WORKON_HOME /home/marvin/.virtualenvs
+RUN pip install -q virtualenvwrapper \
+	&& echo 'source /usr/local/bin/virtualenvwrapper.sh' >> /home/marvin/.profile \
+    && mkdir -p /opt/marvin/data \
+    && pip install --upgrade pip
+
+#Take ownership of needed folders
+RUN chown -R marvin:marvin /opt
+
+ENV MARVIN_DATA_PATH /opt/marvin/data
+
+USER marvin
+WORKDIR /home/marvin
diff --git a/...n_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile b/...n_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile
@@ -0,0 +1,29 @@
+FROM marvinai/marvin-base
+
+USER root
+
+ADD http://archive.apache.org/dist/spark/spark-2.1.1/spark-2.1.1-bin-hadoop2.6.tgz /opt/
+
+#Unpack tgzs
+RUN ls -l /opt \
+	&& mv /opt/spark-2.1.1-bin-hadoop2.6 /opt/spark 
+
+#Add configuration files
+ADD spark-conf/* /opt/spark/conf/
+
+ADD engine.tar /opt/engine
+
+COPY virtualenv_entrypoint.sh /opt/engine
+
+RUN chown marvin:marvin -R /opt/engine
+
+USER marvin
+
+ENV SPARK_HOME /opt/spark
+ENV HADOOP_CONF_DIR /opt/spark/conf
+ENV MARVIN_HOME /opt/engine
+
+RUN cd /opt/engine \
+        && bash -c 'source /usr/local/bin/virtualenvwrapper.sh && mkvirtualenv engine-env && setvirtualenvproject && make marvin'
+
+ENTRYPOINT "/opt/engine/virtualenv_entrypoint.sh"
diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/build.sh b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/build.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+if [ -z "$1" ]
+  then
+    echo "You must specify the version of the image being built"
+    exit 1
+fi
+docker build -t registry.b2w.io/b2wdigital/predictionio-b2w:"$1" .
+
+
diff --git a/...ox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/core-site.xml b/...ox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/core-site.xml
diff --git a/...anagement/templates/python-engine/docker/marvin-spark-docker/spark-conf/fairscheduler.xml b/...anagement/templates/python-engine/docker/marvin-spark-docker/spark-conf/fairscheduler.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<allocations>
+  <pool name="production">
+    <schedulingMode>FAIR</schedulingMode>
+    <weight>1</weight>
+    <minShare>2</minShare>
+  </pool>
+  <pool name="test">
+    <schedulingMode>FIFO</schedulingMode>
+    <weight>2</weight>
+    <minShare>3</minShare>
+  </pool>
+</allocations>
diff --git a/...ox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/hdfs-site.xml b/...ox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/hdfs-site.xml
diff --git a/...ox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/hive-site.xml b/...ox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/hive-site.xml
diff --git a/...management/templates/python-engine/docker/marvin-spark-docker/spark-conf/log4j.properties b/...management/templates/python-engine/docker/marvin-spark-docker/spark-conf/log4j.properties
@@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the console
+log4j.rootCategory=INFO, console
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# Set the default spark-shell log level to WARN. When running the spark-shell, the
+# log level for this class is used to overwrite the root logger's log level, so that
+# the user can have different defaults for the shell and regular Spark apps.
+log4j.logger.org.apache.spark.repl.Main=WARN
+
+# Settings to quiet third party logs that are too verbose
+log4j.logger.org.spark_project.jetty=WARN
+log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
+log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
+log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
+log4j.logger.org.apache.parquet=ERROR
+log4j.logger.parquet=ERROR
+
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
+log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
+log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
diff --git a/...nagement/templates/python-engine/docker/marvin-spark-docker/spark-conf/metrics.properties b/...nagement/templates/python-engine/docker/marvin-spark-docker/spark-conf/metrics.properties
@@ -0,0 +1,170 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#  syntax: [instance].sink|source.[name].[options]=[value]
+
+#  This file configures Spark's internal metrics system. The metrics system is
+#  divided into instances which correspond to internal components.
+#  Each instance can be configured to report its metrics to one or more sinks.
+#  Accepted values for [instance] are "master", "worker", "executor", "driver",
+#  and "applications". A wildcard "*" can be used as an instance name, in
+#  which case all instances will inherit the supplied property.
+#
+#  Within an instance, a "source" specifies a particular set of grouped metrics.
+#  there are two kinds of sources:
+#    1. Spark internal sources, like MasterSource, WorkerSource, etc, which will
+#    collect a Spark component's internal state. Each instance is paired with a
+#    Spark source that is added automatically.
+#    2. Common sources, like JvmSource, which will collect low level state.
+#    These can be added through configuration options and are then loaded
+#    using reflection.
+#
+#  A "sink" specifies where metrics are delivered to. Each instance can be
+#  assigned one or more sinks.
+#
+#  The sink|source field specifies whether the property relates to a sink or
+#  source.
+#
+#  The [name] field specifies the name of source or sink.
+#
+#  The [options] field is the specific property of this source or sink. The
+#  source or sink is responsible for parsing this property.
+#
+#  Notes:
+#    1. To add a new sink, set the "class" option to a fully qualified class
+#    name (see examples below).
+#    2. Some sinks involve a polling period. The minimum allowed polling period
+#    is 1 second.
+#    3. Wildcard properties can be overridden by more specific properties.
+#    For example, master.sink.console.period takes precedence over
+#    *.sink.console.period.
+#    4. A metrics specific configuration
+#    "spark.metrics.conf=${SPARK_HOME}/conf/metrics.properties" should be
+#    added to Java properties using -Dspark.metrics.conf=xxx if you want to
+#    customize metrics system. You can also put the file in ${SPARK_HOME}/conf
+#    and it will be loaded automatically.
+#    5. The MetricsServlet sink is added by default as a sink in the master,
+#    worker and driver, and you can send HTTP requests to the "/metrics/json"
+#    endpoint to get a snapshot of all the registered metrics in JSON format.
+#    For master, requests to the "/metrics/master/json" and
+#    "/metrics/applications/json" endpoints can be sent separately to get
+#    metrics snapshots of the master instance and applications. This
+#    MetricsServlet does not have to be configured.
+
+## List of available common sources and their properties.
+
+# org.apache.spark.metrics.source.JvmSource
+#   Note: Currently, JvmSource is the only available common source.
+#         It can be added to an instance by setting the "class" option to its
+#         fully qualified class name (see examples below).
+
+## List of available sinks and their properties.
+
+# org.apache.spark.metrics.sink.ConsoleSink
+#   Name:   Default:   Description:
+#   period  10         Poll period
+#   unit    seconds    Unit of the poll period
+
+# org.apache.spark.metrics.sink.CSVSink
+#   Name:     Default:   Description:
+#   period    10         Poll period
+#   unit      seconds    Unit of the poll period
+#   directory /tmp       Where to store CSV files
+
+# org.apache.spark.metrics.sink.GangliaSink
+#   Name:     Default:   Description:
+#   host      NONE       Hostname or multicast group of the Ganglia server,
+#                        must be set
+#   port      NONE       Port of the Ganglia server(s), must be set
+#   period    10         Poll period
+#   unit      seconds    Unit of the poll period
+#   ttl       1          TTL of messages sent by Ganglia
+#   dmax      0          Lifetime in seconds of metrics (0 never expired)
+#   mode      multicast  Ganglia network mode ('unicast' or 'multicast')
+
+# org.apache.spark.metrics.sink.JmxSink
+
+# org.apache.spark.metrics.sink.MetricsServlet
+#   Name:     Default:   Description:
+#   path      VARIES*    Path prefix from the web server root
+#   sample    false      Whether to show entire set of samples for histograms
+#                        ('false' or 'true')
+#
+# * Default path is /metrics/json for all instances except the master. The
+#   master has two paths:
+#     /metrics/applications/json # App information
+#     /metrics/master/json       # Master information
+
+# org.apache.spark.metrics.sink.GraphiteSink
+#   Name:     Default:      Description:
+#   host      NONE          Hostname of the Graphite server, must be set
+#   port      NONE          Port of the Graphite server, must be set
+#   period    10            Poll period
+#   unit      seconds       Unit of the poll period
+#   prefix    EMPTY STRING  Prefix to prepend to every metric's name
+#   protocol  tcp           Protocol ("tcp" or "udp") to use
+
+## Examples
+# Enable JmxSink for all instances by class name
+#*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink
+
+# Enable ConsoleSink for all instances by class name
+#*.sink.console.class=org.apache.spark.metrics.sink.ConsoleSink
+
+# Polling period for the ConsoleSink
+#*.sink.console.period=10
+# Unit of the polling period for the ConsoleSink
+#*.sink.console.unit=seconds
+
+# Polling period for the ConsoleSink specific for the master instance
+#master.sink.console.period=15
+# Unit of the polling period for the ConsoleSink specific for the master
+# instance
+#master.sink.console.unit=seconds
+
+# Enable CsvSink for all instances by class name
+#*.sink.csv.class=org.apache.spark.metrics.sink.CsvSink
+
+# Polling period for the CsvSink
+#*.sink.csv.period=1
+# Unit of the polling period for the CsvSink
+#*.sink.csv.unit=minutes
+
+# Polling directory for CsvSink
+#*.sink.csv.directory=/tmp/
+
+# Polling period for the CsvSink specific for the worker instance
+#worker.sink.csv.period=10
+# Unit of the polling period for the CsvSink specific for the worker instance
+#worker.sink.csv.unit=minutes
+
+# Enable Slf4jSink for all instances by class name
+#*.sink.slf4j.class=org.apache.spark.metrics.sink.Slf4jSink
+
+# Polling period for the Slf4JSink
+#*.sink.slf4j.period=1
+# Unit of the polling period for the Slf4jSink
+#*.sink.slf4j.unit=minutes
+
+# Enable JvmSource for instance master, worker, driver and executor
+#master.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+#worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+#driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+#executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource