GetStarted_python

Python for CaffeOnSpark

Setup your Python environment, if not already installed

export IPYTHON_ROOT=~/Python2.7.10 #Change this directory to install elsewhere.
curl -O https://www.python.org/ftp/python/2.7.10/Python-2.7.10.tgz
tar -xvf Python-2.7.10.tgz
rm Python-2.7.10.tgz
pushd Python-2.7.10 >/dev/null
./configure --prefix="${IPYTHON_ROOT}"
make
make install
popd >/dev/null
rm -rf Python-2.7.10
pushd "${IPYTHON_ROOT}" >/dev/null
curl -O https://bootstrap.pypa.io/get-pip.py
bin/python get-pip.py
rm get-pip.py
bin/pip install "ipython[notebook]"
bin/pip install numpy
bin/pip install matplotlib
bin/pip install pandas
bin/pip install p4j
zip -r Python.zip *
popd >/dev/null
pushd "${IPYTHON_ROOT}/.." >/dev/null
export PATH="${IPYTHON_ROOT}/bin:\${PATH}"

Submit Python Script

export IPYTHON_ROOT=~/Python2.7.10
export PYSPARK_PYTHON=${IPYTHON_ROOT}/bin/python
export PYTHONPATH=${CAFFE_ON_SPARK}/caffe-grid/target/caffeonsparkpythonapi.zip:${SPARK_HOME}/python/lib/pyspark.zip
export PATH=${SPARK_HOME}/bin:${IPYTHON_ROOT}/bin/:$PATH
pushd ${CAFFE_ON_SPARK}/data/
unzip ${CAFFE_ON_SPARK}/caffe-grid/target/caffeonsparkpythonapi.zip
spark-submit  --master ${MASTER_URL}  --driver-library-path "${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar" \
--driver-class-path "${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar" \
--conf spark.cores.max=${TOTAL_CORES} \
--conf spark.driver.extraLibraryPath="${LD_LIBRARY_PATH}" --conf spark.executorEnv.LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \
--py-files ${CAFFE_ON_SPARK}/caffe-grid/target/caffeonsparkpythonapi.zip \
--files ${CAFFE_ON_SPARK}/data/caffe/_caffe.so,${CAFFE_ON_SPARK}/data/caffe/lenet_memory_solver.prototxt,${CAFFE_ON_SPARK}/data/caffe/lenet_memory_train_test.prototxt \
--jars "${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar" \
--conf spark.pythonargs="-conf lenet_memory_solver.prototxt -model file:///tmp/lenet.model -features accuracy,ip1,ip2 -label label -output file:///tmp/output -devices 1 -outputFormat json" \
examples/MultiClassLogisticRegression.py

Launch Python Interactive Shell

export IPYTHON_ROOT=~/Python2.7.10
export PYSPARK_PYTHON=${IPYTHON_ROOT}/bin/python
export PYTHONPATH=${CAFFE_ON_SPARK}/caffe-grid/target/caffeonsparkpythonapi.zip:${SPARK_HOME}/python/lib/pyspark.zip
export PATH=${SPARK_HOME}/bin:${IPYTHON_ROOT}/bin/:$PATH
pushd ${CAFFE_ON_SPARK}/data/
unzip ${CAFFE_ON_SPARK}/caffe-grid/target/caffeonsparkpythonapi.zip
IPYTHON=1 pyspark  --master ${MASTER_URL}  --driver-library-path "${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar" \
--driver-class-path "${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar" \
--conf spark.cores.max=${TOTAL_CORES} \
--conf spark.driver.extraLibraryPath="${LD_LIBRARY_PATH}" --conf spark.executorEnv.LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \
--py-files ${CAFFE_ON_SPARK}/caffe-grid/target/caffeonsparkpythonapi.zip \
--files ${CAFFE_ON_SPARK}/data/caffe/_caffe.so,${CAFFE_ON_SPARK}/data/caffe/lenet_memory_solver.prototxt,${CAFFE_ON_SPARK}/data/caffe/lenet_memory_train_test.prototxt \
--jars "${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar"

Run examples

You can also run simple LogisticRegression on the shell prompt like below.

from com.yahoo.ml.caffe.RegisterContext import registerContext
from com.yahoo.ml.caffe.RegisterContext import registerSQLContext
registerContext(sc)
registerSQLContext(sqlContext)
cos=CaffeOnSpark(sc,sqlContext)
cfg=Config(sc)
cfg.protoFile='~/CaffeOnSpark/data/lenet_dataframe_solver.prototxt'
cfg.modelPath = 'file:/tmp/lenet.model'
cfg.label = 'label'
cfg.outputPath = 'file:outputlenet'
cfg.devices = 1
cfg.outputFormat = 'json'
cfg.isTest = True
cfg.clusterSize = 1
dl_train_source = DataSource(sc).getSource(cfg,True)
#Train
cos.train(dl_train_source)
lr_raw_source = DataSource(sc).getSource(cfg,False)
#Extract features
extracted_df = cos.features(lr_raw_source)
# Do multiclass LogisticRegression
data = extracted_df.map(lambda row: LabeledPoint(row.label[0], Vectors.dense(row.ip1)))
lr = LogisticRegressionWithLBFGS.train(data, numClasses=10, iterations=10)
predictions = lr.predict(data.map(lambda pt : pt.features))

IPythonNotebook

Generate data for demo notebook

This step is required, only if you want to run the sample notebook given with the code under {CAFFE_ON_SPARK}/data/examples/DLDemo.ipyb. Skip to the next section in case you don't want to run the demo notebook.

rm -rf ${CAFFE_ON_SPARK}/data/mnist_train_dataframe
spark-submit --master ${MASTER_URL} \
             --conf spark.cores.max=${TOTAL_CORES} \
             --conf spark.driver.extraLibraryPath="${LD_LIBRARY_PATH}" \
             --conf spark.executorEnv.LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \
             --class com.yahoo.ml.caffe.tools.LMDB2DataFrame \
             ${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar \
             -imageRoot file:${CAFFE_ON_SPARK}/data/mnist_train_lmdb \
             -lmdb_partitions 10 \
             -outputFormat parquet \
             -output file:${CAFFE_ON_SPARK}/data/mnist_train_dataframe


rm -rf ${CAFFE_ON_SPARK}/data/mnist_test_dataframe
spark-submit --master ${MASTER_URL} \
             --conf spark.cores.max=${TOTAL_CORES} \
             --conf spark.driver.extraLibraryPath="${LD_LIBRARY_PATH}" \
             --conf spark.executorEnv.LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \
             --class com.yahoo.ml.caffe.tools.LMDB2DataFrame \
             ${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar \
             -imageRoot file:${CAFFE_ON_SPARK}/data/mnist_test_lmdb \
             -lmdb_partitions 10 \
             -outputFormat parquet \
             -output file:${CAFFE_ON_SPARK}/data/mnist_test_dataframe

Make sure that you ${CAFFE_ON_SPARK}/data/caffe/lenet_dataframe_train_test.prototxt is updated to point to ${CAFFE_ON_SPARK}/data/mnist_train_dataframe for training and ${CAFFE_ON_SPARK}/data/mnist_test_dataframe for test.

Launch IPythonNotebook

export IPYTHON_OPTS="notebook --no-browser --ip=`hostname`"
export IPYTHON_ROOT=~/Python2.7.10
export PYSPARK_PYTHON=${IPYTHON_ROOT}/bin/python
export PYTHONPATH=${CAFFE_ON_SPARK}/caffe-grid/target/caffeonsparkpythonapi.zip:${SPARK_HOME}/python/lib/pyspark.zip
export PATH=${SPARK_HOME}/bin:${IPYTHON_ROOT}/bin/:$PATH
pushd ${CAFFE_ON_SPARK}/data/
unzip ${CAFFE_ON_SPARK}/caffe-grid/target/caffeonsparkpythonapi.zip
IPYTHON=1 pyspark  --master ${MASTER_URL}  --driver-library-path "${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar" \
--driver-class-path "${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar" \
--conf spark.driver.extraLibraryPath="${LD_LIBRARY_PATH}" --conf spark.executorEnv.LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \
--py-files ${CAFFE_ON_SPARK}/caffe-grid/target/caffeonsparkpythonapi.zip \
--conf spark.cores.max=${TOTAL_CORES} \
--files ${CAFFE_ON_SPARK}/data/caffe/_caffe.so,${CAFFE_ON_SPARK}/data/caffe/lenet_dataframe_solver.prototxt,${CAFFE_ON_SPARK}/data/caffe/lenet_dataframe_train_test.prototxt \
--jars "${CAFFE_ON_SPARK}/caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar"

When you run the above, the console will output a url, which you should copy on your browser. There you need to click examples/DLDemo.ipynb. When executing the notebook, replace the path of various files like lenet_memory_solver.txt, mnist_dataframe_test with the full path of those files on your system.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly