diff --git a/Spark-IPython4-32bit/README.md b/Spark-IPython4-32bit/README.md new file mode 100644 index 0000000..782fdbe --- /dev/null +++ b/Spark-IPython4-32bit/README.md @@ -0,0 +1,45 @@ +# Spark-IPython-32bit + +Setup a Vagrant single node with Spark, IPython, and matplotlib, in a ubuntu/trusty32 VM + +### Content + +Vagrantfile +apache-mirror-selector.py - Script to help select a Apache mirror to download from +ipython-pyspark.py - IPython notebook config and launch script +provision_spark_node.sh - Vagrant provisioning script + +### Prereq + +Vagrant http://docs.vagrantup.com/v2/installation/index.html +VirtualBox https://www.virtualbox.org/wiki/Downloads as a provider + +### Preparation + + - Go to local directory and run `vagrant up` + - Vagrant will then prepare the VM - this should take about ~2 min to download the core vm (aka "box") and then ~4 min for other downloads and provisioning - it will require Internet connection to download various content from sources + - Spark distribution is automatically downloaded during the provisioning phase + - IPython notebook is downloaded and configured during provisioning, and it is launched with PySpark as the very last step + - To connect to IPython notebook, use http://localhost:1088. To see [SparkContext web UI](https://spark.apache.org/docs/latest/monitoring.html) use http://localhost:4040. Port forwarding is configured in Vagrant. + - If needed, use `vagrant ssh` to connect to the VM machine + +### Start/Stop + +#### IPython + +`vagrant ssh` + +stop: Ctrl-C to break +start: +`$ sudo su -` +`$ ./ipython-pyspark.py` + +Connect to [http://localhost:1088](http://localhost:1088) + +#### Spark + +Connect to [http://localhost:4040](http://localhost:4040) for the Spark UI (Driver) + +### Data transfer + +Vagrant support a "mapped directory". The local directory on the host where Vagrantfile is, is mapped to `/vagrant` in the VM. Any file there can be accessed from within the VM (use `vagrant ssh` to connect) diff --git a/Spark-IPython4-32bit/Vagrantfile b/Spark-IPython4-32bit/Vagrantfile new file mode 100644 index 0000000..3c62ad7 --- /dev/null +++ b/Spark-IPython4-32bit/Vagrantfile @@ -0,0 +1,24 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : + +# Configuration parameters +boxRam = 2048 # Ram in MB +boxCpus = 2 # Number of CPU core + +ipythonPort = 1088 # Ipython port to forward (also set in IPython notebook config) + +Vagrant.configure(2) do |config| + config.vm.define "sparkvm" do |master| + master.vm.box = "ubuntu/trusty32" + master.vm.network :forwarded_port, host: ipythonPort, guest: ipythonPort # IPython port (set in notebook config) + master.vm.network :forwarded_port, host: 4040, guest: 4040 # Spark UI (Driver) + master.vm.hostname = "sparkvm" + + master.vm.provider :virtualbox do |v| + v.name = master.vm.hostname.to_s + v.customize ["modifyvm", :id, "--memory", "#{boxRam}"] + v.customize ["modifyvm", :id, "--cpus", "#{boxCpus}"] + end + master.vm.provision :shell, :path => "provision_spark_node.sh" + end +end diff --git a/Spark-IPython4-32bit/apache-mirror-selector.py b/Spark-IPython4-32bit/apache-mirror-selector.py new file mode 100644 index 0000000..70aef8f --- /dev/null +++ b/Spark-IPython4-32bit/apache-mirror-selector.py @@ -0,0 +1,24 @@ +#! /usr/bin/env python + +# https://github.com/y-higuchi/apache-mirror-selector + +import sys, argparse +from urllib2 import urlopen +from json import loads + +class UsageOnErrorParser(argparse.ArgumentParser): + def error(self, message): + sys.stderr.write('argument error: %s\n' % message) + self.print_help() + sys.exit(2) + +parser = UsageOnErrorParser(description='Print preferred Apache mirror URL.') +parser.add_argument('url', type=str, help='Apache mirror selector url.') + +args = parser.parse_args() + +jsonurl = args.url + '&asjson=1' + +body = urlopen(jsonurl).read().decode('utf-8') +mirrors = loads(body) +print(mirrors['preferred'] + mirrors['path_info']) diff --git a/Spark-IPython4-32bit/ipython-pyspark.py b/Spark-IPython4-32bit/ipython-pyspark.py new file mode 100755 index 0000000..79db442 --- /dev/null +++ b/Spark-IPython4-32bit/ipython-pyspark.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python + +# https://github.com/felixcheung/vagrant-projects + +import getpass +import glob +import inspect +import os +import platform +import re +import subprocess +import sys +import time + +#----------------------- +# PySpark +# + +master = 'local[*]' + +num_executors = 12 #24 +executor_cores = 2 +executor_memory = '1g' #10g + +pyspark_submit_args = os.getenv('PYSPARK_SUBMIT_ARGS', None) +if not pyspark_submit_args: + pyspark_submit_args = '--num-executors %d --executor-cores %d --executor-memory %s' % (num_executors, executor_cores, executor_memory) +pyspark_submit_args = '--master %s %s' % (master, pyspark_submit_args) + +if not os.getenv('PYSPARK_PYTHON', None): + os.environ['PYSPARK_PYTHON'] = sys.executable +os.environ['PYSPARK_DRIVER_PYTHON']='ipython' # PySpark Driver (ie. IPython) +profile_name = 'pyspark' + +# ipython4 does not use profiles anymore +os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'notebook --config=~/.ipython/profile_%s/ipython_notebook_config.py' % profile_name + +#----------------------- +# IPython Notebook +# + +ipython_notebook_config_template = '''c = get_config() +c.NotebookApp.ip = '{ip}' +c.NotebookApp.port = {port} +c.NotebookApp.open_browser = False +''' + +pyspark_setup_template = '''import os +if not os.getenv('PYSPARK_SUBMIT_ARGS', None): + raise ValueError('PYSPARK_SUBMIT_ARGS environment variable is not set') + +spark_home = os.getenv('SPARK_HOME', None) +if not spark_home: + raise ValueError('SPARK_HOME environment variable is not set') +''' + +ip = '*' # Warning: this is potentially insecure +port = 1088 + +#----------------------- +# Create profile and start +# + +try: + ipython_profile_path = os.popen('ipython locate').read().rstrip('\n') + '/profile_%s' % profile_name + setup_py_path = ipython_profile_path + '/startup/00-pyspark-setup.py' + ipython_notebook_config_path = ipython_profile_path + '/ipython_notebook_config.py' + ipython_kernel_config_path = ipython_profile_path + '/ipython_kernel_config.py' + + if not os.path.exists(ipython_profile_path): + print 'Creating IPython Notebook profile\n' + cmd = 'ipython profile create %s' % profile_name + os.system(cmd) + print '\n' + + if not os.path.exists(setup_py_path): + print 'Writing PySpark setup\n' + setup_file = open(setup_py_path, 'w') + setup_file.write(pyspark_setup_template) + setup_file.close() + os.chmod(setup_py_path, 0600) + + # matplotlib inline + kernel_config = open(ipython_kernel_config_path).read() + if "c.IPKernelApp.matplotlib = 'inline'" not in kernel_config: + print 'Writing IPython kernel config\n' + new_kernel_config = kernel_config.replace('# c.IPKernelApp.matplotlib = None', "c.IPKernelApp.matplotlib = 'inline'") + kernel_file = open(ipython_kernel_config_path, 'w') + kernel_file.write(new_kernel_config) + kernel_file.close() + os.chmod(ipython_kernel_config_path, 0600) + + if not os.path.exists(ipython_notebook_config_path) or 'open_browser = False' not in open(ipython_notebook_config_path).read(): + print 'Writing IPython Notebook config\n' + config_file = open(ipython_notebook_config_path, 'w') + config_file.write(ipython_notebook_config_template.format(ip = ip, port = port)) + config_file.close() + os.chmod(ipython_notebook_config_path, 0600) + + print 'Launching PySpark with IPython Notebook\n' + cmd = 'pyspark %s' % pyspark_submit_args + os.system(cmd) + sys.exit(0) +except KeyboardInterrupt: + print 'Aborted\n' + sys.exit(1) diff --git a/Spark-IPython4-32bit/provision_spark_node.sh b/Spark-IPython4-32bit/provision_spark_node.sh new file mode 100644 index 0000000..9f4f6d1 --- /dev/null +++ b/Spark-IPython4-32bit/provision_spark_node.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +echo "== start vm provisioning $(date +'%Y/%m/%d %H:%M:%S')" +STARTTIME=$(date +%s) + +sudo apt-get update && sudo apt-get -y upgrade + +# OpenJDK 1.7.0_79 +sudo apt-get -y install openjdk-7-jre-headless + +# Set JAVA_HOME +java -version +echo '' >> /etc/profile +echo '# set JAVA_HOME' >> /etc/profile +echo 'export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk-i386' >> /etc/profile +export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk-i386 +echo "JAVA_HOME=${JAVA_HOME}" + +# Spark +pushd ~ +echo "Getting Spark..." +cp /vagrant/apache-mirror-selector.py ./ +chmod 700 apache-mirror-selector.py +wget -q `./apache-mirror-selector.py http://www.apache.org/dyn/closer.cgi?path=spark/spark-1.3.1/spark-1.3.1-bin-hadoop2.6.tgz` +sudo cp ./spark-1.3.1-bin-hadoop2.6.tgz /opt +pushd /opt +sudo tar -xzf spark-* +sudo rm -f spark-*.tgz +cd spark-* +SPARKHOME=$(pwd) +echo '' >> /etc/profile +echo '# set SPARK_HOME and PATH' >> /etc/profile +echo "export SPARK_HOME=${SPARKHOME}" >> /etc/profile +echo 'export PATH=$JAVA_HOME/bin:$SPARK_HOME/bin:$PATH' >> /etc/profile +export SPARK_HOME=$SPARKHOME +export PATH=$JAVA_HOME/bin:$SPARK_HOME/bin:$PATH +echo "SPARK_HOME=${SPARK_HOME}" +popd +rm -f apache-mirror-selector.py +popd + +sudo apt-get -y install pkg-config +sudo apt-get -y install python-pip + +# matplotlib +# required to get freetype, png +sudo apt-get -y install python-matplotlib + +# IPython notebook +sudo apt-get -y install libzmq-dev +# required to get pyzmq +sudo apt-get -y install python-dev +sudo python -m pip install "ipython[notebook]" --upgrade +IPYTHONVER=`ipython -V` +echo "IPython version ${IPYTHONVER}" + +# Start IPython notebook +cd ~ +cp /vagrant/ipython-pyspark.py ~/ +~/ipython-pyspark.py + +echo "== end vm provisioning $(date +'%Y/%m/%d %H:%M:%S')" +echo "== $(($(date +%s) - $STARTTIME)) seconds"