Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions Spark-IPython4-32bit/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Spark-IPython-32bit

Setup a Vagrant single node with Spark, IPython, and matplotlib, in a ubuntu/trusty32 VM

### Content

Vagrantfile
apache-mirror-selector.py - Script to help select a Apache mirror to download from
ipython-pyspark.py - IPython notebook config and launch script
provision_spark_node.sh - Vagrant provisioning script

### Prereq

Vagrant http://docs.vagrantup.com/v2/installation/index.html
VirtualBox https://www.virtualbox.org/wiki/Downloads as a provider

### Preparation

- Go to local directory and run `vagrant up`
- Vagrant will then prepare the VM - this should take about ~2 min to download the core vm (aka "box") and then ~4 min for other downloads and provisioning - it will require Internet connection to download various content from sources
- Spark distribution is automatically downloaded during the provisioning phase
- IPython notebook is downloaded and configured during provisioning, and it is launched with PySpark as the very last step
- To connect to IPython notebook, use http://localhost:1088. To see [SparkContext web UI](https://spark.apache.org/docs/latest/monitoring.html) use http://localhost:4040. Port forwarding is configured in Vagrant.
- If needed, use `vagrant ssh` to connect to the VM machine

### Start/Stop

#### IPython

`vagrant ssh`

stop: Ctrl-C to break
start:
`$ sudo su -`
`$ ./ipython-pyspark.py`

Connect to [http://localhost:1088](http://localhost:1088)

#### Spark

Connect to [http://localhost:4040](http://localhost:4040) for the Spark UI (Driver)

### Data transfer

Vagrant support a "mapped directory". The local directory on the host where Vagrantfile is, is mapped to `/vagrant` in the VM. Any file there can be accessed from within the VM (use `vagrant ssh` to connect)
24 changes: 24 additions & 0 deletions Spark-IPython4-32bit/Vagrantfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# -*- mode: ruby -*-
# vi: set ft=ruby :

# Configuration parameters
boxRam = 2048 # Ram in MB
boxCpus = 2 # Number of CPU core

ipythonPort = 1088 # Ipython port to forward (also set in IPython notebook config)

Vagrant.configure(2) do |config|
config.vm.define "sparkvm" do |master|
master.vm.box = "ubuntu/trusty32"
master.vm.network :forwarded_port, host: ipythonPort, guest: ipythonPort # IPython port (set in notebook config)
master.vm.network :forwarded_port, host: 4040, guest: 4040 # Spark UI (Driver)
master.vm.hostname = "sparkvm"

master.vm.provider :virtualbox do |v|
v.name = master.vm.hostname.to_s
v.customize ["modifyvm", :id, "--memory", "#{boxRam}"]
v.customize ["modifyvm", :id, "--cpus", "#{boxCpus}"]
end
master.vm.provision :shell, :path => "provision_spark_node.sh"
end
end
24 changes: 24 additions & 0 deletions Spark-IPython4-32bit/apache-mirror-selector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#! /usr/bin/env python

# https://github.com/y-higuchi/apache-mirror-selector

import sys, argparse
from urllib2 import urlopen
from json import loads

class UsageOnErrorParser(argparse.ArgumentParser):
def error(self, message):
sys.stderr.write('argument error: %s\n' % message)
self.print_help()
sys.exit(2)

parser = UsageOnErrorParser(description='Print preferred Apache mirror URL.')
parser.add_argument('url', type=str, help='Apache mirror selector url.')

args = parser.parse_args()

jsonurl = args.url + '&asjson=1'

body = urlopen(jsonurl).read().decode('utf-8')
mirrors = loads(body)
print(mirrors['preferred'] + mirrors['path_info'])
106 changes: 106 additions & 0 deletions Spark-IPython4-32bit/ipython-pyspark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/usr/bin/env python

# https://github.com/felixcheung/vagrant-projects

import getpass
import glob
import inspect
import os
import platform
import re
import subprocess
import sys
import time

#-----------------------
# PySpark
#

master = 'local[*]'

num_executors = 12 #24
executor_cores = 2
executor_memory = '1g' #10g

pyspark_submit_args = os.getenv('PYSPARK_SUBMIT_ARGS', None)
if not pyspark_submit_args:
pyspark_submit_args = '--num-executors %d --executor-cores %d --executor-memory %s' % (num_executors, executor_cores, executor_memory)
pyspark_submit_args = '--master %s %s' % (master, pyspark_submit_args)

if not os.getenv('PYSPARK_PYTHON', None):
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON']='ipython' # PySpark Driver (ie. IPython)
profile_name = 'pyspark'

# ipython4 does not use profiles anymore
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'notebook --config=~/.ipython/profile_%s/ipython_notebook_config.py' % profile_name

#-----------------------
# IPython Notebook
#

ipython_notebook_config_template = '''c = get_config()
c.NotebookApp.ip = '{ip}'
c.NotebookApp.port = {port}
c.NotebookApp.open_browser = False
'''

pyspark_setup_template = '''import os
if not os.getenv('PYSPARK_SUBMIT_ARGS', None):
raise ValueError('PYSPARK_SUBMIT_ARGS environment variable is not set')

spark_home = os.getenv('SPARK_HOME', None)
if not spark_home:
raise ValueError('SPARK_HOME environment variable is not set')
'''

ip = '*' # Warning: this is potentially insecure
port = 1088

#-----------------------
# Create profile and start
#

try:
ipython_profile_path = os.popen('ipython locate').read().rstrip('\n') + '/profile_%s' % profile_name
setup_py_path = ipython_profile_path + '/startup/00-pyspark-setup.py'
ipython_notebook_config_path = ipython_profile_path + '/ipython_notebook_config.py'
ipython_kernel_config_path = ipython_profile_path + '/ipython_kernel_config.py'

if not os.path.exists(ipython_profile_path):
print 'Creating IPython Notebook profile\n'
cmd = 'ipython profile create %s' % profile_name
os.system(cmd)
print '\n'

if not os.path.exists(setup_py_path):
print 'Writing PySpark setup\n'
setup_file = open(setup_py_path, 'w')
setup_file.write(pyspark_setup_template)
setup_file.close()
os.chmod(setup_py_path, 0600)

# matplotlib inline
kernel_config = open(ipython_kernel_config_path).read()
if "c.IPKernelApp.matplotlib = 'inline'" not in kernel_config:
print 'Writing IPython kernel config\n'
new_kernel_config = kernel_config.replace('# c.IPKernelApp.matplotlib = None', "c.IPKernelApp.matplotlib = 'inline'")
kernel_file = open(ipython_kernel_config_path, 'w')
kernel_file.write(new_kernel_config)
kernel_file.close()
os.chmod(ipython_kernel_config_path, 0600)

if not os.path.exists(ipython_notebook_config_path) or 'open_browser = False' not in open(ipython_notebook_config_path).read():
print 'Writing IPython Notebook config\n'
config_file = open(ipython_notebook_config_path, 'w')
config_file.write(ipython_notebook_config_template.format(ip = ip, port = port))
config_file.close()
os.chmod(ipython_notebook_config_path, 0600)

print 'Launching PySpark with IPython Notebook\n'
cmd = 'pyspark %s' % pyspark_submit_args
os.system(cmd)
sys.exit(0)
except KeyboardInterrupt:
print 'Aborted\n'
sys.exit(1)
63 changes: 63 additions & 0 deletions Spark-IPython4-32bit/provision_spark_node.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash

echo "== start vm provisioning $(date +'%Y/%m/%d %H:%M:%S')"
STARTTIME=$(date +%s)

sudo apt-get update && sudo apt-get -y upgrade

# OpenJDK 1.7.0_79
sudo apt-get -y install openjdk-7-jre-headless

# Set JAVA_HOME
java -version
echo '' >> /etc/profile
echo '# set JAVA_HOME' >> /etc/profile
echo 'export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk-i386' >> /etc/profile
export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk-i386
echo "JAVA_HOME=${JAVA_HOME}"

# Spark
pushd ~
echo "Getting Spark..."
cp /vagrant/apache-mirror-selector.py ./
chmod 700 apache-mirror-selector.py
wget -q `./apache-mirror-selector.py http://www.apache.org/dyn/closer.cgi?path=spark/spark-1.3.1/spark-1.3.1-bin-hadoop2.6.tgz`
sudo cp ./spark-1.3.1-bin-hadoop2.6.tgz /opt
pushd /opt
sudo tar -xzf spark-*
sudo rm -f spark-*.tgz
cd spark-*
SPARKHOME=$(pwd)
echo '' >> /etc/profile
echo '# set SPARK_HOME and PATH' >> /etc/profile
echo "export SPARK_HOME=${SPARKHOME}" >> /etc/profile
echo 'export PATH=$JAVA_HOME/bin:$SPARK_HOME/bin:$PATH' >> /etc/profile
export SPARK_HOME=$SPARKHOME
export PATH=$JAVA_HOME/bin:$SPARK_HOME/bin:$PATH
echo "SPARK_HOME=${SPARK_HOME}"
popd
rm -f apache-mirror-selector.py
popd

sudo apt-get -y install pkg-config
sudo apt-get -y install python-pip

# matplotlib
# required to get freetype, png
sudo apt-get -y install python-matplotlib

# IPython notebook
sudo apt-get -y install libzmq-dev
# required to get pyzmq
sudo apt-get -y install python-dev
sudo python -m pip install "ipython[notebook]" --upgrade
IPYTHONVER=`ipython -V`
echo "IPython version ${IPYTHONVER}"

# Start IPython notebook
cd ~
cp /vagrant/ipython-pyspark.py ~/
~/ipython-pyspark.py

echo "== end vm provisioning $(date +'%Y/%m/%d %H:%M:%S')"
echo "== $(($(date +%s) - $STARTTIME)) seconds"