Skip to content

Commit 57d1ad7

Browse files
committed
Working with Spark 3/Java 11/Python 3.8
1 parent 0723af2 commit 57d1ad7

6 files changed

+18
-13
lines changed

Dockerfile-loader

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM python:3.6-stretch
1+
FROM python:3.8-buster
22
MAINTAINER TweetSets <[email protected]>
33

44
ADD requirements.txt /opt/tweetsets/
@@ -7,7 +7,7 @@ RUN pip install -r requirements.txt
77
RUN grep elasticsearch-dsl requirements.txt | xargs pip install -t dependencies
88

99
RUN apt-get update && \
10-
apt-get install -y openjdk-8-jre-headless \
10+
apt-get install -y openjdk-11-jre-headless \
1111
ca-certificates-java \
1212
zip -y
1313

@@ -22,7 +22,7 @@ ADD spark_utils.py /opt/tweetsets/
2222
ADD tweetsets_schema.json /opt/tweetsets/
2323
ADD tweetsets_sql_exp.sql /opt/tweetsets
2424
ADD setup.py /opt/tweetsets/
25-
ADD elasticsearch-hadoop-7.9.2.jar /opt/tweetsets/elasticsearch-hadoop.jar
25+
ADD elasticsearch-spark-30_2.12-7.13.4.jar /opt/tweetsets/elasticsearch-hadoop.jar
2626
ADD tweetset_cli.py /opt/tweetsets/
2727

2828
RUN python setup.py bdist_egg

Dockerfile-spark

+7-8
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM python:3.6-stretch
1+
FROM python:3.8-buster
22
MAINTAINER TweetSets <[email protected]>
33

44
# Based on https://hub.docker.com/r/gettyimages/spark/~/dockerfile/
@@ -24,15 +24,14 @@ ENV PYTHONIOENCODING UTF-8
2424
ENV PIP_DISABLE_PIP_VERSION_CHECK 1
2525

2626
# JAVA
27-
ARG JAVA_MAJOR_VERSION=8
28-
ARG JAVA_UPDATE_VERSION=131
27+
ARG JAVA_MAJOR_VERSION=11
28+
ARG JAVA_UPDATE_VERSION=11+9
2929
ARG JAVA_BUILD_NUMBER=11
30-
ENV JAVA_HOME /usr/jdk1.${JAVA_MAJOR_VERSION}.0_${JAVA_UPDATE_VERSION}
30+
ENV JAVA_HOME /usr/jdk-${JAVA_MAJOR_VERSION}.0.${JAVA_UPDATE_VERSION}
3131

3232
ENV PATH $PATH:$JAVA_HOME/bin
3333
RUN curl -sL --retry 3 --insecure \
34-
--header "Cookie: oraclelicense=accept-securebackup-cookie;" \
35-
"http://download.oracle.com/otn-pub/java/jdk/${JAVA_MAJOR_VERSION}u${JAVA_UPDATE_VERSION}-b${JAVA_BUILD_NUMBER}/d54c1d3a095b4ff2b6607d096fa80163/server-jre-${JAVA_MAJOR_VERSION}u${JAVA_UPDATE_VERSION}-linux-x64.tar.gz" \
34+
"https://github.com/AdoptOpenJDK/openjdk11-binaries/releases/download/jdk-11.0.11+9/OpenJDK11U-jdk_x64_linux_hotspot_11.0.11_9.tar.gz" \
3635
| gunzip \
3736
| tar x -C /usr/ \
3837
&& ln -s $JAVA_HOME /usr/java \
@@ -51,13 +50,13 @@ RUN curl -sL --retry 3 \
5150
&& chown -R root:root $HADOOP_HOME
5251

5352
# SPARK
54-
ENV SPARK_VERSION 2.4.7
53+
ENV SPARK_VERSION 3.1.2
5554
ENV SPARK_PACKAGE spark-${SPARK_VERSION}-bin-without-hadoop
5655
ENV SPARK_HOME /usr/spark-${SPARK_VERSION}
5756
ENV SPARK_DIST_CLASSPATH="$HADOOP_HOME/etc/hadoop/*:$HADOOP_HOME/share/hadoop/common/lib/*:$HADOOP_HOME/share/hadoop/common/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/hdfs/lib/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/yarn/lib/*:$HADOOP_HOME/share/hadoop/yarn/*:$HADOOP_HOME/share/hadoop/mapreduce/lib/*:$HADOOP_HOME/share/hadoop/mapreduce/*:$HADOOP_HOME/share/hadoop/tools/lib/*"
5857
ENV PATH $PATH:${SPARK_HOME}/bin
5958
RUN curl -sL --retry 3 \
60-
"https://archive.apache.org/dist/spark/spark-2.4.7/spark-2.4.7-bin-without-hadoop.tgz" \
59+
"https://mirrors.sonic.net/apache/spark/spark-3.1.2/spark-3.1.2-bin-without-hadoop.tgz" \
6160
| gunzip \
6261
| tar x -C /usr/ \
6362
&& mv /usr/$SPARK_PACKAGE $SPARK_HOME \
2 MB
Binary file not shown.

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,4 @@ six==1.10.0
2525
vine==1.3.0
2626
Werkzeug==1.0.1
2727
twarc==1.12.1
28-
pyspark==2.4.7
28+
pyspark==3.1.2

spark_utils.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ def make_spark_df(spark, schema, sql, path_to_dataset, dataset_id):
1616
# Read JSON files as Spark DataFrame
1717
df = spark.read.schema(schema).json(path_to_dataset)
1818
# Add the full Tweet JSON as a separate field
19-
df = df.withColumn("tweet", F.to_json(F.struct([df[x] for x in df.columns])))
19+
# Option for Spark v3 to write null fields as nulls (not skip)
20+
df = df.withColumn("tweet", F.to_json(F.struct([df[x] for x in df.columns]), {'ignoreNullFields': 'false'}))
2021
df.createOrReplaceTempView("tweets")
2122
# Apply SQL transform
2223
df = spark.sql(sql)

tweetset_loader.py

+5
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,11 @@ def shard_count(tweet_count, store_tweet=True):
312312
tweet_index.create()
313313

314314
spark = SparkSession.builder.appName('TweetSets').getOrCreate()
315+
# Make Spark v3 use the v2 time parser
316+
# TO DO --> update Spark SQL code to use the new time parser
317+
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")
318+
# Set UTC as the time zone
319+
spark.conf.set('spark.sql.session.timeZone', 'UTC')
315320
try:
316321
es_conf = {"es.nodes": os.environ.get('ES_HOST', 'elasticsearch'),
317322
"es.port": "9200",

0 commit comments

Comments
 (0)