tutorial/pyspark-examples/rdds/pyspark-session-2019-10-09.txt

/spark-2.4.4 $ ./bin/pyspark
Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
[Clang 6.0 (clang-600.0.57)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
19/10/09 18:57:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.4
      /_/

Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
SparkSession available as 'spark'.
>>>
>>>
>>>
>>>
>>> numbers = [1, 2, 3, 1, 2, 3, 4, 4, 5, 6]
>>> numbers
[1, 2, 3, 1, 2, 3, 4, 4, 5, 6]
>>> rdd = spark.sparkContext.parallelize(numbers)
>>> rdd.collect()
[1, 2, 3, 1, 2, 3, 4, 4, 5, 6]
>>> rdd.count()
10
>>> rdd2 = rdd.filter(lambda x : x > 3)
>>> rdd2.collect()
[4, 4, 5, 6]
>>>
>>>
>>> def custom_filter(x):
...     if x > 3:
...        return True
...     else:
...        return False
... ^D
>>>
>>> x = custom_filter(10)
>>> x
True
>>> x = custom_filter(2)
>>> x
False
>>> rdd3 = rdd.filter(custom_filter)
>>> rdd3.collect()
[4, 4, 5, 6]
>>> rdd2.collect()
[4, 4, 5, 6]
>>>
>>>
>>> data = [('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 2),('B', 7)]
>>> data
[('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 2), ('B', 7)]
>>>
>>> rdd = spark.sparkContext.parallelize(data)
>>> rdd.collect()
[('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 2), ('B', 7)]
>>>
>>>
>>>
>>>
>>> total = rdd.reduceByKey(lambda x, y: x+y)
>>> total.collect()
[('B', 9), ('A', 14)]
>>>
>>>
>>> rdd.collect()
[('A', 2), ('A', 3), ('A', 4), ('A', 5), ('B', 2), ('B', 7)]
>>> grouped = rdd.groupByKey()
>>> grouped.collect()
[
 ('B', <pyspark.resultiterable.ResultIterable object at 0x114ef18d0>), 
 ('A', <pyspark.resultiterable.ResultIterable object at 0x114ef19e8>)
]
>>> grouped.map(lambda x: (x[0], list(x[1])).collect()
[('B', [2, 7]), ('A', [2, 3, 4, 5])]
>>> total2 = grouped.map(lambda x: (x[0], sum(x[1])))
>>> total2.collect()
[('B', 9), ('A', 14)]
>>>

>>>
>>> spark
<pyspark.sql.session.SparkSession object at 0x11848eda0>
>>> numbers = [-1, 2, 3, -55, 88, 99, -99, 66, 777]
>>> numbers
[-1, 2, 3, -55, 88, 99, -99, 66, 777]
>>> rdd = spark.sparkContext.parallelize(numbers)
>>> rdd.collect()
[-1, 2, 3, -55, 88, 99, -99, 66, 777]
>>>
>>> positives = rdd.filter(lambda x : x > 0)
>>> positives.collect()
[2, 3, 88, 99, 66, 777]
>>>
>>> negatives = rdd.filter(lambda x : x < 0)
>>> negatives.collect()
[-1, -55, -99]
>>> def keep_positives(n):
...     if (n > 0):
...        return True
...     else:
...        return False
... ^D
>>>
>>> a = keep_positives(100)
>>> a
True
>>> a = keep_positives(-9)
>>> a
False
>>> pos2 = rdd.filter(keep_positives)
>>> pos2.collect()
[2, 3, 88, 99, 66, 777]
>>> pos2222 = pos2.filter(lambda x : True)
>>> pos2222.collect()
[2, 3, 88, 99, 66, 777]
>>>
>>>
>>> pairs = [('A', 2), ('A', 3), ('A', 4),('A', 5), ('A', 6), ('B', 10), ('B', 2)]
>>> pairs
[('A', 2), ('A', 3), ('A', 4), ('A', 5), ('A', 6), ('B', 10), ('B', 2)]
>>>
>>>
>>> rdd = spark.sparkContext.parallelize(pairs)
>>> rdd.collect()
[('A', 2), ('A', 3), ('A', 4), ('A', 5), ('A', 6), ('B', 10), ('B', 2)]
>>> totals = rdd.reduceByKey(lambda a, b : a+b)
>>> result = totals.collect()
>>> result
[('B', 12), ('A', 20)]
>>>
>>>
>>> rdd.collect()
[('A', 2), ('A', 3), ('A', 4), ('A', 5), ('A', 6), ('B', 10), ('B', 2)]
>>> grouped = rdd.groupByKey()
>>> grouped.collect()
[
 ('B', <pyspark.resultiterable.ResultIterable object at 0x1184f3b70>), 
 ('A', <pyspark.resultiterable.ResultIterable object at 0x1184f3c88>)
]
>>>
>>> grouped.map(lambda x: (x[0], list(x[1]))).collect()
[('B', [10, 2]), ('A', [2, 3, 4, 5, 6])]
>>>
>>> sum2 = grouped.map(lambda x: (x[0], sum(x[1])))
>>> sum2.collect()
[('B', 12), ('A', 20)]
>>>