-
Notifications
You must be signed in to change notification settings - Fork 475
/
Copy pathpyspark-session-2019-04-16.txt
executable file
·98 lines (92 loc) · 3.37 KB
/
pyspark-session-2019-04-16.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ pwd
/Users/mparsian/spark-2.4.0
mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ ls -l
-rw-r--r--@ 1 mparsian 897801646 21357 Oct 28 23:36 LICENSE
-rw-r--r--@ 1 mparsian 897801646 42919 Oct 28 23:36 NOTICE
drwxr-xr-x@ 3 mparsian 897801646 96 Oct 28 23:36 R
-rw-r--r--@ 1 mparsian 897801646 3952 Oct 28 23:36 README.md
-rw-r--r--@ 1 mparsian 897801646 156 Oct 28 23:36 RELEASE
drwxr-xr-x@ 29 mparsian 897801646 928 Oct 28 23:36 bin
drwxr-xr-x@ 9 mparsian 897801646 288 Oct 28 23:36 conf
drwxr-xr-x@ 5 mparsian 897801646 160 Oct 28 23:36 data
drwxr-xr-x@ 4 mparsian 897801646 128 Oct 28 23:36 examples
drwxr-xr-x@ 227 mparsian 897801646 7264 Oct 28 23:36 jars
drwxr-xr-x@ 4 mparsian 897801646 128 Oct 28 23:36 kubernetes
drwxr-xr-x@ 48 mparsian 897801646 1536 Oct 28 23:36 licenses
drwxr-xr-x 16 mparsian 897801646 512 Mar 25 12:29 logs
drwxr-xr-x@ 19 mparsian 897801646 608 Oct 28 23:36 python
drwxr-xr-x@ 24 mparsian 897801646 768 Oct 28 23:36 sbin
drwxr-xr-x 2 mparsian 897801646 64 Jan 8 03:00 work
drwxr-xr-x@ 3 mparsian 897801646 96 Oct 28 23:36 yarn
mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ ls -l bin
total 224
-rwxr-xr-x@ 1 mparsian 897801646 1089 Oct 28 23:36 beeline
-rw-r--r--@ 1 mparsian 897801646 1064 Oct 28 23:36 beeline.cmd
-rwxr-xr-x@ 1 mparsian 897801646 5427 Oct 28 23:36 docker-image-tool.sh
-rwxr-xr-x@ 1 mparsian 897801646 1933 Oct 28 23:36 find-spark-home
-rw-r--r--@ 1 mparsian 897801646 2681 Oct 28 23:36 find-spark-home.cmd
-rw-r--r--@ 1 mparsian 897801646 1892 Oct 28 23:36 load-spark-env.cmd
-rw-r--r--@ 1 mparsian 897801646 2025 Oct 28 23:36 load-spark-env.sh
-rwxr-xr-x@ 1 mparsian 897801646 2987 Oct 28 23:36 pyspark
...
mparsian@Mahmouds-MacBook ~/spark-2.4.0 $ ./bin/pyspark
Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
[Clang 6.0 (clang-600.0.57)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/__ / .__/\_,_/_/ /_/\_\ version 2.4.0
/_/
Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
SparkSession available as 'spark'.
>>> spark
<pyspark.sql.session.SparkSession object at 0x111140860>
>>>
>>>
>>>
>>> data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
>>> data
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
>>>
>>> rdd = spark.sparkContext.parallelize(data)
>>> rdd.count()
12
>>> rdd.collect()
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
>>> rdd.getNumPartitions()
8
>>> rdd2 = spark.sparkContext.parallelize(data, 3)
>>> rdd2.collect()
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
>>> rdd2.getNumPartitions()
3
>>> rdd3 = rdd.map(lambda x : x+100)
>>> rdd3.collect()
[101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112]
>>>
>>> def myfun(x):
... return x+100
...
>>>
>>>
>>> y = myfun(4)
>>> y
104
>>> z = myfun(60)
>>> z
160
>>> rdd4 = rdd.map(myfun)
>>> rdd4.collect()
[101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112]
>>> rdd5 = rdd.map(lambda x: (x, 1))
>>> rdd5.collect()
[(1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)]
>>> rdd2.collect()
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
>>> N = rdd.reduce(lambda x, y: x+y)
>>> N
78
>>> exit()