Skip to content

Commit 9b5711f

Browse files
added DataFrames Tutorials
1 parent a6081a3 commit 9b5711f

File tree

3 files changed

+761
-0
lines changed

3 files changed

+761
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
# Spark DataFrame Tutorial:
2+
# Creating Dataframes from Python Collections
3+
4+
Author: Mahmoud Parsian
5+
6+
Date: July 17, 2022
7+
8+
---------------------------
9+
10+
Tutorial Description:
11+
12+
This is a basic Tutorial on Spark
13+
DataFrames using PySpark. It shows
14+
how to create Spark DataFrames from
15+
CSV text files.
16+
17+
---------------------------
18+
19+
1. Operating system command prompt begins with `$`
20+
2. Operating system comments begin with `$#`
21+
3. PySpark shell comments begin with `>>>#`
22+
4. PySpark shell commands begin with `>>>`
23+
24+
---------------------------
25+
26+
# Invoke PySpark Shell
27+
28+
Note that /Users/mparsian/spark-3.3.0 is my
29+
installed Spark directory (you need to change this accordingly)
30+
31+
32+
$ cd /Users/mparsian/spark-3.3.0
33+
$ ./bin/pyspark
34+
>>>
35+
Welcome to
36+
____ __
37+
/ __/__ ___ _____/ /__
38+
_\ \/ _ \/ _ `/ __/ '_/
39+
/__ / .__/\_,_/_/ /_/\_\ version 3.3.0
40+
/_/
41+
42+
43+
>>># spark is a SparkSession object created by PySpark shell
44+
>>># let's check spark
45+
>>> spark
46+
<pyspark.sql.session.SparkSession object at 0x10c85a710>
47+
48+
>>> spark.version
49+
'3.3.0'
50+
51+
# Create a Python collection as `data`
52+
>>> # create a Python collection as data
53+
>>> data =
54+
[
55+
('alex', 20, 12000),
56+
('jane', 30, 45000),
57+
('rafa', 40, 56000),
58+
('ted', 30, 145000),
59+
('xo2', 10, 1332000),
60+
('mary', 44, 555000)
61+
]
62+
63+
>>> # examine/display data
64+
>>> data
65+
[
66+
('alex', 20, 12000),
67+
('jane', 30, 45000),
68+
('rafa', 40, 56000),
69+
('ted', 30, 145000),
70+
('xo2', 10, 1332000),
71+
('mary', 44, 555000)
72+
]
73+
74+
# Create a DataFrame and perform some queries
75+
76+
>>># define column names
77+
>>> column_names = ['name', 'age', 'salary']
78+
79+
>>> # examine/display column_names
80+
>>> column_names
81+
['name', 'age', 'salary']
82+
83+
>>> # create a DataFrame as df from Python collection
84+
>>> df = spark.createDataFrame(data, column_names)
85+
>>>
86+
>>> # inspect created DataFrame
87+
>>> df
88+
DataFrame[name: string, age: bigint, salary: bigint]
89+
90+
>>> # inspect created DataFrame's Schema
91+
>>> df.printSchema()
92+
root
93+
|-- name: string (nullable = true)
94+
|-- age: long (nullable = true)
95+
|-- salary: long (nullable = true)
96+
97+
>>> # display the first 20 rows of a DataFrame
98+
>>> df.show()
99+
+----+---+-------+
100+
|name|age| salary|
101+
+----+---+-------+
102+
|alex| 20| 12000|
103+
|jane| 30| 45000|
104+
|rafa| 40| 56000|
105+
| ted| 30| 145000|
106+
| xo2| 10|1332000|
107+
|mary| 44| 555000|
108+
+----+---+-------+
109+
110+
>>> # count the number of rows
111+
>>> df.count()
112+
6
113+
114+
# Register your DataFrame as a Table
115+
>>> # Creates or replaces a local temporary view with this DataFrame
116+
>>> df.createOrReplaceTempView("people")
117+
118+
119+
# Run SQL queries using defined Table
120+
121+
>>> df2 = spark.sql("select * from people where salary > 67000")
122+
>>> df2.show()
123+
+----+---+-------+
124+
|name|age| salary|
125+
+----+---+-------+
126+
| ted| 30| 145000|
127+
| xo2| 10|1332000|
128+
|mary| 44| 555000|
129+
+----+---+-------+
130+
131+
>>> df3 = spark.sql("select * from people where salary > 67000 and age > 11")
132+
>>> df3.show()
133+
+----+---+------+
134+
|name|age|salary|
135+
+----+---+------+
136+
| ted| 30|145000|
137+
|mary| 44|555000|
138+
+----+---+------+
139+
140+
141+
>>> df4 = spark.sql("select * from people")
142+
>>> df4.show()
143+
+----+---+-------+
144+
|name|age| salary|
145+
+----+---+-------+
146+
|alex| 20| 12000|
147+
|jane| 30| 45000|
148+
|rafa| 40| 56000|
149+
| ted| 30| 145000|
150+
| xo2| 10|1332000|
151+
|mary| 44| 555000|
152+
+----+---+-------+
153+
154+
>>> # cross join: or cartesian product
155+
>>> cart = spark.sql("select * from people p1, people p2")
156+
>>> cart.show()
157+
+----+---+------+----+---+-------+
158+
|name|age|salary|name|age| salary|
159+
+----+---+------+----+---+-------+
160+
|alex| 20| 12000|alex| 20| 12000|
161+
|alex| 20| 12000|jane| 30| 45000|
162+
|alex| 20| 12000|rafa| 40| 56000|
163+
|alex| 20| 12000| ted| 30| 145000|
164+
|alex| 20| 12000| xo2| 10|1332000|
165+
|alex| 20| 12000|mary| 44| 555000|
166+
|jane| 30| 45000|alex| 20| 12000|
167+
|jane| 30| 45000|jane| 30| 45000|
168+
|jane| 30| 45000|rafa| 40| 56000|
169+
|jane| 30| 45000| ted| 30| 145000|
170+
|jane| 30| 45000| xo2| 10|1332000|
171+
|jane| 30| 45000|mary| 44| 555000|
172+
|rafa| 40| 56000|alex| 20| 12000|
173+
|rafa| 40| 56000|jane| 30| 45000|
174+
|rafa| 40| 56000|rafa| 40| 56000|
175+
|rafa| 40| 56000| ted| 30| 145000|
176+
|rafa| 40| 56000| xo2| 10|1332000|
177+
|rafa| 40| 56000|mary| 44| 555000|
178+
| ted| 30|145000|alex| 20| 12000|
179+
| ted| 30|145000|jane| 30| 45000|
180+
+----+---+------+----+---+-------+
181+
only showing top 20 rows
182+
183+
>>> cart
184+
>>> DataFrame[name: string,
185+
age: bigint,
186+
salary: bigint,
187+
name: string,
188+
age: bigint,
189+
salary: bigint]
190+
>>>
191+
192+
>>> # cross join: or cartesian product
193+
>>> cart2 = spark.sql("select p1.name as name, p2.age as age, p1.salary as salary, p2.name as name2, p2.age as age2, p2.salary as salary2 from people p1, people p2")
194+
>>> cart2.show()
195+
+----+---+------+-----+----+-------+
196+
|name|age|salary|name2|age2|salary2|
197+
+----+---+------+-----+----+-------+
198+
|alex| 20| 12000| alex| 20| 12000|
199+
|alex| 30| 12000| jane| 30| 45000|
200+
|alex| 40| 12000| rafa| 40| 56000|
201+
|alex| 30| 12000| ted| 30| 145000|
202+
|alex| 10| 12000| xo2| 10|1332000|
203+
|alex| 44| 12000| mary| 44| 555000|
204+
|jane| 20| 45000| alex| 20| 12000|
205+
|jane| 30| 45000| jane| 30| 45000|
206+
|jane| 40| 45000| rafa| 40| 56000|
207+
|jane| 30| 45000| ted| 30| 145000|
208+
|jane| 10| 45000| xo2| 10|1332000|
209+
|jane| 44| 45000| mary| 44| 555000|
210+
|rafa| 20| 56000| alex| 20| 12000|
211+
|rafa| 30| 56000| jane| 30| 45000|
212+
|rafa| 40| 56000| rafa| 40| 56000|
213+
|rafa| 30| 56000| ted| 30| 145000|
214+
|rafa| 10| 56000| xo2| 10|1332000|
215+
|rafa| 44| 56000| mary| 44| 555000|
216+
| ted| 20|145000| alex| 20| 12000|
217+
| ted| 30|145000| jane| 30| 45000|
218+
+----+---+------+-----+----+-------+
219+
only showing top 20 rows
220+
221+
>>>
222+
>>> cart2
223+
DataFrame[name: string, age: bigint, salary: bigint, name2: string, age2: bigint, salary2: bigint]

0 commit comments

Comments
 (0)