Skip to content

Commit 5110cb1

Browse files
added explode_arrays_into_rows
1 parent 47df926 commit 5110cb1

File tree

4 files changed

+77
-0
lines changed

4 files changed

+77
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Pyspark – Split multiple array columns into rows
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/Users/mparsian/spark-3.2.1/bin/spark-submit explode_arrays_into_rows.py
2+
3+
root
4+
|-- name: string (nullable = true)
5+
|-- age: string (nullable = true)
6+
|-- languages: array (nullable = true)
7+
| |-- element: string (containsNull = true)
8+
9+
+-----+---+--------------------+
10+
| name|age| languages|
11+
+-----+---+--------------------+
12+
| Rafa| 20| [SQL, NoSQL]|
13+
| Alex| 21| [Ada, SQL, Java]|
14+
| Jane| 22|[Fortran, Cobol, ...|
15+
|Maria| 23| []|
16+
+-----+---+--------------------+
17+
18+
root
19+
|-- name: string (nullable = true)
20+
|-- age: string (nullable = true)
21+
|-- col: string (nullable = true)
22+
23+
+----+---+-------+
24+
|name|age| col|
25+
+----+---+-------+
26+
|Rafa| 20| SQL|
27+
|Rafa| 20| NoSQL|
28+
|Alex| 21| Ada|
29+
|Alex| 21| SQL|
30+
|Alex| 21| Java|
31+
|Jane| 22|Fortran|
32+
|Jane| 22| Cobol|
33+
|Jane| 22| R|
34+
|Jane| 22| C++|
35+
+----+---+-------+
36+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from pyspark.sql import SparkSession
2+
from pyspark.sql.functions import *
3+
4+
#-------------------------------------------------
5+
# Pyspark – Split multiple array columns into rows
6+
#-------------------------------------------------
7+
8+
# creating a sparksession object
9+
spark=SparkSession.builder.getOrCreate()
10+
11+
# now creating dataframe
12+
# creating the row data and giving array
13+
# values for dataframe
14+
sample_data = [('Rafa', '20', ['SQL','NoSQL']),
15+
('Alex', '21', ['Ada','SQL', 'Java']),
16+
('Jane', '22', ['Fortran', 'Cobol', 'R', 'C++']),
17+
('Maria', '23', [])]
18+
19+
# column names for dataframe
20+
column_names = ['name', 'age', 'languages']
21+
22+
# creating dataframe with createDataFrame()
23+
df = spark.createDataFrame(sample_data, column_names)
24+
25+
# printing dataframe schema
26+
df.printSchema()
27+
28+
# show dataframe
29+
df.show()
30+
31+
# using select function applying
32+
# explode on array column
33+
df2 = df.select(df.name, df.age, explode(df.languages))
34+
35+
# printing the schema of the df2
36+
df2.printSchema()
37+
38+
# show df2
39+
df2.show()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Pyspark – Split multiple array columns into rows

0 commit comments

Comments
 (0)