added explode_arrays_into_rows

mahmoudparsian · mahmoudparsian · commit 5110cb11d5ea · 2022-05-18T20:35:14.000-07:00
diff --git a/code/bonus_chapters/dataframes/explode_arrays_into_rows/python/README.md b/code/bonus_chapters/dataframes/explode_arrays_into_rows/python/README.md
@@ -0,0 +1 @@
+Pyspark – Split multiple array columns into rows
diff --git a/code/bonus_chapters/dataframes/explode_arrays_into_rows/python/explode_arrays_into_rows.log b/code/bonus_chapters/dataframes/explode_arrays_into_rows/python/explode_arrays_into_rows.log
@@ -0,0 +1,36 @@
+/Users/mparsian/spark-3.2.1/bin/spark-submit explode_arrays_into_rows.py
+
+root
+ |-- name: string (nullable = true)
+ |-- age: string (nullable = true)
+ |-- languages: array (nullable = true)
+ |    |-- element: string (containsNull = true)
+
++-----+---+--------------------+
+| name|age|           languages|
++-----+---+--------------------+
+| Rafa| 20|        [SQL, NoSQL]|
+| Alex| 21|    [Ada, SQL, Java]|
+| Jane| 22|[Fortran, Cobol, ...|
+|Maria| 23|                  []|
++-----+---+--------------------+
+
+root
+ |-- name: string (nullable = true)
+ |-- age: string (nullable = true)
+ |-- col: string (nullable = true)
+
++----+---+-------+
+|name|age|    col|
++----+---+-------+
+|Rafa| 20|    SQL|
+|Rafa| 20|  NoSQL|
+|Alex| 21|    Ada|
+|Alex| 21|    SQL|
+|Alex| 21|   Java|
+|Jane| 22|Fortran|
+|Jane| 22|  Cobol|
+|Jane| 22|      R|
+|Jane| 22|    C++|
++----+---+-------+
+
diff --git a/code/bonus_chapters/dataframes/explode_arrays_into_rows/python/explode_arrays_into_rows.py b/code/bonus_chapters/dataframes/explode_arrays_into_rows/python/explode_arrays_into_rows.py
@@ -0,0 +1,39 @@
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import *
+
+#-------------------------------------------------
+# Pyspark – Split multiple array columns into rows
+#-------------------------------------------------
+  
+# creating a sparksession object
+spark=SparkSession.builder.getOrCreate()
+  
+# now creating dataframe
+# creating the row data and giving array
+# values for dataframe
+sample_data = [('Rafa',  '20', ['SQL','NoSQL']),
+        ('Alex',  '21', ['Ada','SQL', 'Java']),
+        ('Jane',  '22', ['Fortran', 'Cobol', 'R', 'C++']),
+        ('Maria', '23', [])]
+  
+# column names for dataframe
+column_names = ['name', 'age', 'languages']
+  
+# creating dataframe with createDataFrame()
+df = spark.createDataFrame(sample_data, column_names)
+  
+# printing dataframe schema
+df.printSchema()
+  
+# show dataframe
+df.show()
+
+# using select function applying 
+# explode on array column
+df2 = df.select(df.name, df.age, explode(df.languages))
+  
+# printing the schema of the df2
+df2.printSchema()
+  
+# show df2
+df2.show()
diff --git a/code/bonus_chapters/dataframes/explode_arrays_into_rows/scala/README.md b/code/bonus_chapters/dataframes/explode_arrays_into_rows/scala/README.md
@@ -0,0 +1 @@
+Pyspark – Split multiple array columns into rows

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Pyspark – Split multiple array columns into rows`