Skip to content

Commit 5318a17

Browse files
added UDF
1 parent c9180e3 commit 5318a17

File tree

6 files changed

+111
-1
lines changed

6 files changed

+111
-1
lines changed

code/bonus_chapters/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ so I added the following bonus chapters online.
1414
| [K-mers](./k-mers/) | K-mers for DNA Sequences |
1515
| [Correlation](./correlation/) | All vs. All Correlation |
1616
| [`mapPartitions()` Transformation](./mappartitions/) | `mapPartitions()` Complete Example |
17-
17+
| [`UDF`](./UDF/) | User-Defined Function Example |
1818

1919
-----
2020

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Demo Spark's UDF (user-defined-function)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
% export SPARK_HOME=/home/mparsian/spark-3.2.0
2+
% $SPARK_HOME/bin/spark-submit dataframe_UDF_example.py
3+
4+
+---+------------+
5+
|ID |Name |
6+
+---+------------+
7+
|100|john jones |
8+
|200|tracey smith|
9+
|300|amy sanders |
10+
|400|null |
11+
+---+------------+
12+
13+
+---+------------+
14+
|ID |Name |
15+
+---+------------+
16+
|100|John Jones |
17+
|200|Tracey Smith|
18+
|300|Amy Sanders |
19+
|400|null |
20+
+---+------------+
21+
22+
+---+------------+------------+
23+
|ID |Name |Upper Name |
24+
+---+------------+------------+
25+
|100|john jones |JOHN JONES |
26+
|200|tracey smith|TRACEY SMITH|
27+
|300|amy sanders |AMY SANDERS |
28+
|400|null |null |
29+
+---+------------+------------+
30+
31+
+---+------------+
32+
|ID |Name |
33+
+---+------------+
34+
|100|John Jones |
35+
|200|Tracey Smith|
36+
|300|Amy Sanders |
37+
|400|null |
38+
+---+------------+
39+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# import required libraries
2+
from pyspark.sql import SparkSession
3+
from pyspark.sql.functions import col
4+
from pyspark.sql.functions import udf
5+
from pyspark.sql.types import StringType
6+
7+
#--------------------------------------------------
8+
# Demo concept of Spark UDF (user-defined-function)
9+
#--------------------------------------------------
10+
# @author: Mahmoud Parsian
11+
#--------------------------------------------------
12+
def convert_case(name):
13+
if name is None: return None
14+
if len(name) < 1: return ""
15+
result_string = ""
16+
arr = name.split(" ")
17+
for x in arr:
18+
result_string += x[0:1].upper() + x[1:len(x)] + " "
19+
#end-for
20+
return result_string.strip()
21+
#end-def
22+
#--------------------------------------------------
23+
def to_upper_case(name):
24+
if name is None: return None
25+
if len(name) < 1: return ""
26+
return name.upper()
27+
#end-def
28+
#--------------------------------------------------
29+
#
30+
# create a SparkSession object
31+
spark = SparkSession.builder.appName('UDF-Learning').getOrCreate()
32+
33+
# define column names for a DataFrame
34+
column_names = ["ID", "Name"]
35+
36+
# define some rows for a DataFrame
37+
some_data = [("100", "john jones"),
38+
("200", "tracey smith"),
39+
("300", "amy sanders"),
40+
("400", None)]
41+
42+
# create a DataFrame
43+
df = spark.createDataFrame(data=some_data,schema=column_names)
44+
45+
# display content of a DataFrame for testing/debugging
46+
df.show(truncate=False)
47+
48+
49+
# Converting function to UDF
50+
convert_case_udf = udf(lambda p: convert_case(p))
51+
52+
# use UDF in select stmt
53+
df.select(col("ID"), convert_case_udf(col("Name")).alias("Name")).show(truncate=False)
54+
55+
# create a UDF function
56+
upper_case_udf = udf(lambda p: to_upper_case(p), StringType())
57+
58+
# Apply a UDF using withColumn
59+
df.withColumn("Upper Name", upper_case_udf(col("Name"))).show(truncate=False)
60+
61+
# Using UDF on SQL
62+
spark.udf.register("convert_UDF", convert_case, StringType())
63+
df.createOrReplaceTempView("NAME_TABLE")
64+
spark.sql("select ID, convert_UDF(Name) as Name from NAME_TABLE").show(truncate=False)
65+
+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Demo Spark's UDF (user-defined-function)

code/bonus_chapters/mappartitions/README.md

+4
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,9 @@ Now create a source `RDD[Integer]` and then apply `mapPartitions()`:
121121
~~~python
122122
>>> # spark : SparkSession object
123123
>>> data = [10, 20, 3, 4, 5, 2, 2, 20, 20, 10]
124+
>>> # rdd : RDD[integer]
124125
>>> rdd = spark.sparkContext.parallelize(data, 3)
126+
>>> # mapped : RDD[(integer, integer, integer)] : RDD[(count, min, max)]
125127
>>> mapped = rdd.mapPartitions(min_max)
126128
>>> mapped.collect()
127129
[(3, 3, 20), (3, 2, 5), (4, 2, 20)]
@@ -144,7 +146,9 @@ Note that you may perform final reduction by `RDD.reduce()` as well:
144146
~~~python
145147
>>> # spark : SparkSession object
146148
>>> data = [10, 20, 3, 4, 5, 2, 2, 20, 20, 10]
149+
>>> # rdd : RDD[integer]
147150
>>> rdd = spark.sparkContext.parallelize(data, 3)
151+
>>> # mapped : RDD[(integer, integer, integer)] : RDD[(count, min, max)]
148152
>>> mapped = rdd.mapPartitions(min_max)
149153
>>> mapped.collect()
150154
[(3, 3, 20), (3, 2, 5), (4, 2, 20)]

0 commit comments

Comments
 (0)