This repository has been archived by the owner on Aug 18, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 78
/
Copy pathwordCount.py
executable file
·56 lines (48 loc) · 2.15 KB
/
wordCount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import sys
import re
from pyspark import SparkContext,SparkConf
def splitDocument(document):
"""Returns a list of all words in the document"""
return re.findall(r"\w+", document[1])
def toPairs(word):
""" Creates `(key, value)` pairs where the word is the key and 1 is the value """
return (word, 1)
def sumCounts(a, b):
""" Add up the values for each word, resulting in a count of occurences """
return a + b
def wordCount(file_name, output="spark-wc-out-wordCount"):
sc = SparkContext("local[8]", "WordCount", conf=SparkConf().set("spark.hadoop.validateOutputSpecs", "false"))
""" Reads in a sequence file FILE_NAME to be manipulated """
file = sc.sequenceFile(file_name)
"""
- Explanation:
-
- `flatMap` takes in a function that will take one input and outputs 0 or
- more items. All returned results are combined into a single list of
- items that future functions are run on. We use this function to
- transform our document into a list of words.
-
- `map` takes in a function take in one item, perform an action on it, and
- return the result. When called on a list, it applies the function to
- each item in the list. We use this function transform our words into
- `(key, value)` pairs, with the key being the word and the value being
- the number of times it occurs.
-
- `reduceByKey` groups a list of `(key, value)` pairs by keys and runs a
- function on each key which takes two values and returns a single value
- (i.e. "reducing" them two inputs into one). It will be called
- iteratively on each key until only a single value remains for that key.
- We use this function to sum the number of times a word occurs.
"""
counts = file.flatMap(splitDocument) \
.map(toPairs) \
.reduceByKey(sumCounts)
""" Takes the dataset stored in counts and writes everything out to OUTPUT """
counts.coalesce(1).saveAsTextFile(output)
""" Do not worry about this """
if __name__ == "__main__":
argv = sys.argv
if len(argv) == 2:
wordCount(argv[1])
else:
wordCount(argv[1], argv[2])