This repository has been archived by the owner on Aug 18, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 78
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Cynthia Zhong
committed
Aug 2, 2020
1 parent
289483e
commit e99fbdc
Showing
11 changed files
with
301,219 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Build file for CS61C MapReduce Lab | ||
# You should not need to edit this file. | ||
|
||
# This file requires GNU make and depends on paths on instruction machines. | ||
|
||
#### | ||
|
||
all: | ||
|
||
generate-input: | ||
hadoop jar textImporter/wc.jar Importer $(myinput) seqFiles/ | ||
|
||
clean: | ||
rm -rf spark-wc-out* | ||
|
||
.PHONY: clean all |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import sys | ||
import re | ||
|
||
from pyspark import SparkContext,SparkConf | ||
|
||
def flatMapFunc(document): | ||
""" | ||
document[0] is the document ID (distinct for each document) | ||
document[1] is a string of all text in that document | ||
You will need to modify this code. | ||
""" | ||
documentID = document[0] | ||
words = re.findall(r"\w+", document[1]) | ||
return words | ||
|
||
def mapFunc(arg): | ||
""" | ||
You may need to modify this code. | ||
""" | ||
return (arg, 1) | ||
|
||
def reduceFunc(arg1, arg2): | ||
""" | ||
You may need to modify this code. | ||
""" | ||
return arg1+arg2 | ||
|
||
def createIndices(file_name, output="spark-wc-out-createIndices"): | ||
sc = SparkContext("local[8]", "CreateIndices", conf=SparkConf().set("spark.hadoop.validateOutputSpecs", "false")) | ||
file = sc.sequenceFile(file_name) | ||
|
||
indices = file.flatMap(flatMapFunc) \ | ||
.map(mapFunc) \ | ||
.reduceByKey(reduceFunc) | ||
|
||
indices.coalesce(1).saveAsTextFile(output) | ||
|
||
""" Do not worry about this """ | ||
if __name__ == "__main__": | ||
argv = sys.argv | ||
if len(argv) == 2: | ||
createIndices(argv[1]) | ||
else: | ||
createIndices(argv[1], argv[2]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import sys | ||
import re | ||
|
||
from pyspark import SparkContext,SparkConf | ||
|
||
def splitDocument(document): | ||
"""Returns a list of all words in the document""" | ||
return re.findall(r"\w+", document[1]) | ||
|
||
def toPairs(word): | ||
""" Creates `(key, value)` pairs where the word is the key and 1 is the value """ | ||
return (word, 1) | ||
|
||
def sumCounts(a, b): | ||
""" Add up the values for each word, resulting in a count of occurences """ | ||
return a + b | ||
|
||
""" TODO: Add functions here to determine the most popular word | ||
Note that Map/flatMap style functions take in one argument while Reduce functions take in two | ||
""" | ||
|
||
def mostPopular(file_name, output="spark-wc-out-mostPopular"): | ||
sc = SparkContext("local[8]", "WordCount", conf=SparkConf().set("spark.hadoop.validateOutputSpecs", "false")) | ||
""" Reads in a sequence file FILE_NAME to be manipulated """ | ||
file = sc.sequenceFile(file_name) | ||
|
||
counts = file.flatMap(splitDocument) \ | ||
.map(toPairs) \ | ||
.reduceByKey(sumCounts) | ||
# TODO: add appropriate extra transformations here | ||
|
||
""" Takes the dataset stored in counts and writes everything out to OUTPUT """ | ||
counts.coalesce(1).saveAsTextFile(output) | ||
|
||
""" Do not worry about this """ | ||
if __name__ == "__main__": | ||
argv = sys.argv | ||
if len(argv) == 2: | ||
mostPopular(argv[1]) | ||
else: | ||
mostPopular(argv[1], argv[2]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import sys | ||
import re | ||
|
||
from pyspark import SparkContext,SparkConf | ||
|
||
def flatMapFunc(document): | ||
""" | ||
Before we returned a list of words and used the map and reduce fucntions to | ||
determine how many times each word occurred (regardless of document ID). | ||
Now we want to know how many different documents contain the word. | ||
This function should create a list of items which we will then run various | ||
transformations on to eventually create a count of documents per word. | ||
document[0] is the document ID (distinct for each document) | ||
document[1] is a string of all text in that document | ||
You will need to modify this code. | ||
""" | ||
documentID = document[0] | ||
words = re.findall(r"\w+", document[1]) | ||
return words | ||
|
||
def mapFunc(arg): | ||
""" | ||
Create `(key, value)` pairs. | ||
You may need to modify this code. | ||
""" | ||
return (arg, 1) | ||
|
||
def reduceFunc(arg1, arg2): | ||
""" | ||
Sum counts. | ||
You may need to modify this code. | ||
""" | ||
return arg1 + arg2 | ||
|
||
def perWordDocumentCount(file_name, output="spark-wc-out-perWordDocumentCount"): | ||
sc = SparkContext("local[8]", "PerWordDocumentCount", conf=SparkConf().set("spark.hadoop.validateOutputSpecs", "false")) | ||
file = sc.sequenceFile(file_name) | ||
|
||
""" | ||
You will need to add, remove, and/or modify function calls here. | ||
The function `distinct()` may be helpful... | ||
Be sure that your output ends up in alphabetial order. | ||
""" | ||
counts = file.flatMap(flatMapFunc) \ | ||
.map(mapFunc) \ | ||
.reduceByKey(reduceFunc) | ||
|
||
counts.coalesce(1).saveAsTextFile(output) | ||
|
||
""" Do not worry about this """ | ||
if __name__ == "__main__": | ||
argv = sys.argv | ||
if len(argv) == 2: | ||
perWordDocumentCount(argv[1]) | ||
else: | ||
perWordDocumentCount(argv[1], argv[2]) |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
Amendment I | ||
|
||
Congress shall make no law respecting an establishment of religion, or prohibiting the free exercise thereof; or abridging the freedom of speech, or of the press; or the right of the people peaceably to assemble, and to petition the Government for a redress of grievances. | ||
|
||
---END.OF.DOCUMENT--- | ||
|
||
Amendment II | ||
|
||
A well regulated Militia, being necessary to the security of a free State, the right of the people to keep and bear Arms, shall not be infringed. | ||
|
||
---END.OF.DOCUMENT--- | ||
|
||
Amendment III | ||
|
||
No Soldier shall, in time of peace be quartered in any house, without the consent of the Owner, nor in time of war, but in a manner to be prescribed by law. | ||
|
||
---END.OF.DOCUMENT--- | ||
|
||
Amendment IV | ||
|
||
The right of the people to be secure in their persons, houses, papers, and effects, against unreasonable searches and seizures, shall not be violated, and no Warrants shall issue, but upon probable cause, supported by Oath or affirmation, and particularly describing the place to be searched, and the persons or things to be seized. | ||
|
||
---END.OF.DOCUMENT--- | ||
|
||
Amendment V | ||
|
||
No person shall be held to answer for a capital, or otherwise infamous crime, unless on a presentment or indictment of a Grand Jury, except in cases arising in the land or naval forces, or in the Militia, when in actual service in time of War or public danger; nor shall any person be subject for the same offence to be twice put in jeopardy of life or limb; nor shall be compelled in any criminal case to be a witness against himself, nor be deprived of life, liberty, or property, without due process of law; nor shall private property be taken for public use, without just compensation. | ||
|
||
---END.OF.DOCUMENT--- | ||
|
||
Amendment VI | ||
|
||
In all criminal prosecutions, the accused shall enjoy the right to a speedy and public trial, by an impartial jury of the State and district wherein the crime shall have been committed, which district shall have been previously ascertained by law, and to be informed of the nature and cause of the accusation; to be confronted with the witnesses against him; to have compulsory process for obtaining witnesses in his favor, and to have the Assistance of Counsel for his defence. | ||
|
||
---END.OF.DOCUMENT--- | ||
|
||
Amendment VII | ||
|
||
In Suits at common law, where the value in controversy shall exceed twenty dollars, the right of trial by jury shall be preserved, and no fact tried by a jury, shall be otherwise re-examined in any Court of the United States, than according to the rules of the common law. | ||
|
||
---END.OF.DOCUMENT--- | ||
|
||
Amendment VIII | ||
|
||
Excessive bail shall not be required, nor excessive fines imposed, nor cruel and unusual punishments inflicted. | ||
|
||
---END.OF.DOCUMENT--- | ||
|
||
Amendment IX | ||
|
||
The enumeration in the Constitution, of certain rights, shall not be construed to deny or disparage others retained by the people. | ||
|
||
---END.OF.DOCUMENT--- | ||
|
||
Amendment X | ||
|
||
The powers not delegated to the United States by the Constitution, nor prohibited by it to the States, are reserved to the States respectively, or to the people. |
Oops, something went wrong.