Skip to content

Commit bdedc07

Browse files
committed
first commit
0 parents  commit bdedc07

11 files changed

+361
-0
lines changed

README

Whitespace-only changes.

bigram_finder.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
2+
from nltk import word_tokenize
3+
from nltk.collocations import BigramCollocationFinder
4+
from nltk.metrics import BigramAssocMeasures
5+
6+
text = '''
7+
The Boston Celtics are a National Basketball Association (NBA) team based in Boston, MA. They play in the Atlantic Division
8+
of the Eastern Conference. Founded in 1946, the team is currently owned by
9+
Boston Basketball Partners LLC. The Celtics play their home games at the TD Garden,
10+
which they share with the Boston Blazers (NLL), and the Boston Bruins of the NHL.
11+
12+
The Celtics have dominated the league during the late 50's and through the mid 80's,
13+
with the help of many Hall of Famers which include Bill Russell, Bob Cousy, John Havlicek,
14+
Larry Bird and legendary Celtics coach Red Auerbach,
15+
combined for a 795 - 397 record that helped the Celtics win 16 Championships.
16+
'''
17+
bigram_finder = BigramCollocationFinder.from_words(word_tokenize(text),5)
18+
print bigram_finder.nbest(BigramAssocMeasures.chi_sq,100)

named_ent_chunker.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from nltk import ne_chunk,pos_tag
2+
from nltk.tokenize.punkt import PunktSentenceTokenizer
3+
from nltk.tokenize.treebank import TreebankWordTokenizer
4+
'''
5+
import nltk
6+
nltk.download('words')
7+
nltk.download('punkt')
8+
nltk.download('maxent_treebank_pos_tagger')
9+
nltk.download('maxent_ne_chunker')
10+
'''
11+
12+
13+
TreeBankTokenizer = TreebankWordTokenizer()
14+
PunktTokenizer = PunktSentenceTokenizer()
15+
text = '''
16+
The Boston Celtics are a National Basketball Association (NBA) team based in Boston, MA. They play in the Atlantic Division
17+
of the Eastern Conference. Founded in 1946, the team is currently owned by
18+
Boston Basketball Partners LLC. The Celtics play their home games at the TD Garden,
19+
which they share with the Boston Blazers (NLL), and the Boston Bruins of the NHL.
20+
21+
The Celtics have dominated the league during the late 50's and through the mid 80's,
22+
with the help of many Hall of Famers which include Bill Russell, Bob Cousy, John Havlicek,
23+
Larry Bird and legendary Celtics coach Red Auerbach,
24+
combined for a 795 - 397 record that helped the Celtics win 16 Championships.
25+
'''
26+
27+
sentences = PunktTokenizer.tokenize(text)
28+
tokens = [TreeBankTokenizer.tokenize(sentence) for sentence in sentences]
29+
tagged = [pos_tag(token) for token in tokens]
30+
chunked = [ne_chunk(taggedToken) for taggedToken in tagged]
31+
32+
chunked[0].draw()
33+
chunked[-1].draw()
34+
chunked[-3].draw()
35+
36+
print chunked
37+
38+
39+

nltkText.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from nltk.corpus import brown
2+
from nltk import Text
3+
brown_words = brown.words(categories='humor')
4+
brownText = Text(brown_words)
5+
brownText.collocations()
6+
brownText.count("car")
7+
brownText.concordance("oil")
8+
brownText.dispersion_plot(['car', 'document', 'funny', 'oil'])
9+
brownText.similar('humor')
10+

nounphrase_chunker.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
2+
3+
from nltk.chunk import *
4+
from nltk.chunk.util import *
5+
from nltk.chunk.regexp import *
6+
from nltk import word_tokenize
7+
from nltk import pos_tag
8+
9+
10+
text = '''
11+
Jack and Jill went up the hill to fetch a pail of water
12+
'''
13+
tokens = nltk.pos_tag(word_tokenize(text))
14+
15+
chunk = ChunkRule("<.*>+", "Chunk all the text")
16+
chink = ChinkRule("<VBD|IN|\.>", "Leave verbs and prepositions out of this")
17+
split = SplitRule("<DT><NN>", "<DT><NN>","Chunk on sequences of determiner+noun phrases")
18+
19+
chunker = RegexpChunkParser([chunk, chink, split],chunk_node='NP')
20+
chunked = chunker.parse(tokens)
21+
chunked.draw()
22+
23+
24+
25+
26+
# ANOTHER WAY TO DO THIS USING THE "REGEX PARSER"
27+
'''
28+
import nltk
29+
from nltk import pos_tag, word_tokenize
30+
text = """
31+
Jack and Jill went up the hill to fetch a pail of water. Jack fell down and broke his crown and jill came tumbling after.
32+
"""
33+
tagged_tokens = pos_tag(word_tokenize(text))
34+
35+
grammar = """
36+
NP:
37+
{<.*>+}
38+
}<VBD|IN>+{
39+
"""
40+
chunk = nltk.RegexpParser(grammar)
41+
tree = chunk.parse(tagged_tokens)
42+
tree.draw()
43+
'''

poem.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
from nltk_contrib.readability.textanalyzer import syllables_en
2+
from nltk.corpus import cmudict,wordnet as wn
3+
import nltk
4+
import re
5+
6+
7+
# requires the installation of the nltk_contrib package which is available at the google code project page
8+
textchunk = '''
9+
One of the very difficult parts of the decision I made on the
10+
financial crisis was to use hardworking people's money to
11+
help prevent there to be a crisis.
12+
13+
I want to share with you an interesting program for two reasons, one,
14+
it's interesting, and two, my wife thought of it or
15+
has actually been involved with it; she didn't think of it.
16+
But she thought of it for this speech.
17+
18+
I'm telling you there's an enemy that would like to attack America, Americans, again.
19+
There just is.
20+
That's the reality of the world.
21+
And I wish him all the very best.
22+
23+
This is my maiden voyage.
24+
My first speech since I was the president of the United States and I couldn't think of a better
25+
place to give it than Calgary, Canada.
26+
'''
27+
28+
textchunk += '''
29+
They want to deliver vast amounts of information over the Internet.
30+
And again, the Internet is not something that you just dump something on.
31+
It's not a big truck. It's a series of tubes. And if you don't understand,
32+
those tubes can be filled and if they are filled, when you put your message in,
33+
it gets in line and it's going to be delayed by anyone that puts into that tube
34+
enormous amounts of material, enormous amounts of material
35+
'''
36+
37+
poem = ''
38+
wordmap = [] # a list that will contain a tuple (word,syllable_count)
39+
words = nltk.word_tokenize(textchunk)
40+
for iter,word in enumerate(words):
41+
word += " "
42+
syls = syllables_en.count(word)
43+
wordmap.append((word,syls))
44+
45+
46+
47+
def findSyllableWord(word,syllableSize):
48+
synsets = wn.synsets(word)
49+
for syns in synsets:
50+
name = syns.name
51+
lemmas = syns.lemma_names
52+
for wordstring in lemmas:
53+
if(syllables_en.count(wordstring) == syllableSize and wordstring != word):
54+
return {'word':word,'syllable':syllableSize}
55+
return {'word':word,'syllable':syllables_en.count(word)}
56+
57+
58+
lineNo = 1
59+
charNo = 0
60+
tally = 0
61+
for syllabicword in wordmap:
62+
s = syllabicword[1]
63+
wordtoAdd = syllabicword[0]
64+
if lineNo == 1:
65+
if tally < 5:
66+
if tally + int(s) > 5 and wordtoAdd.isalpha():
67+
num = 5 - tally
68+
similarterm = findSyllableWord(wordtoAdd,num)
69+
wordtoAdd = similarterm['word']
70+
s = similarterm['syllable']
71+
tally += int(s)
72+
poem += wordtoAdd
73+
else:
74+
poem += " ---"+str(tally)+"\n"
75+
if wordtoAdd.isalpha():
76+
poem += wordtoAdd
77+
tally = s
78+
lineNo = 2
79+
elif lineNo == 2:
80+
if tally < 7:
81+
if tally + int(s) > 7 and wordtoAdd.isalpha():
82+
num = 7 - tally
83+
similarterm = findSyllableWord(wordtoAdd,num)
84+
wordtoAdd = similarterm['word']
85+
s = similarterm['syllable']
86+
tally += int(s)
87+
poem += wordtoAdd
88+
else:
89+
poem += " ---"+str(tally)+"\n"
90+
if wordtoAdd.isalpha():
91+
poem += wordtoAdd
92+
tally = s
93+
lineNo = 3
94+
elif lineNo == 3:
95+
if tally < 5:
96+
if tally + int(s) > 5 and wordtoAdd.isalpha():
97+
num = 5 - tally
98+
similarterm = findSyllableWord(wordtoAdd,num)
99+
wordtoAdd = similarterm['word']
100+
s = similarterm['syllable']
101+
tally += int(s)
102+
poem += wordtoAdd
103+
else:
104+
poem += " ---"+str(tally)+"\n\n"
105+
if wordtoAdd.isalpha():
106+
poem += wordtoAdd
107+
tally = s
108+
lineNo = 1
109+
charNo+=1
110+
111+
112+
print poem

similarity.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/usr/bin/env python
2+
from nltk.corpus import wordnet as wn
3+
4+
Aword = 'language'
5+
Bword = 'barrier'
6+
7+
synsetsA = wn.synsets(Aword)
8+
synsetsB = wn.synsets(Bword)
9+
10+
groupA= [wn.synset(str(synset.name)) for synset in synsetsA]
11+
groupB = [wn.synset(str(synset.name)) for synset in synsetsB]
12+
13+
similars = []
14+
15+
for sseta in groupA:
16+
for ssetb in groupB:
17+
path_similarity = sseta.path_similarity(ssetb)
18+
wup_similarity = sseta.wup_similarity(ssetb)
19+
20+
if path_similarity is not None:
21+
similars.append({
22+
'path':path_similarity,
23+
'wup':wup_similarity,
24+
'wordA':sseta,
25+
'wordB':ssetb,
26+
'wordA_definition':sseta.definition,
27+
'wordB_definition':ssetb.definition
28+
})
29+
30+
similars = sorted(similars, key=lambda item: item['path'],reverse=True)
31+
32+
for item in similars:
33+
print item['wordA'],"\n",item['wordA_definition']
34+
print item['wordB'],"\n",item['wordB_definition']
35+
print 'Path similarity - ',item['path'],"\n"

spamdetector.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from nltk import word_tokenize,WordNetLemmatizer,NaiveBayesClassifier,classify,MaxentClassifier
2+
from nltk.corpus import stopwords
3+
import random
4+
import os, glob,re
5+
6+
7+
8+
commonwords = stopwords.words('english')
9+
wordlemmatizer = WordNetLemmatizer()
10+
11+
12+
def email_features(sent):
13+
features = {}
14+
wordtokens = [wordlemmatizer.lemmatize(word.lower()) for word in word_tokenize(sent)]
15+
for word in wordtokens:
16+
if word not in commonwords:
17+
features[word] = True
18+
logging.info(word)
19+
return features
20+
21+
22+
hamtexts = []
23+
spamtexts = []
24+
25+
for infile in glob.glob( os.path.join('ham/', '*.txt')):
26+
text_file = open(infile, "r")
27+
hamtexts.append(text_file.read())
28+
text_file.close()
29+
30+
for infile in glob.glob( os.path.join('spam/', '*.txt') ):
31+
text_file = open(infile, "r")
32+
spamtexts.append(text_file.read())
33+
text_file.close()
34+
35+
36+
37+
38+
mixedemails = ([(email,'spam') for email in spamtexts] + [(email,'ham') for email in hamtexts])
39+
40+
random.shuffle(mixedemails)
41+
featuresets = [(email_features(n), g) for (n,g) in mixedemails]
42+
43+
size = int(len(featuresets) * 0.35)
44+
train_set, test_set = featuresets[size:], featuresets[:size]
45+
classifier = NaiveBayesClassifier.train(train_set)
46+
#classifier = MaxentClassifier.train(train_set,'Powell',3)
47+
48+
49+
print 'accuracy: ', classify.accuracy(classifier,test_set)
50+
classifier.show_most_informative_features(30)
51+
print 'labels:',classifier.labels()
52+
while(True):
53+
featset = email_features(raw_input("Enter text to classify: "))
54+
print classifier.classify(featset)

timit_corpus.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import nltk
2+
3+
print nltk.corpus.timit.phones('dr1-fvmh0/sa1')
4+
5+
transcriptDict = nltk.corpus.timit.transcription_dict()
6+
7+
print transcriptDict

tokens.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import nltk
2+
3+
4+
rawtext = '''
5+
The Boston Celtics are a National Basketball Association (NBA) team based in Boston, MA. They play in the Atlantic Division
6+
of the Eastern Conference. Founded in 1946, the team is currently owned by
7+
Boston Basketball Partners LLC. The Celtics play their home games at the TD Garden,
8+
which they share with the Boston Blazers (NLL), and the Boston Bruins of the NHL.
9+
10+
The Celtics have dominated the league during the late 50's and through the mid 80's,
11+
with the help of many Hall of Famers which include Bill Russell, Bob Cousy, John Havlicek,
12+
Larry Bird and legendary Celtics coach Red Auerbach,
13+
combined for a 795 - 397 record that helped the Celtics win 16 Championships.
14+
'''
15+
rawtext = rawtext.lower()
16+
17+
18+
19+
text = nltk.Text(nltk.word_tokenize(rawtext))
20+
21+
22+
# Frequency of any given word
23+
print '-- Counting'
24+
print text.count("boston")
25+
26+
# concordance
27+
print '-- Concordance'
28+
print text.concordance("basketball")
29+
30+
# similarity
31+
print '-- similarity'
32+
print text.similar("game")
33+
34+
35+
print '-- contexts'
36+
text.common_contexts(["celtics" ,"win","boston","basketball"])
37+
38+
# lexical dispersion
39+
text.dispersion_plot(["celtics", "boston", "basketball"])

wordnet.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from nltk.corpus import wordnet as wn
2+
3+
for holonym in wn.synsets('robot')[0].hypernym_paths()[0]:
4+
print hypernym.lemma_names

0 commit comments

Comments
 (0)