first commit

shanbady · shanbady · commit bdedc071dae9 · 2010-12-14T11:16:20.000-05:00
diff --git a/README b/README
diff --git a/bigram_finder.py b/bigram_finder.py
@@ -0,0 +1,18 @@
+
+from nltk import word_tokenize
+from nltk.collocations import BigramCollocationFinder
+from nltk.metrics import BigramAssocMeasures
+
+text = '''
+The Boston Celtics are a National Basketball Association (NBA) team based in Boston, MA. They play in the Atlantic Division
+ of the Eastern Conference. Founded in 1946, the team is currently owned by 
+ Boston Basketball Partners LLC. The Celtics play their home games at the TD Garden,
+ which they share with the Boston Blazers (NLL), and the Boston Bruins of the NHL.
+ 
+ The Celtics have dominated the league during the late 50's and through the mid 80's, 
+ with the help of many Hall of Famers which include Bill Russell, Bob Cousy, John Havlicek, 
+ Larry Bird and legendary Celtics coach Red Auerbach, 
+ combined for a 795 - 397 record that helped the Celtics win 16 Championships.
+'''
+bigram_finder = BigramCollocationFinder.from_words(word_tokenize(text),5)
+print bigram_finder.nbest(BigramAssocMeasures.chi_sq,100)
diff --git a/named_ent_chunker.py b/named_ent_chunker.py
@@ -0,0 +1,39 @@
+from nltk import ne_chunk,pos_tag
+from nltk.tokenize.punkt import PunktSentenceTokenizer
+from nltk.tokenize.treebank import TreebankWordTokenizer
+'''
+	import nltk
+	nltk.download('words')
+	nltk.download('punkt')
+	nltk.download('maxent_treebank_pos_tagger')
+	nltk.download('maxent_ne_chunker')
+'''
+
+
+TreeBankTokenizer = TreebankWordTokenizer()
+PunktTokenizer = PunktSentenceTokenizer()
+text = '''
+The Boston Celtics are a National Basketball Association (NBA) team based in Boston, MA. They play in the Atlantic Division
+ of the Eastern Conference. Founded in 1946, the team is currently owned by 
+ Boston Basketball Partners LLC. The Celtics play their home games at the TD Garden,
+ which they share with the Boston Blazers (NLL), and the Boston Bruins of the NHL.
+ 
+ The Celtics have dominated the league during the late 50's and through the mid 80's, 
+ with the help of many Hall of Famers which include Bill Russell, Bob Cousy, John Havlicek, 
+ Larry Bird and legendary Celtics coach Red Auerbach, 
+ combined for a 795 - 397 record that helped the Celtics win 16 Championships.
+'''
+
+sentences = PunktTokenizer.tokenize(text)
+tokens = [TreeBankTokenizer.tokenize(sentence) for sentence in sentences]
+tagged = [pos_tag(token) for token in tokens]
+chunked = [ne_chunk(taggedToken) for taggedToken in tagged]
+
+chunked[0].draw()
+chunked[-1].draw()
+chunked[-3].draw()
+
+print chunked
+
+
+
diff --git a/nltkText.py b/nltkText.py
@@ -0,0 +1,10 @@
+from nltk.corpus import brown
+from nltk import Text
+brown_words = brown.words(categories='humor')
+brownText = Text(brown_words)
+brownText.collocations()
+brownText.count("car")
+brownText.concordance("oil")
+brownText.dispersion_plot(['car', 'document', 'funny', 'oil'])
+brownText.similar('humor')
+
diff --git a/nounphrase_chunker.py b/nounphrase_chunker.py
@@ -0,0 +1,43 @@
+
+
+from nltk.chunk import *
+from nltk.chunk.util import *
+from nltk.chunk.regexp import *
+from nltk import word_tokenize
+from nltk import pos_tag
+
+
+text = '''
+Jack and Jill went up the hill to fetch a pail of water
+'''
+tokens = nltk.pos_tag(word_tokenize(text))
+
+chunk = ChunkRule("<.*>+", "Chunk all the text")
+chink = ChinkRule("<VBD|IN|\.>", "Leave verbs and prepositions out of this")
+split = SplitRule("<DT><NN>", "<DT><NN>","Chunk on sequences of determiner+noun phrases")
+
+chunker = RegexpChunkParser([chunk, chink, split],chunk_node='NP')
+chunked = chunker.parse(tokens)
+chunked.draw()
+
+
+
+
+# ANOTHER WAY TO DO THIS USING THE "REGEX PARSER"
+'''
+import nltk
+from nltk import pos_tag, word_tokenize
+text = """
+Jack and Jill went up the hill to fetch a pail of water. Jack fell down and broke his crown and jill came tumbling after.
+"""
+tagged_tokens = pos_tag(word_tokenize(text))
+
+grammar = """
+  NP:
+    {<.*>+}         
+    }<VBD|IN>+{      
+  """
+chunk = nltk.RegexpParser(grammar)
+tree = chunk.parse(tagged_tokens)
+tree.draw()
+'''
diff --git a/poem.py b/poem.py
@@ -0,0 +1,112 @@
+from nltk_contrib.readability.textanalyzer import syllables_en
+from nltk.corpus import cmudict,wordnet as wn
+import nltk
+import re
+
+
+# requires the installation of the nltk_contrib package which is available at the google code project page
+textchunk = '''
+	One of the very difficult parts of the decision I made on the 
+	financial crisis was to use hardworking people's money to
+	help prevent there to be a crisis.
+	
+	I want to share with you an interesting program for two reasons, one, 
+	it's interesting, and two, my wife thought of it or 
+	has actually been involved with it; she didn't think of it. 
+	But she thought of it for this speech.
+
+	I'm telling you there's an enemy that would like to attack America, Americans, again. 
+	There just is. 
+	That's the reality of the world. 
+	And I wish him all the very best.
+	
+	This is my maiden voyage.
+	My first speech since I was the president of the United States and I couldn't think of a better 
+	place to give it than Calgary, Canada.
+	'''
+	
+textchunk += '''
+	They want to deliver vast amounts of information over the Internet.
+	And again, the Internet is not something that you just dump something on. 
+	It's not a big truck. It's a series of tubes. And if you don't understand, 
+	those tubes can be filled and if they are filled, when you put your message in, 
+	it gets in line and it's going to be delayed by anyone that puts into that tube 
+	enormous amounts of material, enormous amounts of material
+	'''
+	
+poem = ''
+wordmap = [] # a list that will contain a tuple (word,syllable_count)
+words = nltk.word_tokenize(textchunk)
+for iter,word in enumerate(words):	
+	word += " "
+	syls = syllables_en.count(word)
+	wordmap.append((word,syls))
+	
+	
+	
+def findSyllableWord(word,syllableSize): 
+	synsets = wn.synsets(word)
+	for syns in synsets:
+		name = syns.name
+		lemmas =  syns.lemma_names
+		for wordstring in lemmas:
+			if(syllables_en.count(wordstring) == syllableSize and wordstring != word):
+				return {'word':word,'syllable':syllableSize}
+	return	{'word':word,'syllable':syllables_en.count(word)}
+
+
+lineNo = 1
+charNo = 0
+tally = 0
+for syllabicword in wordmap:
+	s = syllabicword[1]
+	wordtoAdd = syllabicword[0]
+	if lineNo == 1:
+		if tally < 5:
+			if tally + int(s) > 5 and wordtoAdd.isalpha():
+				num = 5 - tally
+				similarterm = findSyllableWord(wordtoAdd,num)
+				wordtoAdd = similarterm['word']
+				s = similarterm['syllable']
+			tally += int(s)
+			poem += wordtoAdd
+		else:
+			poem += " ---"+str(tally)+"\n"
+			if wordtoAdd.isalpha():
+				poem += wordtoAdd
+			tally = s
+			lineNo = 2
+	elif lineNo == 2:
+		if tally < 7:
+			if tally + int(s) > 7 and wordtoAdd.isalpha():
+				num = 7 - tally
+				similarterm = findSyllableWord(wordtoAdd,num)
+				wordtoAdd = similarterm['word']
+				s = similarterm['syllable']
+			tally += int(s)
+			poem += wordtoAdd
+		else:
+			poem += " ---"+str(tally)+"\n"
+			if wordtoAdd.isalpha():
+				poem += wordtoAdd
+			tally = s
+			lineNo = 3
+	elif lineNo == 3:
+		if tally < 5:
+			if tally + int(s) > 5 and wordtoAdd.isalpha():
+				num = 5 - tally
+				similarterm = findSyllableWord(wordtoAdd,num)
+				wordtoAdd = similarterm['word']
+				s = similarterm['syllable']
+			tally += int(s)
+			poem += wordtoAdd
+		else:
+			poem += " ---"+str(tally)+"\n\n"
+			if wordtoAdd.isalpha():
+				poem += wordtoAdd
+			tally = s
+			lineNo = 1
+	charNo+=1
+
+
+print poem
diff --git a/similarity.py b/similarity.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+from nltk.corpus import wordnet as wn
+
+Aword = 'language'
+Bword = 'barrier'
+
+synsetsA = wn.synsets(Aword)
+synsetsB = wn.synsets(Bword)
+
+groupA= [wn.synset(str(synset.name)) for synset in synsetsA]
+groupB = [wn.synset(str(synset.name)) for synset in synsetsB]
+
+similars = []
+
+for sseta in groupA:
+	for ssetb in groupB:
+		path_similarity = sseta.path_similarity(ssetb)
+		wup_similarity = sseta.wup_similarity(ssetb)
+		
+		if path_similarity is not None:
+			similars.append({
+				'path':path_similarity,
+				'wup':wup_similarity,
+				'wordA':sseta,
+				'wordB':ssetb,
+				'wordA_definition':sseta.definition,
+				'wordB_definition':ssetb.definition
+			})
+			
+similars = sorted(similars, key=lambda item: item['path'],reverse=True)
+
+for item in similars:
+	print item['wordA'],"\n",item['wordA_definition']
+	print item['wordB'],"\n",item['wordB_definition']
+	print 'Path similarity - ',item['path'],"\n"
diff --git a/spamdetector.py b/spamdetector.py
@@ -0,0 +1,54 @@
+from nltk import word_tokenize,WordNetLemmatizer,NaiveBayesClassifier,classify,MaxentClassifier
+from nltk.corpus import stopwords
+import random
+import os, glob,re
+
+
+
+commonwords = stopwords.words('english')
+wordlemmatizer = WordNetLemmatizer()
+
+
+def email_features(sent):
+	features = {}
+	wordtokens = [wordlemmatizer.lemmatize(word.lower()) for word in word_tokenize(sent)]
+	for word in wordtokens:
+		if word not in commonwords:
+			features[word] =  True
+			logging.info(word)
+	return features
+
+
+hamtexts  = []
+spamtexts  = []
+
+for infile in glob.glob( os.path.join('ham/', '*.txt')):
+	text_file = open(infile, "r")
+	hamtexts.append(text_file.read())
+	text_file.close()
+
+for infile in glob.glob( os.path.join('spam/', '*.txt') ):
+	text_file = open(infile, "r")
+	spamtexts.append(text_file.read())
+	text_file.close()
+	
+	
+
+
+mixedemails =	([(email,'spam') for email in spamtexts] + [(email,'ham') for email in hamtexts])
+
+random.shuffle(mixedemails)
+featuresets = [(email_features(n), g) for (n,g) in mixedemails]
+
+size = int(len(featuresets) * 0.35)
+train_set, test_set = featuresets[size:], featuresets[:size]
+classifier = NaiveBayesClassifier.train(train_set)
+#classifier = MaxentClassifier.train(train_set,'Powell',3)
+
+
+print 'accuracy: ', classify.accuracy(classifier,test_set)
+classifier.show_most_informative_features(30)
+print 'labels:',classifier.labels()
+while(True):
+	featset = email_features(raw_input("Enter text to classify: "))
+	print classifier.classify(featset)
diff --git a/timit_corpus.py b/timit_corpus.py
@@ -0,0 +1,7 @@
+import nltk
+
+print nltk.corpus.timit.phones('dr1-fvmh0/sa1')
+
+transcriptDict = nltk.corpus.timit.transcription_dict()
+
+print transcriptDict
diff --git a/tokens.py b/tokens.py
@@ -0,0 +1,39 @@
+import nltk
+
+
+rawtext = '''
+The Boston Celtics are a National Basketball Association (NBA) team based in Boston, MA. They play in the Atlantic Division
+ of the Eastern Conference. Founded in 1946, the team is currently owned by 
+ Boston Basketball Partners LLC. The Celtics play their home games at the TD Garden,
+ which they share with the Boston Blazers (NLL), and the Boston Bruins of the NHL.
+ 
+ The Celtics have dominated the league during the late 50's and through the mid 80's, 
+ with the help of many Hall of Famers which include Bill Russell, Bob Cousy, John Havlicek, 
+ Larry Bird and legendary Celtics coach Red Auerbach, 
+ combined for a 795 - 397 record that helped the Celtics win 16 Championships.
+'''
+rawtext = rawtext.lower()
+
+
+
+text = nltk.Text(nltk.word_tokenize(rawtext))
+
+
+# Frequency of any given word
+print '-- Counting'
+print text.count("boston")
+
+# concordance
+print '-- Concordance'
+print text.concordance("basketball")
+
+# similarity
+print '-- similarity'
+print text.similar("game")
+
+
+print '-- contexts'
+text.common_contexts(["celtics" ,"win","boston","basketball"])
+
+# lexical dispersion
+text.dispersion_plot(["celtics", "boston", "basketball"])
diff --git a/wordnet.py b/wordnet.py
@@ -0,0 +1,4 @@
+from nltk.corpus import wordnet as wn
+
+for holonym in wn.synsets('robot')[0].hypernym_paths()[0]:
+	print hypernym.lemma_names