Skip to content

Commit a65cf0d

Browse files
committed
pushing scripts and data
1 parent fc3d3f3 commit a65cf0d

File tree

4 files changed

+202
-0
lines changed

4 files changed

+202
-0
lines changed

gender_predictor.py

+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
from nltk import NaiveBayesClassifier, classify
2+
import name_loader
3+
import random
4+
5+
6+
class genderPredictor():
7+
8+
def getFeatures(self):
9+
maleNames, femaleNames = self._loadNames()
10+
11+
featureset = list()
12+
13+
for nameTuple in maleNames:
14+
features = self._nameFeatures(nameTuple[0])
15+
male_prob, female_prob = self._getProbDistr(nameTuple)
16+
features['male_prob'] = male_prob
17+
features['female_prob'] = female_prob
18+
featureset.append((features, 'M'))
19+
20+
for nameTuple in femaleNames:
21+
features = self._nameFeatures(nameTuple[0])
22+
male_prob, female_prob = self._getProbDistr(nameTuple)
23+
features['male_prob'] = male_prob
24+
features['female_prob'] = female_prob
25+
featureset.append((features, 'F'))
26+
27+
return featureset
28+
29+
def trainAndTest(self, trainingPercent=0.80):
30+
featureset = self.getFeatures()
31+
random.shuffle(featureset)
32+
33+
name_count = len(featureset)
34+
35+
cut_point = int(name_count * trainingPercent)
36+
37+
train_set = featureset[:cut_point]
38+
test_set = featureset[cut_point:]
39+
40+
self.train(train_set)
41+
42+
return self.test(test_set)
43+
44+
def classify(self, name):
45+
feats = self._nameFeatures(name)
46+
return self.classifier.classify(feats)
47+
48+
def train(self, train_set):
49+
self.classifier = NaiveBayesClassifier.train(train_set)
50+
return self.classifier
51+
52+
def test(self, test_set):
53+
return classify.accuracy(self.classifier, test_set)
54+
55+
def _getProbDistr(self, nameTuple):
56+
male_prob = (nameTuple[1] * 1.0) / (nameTuple[1] + nameTuple[2])
57+
if male_prob == 1.0:
58+
male_prob = 0.99
59+
elif male_prob == 0.0:
60+
male_prob = 0.01
61+
else:
62+
pass
63+
female_prob = 1.0 - male_prob
64+
return (male_prob, female_prob)
65+
66+
def getMostInformativeFeatures(self, n=5):
67+
return self.classifier.most_informative_features(n)
68+
69+
def _loadNames(self):
70+
return name_loader.getNameList()
71+
72+
def _nameFeatures(self, name):
73+
name = name.upper()
74+
return {
75+
'last_letter': name[-1],
76+
'last_two': name[-2:],
77+
'last_three': name[-3:],
78+
'last_is_vowel': (name[-1] in 'AEIOUY')
79+
}
80+
81+
82+
if __name__ == "__main__":
83+
gp = genderPredictor()
84+
accuracy = gp.trainAndTest()
85+
print ('Accuracy: %f' % accuracy)
86+
print ('Most Informative Features')
87+
feats = gp.getMostInformativeFeatures(10)
88+
for feat in feats:
89+
print ('\t%s = %s' % feat)
90+
name = ''
91+
while name != 'quit':
92+
name = input('Enter name to classify: ')
93+
name = name.strip()
94+
print ('\n%s is classified as %s' % (name, gp.classify(name)))

name_loader.py

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import os
2+
import csv
3+
import pickle
4+
5+
6+
def getNameList():
7+
if not os.path.exists('nepali_names.pickle'):
8+
print('nepali_names.pickle does not exist, generating')
9+
10+
print('Extracting names from nepali_names.csv')
11+
namesDict = extractNamesDict()
12+
13+
maleNames = list()
14+
femaleNames = list()
15+
16+
print('Sorting Names')
17+
for name in namesDict:
18+
counts = namesDict[name]
19+
tuple = (name, counts[0], counts[1])
20+
if counts[0] > counts[1]:
21+
maleNames.append(tuple)
22+
elif counts[1] > counts[0]:
23+
femaleNames.append(tuple)
24+
25+
names = (maleNames, femaleNames)
26+
27+
print('Saving nepali_names.pickle')
28+
fw = open('nepali_names.pickle', 'wb')
29+
pickle.dump(names, fw, -1)
30+
fw.close()
31+
print('Saved nepali_names.pickle')
32+
else:
33+
print('nepali_names.pickle exists, loading data')
34+
f = open('nepali_names.pickle', 'rb')
35+
names = pickle.load(f)
36+
print('nepali_names.pickle loaded')
37+
38+
print('%d male names loaded, %d female names loaded' % (len(names[0]), len(names[1])))
39+
40+
return names
41+
42+
43+
def extractNamesDict():
44+
names = dict()
45+
genderMap = {'M': 0, 'F': 1}
46+
47+
file = open('nepali_names.csv', 'r')
48+
rows = csv.reader(file, delimiter=',')
49+
50+
for row in rows:
51+
name = row[0].upper()
52+
gender = genderMap[row[1]]
53+
count = int(row[2])
54+
55+
if name not in names:
56+
names[name] = [0, 0]
57+
names[name][gender] = names[name][gender] + count
58+
59+
file.close()
60+
print('\tImported Nepali names file')
61+
62+
return names
63+
64+
65+
if __name__ == "__main__":
66+
getNameList()

nepali_names.pickle

4.32 MB
Binary file not shown.

sample_nepali_names.csv

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
gokaran,M,10
2+
gokaran prasad,M,1
3+
gokaran purus,M,1
4+
gokaran raj,M,2
5+
gokarana,M,3
6+
gokarda,M,1
7+
gokarn,M,40
8+
gokarn bahadur,M,8
9+
gokarn gairi,M,1
10+
gokarn jang,M,1
11+
gokarn khan,M,1
12+
gokarn lal,M,1
13+
gokarn maya,F,1
14+
gokarn prasad,M,7
15+
gokarn raj,M,1
16+
gokarna,F,4
17+
gokarna,M,783
18+
gokarna bahadhur,M,4
19+
gokarna bahadue,M,1
20+
gokarna bahadur,M,121
21+
gokarna bahadur aale,M,1
22+
gokarna bhadhur,M,3
23+
gokarna bhadur,M,1
24+
tejakumari,M,1
25+
tejal,F,2
26+
tejal maya,F,1
27+
tejal yata,F,1
28+
tejamai,F,1
29+
tejan,M,4
30+
tejanath,M,1
31+
tejanatha,M,1
32+
tejandar,M,1
33+
tejandra,M,2
34+
tejap,M,1
35+
tejar,M,1
36+
tejas,M,6
37+
tejasakar,M,1
38+
tejasbhi,F,1
39+
tejasbi,F,6
40+
tejasbi,M,1
41+
tejash,M,6
42+
tejashi,F,1

0 commit comments

Comments
 (0)