|
| 1 | +from nltk import NaiveBayesClassifier, classify |
| 2 | +import name_loader |
| 3 | +import random |
| 4 | + |
| 5 | + |
| 6 | +class genderPredictor(): |
| 7 | + |
| 8 | + def getFeatures(self): |
| 9 | + maleNames, femaleNames = self._loadNames() |
| 10 | + |
| 11 | + featureset = list() |
| 12 | + |
| 13 | + for nameTuple in maleNames: |
| 14 | + features = self._nameFeatures(nameTuple[0]) |
| 15 | + male_prob, female_prob = self._getProbDistr(nameTuple) |
| 16 | + features['male_prob'] = male_prob |
| 17 | + features['female_prob'] = female_prob |
| 18 | + featureset.append((features, 'M')) |
| 19 | + |
| 20 | + for nameTuple in femaleNames: |
| 21 | + features = self._nameFeatures(nameTuple[0]) |
| 22 | + male_prob, female_prob = self._getProbDistr(nameTuple) |
| 23 | + features['male_prob'] = male_prob |
| 24 | + features['female_prob'] = female_prob |
| 25 | + featureset.append((features, 'F')) |
| 26 | + |
| 27 | + return featureset |
| 28 | + |
| 29 | + def trainAndTest(self, trainingPercent=0.80): |
| 30 | + featureset = self.getFeatures() |
| 31 | + random.shuffle(featureset) |
| 32 | + |
| 33 | + name_count = len(featureset) |
| 34 | + |
| 35 | + cut_point = int(name_count * trainingPercent) |
| 36 | + |
| 37 | + train_set = featureset[:cut_point] |
| 38 | + test_set = featureset[cut_point:] |
| 39 | + |
| 40 | + self.train(train_set) |
| 41 | + |
| 42 | + return self.test(test_set) |
| 43 | + |
| 44 | + def classify(self, name): |
| 45 | + feats = self._nameFeatures(name) |
| 46 | + return self.classifier.classify(feats) |
| 47 | + |
| 48 | + def train(self, train_set): |
| 49 | + self.classifier = NaiveBayesClassifier.train(train_set) |
| 50 | + return self.classifier |
| 51 | + |
| 52 | + def test(self, test_set): |
| 53 | + return classify.accuracy(self.classifier, test_set) |
| 54 | + |
| 55 | + def _getProbDistr(self, nameTuple): |
| 56 | + male_prob = (nameTuple[1] * 1.0) / (nameTuple[1] + nameTuple[2]) |
| 57 | + if male_prob == 1.0: |
| 58 | + male_prob = 0.99 |
| 59 | + elif male_prob == 0.0: |
| 60 | + male_prob = 0.01 |
| 61 | + else: |
| 62 | + pass |
| 63 | + female_prob = 1.0 - male_prob |
| 64 | + return (male_prob, female_prob) |
| 65 | + |
| 66 | + def getMostInformativeFeatures(self, n=5): |
| 67 | + return self.classifier.most_informative_features(n) |
| 68 | + |
| 69 | + def _loadNames(self): |
| 70 | + return name_loader.getNameList() |
| 71 | + |
| 72 | + def _nameFeatures(self, name): |
| 73 | + name = name.upper() |
| 74 | + return { |
| 75 | + 'last_letter': name[-1], |
| 76 | + 'last_two': name[-2:], |
| 77 | + 'last_three': name[-3:], |
| 78 | + 'last_is_vowel': (name[-1] in 'AEIOUY') |
| 79 | + } |
| 80 | + |
| 81 | + |
| 82 | +if __name__ == "__main__": |
| 83 | + gp = genderPredictor() |
| 84 | + accuracy = gp.trainAndTest() |
| 85 | + print ('Accuracy: %f' % accuracy) |
| 86 | + print ('Most Informative Features') |
| 87 | + feats = gp.getMostInformativeFeatures(10) |
| 88 | + for feat in feats: |
| 89 | + print ('\t%s = %s' % feat) |
| 90 | + name = '' |
| 91 | + while name != 'quit': |
| 92 | + name = input('Enter name to classify: ') |
| 93 | + name = name.strip() |
| 94 | + print ('\n%s is classified as %s' % (name, gp.classify(name))) |
0 commit comments