diff --git a/.gitignore b/.gitignore index d8eff1c..a33edc5 100644 --- a/.gitignore +++ b/.gitignore @@ -110,4 +110,6 @@ com_crashlytics_export_strings.xml crashlytics.properties crashlytics-build.properties -atusdata/ +# Ignore the data folder ? +guess_lang/data/ +.DS_Store diff --git a/Using_guess_lang.ipynb b/Using_guess_lang.ipynb new file mode 100644 index 0000000..cfb7d23 --- /dev/null +++ b/Using_guess_lang.ipynb @@ -0,0 +1,279 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:4c467298348c6ff6bcbe571ee98f7c99f30feb8c4bd3e3d7d4f67a4b00a8a79e" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 79 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import guess_lang" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 80 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "!python guess_lang" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "Extra Trees Random Forest\r\n", + "01 : Clojure \t Correct!\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "02 : Clojure \t Correct!\r\n", + "03 : Ruby \t Incorrect: Clojure\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "04 : Clojure \t Correct!\r\n", + "05 : Java \t Incorrect: Python\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "06 : Python \t Correct!\r\n", + "07 : Ruby \t Incorrect: Python\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "08 : Python \t Correct!\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "09 : Javascript \t Correct!\r\n", + "10 : Javascript \t Correct!\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "11 : Javascript \t Correct!\r\n", + "12 : Javascript \t Correct!\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "13 : Ruby \t Correct!\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "14 : Ruby \t Correct!\r\n", + "15 : Ruby \t Correct!\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "16 : Haskell \t Correct!\r\n", + "17 : Haskell \t Correct!\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "18 : Haskell \t Correct!\r\n", + "19 : Scheme \t Correct!\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "20 : Scheme \t Correct!\r\n", + "21 : Scheme \t Correct!\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "22 : Java \t Correct!\r\n", + "23 : Java \t Correct!\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "24 : Scala \t Correct!\r\n", + "25 : Scala \t Correct!\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "26 : Tcl \t Correct!\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "27 : Tcl \t Correct!\r\n", + "28 : Php \t Correct!\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "29 : Php \t Correct!\r\n", + "30 : Php \t Correct!\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "31 : Ocaml \t Correct!\r\n", + "32 : Ocaml \t Correct!\r\n", + "Score: 0.90625\r\n" + ] + } + ], + "prompt_number": 81 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "!python guess_lang other_tests/multi_table.java" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "Extra Trees Random Forest\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "1 : Java\r\n" + ] + } + ], + "prompt_number": 85 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "!python guess_lang other_tests/scanner.py" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "Extra Trees Random Forest\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "1 : Python\r\n" + ] + } + ], + "prompt_number": 86 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "!python guess_lang other_tests/methods.rb" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "Extra Trees Random Forest\r\n" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "1 : Ruby\r\n" + ] + } + ], + "prompt_number": 87 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [] + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/classifier.data b/classifier.data new file mode 100644 index 0000000..0061e61 Binary files /dev/null and b/classifier.data differ diff --git a/guess_lang/.DS_Store b/guess_lang/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/guess_lang/.DS_Store differ diff --git a/guess_lang/__init__.py b/guess_lang/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/guess_lang/__main__.py b/guess_lang/__main__.py new file mode 100644 index 0000000..f702b17 --- /dev/null +++ b/guess_lang/__main__.py @@ -0,0 +1,143 @@ +import sys +import os +import pickle +import traverse_folders as tf +from learner import Learner +from classifier import Classifier +from sklearn.cluster import KMeans + + +class Language_Guesser: + + def __init__(self): + pass + +if __name__ == '__main__': + training_path = "guess_lang/data/" + testing_path = "guess_lang/test/" + single_file = False + use_pickle = True + + """ Check for command-line arguments. If -n, don't use pickled files. """ + if len(sys.argv) > 1: + if sys.argv[1] == '-n': + use_pickle = False + else: + testing_path = sys.argv[1] + single_file = True + try: + if sys.argv[2] == '-n': + use_pickle = False + except: + pass + + """ If learner object has been pickled, load it. + else, build the learner. (also check use_pickle arg) """ + learner_path = "learner.data" + if os.path.isfile(learner_path) and use_pickle: + learner_file = open(learner_path, 'rb') + learner = pickle.load(learner_file) + learner_file.close() + else: + """ Read in files to use as training data. """ + training_set = tf.build_train_set(training_path) + learner = Learner() + """ Build the DataFrame of features""" + for code, language in training_set: + learner.train(code, language) + + """ If the classifier has been pickled, load it. + Else build the classifier. (also check use_pickle arg) """ + classifier_path = "classifier.data" + if os.path.isfile(classifier_path) and use_pickle: + classifier_file = open(classifier_path, 'rb') + classifier = pickle.load(classifier_file) + classifier_file.close() + else: + classifier = Classifier(learner.training_df) + + """ Build the list of files to test from testing_path. + If user input a specified file, open it and prepare it for test. """ + testing_set = tf.build_test_set(testing_path) + answers = tf.get_answers("guess_lang/test.csv") + testing_set = sorted(testing_set, key=lambda x: x[0]) + + # print("Decision Tree") + # correct = 0 + # for test_number,test in testing_set: + # analysis = learner.analyze(test) + # decision = classifier.decision_tree(analysis)[0].lower() + # if decision == (answers[test_number]): + # correct += 1 + # if single_file: + # print(test_number, ": ", decision) + # else: + # print(test_number, ": ", decision, + # "\tCorrect: ", answers[test_number]) + # print("Score: {}".format(correct/32)) + + # print("Random Forest") + # correct = 0 + # for test_number, test in testing_set: + # analysis = learner.analyze(test) + # decision = classifier.random_forest(analysis)[0].lower() + # if decision == (answers[test_number]): + # correct += 1 + # if single_file: + # print(test_number, ": ", decision) + # else: + # print(test_number, ": ", decision, + # "\tCorrect: ", answers[test_number]) + # if not single_file: + # print("Score: {}".format(correct/32)) + + print("Extra Trees Random Forest") + correct = 0 + for test_number, test in testing_set: + analysis = learner.analyze(test) + decision = classifier.extreme_random_forest(analysis)[0].lower() + if decision == (answers[test_number]): + correct_string = "Correct!" + correct += 1 + else: + correct_string = ("Incorrect: {}" + .format(answers[test_number].title())) + if single_file: + print(test_number, ": ", decision.title()) + else: + print(str.zfill(str(test_number), 2), ": ", + str.rjust(decision.title(), 10), + "\t", correct_string) + if not single_file: + print("Score: {}".format(correct/32)) + + # print("Linear SVC") + # correct = 0 + # for test_number,test in testing_set: + # analysis = learner.analyze(test) + # decision = classifier.linear_svc(analysis)[0].lower() + # if decision == (answers[test_number]): + # correct += 1 + # #print(test_number, ": ", decision) + # print("Score: {}".format(correct/32)) + + # print("Cluster") + # correct = 0 + # for test_number,test in testing_set: + # analysis = learner.analyze(test) + # decision = classifier.cluster(analysis)[0] + # if decision == (answers[test_number]): + # correct += 1 + # #print(test_number, ": ", decision) + # print("Score: {}".format(correct/32)) + + """ Pickle learner and classifier if they aren't already pickled. + If user specified -n, pickle the new learner and classifier. """ + if not os.path.isfile(learner_path) or not use_pickle: + output = open(learner_path, 'wb') + pickle.dump(learner, output, protocol=2) + output.close() + if not os.path.isfile(classifier_path) or not use_pickle: + output = open(classifier_path, 'wb') + pickle.dump(classifier, output, protocol=2) + output.close() diff --git a/guess_lang/classifier.py b/guess_lang/classifier.py new file mode 100644 index 0000000..306b913 --- /dev/null +++ b/guess_lang/classifier.py @@ -0,0 +1,101 @@ +import matplotlib.pyplot as plt +import pandas as pd +import numpy as np +import re +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.svm import LinearSVC +from sklearn.cluster import KMeans + +LANGUAGES = ['Clojure', 'Haskell', 'Java', 'JavaScript', 'OCaml', + 'Perl', 'PHP', 'Python', 'Ruby', 'Scala', 'Scheme', 'Tcl'] + + +class Classifier: + + def __init__(self, training_df): + """ Uses the training data and builds a dataFrame for the test files. + Classifiers are attributes of this class so that they can be saved + through pickling. We don't need to save the actual classifiers, + only the results of each classifier's fit() method. """ + self.training_df = training_df + self.testing_df = pd.DataFrame() + self.dt_fit = None + self.rf_fit = None + self.et_fit = None + self.lsvc_fit = None + self.cl_fit = None + + def __str__(self): + return str(self.testing_df) + + """ Each classification method is roughly the same. + If a fit() attribute has not been loaded through pickling, build a new + classifier, give it the features (all but the last column) and + the class (the last column). Perform a fit() + Make and return a prediction, given the testframe. """ + + def decision_tree(self, testframe): + code_count = len(self.testing_df.index) + if not self.dt_fit: + tree = DecisionTreeClassifier() + features = self.training_df.ix[:, :-1] + classes = self.training_df.ix[:, -1] + self.dt_fit = tree.fit(features, classes) + prediction = self.dt_fit.predict(testframe) + return prediction + + def random_forest(self, testframe): + code_count = len(self.testing_df.index) + if not self.rf_fit: + forest = RandomForestClassifier(n_estimators=15, + criterion='gini', + max_features=None) + features = self.training_df.ix[:, :-1] + classes = self.training_df.ix[:, -1] + self.rf_fit = forest.fit(features, classes) + prediction = self.rf_fit.predict(testframe) + return prediction + + def extreme_random_forest(self, testframe): + code_count = len(self.testing_df.index) + if not self.et_fit: + extra_trees = ExtraTreesClassifier(n_estimators=15, + criterion='gini', + max_features=None) + features = self.training_df.ix[:, :-1] + classes = self.training_df.ix[:, -1] + self.et_fit = extra_trees.fit(features, classes) + prediction = self.et_fit.predict(testframe) + return prediction + + def linear_svc(self, testframe): + code_count = len(self.testing_df.index) + if not self.lsvc_fit: + linear_svc = LinearSVC(loss='l1') + features = self.training_df.ix[:, :-1] + classes = self.training_df.ix[:, -1] + self.lsvc_fit = linear_svc.fit(features, classes) + prediction = self.lsvc_fit.predict(testframe) + return prediction + + def cluster(self, testframe): + """ Clustering is unsupervised learning so what if we cluster + the codes and then run each cluster through random forest + or another supervised algorithm in order to actually identify each. + """ + code_count = len(self.testing_df.index) + cluster = KMeans(12) + # cluster.set_params(LANGUAGES) + features = self.training_df.ix[:, :-1] + classes = self.training_df.ix[:, -1] + try: + classifier = cluster.fit(features) # ,classes) + except: + print(features) + print(classes) + + prediction = classifier.predict(testframe) + print(prediction) + return prediction diff --git a/guess_lang/data/.DS_Store b/guess_lang/data/.DS_Store new file mode 100644 index 0000000..34c5e8d Binary files /dev/null and b/guess_lang/data/.DS_Store differ diff --git a/guess_lang/learner.py b/guess_lang/learner.py new file mode 100644 index 0000000..91633f1 --- /dev/null +++ b/guess_lang/learner.py @@ -0,0 +1,80 @@ +import matplotlib.pyplot as plt +import pandas as pd +import numpy as np +import re + + +class Learner: + """ A learner builds a dataframe of features to train on. + When a new file is input, add a new row onto the dataframe. + Loop through the list of features, which will fill in the + corresponding columns. + Each row will be a piece of code. """ + + def __init__(self): + self.training_df = pd.DataFrame() + # A List of feature functions + self.features = [r'\$[\D]', r'[^;];[^.]', r';;[^;]', r';;;', + r'include', r'let', r'{[^-]', r'{-', + r'import', r'var', r'@', r'#', r'=>', r'js\.', r'/\*', + r'->', '\(\*', r'|[^|]', r'& args', r'', 'Snakes', 'Turtles', 'Snails', 'Salamanders', 'Slugs', + 'Newts') +print "\n" + +def boffo(a, b, c, d) + print "a = #{a} b = #{b}, c = #{c}, d = #{d}\n" +end + +# Use * to adapt between arrays and arguments +a1 = ['snack', 'fast', 'junk', 'pizza'] +a2 = [4, 9] +boffo(*a1) +boffo(17, 3, *a2) diff --git a/other_tests/multi_table.java b/other_tests/multi_table.java new file mode 100644 index 0000000..68d8f6a --- /dev/null +++ b/other_tests/multi_table.java @@ -0,0 +1,16 @@ +import java.util.Scanner; + +class MultiplicationTable +{ + public static void main(String args[]) + { + int n, c; + System.out.println("Enter an integer to print it's multiplication table"); + Scanner in = new Scanner(System.in); + n = in.nextInt(); + System.out.println("Multiplication table of "+n+" is :-"); + + for ( c = 1 ; c <= 10 ; c++ ) + System.out.println(n+"*"+c+" = "+(n*c)); + } +} diff --git a/other_tests/scanner.py b/other_tests/scanner.py new file mode 100644 index 0000000..fa37fa1 --- /dev/null +++ b/other_tests/scanner.py @@ -0,0 +1,24 @@ +import re +import sys +import urllib2 +import BeautifulSoup + +usage = "Run the script: ./geolocate.py IPAddress" + +if len(sys.argv)!=2: + print(usage) + sys.exit(0) + +if len(sys.argv) > 1: + ipaddr = sys.argv[1] + +geody = "http://www.geody.com/geoip.php?ip=" + ipaddr +html_page = urllib2.urlopen(geody).read() +soup = BeautifulSoup.BeautifulSoup(html_page) + +# Filter paragraph containing geolocation info. +paragraph = soup('p')[3] + +# Remove html tags using regex. +geo_txt = re.sub(r'<.*?>', '', str(paragraph)) +print geo_txt[32:].strip() diff --git a/requirements.txt b/requirements.txt index 9170871..ec229bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ scikit-learn +Pandas textblob \ No newline at end of file