diff --git a/README.md b/README.md index 8380c5f..6f79665 100644 --- a/README.md +++ b/README.md @@ -1,102 +1,8 @@ -# Classify code snippets into programming languages +All appropriate files are included in the directory "plc". To have my classifier determine what language a code snippet was written in, do the following: +1. Copy your code snippet into the file "code_snippet.txt" or make a .txt file of a different name with code snippet inside +2. run "python guess_lang.py code_snippet.txt" from the command line. (If you've named a file differently than code_snippet.txt then use that name instead.) +The program will tell you what language it thinks it is, as well as a percentage of how confident it is in that assessment. -## Description - -Create a classifier that can take snippets of code and guess the programming language of the code. - -## Objectives - -### Learning Objectives - -After completing this assignment, you should understand: - -* Feature extraction -* Classification -* The varied syntax of programming languages - -### Performance Objectives - -After completing this assignment, you should be able to: - -* Build a robust classifier - -## Details - -### Deliverables - -* A Git repo called programming-language-classifier containing at least: - * `README.md` file explaining how to run your project - * a `requirements.txt` file - * a suite of tests for your project - -### Requirements - -* Passing unit tests -* No PEP8 or Pyflakes warnings or errors - -## Normal Mode - -### Getting a corpus of programming languages - -Option 1: Get code from the [Computer Language Benchmarks Game](http://benchmarksgame.alioth.debian.org/). You can [download their code](https://alioth.debian.org/snapshots.php?group_id=100815) directly. In the downloaded archive under `benchmarksgame/bench`, you'll find many directories with short programs in them. Using the file extensions of these files, you should be able to find out what programming language they are. - -Option 2: Scrape code from [Rosetta Code](http://rosettacode.org/wiki/Rosetta_Code). You will need to figure out how to scrape HTML and parse it. [BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/) is your best bet for doing that. - -Option 3: Get code from GitHub somehow. The specifics of this are left up to you. - -You are allowed to use other code samples as well. - -**For your sanity, you only have to worry about the following languages:** - -* Clojure -* Haskell -* Java -* JavaScript -* OCaml -* Perl -* PHP -* Python -* Ruby -* Scala -* Scheme -* Tcl - -Feel more than free to add others! - -### Classifying new snippets - -Using your corpus, you should extract features for your classifier. Use whatever classifier engine that works best for you _and that you can explain how it works._ - -Your initial classifier should be able to take a string containing code and return a guessed language for it. It is recommended you also have a method that returns the snippet's percentage chance for each language in a dict. - -### Testing your classifier - -The `test/` directory contains code snippets. The file `test.csv` contains a list of the file names in the `test` directory and the language of each snippet. Use this set of snippets to test your classifier. _Do not use the test snippets for training your classifier, obviously._ - -### Code layout - -This project should be laid out in accordance with the project layout from _The Hacker's Guide to Python_. It should have tests for things which can be tested. Your classifier should be able to be run with a small controlled corpus for testing. - -Your project should also contain an IPython notebook that demonstrates use of your classifier. - -## Hard Mode - -In addition to the requirements from **Normal Mode**: - -Create a runnable Python file that can classify a snippet in a text file, run like this: - -`guess_lang.py code-snippet.txt` - -where `guess_lang.py` is whatever you name your program and `code-snippet.txt` is any snippet. Your program should print out the language it thinks the snippet is. - -To do this, you will likely want to either pre-parse your corpus and output it as features to load or save out your classifier for later use. Otherwise, you'll have to read your entire corpus every time you run the program. That's acceptable, but slow. - -You may want to add some command-line flags to your program. You could allow people to choose the corpus, for example, or to get percentage chances instead of one language. To understand how to write a command-line program with arguments and flags, see the [argparse](https://docs.python.org/3/library/argparse.html) module in the standard library. - -## Additional Resources - -* [TextBlob](http://textblob.readthedocs.org/en/dev/) -* [Beautiful Soup](http://www.crummy.com/software/BeautifulSoup/) -* [Rosetta Code](http://rosettacode.org/wiki/Rosetta_Code) -* [Working with Text Files](https://opentechschool.github.io/python-data-intro/core/text-files.html) +The IPython notebook "Code Classifier.ipynb" demonstrates how the classifier is built and runs it against the test codes found in the "test" directory. Supplemental functions are found in modules referenced by the imports at the start of the notebook. +An additional IPython notebook "split and test.ipynb" contains a classifier trained on a portion of the Benchmarks game code and tested on the remainder of the code. diff --git a/plc/Code Classifier.ipynb b/plc/Code Classifier.ipynb new file mode 100644 index 0000000..da9fbfe --- /dev/null +++ b/plc/Code Classifier.ipynb @@ -0,0 +1,432 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:38f00d516f6fc9736605d73e40bc267b2e9567b9f50d67a67165e3e7bce12bf8" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from data_load import*\n", + "from features import*\n", + "from classifier import*\n", + "from random_guesser import*" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 203 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import os\n", + "import pandas as pd\n", + "import re\n", + "import random" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 204 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "The lists below are the basis for my features. For the word_list and symbol_list, the program creates a feature based on the number of occurences of the word or symbol divided by the number of characters in the code snippet. For the endings list, if the code snippet ends with one of those strings, the feature receives a value of 10. If it doesn't have one of the listed endings, the feature is valued at 0." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "word_list = ['let', 'end', 'defn', 'function', 'fun', 'return', 'def', 'return', 'check', 'make', '->', '.format',\n", + " 'define', '::', 'done', 'type', 'rescue', 'print', 'elif', 'clone', 'display', '$format', 'echo', 'str',\n", + " 'join', '&&', 'val', 'Nil', 'object', '<-', '--', 'lambda', 'var', '//', 'tmpl', 'public function',\n", + " 'stdlib', '=>', 'final', 'case', 'impl']\n", + "symbol_list = ['$', '^', ',', ';', '&', '|', '!', '*', '@', '#', '(', '{', ' ']\n", + "endings = ['end', ')', '}']" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 205 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "The following function creates the data frame of features based on the corpus of code snippets pulled from the Computer Language Benchmarks game." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def data_frame_generator():\n", + " codelist = code_sucker()\n", + " typelist = type_getter()\n", + " df = pd.DataFrame(typelist, index=range(386))\n", + " df.columns = [\"Language\"]\n", + " df[\"Code\"] = codelist\n", + " df['Language'] = df.Language.apply(lambda x:x.lower())\n", + " for string in word_list:\n", + " def sub_function(code):\n", + " x = string_ratio(string, code)\n", + " return x\n", + " df[string] = df.Code.apply(sub_function)\n", + " for char in symbol_list:\n", + " def sub_function2(code):\n", + " y = character_ratio(code, char)\n", + " return y\n", + " df[char] = df.Code.apply(sub_function2)\n", + " for ending in endings:\n", + " def sub_function3(code):\n", + " z = string_end(ending, code)\n", + " return z\n", + " df['_' + ending] = df.Code.apply(sub_function3)\n", + " return df" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 206 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df = data_frame_generator()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 207 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#df.head(2)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 208 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "The following function creates the data frame of features based on the code snippets provided for testing the classifier." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def tdata_frame_generator():\n", + " test_codelist = tcode_sucker()\n", + " df = pd.read_csv(\"test.csv\")\n", + " df[\"Code\"] = test_codelist\n", + " for string in word_list:\n", + " def sub_function(code):\n", + " x = string_ratio(string, code)\n", + " return x\n", + " df[string] = df.Code.apply(sub_function)\n", + " for char in symbol_list:\n", + " def sub_function2(code):\n", + " y = character_ratio(code, char)\n", + " return y\n", + " df[char] = df.Code.apply(sub_function2)\n", + " for ending in endings:\n", + " def sub_function3(code):\n", + " z = string_end(ending, code)\n", + " return z\n", + " df['_' + ending] = df.Code.apply(sub_function3)\n", + " return df" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 209 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_df = tdata_frame_generator()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 210 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "x_train, x_test, y_train, y_test = create_xy(df, test_df, word_list[0], 'Language')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 211 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "I used a Decision Tree Classifier and a Gaussian Naive Bayes Classifier. The Gaussian NB classifier scored higher, and thus I used that in my guess_lang.py program to be run from the console. I also made a random guesser. This is included as a morale booster for whenever I feel like my classifiers are not sufficiently effective." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.tree import DecisionTreeClassifier" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 212 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "tree = DecisionTreeClassifier()\n", + "run_classifier(tree, x_train, x_test, y_train, y_test)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " clojure 1.00 0.75 0.86 4\n", + " haskell 0.67 0.67 0.67 3\n", + " java 0.50 1.00 0.67 2\n", + " javascript 1.00 0.50 0.67 4\n", + " ocaml 1.00 1.00 1.00 2\n", + " perl 0.00 0.00 0.00 0\n", + " php 1.00 0.33 0.50 3\n", + " python 0.40 0.50 0.44 4\n", + " ruby 0.50 0.67 0.57 3\n", + " scala 0.40 1.00 0.57 2\n", + " scheme 0.00 0.00 0.00 3\n", + "\n", + "avg / total 0.66 0.60 0.59 30\n", + "\n", + "[[3 0 0 0 0 0 0 0 1 0 0]\n", + " [0 2 0 0 0 0 0 1 0 0 0]\n", + " [0 0 2 0 0 0 0 0 0 0 0]\n", + " [0 0 0 2 0 0 0 1 0 1 0]\n", + " [0 0 0 0 2 0 0 0 0 0 0]\n", + " [0 0 0 0 0 0 0 0 0 0 0]\n", + " [0 0 0 0 0 1 1 0 0 1 0]\n", + " [0 0 1 0 0 0 0 2 1 0 0]\n", + " [0 0 1 0 0 0 0 0 2 0 0]\n", + " [0 0 0 0 0 0 0 0 0 2 0]\n", + " [0 1 0 0 0 0 0 1 0 1 0]]\n", + "0.58544973545\n" + ] + } + ], + "prompt_number": 213 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "gauss = GaussianNB()\n", + "run_classifier(gauss, x_train, x_test, y_train, y_test)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " clojure 1.00 1.00 1.00 4\n", + " haskell 1.00 0.67 0.80 3\n", + " java 0.67 1.00 0.80 2\n", + " javascript 1.00 0.25 0.40 4\n", + " ocaml 1.00 1.00 1.00 2\n", + " perl 0.00 0.00 0.00 0\n", + " php 0.38 1.00 0.55 3\n", + " python 1.00 1.00 1.00 4\n", + " ruby 1.00 0.33 0.50 3\n", + " scala 0.00 0.00 0.00 2\n", + " scheme 1.00 1.00 1.00 3\n", + "\n", + "avg / total 0.85 0.73 0.72 30\n", + "\n", + "[[4 0 0 0 0 0 0 0 0 0 0]\n", + " [0 2 0 0 0 0 0 0 0 1 0]\n", + " [0 0 2 0 0 0 0 0 0 0 0]\n", + " [0 0 0 1 0 0 3 0 0 0 0]\n", + " [0 0 0 0 2 0 0 0 0 0 0]\n", + " [0 0 0 0 0 0 0 0 0 0 0]\n", + " [0 0 0 0 0 0 3 0 0 0 0]\n", + " [0 0 0 0 0 0 0 4 0 0 0]\n", + " [0 0 0 0 0 1 1 0 1 0 0]\n", + " [0 0 1 0 0 0 1 0 0 0 0]\n", + " [0 0 0 0 0 0 0 0 0 0 3]]\n", + "0.724545454545\n" + ] + } + ], + "prompt_number": 214 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "random_guesser(y_test)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 215, + "text": [ + "0.0" + ] + } + ], + "prompt_number": 215 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "Attempting to pickle my classifier" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import pickle" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 216 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "pickleclf = GaussianNB()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 217 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "pickleclf.fit(x_train, y_train)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 218, + "text": [ + "GaussianNB()" + ] + } + ], + "prompt_number": 218 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "file_Name = \"picklefile\"\n", + "fileObject = open(file_Name,'wb')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 219 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "pickle.dump(pickleclf, fileObject)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 220 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "fileObject.close()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 221 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "fileObject = open('picklefile','r')\n", + "clf = pickle.load(fileObject)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "ename": "UnicodeDecodeError", + "evalue": "'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte", + "output_type": "pyerr", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mfileObject\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'picklefile'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mclf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfileObject\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/Users/bretrunestad/.pyenv/versions/3.4.2/lib/python3.4/codecs.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, input, final)\u001b[0m\n\u001b[1;32m 311\u001b[0m \u001b[0;31m# decode input (taking the buffer into account)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 312\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuffer\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 313\u001b[0;31m \u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconsumed\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_buffer_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfinal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 314\u001b[0m \u001b[0;31m# keep undecoded input until the next call\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 315\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuffer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mconsumed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mUnicodeDecodeError\u001b[0m: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte" + ] + } + ], + "prompt_number": 222 + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/plc/classifier.py b/plc/classifier.py new file mode 100644 index 0000000..0539ca8 --- /dev/null +++ b/plc/classifier.py @@ -0,0 +1,22 @@ +from sklearn import metrics + + +def create_xy(df, test_df, startx, columny): + x_train = df.loc[:, startx:] + x_test = test_df.loc[:, startx:] + y_train = df[columny] + y_test = test_df[columny] + return x_train, x_test, y_train, y_test + + +def run_classifier(clf, x_train, x_test, y_train, y_test): + clf.fit(x_train, y_train) + predicted = clf.predict(x_test) + print(metrics.classification_report(y_test, predicted)) + print(metrics.confusion_matrix(y_test, predicted)) + print(metrics.f1_score(y_test, predicted)) + +def create_train(df, startx, columny): + x_train = df.loc[:, startx:] + y_train = df[columny] + return x_train, y_train diff --git a/plc/code_snippet.txt b/plc/code_snippet.txt new file mode 100644 index 0000000..9f1e454 --- /dev/null +++ b/plc/code_snippet.txt @@ -0,0 +1,14 @@ +module BinarySearch where + +a :: [Int] +a = [6, 13, 14, 25, 33, 43, 51, 53, 64, 72, 84, 93, 95, 96, 97] + +bsearch :: Int -> [Int] -> Int +bsearch _ [] = -1 +bsearch key xs + | key < val = bsearch key (take (mid-1) xs) + | key > val = bsearch key (drop (mid+1) xs) + | otherwise = val + where + mid = floor ((fromIntegral $ (length xs) - 1) / 2) + val = xs !! mid diff --git a/plc/data_load.py b/plc/data_load.py new file mode 100644 index 0000000..e68c016 --- /dev/null +++ b/plc/data_load.py @@ -0,0 +1,80 @@ +import os +import pandas as pd + +filename_dict = {".clj":"Clojure", + ".cljs": "Clojure", + ".edn": "Clojure", + ".clojure": "Clojure", + ".hs": "Haskell", + ".lhs": "Haskell", + ".java": "Java", + ".class": "Java", + ".jar": "Java", + ".js": "Javascript", + ".javascript": "Javascript", + ".ocaml": "OCaml", + ".ml": "OCaml", + ".pl": "Perl", + ".pm": "Perl", + ".t": "Perl", + ".pod": "Perl", + ".php": "PHP", + ".perl": "Perl", + ".phtml": "PHP", + ".php4": "PHP", + ".php3": "PHP", + ".php5": "PHP", + ".phps": "PHP", + ".py": "Python", + ".pyw": "Python", + ".pyc": "Python", + ".pyo": "Python", + ".pyd": "Python", + ".python3": "Python", + ".Python2": "Python", + ".rb": "Ruby", + ".rbw": "Ruby", + ".jruby": "Ruby", + ".scala": "Scala", + ".scm": "Scheme", + ".ss": "Scheme", + ".tcl": "Tcl", + ".racket": "Scheme", + ".ghc": "Haskell"} + +def tuple_maker(adict): + lista = [] + for key in adict: + lista.append(key) + tup = tuple(lista) + return tup + +def code_sucker(): + codelist = [] + for subdir, dirs, files in os.walk("bench/"): + for fname in files: + if fname.endswith(tuple_maker(filename_dict)): + #print(fname) + with open(os.path.join(subdir, fname)) as current_file: + codelist.append(current_file.read()) + return codelist + +def type_getter(): + rootDir = 'bench' + typelist = [] + for subdir, dirs, files in os.walk(rootDir): + for fname in files: + #print('\t%s' % fname) + name, extension = os.path.splitext(fname) + if extension in filename_dict: + #print(filename_dict[extension]) + typelist.append(filename_dict[extension]) + return typelist + +def tcode_sucker(): + codelist = [] + for subdir, dirs, files in os.walk("test/"): + for fname in files: + with open(os.path.join(subdir, fname)) as current_file: + codelist.append(current_file.read()) + return codelist diff --git a/plc/features.py b/plc/features.py new file mode 100644 index 0000000..4df7f87 --- /dev/null +++ b/plc/features.py @@ -0,0 +1,26 @@ +import re + +def character_counter(code, char): + counter = 0 + for _ in code: + if _ == char: + counter+=1 + return counter + +def character_ratio(code, char): + value = character_counter(code, char)/len(code) + return value + +def string_finder(string, code): + value = len(re.findall(string, code)) + return value + +def string_ratio(string, code): + value = string_finder(string, code)/len(code) + return value + +def string_end(string, code): + if code.endswith(string): + return 10 + else: + return 0 diff --git a/plc/guess_lang.py b/plc/guess_lang.py new file mode 100644 index 0000000..766a14e --- /dev/null +++ b/plc/guess_lang.py @@ -0,0 +1,89 @@ +from data_load import* +from features import* +from classifier import* + +from sklearn.naive_bayes import GaussianNB + +import os +import pandas as pd +import re +import random +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("filename") +args = parser.parse_args() + +word_list = ['let', 'end', 'defn', 'function', 'fun', 'return', 'def', 'return', 'check', 'make', '->', '.format', + 'define', '::', 'done', 'type', 'rescue', 'print', 'elif', 'clone', 'display', '$format', 'echo', 'str', + 'join', '&&', 'val', 'Nil', 'object', '<-', '--', 'lambda', 'var', '//', 'tmpl', 'public function', + 'stdlib', '=>', 'final', 'case', 'impl'] +symbol_list = ['$', '^', ',', ';', '&', '|', '!', '*', '@', '#'] +endings = ['end', ')', '}'] + +def data_frame_generator(): + codelist = code_sucker() + typelist = type_getter() + df = pd.DataFrame(typelist, index=range(386)) + df.columns = ["Language"] + df["Code"] = codelist + df['Language'] = df.Language.apply(lambda x:x.lower()) + for string in word_list: + def sub_function(code): + x = string_ratio(string, code) + return x + df[string] = df.Code.apply(sub_function) + for char in symbol_list: + def sub_function2(code): + y = character_ratio(code, char) + return y + df[char] = df.Code.apply(sub_function2) + for ending in endings: + def sub_function3(code): + z = string_end(ending, code) + return z + df['_' + ending] = df.Code.apply(sub_function3) + return df + +def x_data_frame_generator(): + test_list = [] + with open(args.filename) as test_file: + test_list.append(test_file.read()) + df = pd.DataFrame(test_list, index=range(1), columns=['Code']) + for string in word_list: + def sub_function(code): + x = string_ratio(string, code) + return x + df[string] = df.Code.apply(sub_function) + for char in symbol_list: + def sub_function2(code): + y = character_ratio(code, char) + return y + df[char] = df.Code.apply(sub_function2) + for ending in endings: + def sub_function3(code): + z = string_end(ending, code) + return z + df['_' + ending] = df.Code.apply(sub_function3) + return df + + +def probability_display(df): + n = gauss.predict_proba(xdf.loc[:, 'let':]) + prob_list = n.tolist()[0] + programs_list = ['Clojure', 'Haskell', 'Java', 'Javascript', 'Ocaml', 'Perl', 'Php', 'Python', 'Ruby', 'Scala', 'Scheme'] + percent = 0 + for item in prob_list: + if item > percent: + percent = item + idx = prob_list.index(item) + print("The code snippet is written in {}".format(programs_list[idx])) + print("Confidence: {}%".format(percent*100)) + +if __name__ == "__main__": + df = data_frame_generator() + gauss = GaussianNB() + x_train, y_train = create_train(df, word_list[0], 'Language') + gauss.fit(x_train, y_train) + xdf = x_data_frame_generator() + probability_display(xdf) diff --git a/plc/picklefile b/plc/picklefile new file mode 100644 index 0000000..2f1a3e6 Binary files /dev/null and b/plc/picklefile differ diff --git a/plc/random_guesser.py b/plc/random_guesser.py new file mode 100644 index 0000000..d155743 --- /dev/null +++ b/plc/random_guesser.py @@ -0,0 +1,29 @@ +import random + +def random_guesser(y_test): + lang_list = ['clojure', 'python', 'javascript', 'ruby', 'haskell', 'scheme', 'java', 'scala', 'php', 'ocaml'] + correct_count = 0 + total_count = 0 + for answer in y_test: + if answer == random.choice(lang_list): + correct_count+=1 + total_count+=1 + else: + total_count+=1 + return correct_count/total_count + + +def average_of_random_guesser(y_test, number): + alist = [] + for num in range(number): + alist.append(random_guesser(y_test)) + x = sum(alist)/len(alist) + return x + + +def max_of_random_guesser(y_test, number): + alist = [] + for num in range(number): + alist.append(random_guesser(y_test)) + x = max(alist) + return x diff --git a/plc/split and test.ipynb b/plc/split and test.ipynb new file mode 100644 index 0000000..fe1e4a7 --- /dev/null +++ b/plc/split and test.ipynb @@ -0,0 +1,211 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:d92b8193818cf4fb449ca66b377d7ceb050fe6af17428898526d4210e2ddb23a" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from data_load import*\n", + "from features import*\n", + "from classifier import*\n", + "from guess_lang import*" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 6 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import os\n", + "import pandas as pd\n", + "import re\n", + "import random" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 7 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "word_list = ['let', 'end', 'defn', 'function', 'fun', 'return', 'def', 'return', 'check', 'make', '->', '.format',\n", + " 'define', '::', 'done', 'type', 'rescue', 'print', 'elif', 'clone', 'display', '$format', 'echo', 'str',\n", + " 'join', '&&', 'val', 'Nil', 'object', '<-', '--', 'lambda', 'var', '//', 'tmpl', 'public function',\n", + " 'stdlib', '=>', 'final', 'case', 'impl']\n", + "symbol_list = ['$', '^', ',', ';', '&', '|', '!', '*', '@', '#']\n", + "endings = ['end', ')', '}']" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 8 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.cross_validation import train_test_split\n", + "from sklearn import metrics" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 9 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df = data_frame_generator()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 10 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "xdata = df.loc[:, 'let':]\n", + "ydata = df['Language']" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 11 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "x_train, x_test, y_train, y_test = train_test_split(xdata, ydata, test_size=0.3, random_state=0)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 12 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.tree import DecisionTreeClassifier" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 13 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "tree = DecisionTreeClassifier()\n", + "run_classifier(tree, x_train, x_test, y_train, y_test)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " clojure 1.00 1.00 1.00 11\n", + " haskell 0.80 0.80 0.80 10\n", + " java 1.00 0.80 0.89 15\n", + " javascript 0.70 0.88 0.78 8\n", + " ocaml 0.75 1.00 0.86 9\n", + " perl 1.00 0.82 0.90 11\n", + " php 0.82 1.00 0.90 9\n", + " python 0.77 1.00 0.87 10\n", + " ruby 1.00 0.83 0.91 12\n", + " scala 1.00 0.93 0.96 14\n", + " scheme 1.00 0.71 0.83 7\n", + "\n", + "avg / total 0.91 0.89 0.89 116\n", + "\n", + "[[11 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 8 0 0 1 0 0 1 0 0 0]\n", + " [ 0 0 12 3 0 0 0 0 0 0 0]\n", + " [ 0 1 0 7 0 0 0 0 0 0 0]\n", + " [ 0 0 0 0 9 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 9 2 0 0 0 0]\n", + " [ 0 0 0 0 0 0 9 0 0 0 0]\n", + " [ 0 0 0 0 0 0 0 10 0 0 0]\n", + " [ 0 0 0 0 0 0 0 2 10 0 0]\n", + " [ 0 1 0 0 0 0 0 0 0 13 0]\n", + " [ 0 0 0 0 2 0 0 0 0 0 5]]\n", + "0.889563785928\n" + ] + } + ], + "prompt_number": 14 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "gauss = GaussianNB()\n", + "run_classifier(gauss, x_train, x_test, y_train, y_test)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " clojure 0.92 1.00 0.96 11\n", + " haskell 0.91 1.00 0.95 10\n", + " java 1.00 0.87 0.93 15\n", + " javascript 0.83 0.62 0.71 8\n", + " ocaml 1.00 1.00 1.00 9\n", + " perl 0.92 1.00 0.96 11\n", + " php 0.82 1.00 0.90 9\n", + " python 1.00 0.70 0.82 10\n", + " ruby 0.77 0.83 0.80 12\n", + " scala 0.87 0.93 0.90 14\n", + " scheme 1.00 1.00 1.00 7\n", + "\n", + "avg / total 0.91 0.91 0.90 116\n", + "\n", + "[[11 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 10 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 13 1 0 0 1 0 0 0 0]\n", + " [ 0 0 0 5 0 0 1 0 0 2 0]\n", + " [ 0 0 0 0 9 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 11 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 9 0 0 0 0]\n", + " [ 0 0 0 0 0 0 0 7 3 0 0]\n", + " [ 1 1 0 0 0 0 0 0 10 0 0]\n", + " [ 0 0 0 0 0 1 0 0 0 13 0]\n", + " [ 0 0 0 0 0 0 0 0 0 0 7]]\n", + "0.902561751579\n" + ] + } + ], + "prompt_number": 15 + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/test.csv b/plc/test.csv similarity index 95% rename from test.csv rename to plc/test.csv index adbf5dd..289113e 100644 --- a/test.csv +++ b/plc/test.csv @@ -24,8 +24,6 @@ Filename,Language 23,java 24,scala 25,scala -26,tcl -27,tcl 28,php 29,php 30,php diff --git a/test/1 b/plc/test/01 similarity index 100% rename from test/1 rename to plc/test/01 diff --git a/test/2 b/plc/test/02 similarity index 100% rename from test/2 rename to plc/test/02 diff --git a/test/3 b/plc/test/03 similarity index 100% rename from test/3 rename to plc/test/03 diff --git a/test/4 b/plc/test/04 similarity index 100% rename from test/4 rename to plc/test/04 diff --git a/test/5 b/plc/test/05 similarity index 100% rename from test/5 rename to plc/test/05 diff --git a/test/6 b/plc/test/06 similarity index 100% rename from test/6 rename to plc/test/06 diff --git a/test/7 b/plc/test/07 similarity index 100% rename from test/7 rename to plc/test/07 diff --git a/test/8 b/plc/test/08 similarity index 100% rename from test/8 rename to plc/test/08 diff --git a/test/9 b/plc/test/09 similarity index 100% rename from test/9 rename to plc/test/09 diff --git a/test/10 b/plc/test/10 similarity index 100% rename from test/10 rename to plc/test/10 diff --git a/test/11 b/plc/test/11 similarity index 100% rename from test/11 rename to plc/test/11 diff --git a/test/12 b/plc/test/12 similarity index 100% rename from test/12 rename to plc/test/12 diff --git a/test/13 b/plc/test/13 similarity index 100% rename from test/13 rename to plc/test/13 diff --git a/test/14 b/plc/test/14 similarity index 100% rename from test/14 rename to plc/test/14 diff --git a/test/15 b/plc/test/15 similarity index 100% rename from test/15 rename to plc/test/15 diff --git a/test/16 b/plc/test/16 similarity index 100% rename from test/16 rename to plc/test/16 diff --git a/test/17 b/plc/test/17 similarity index 100% rename from test/17 rename to plc/test/17 diff --git a/test/18 b/plc/test/18 similarity index 100% rename from test/18 rename to plc/test/18 diff --git a/test/19 b/plc/test/19 similarity index 100% rename from test/19 rename to plc/test/19 diff --git a/test/20 b/plc/test/20 similarity index 100% rename from test/20 rename to plc/test/20 diff --git a/test/21 b/plc/test/21 similarity index 100% rename from test/21 rename to plc/test/21 diff --git a/test/22 b/plc/test/22 similarity index 100% rename from test/22 rename to plc/test/22 diff --git a/test/23 b/plc/test/23 similarity index 100% rename from test/23 rename to plc/test/23 diff --git a/test/24 b/plc/test/24 similarity index 100% rename from test/24 rename to plc/test/24 diff --git a/test/25 b/plc/test/25 similarity index 100% rename from test/25 rename to plc/test/25 diff --git a/test/28 b/plc/test/28 similarity index 100% rename from test/28 rename to plc/test/28 diff --git a/test/29 b/plc/test/29 similarity index 100% rename from test/29 rename to plc/test/29 diff --git a/test/30 b/plc/test/30 similarity index 100% rename from test/30 rename to plc/test/30 diff --git a/test/31 b/plc/test/31 similarity index 100% rename from test/31 rename to plc/test/31 diff --git a/test/32 b/plc/test/32 similarity index 100% rename from test/32 rename to plc/test/32 diff --git a/plc/tests/test_features.py b/plc/tests/test_features.py new file mode 100644 index 0000000..e297bf7 --- /dev/null +++ b/plc/tests/test_features.py @@ -0,0 +1,26 @@ +from plc.features import* + +def test_character_counter(): + testcode = "This is a string with some funky characters !@*!@*(#^&!)" + assert character_counter(testcode, 's') == 5 + assert character_counter(testcode, '!') == 3 + assert character_counter(testcode, ' ') == 8 + assert character_counter(testcode, '(') == 1 + +# testcode length is 56 + +def test_character_ratio(): + testcode = "This is a string with some funky characters !@*!@*(#^&!)" + assert character_ratio(testcode, ' ') == 8/56 + +def test_string_finder(): + test2code = ("""let min_depth = 4 let n = if Array.length + Sys.argv <> 2 then 0 else int_of_string Sys.argv.(1)let + max_depth = max (min_depth + 2) nlet stretch_depth = + max_depth + 1""") + assert string_finder("let", test2code) == 4 + +def test_string_end(): + test3code = "boring test string()" + assert string_end("ing()", test3code) == 10 + assert string_end("monkey", test3code) == 0 diff --git a/plc/workbook.ipynb b/plc/workbook.ipynb new file mode 100644 index 0000000..46b5484 --- /dev/null +++ b/plc/workbook.ipynb @@ -0,0 +1,1135 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:95e12a41859fa3e312230e9c8cd06933a5646f54cec92e1fc473f5eecb3bbc3e" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import os\n", + "import pandas as pd\n", + "import re" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 290 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "filename_dict = {\".clj\":\"Clojure\",\n", + " \".cljs\": \"Clojure\",\n", + " \".edn\": \"Clojure\",\n", + " \".clojure\": \"Clojure\",\n", + " \".hs\": \"Haskell\",\n", + " \".lhs\": \"Haskell\",\n", + " \".java\": \"Java\",\n", + " \".class\": \"Java\",\n", + " \".jar\": \"Java\",\n", + " \".js\": \"Javascript\",\n", + " \".javascript\": \"Javascript\",\n", + " \".ocaml\": \"OCaml\",\n", + " \".ml\": \"OCaml\",\n", + " \".pl\": \"Perl\",\n", + " \".pm\": \"Perl\",\n", + " \".t\": \"Perl\",\n", + " \".pod\": \"Perl\",\n", + " \".php\": \"PHP\",\n", + " \".perl\": \"Perl\",\n", + " \".phtml\": \"PHP\",\n", + " \".php4\": \"PHP\",\n", + " \".php3\": \"PHP\",\n", + " \".php5\": \"PHP\",\n", + " \".phps\": \"PHP\",\n", + " \".py\": \"Python\",\n", + " \".pyw\": \"Python\",\n", + " \".pyc\": \"Python\",\n", + " \".pyo\": \"Python\",\n", + " \".pyd\": \"Python\",\n", + " \".python3\": \"Python\",\n", + " \".Python2\": \"Python\",\n", + " \".rb\": \"Ruby\",\n", + " \".rbw\": \"Ruby\",\n", + " \".jruby\": \"Ruby\",\n", + " \".scala\": \"Scala\",\n", + " \".scm\": \"Scheme\",\n", + " \".ss\": \"Scheme\",\n", + " \".tcl\": \"Tcl\",\n", + " \".racket\": \"Scheme\",\n", + " \".ghc\": \"Haskell\"}" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 291 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def tuple_maker(adict):\n", + " lista = []\n", + " for key in adict:\n", + " lista.append(key)\n", + " tup = tuple(lista)\n", + " return tup" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 292 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def code_sucker():\n", + " codelist = []\n", + " for subdir, dirs, files in os.walk(\"bench/\"):\n", + " for fname in files:\n", + " if fname.endswith(tuple_maker(filename_dict)):\n", + " #print(fname)\n", + " with open(os.path.join(subdir, fname)) as current_file:\n", + " codelist.append(current_file.read()) \n", + " return codelist" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 293 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "codelist = code_sucker()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 294 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "len(codelist)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 295, + "text": [ + "386" + ] + } + ], + "prompt_number": 295 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def type_getter():\n", + " rootDir = 'bench'\n", + " typelist = []\n", + " for subdir, dirs, files in os.walk(rootDir):\n", + " for fname in files:\n", + " #print('\\t%s' % fname)\n", + " name, extension = os.path.splitext(fname)\n", + " if extension in filename_dict:\n", + " #print(filename_dict[extension])\n", + " typelist.append(filename_dict[extension])\n", + " return typelist" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 296 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "typelist = type_getter()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 297 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "len(typelist)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 298, + "text": [ + "386" + ] + } + ], + "prompt_number": 298 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df = pd.DataFrame(typelist, index=range(386))" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 299 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df.columns = [\"Language\"]\n", + "df[\"Code\"] = codelist" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 300 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df['Language'] = df.Language.apply(lambda x:x.lower())" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 301 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LanguageCode
0 clojure ;; The Computer Language Benchmarks Game\\n;; h...
1 clojure ;; The Computer Language Benchmarks Game\\n;; h...
2 clojure ;; The Computer Language Benchmarks Game\\n;; h...
3 haskell --\\n-- The Computer Language Benchmarks Game\\n...
4 haskell --\\n-- The Computer Language Benchmarks Game\\n...
5 haskell --\\n-- The Computer Language Benchmarks Game\\n...
6 java /* The Computer Language Benchmarks Game\\n h...
7 java /* The Computer Language Benchmarks Game\\n h...
8 java /* The Computer Language Benchmarks Game\\n * h...
9 javascript /* The Computer Language Benchmarks Game\\n h...
10 ruby # The Computer Language Shootout Benchmarks\\n#...
11 ruby # The Computer Language Benchmarks Game\\n# htt...
12 ruby # The Computer Language Benchmarks Game\\n# htt...
13 ruby # The Computer Language Benchmarks Game\\n# htt...
14 ocaml (* The Computer Language Benchmarks Game\\n * h...
15 ocaml (* The Computer Language Benchmarks Game\\n * h...
16 ocaml (* The Computer Language Benchmarks Game\\n * h...
17 perl # The Computer Language Benchmarks Game\\n# htt...
18 perl # The Computer Language Benchmarks Game\\n# htt...
19 php <?php \\n/* The Computer Language Benchmarks Ga...
20 php <?php \\n/* The Computer Language Benchmarks Ga...
21 php <?php \\n/* The Computer Language Benchmarks Ga...
22 php <?php \\n/* The Computer Language Benchmarks Ga...
23 python # The Computer Language Benchmarks Game\\n# htt...
24 scheme #lang racket/base\\n\\n;;; The Computer Language...
25 scheme #lang racket/base\\n\\n;;; The Computer Language...
26 scheme #lang racket/base\\n\\n;;; The Computer Language...
27 scala /* The Computer Language Benchmarks Game\\n h...
28 scala /* The Computer Language Benchmarks Game\\n h...
29 scala /* The Computer Language Benchmarks Game\\n h...
.........
356 python # The Computer Language Benchmarks Game\\n# htt...
357 python # The Computer Language Benchmarks Game\\n# htt...
358 python # The Computer Language Benchmarks Game\\n# htt...
359 scheme #lang racket/base\\n\\n;;; The Computer Language...
360 scheme #lang racket/base\\n\\n;;; The Computer Language...
361 scheme #lang racket/base\\n;; The Computer Language Be...
362 scala /* The Computer Language Benchmarks Game\\n h...
363 scala /* The Computer Language Benchmarks Game\\n h...
364 scala /* The Computer Language Benchmarks Game\\n h...
365 scala /* The Computer Language Benchmarks Game\\n h...
366 clojure ;; The Computer Language Benchmarks Game\\n;; h...
367 clojure ;; The Computer Language Benchmarks Game\\n;; h...
368 haskell -- The Computer Language Benchmarks Game\\n-- h...
369 java /**\\n * The Computer Language Benchmarks Game\\...
370 java /**\\n * The Computer Language Benchmarks Game\\...
371 java /**\\n * The Computer Language Benchmarks Game\\...
372 java /**\\n * The Computer Language Benchmarks Game\\...
373 java /**\\n * The Computer Language Benchmarks Game\\...
374 java /**\\n * The Computer Language Benchmarks Game\\...
375 ruby # The Computer Language Benchmarks Game\\n# htt...
376 ruby # The Computer Language Benchmarks Game\\n# htt...
377 ocaml (* The Computer Language Benchmarks Game\\n * h...
378 ocaml (* The Computer Language Benchmarks Game\\n * h...
379 ocaml (* The Computer Language Benchmarks Game\\n * h...
380 perl # The Computer Language Benchmarks Game\\n# htt...
381 perl # The Computer Language Benchmarks Game\\n# htt...
382 python # The Computer Language Benchmarks Game\\n# htt...
383 python # The Computer Language Benchmarks Game\\n# htt...
384 scheme #lang racket/base\\n\\n;;; The Computer Language...
385 scala /* The Computer Language Benchmarks Game\\n h...
\n", + "

386 rows \u00d7 2 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 302, + "text": [ + " Language Code\n", + "0 clojure ;; The Computer Language Benchmarks Game\\n;; h...\n", + "1 clojure ;; The Computer Language Benchmarks Game\\n;; h...\n", + "2 clojure ;; The Computer Language Benchmarks Game\\n;; h...\n", + "3 haskell --\\n-- The Computer Language Benchmarks Game\\n...\n", + "4 haskell --\\n-- The Computer Language Benchmarks Game\\n...\n", + "5 haskell --\\n-- The Computer Language Benchmarks Game\\n...\n", + "6 java /* The Computer Language Benchmarks Game\\n h...\n", + "7 java /* The Computer Language Benchmarks Game\\n h...\n", + "8 java /* The Computer Language Benchmarks Game\\n * h...\n", + "9 javascript /* The Computer Language Benchmarks Game\\n h...\n", + "10 ruby # The Computer Language Shootout Benchmarks\\n#...\n", + "11 ruby # The Computer Language Benchmarks Game\\n# htt...\n", + "12 ruby # The Computer Language Benchmarks Game\\n# htt...\n", + "13 ruby # The Computer Language Benchmarks Game\\n# htt...\n", + "14 ocaml (* The Computer Language Benchmarks Game\\n * h...\n", + "15 ocaml (* The Computer Language Benchmarks Game\\n * h...\n", + "16 ocaml (* The Computer Language Benchmarks Game\\n * h...\n", + "17 perl # The Computer Language Benchmarks Game\\n# htt...\n", + "18 perl # The Computer Language Benchmarks Game\\n# htt...\n", + "19 php >= readIO . head\\n let maxN = max (minN + 2) n\\n stretchN = maxN + 1\\n\\n -- stretch memory tree\\n let c = check (make 0 stretchN)\\n io \"stretch tree\" stretchN c\\n\\n -- allocate a long lived tree\\n let !long = make 0 maxN\\n\\n -- allocate, walk, and deallocate many bottom-up binary trees\\n let vs = depth minN maxN\\n mapM_ (\\\\((m,d,i)) -> io (show m ++ \"\\\\t trees\") d i) vs\\n\\n -- confirm the the long-lived binary tree still exists\\n io \"long lived tree\" maxN (check long)\\n\\n-- generate many trees\\ndepth :: Int -> Int -> [(Int,Int,Int)]\\ndepth d m\\n | d <= m = (2*n,d,sumT d n 0) : depth (d+2) m\\n | otherwise = []\\n where n = 1 `shiftL` (m - d + minN)\\n\\n-- allocate and check lots of trees\\nsumT :: Int -> Int -> Int -> Int\\nsumT d 0 t = t\\nsumT d i t = sumT d (i-1) (t + a + b)\\n where a = check (make i d)\\n b = check (make (-i) d)\\n\\n-- traverse the tree, counting up the nodes\\ncheck :: Tree -> Int\\ncheck Nil = 0\\ncheck (Node i l r) = i + check l - check r\\n\\n-- build a tree\\nmake :: Int -> Int -> Tree\\nmake i 0 = Node i Nil Nil\\nmake i d = Node i (make (i2-1) d2) (make i2 d2)\\n where i2 = 2*i; d2 = d-1\\n'" + ] + } + ], + "prompt_number": 303 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def length_getter(code):\n", + " return len(code)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 304 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def character_counter(code, char):\n", + " counter = 0\n", + " for _ in code:\n", + " if _ == char:\n", + " counter+=1\n", + " return counter\n", + "\n", + "def character_ratio(code, char):\n", + " value = character_counter(code, char)/len(code)\n", + " return value\n", + " " + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 305 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def comma_counter(code):\n", + " counter = 0\n", + " for _ in code:\n", + " if _ == ',':\n", + " counter+=1\n", + " return counter\n", + "\n", + "def semicolon_counter(code):\n", + " counter = 0\n", + " for _ in code:\n", + " if _ == ';':\n", + " counter+=1\n", + " return counter" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 306 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def string_finder(string, code):\n", + " value = len(re.findall(string, code))\n", + " return value\n", + "\n", + "def string_ratio(string, code):\n", + " value = string_finder(string, code)/len(code)\n", + " return value" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 307 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df['code length'] = df.Code.apply(length_getter)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 308 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "word_list = ['let', 'end', 'defn', 'function', 'fun', 'return', 'def', 'return', 'check', 'make', '->', '.format',\n", + " 'define', '::', '$', '^', ',', ';', '&', '|', '!']" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 309 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def data_frame_generator():\n", + " codelist = code_sucker()\n", + " typelist = type_getter()\n", + " df = pd.DataFrame(typelist, index=range(386))\n", + " df.columns = [\"Language\"]\n", + " df[\"Code\"] = codelist\n", + " df['Language'] = df.Language.apply(lambda x:x.lower())\n", + " for string in word_list:\n", + " def sub_function(code):\n", + " x = string_ratio(string, code)\n", + " return x\n", + " df[string] = df.Code.apply(sub_function)\n", + " return df" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 310 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df = data_frame_generator()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 311 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#df" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 312 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 312 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from textblob import TextBlob" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 313 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def code_lookup(df, num):\n", + " codeblob = TextBlob(df.iloc[num-1]['Code'])\n", + " print(df.iloc[num-1]['Language'])\n", + " print(\"\")\n", + " print(codeblob)\n", + " " + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 314 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "code_lookup(test_df, 13)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'Code'", + "output_type": "pyerr", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcode_lookup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m13\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mcode_lookup\u001b[0;34m(df, num)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcode_lookup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mcodeblob\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextBlob\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnum\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Code'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnum\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Language'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcodeblob\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/bretrunestad/.pyenv/versions/charting/lib/python3.4/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 507\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 508\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 509\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 510\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 511\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misscalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/bretrunestad/.pyenv/versions/charting/lib/python3.4/site-packages/pandas/core/index.py\u001b[0m in \u001b[0;36mget_value\u001b[0;34m(self, series, key)\u001b[0m\n\u001b[1;32m 1429\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mInvalidIndexError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1430\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1431\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1432\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# pragma: no cover\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1433\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/bretrunestad/.pyenv/versions/charting/lib/python3.4/site-packages/pandas/core/index.py\u001b[0m in \u001b[0;36mget_value\u001b[0;34m(self, series, key)\u001b[0m\n\u001b[1;32m 1415\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1416\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1417\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1418\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1419\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minferred_type\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'integer'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'boolean'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/bretrunestad/.pyenv/versions/charting/lib/python3.4/site-packages/pandas/index.so\u001b[0m in \u001b[0;36mpandas.index.IndexEngine.get_value (pandas/index.c:3096)\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m/Users/bretrunestad/.pyenv/versions/charting/lib/python3.4/site-packages/pandas/index.so\u001b[0m in \u001b[0;36mpandas.index.IndexEngine.get_value (pandas/index.c:2827)\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m/Users/bretrunestad/.pyenv/versions/charting/lib/python3.4/site-packages/pandas/index.so\u001b[0m in \u001b[0;36mpandas.index.IndexEngine.get_loc (pandas/index.c:3687)\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m/Users/bretrunestad/.pyenv/versions/charting/lib/python3.4/site-packages/pandas/hashtable.so\u001b[0m in \u001b[0;36mpandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12310)\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m/Users/bretrunestad/.pyenv/versions/charting/lib/python3.4/site-packages/pandas/hashtable.so\u001b[0m in \u001b[0;36mpandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12261)\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'Code'" + ] + } + ], + "prompt_number": 315 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def tcode_sucker():\n", + " codelist = []\n", + " for subdir, dirs, files in os.walk(\"test/\"):\n", + " for fname in files:\n", + " with open(os.path.join(subdir, fname)) as current_file:\n", + " codelist.append(current_file.read()) \n", + " return codelist" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def tdata_frame_generator():\n", + " test_codelist = tcode_sucker()\n", + " df = pd.read_csv(\"test.csv\")\n", + " df[\"Code\"] = test_codelist\n", + " for string in word_list:\n", + " def sub_function(code):\n", + " x = string_ratio(string, code)\n", + " return x\n", + " df[string] = df.Code.apply(sub_function)\n", + " return df" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_df = tdata_frame_generator()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "codelist=tcode_sucker()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "codelist" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_df" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_codelist = tcode_sucker()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_df = pd.read_csv(\"test.csv\")" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_df[\"Testcode\"] = test_codelist" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_df['code length'] = test_df.Testcode.apply(length_getter)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_df['^'] = df.Code.apply(carat)\n", + "test_df[';'] = df.Code.apply(semicolon)\n", + "test_df[','] = df.Code.apply(comma)\n", + "test_df['&'] = df.Code.apply(carat)\n", + "test_df['!'] = df.Code.apply(semicolon)\n", + "test_df['|'] = df.Code.apply(comma)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_df['let'] = df.Code.apply(let_ratio)\n", + "test_df['end'] = df.Code.apply(end_ratio)\n", + "test_df['fun'] = df.Code.apply(fun_ratio)\n", + "test_df['defn'] = df.Code.apply(defn_ratio)\n", + "test_df['def'] = df.Code.apply(def_ratio)\n", + "test_df['function'] = df.Code.apply(function_ratio)\n", + "test_df['slice'] = df.Code.apply(slice_ratio)\n", + "test_df['return'] = df.Code.apply(return_ratio)\n", + "test_df['define'] = df.Code.apply(define_ratio)\n", + "test_df['doublecolon'] = df.Code.apply(doublecolon_ratio)\n", + "test_df['check'] = df.Code.apply(check_ratio)\n", + "test_df['make'] = df.Code.apply(make_ratio)\n", + "test_df['.format'] = df.Code.apply(format_ratio)\n", + "test_df['->'] = df.Code.apply(arrow_ratio)\n", + "test_df['$'] = df.Code.apply(dollar_ratio)\n", + "test_df['printf'] = df.Code.apply(printf_ratio)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_df" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn import metrics" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def create_xy(df, test_df startx, columny):\n", + " x_train = df.loc[:, startx:]\n", + " x_test = test_df.loc[:, startx:]\n", + " y_train = df[columny]\n", + " y_test = test_df[columny]\n", + " return x_train, x_test, y_train, y_test" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "x_train = df.loc[:,\"^\":]\n", + "x_test = test_df.loc[:, \"^\":]\n", + "y_train = df.Language\n", + "y_test = test_df.Language" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "x_train, x_test, y_train, y_test = create_xy('let', 'Language')" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "x_train" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def run_classifier(clf, x_train, x_test, y_train, y_test):\n", + " clf.fit(x_train, y_train)\n", + " predicted = clf.predict(x_test)\n", + " print(metrics.classification_report(y_test, predicted))\n", + " print(metrics.confusion_matrix(y_test, predicted))\n", + " print(metrics.f1_score(y_test, predicted))" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.cross_validation import cross_val_score\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.ensemble import RandomForestClassifier" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "tree = DecisionTreeClassifier()\n", + "run_classifier(tree, x_train, x_test, y_train, y_test)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "gauss = GaussianNB()\n", + "run_classifier(gauss, x_train, x_test, y_train, y_test)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9170871..76af5b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ +IPython[all] scikit-learn -textblob \ No newline at end of file +textblob diff --git a/test/26 b/test/26 deleted file mode 100644 index 182f919..0000000 --- a/test/26 +++ /dev/null @@ -1,35 +0,0 @@ -proc isaac::mix {a b c d e f g h} { - set a [expr {($a ^ ($b << 11)) & 0xffffffff}] - set d [expr {($d + $a) & 0xffffffff}] - set b [expr {($b + $c) & 0xffffffff}] - - set b [expr {($b ^ ($c >> 2)) & 0xffffffff}] - set e [expr {($e + $b) & 0xffffffff}] - set c [expr {($c + $d) & 0xffffffff}] - - set c [expr {($c ^ ($d << 8)) & 0xffffffff}] - set f [expr {($f + $c) & 0xffffffff}] - set d [expr {($d + $e) & 0xffffffff}] - - set d [expr {($d ^ ($e >> 16)) & 0xffffffff}] - set g [expr {($g + $d) & 0xffffffff}] - set e [expr {($e + $f) & 0xffffffff}] - - set e [expr {($e ^ ($f << 10)) & 0xffffffff}] - set h [expr {($h + $e) & 0xffffffff}] - set f [expr {($f + $g) & 0xffffffff}] - - set f [expr {($f ^ ($g >> 4)) & 0xffffffff}] - set a [expr {($a + $f) & 0xffffffff}] - set g [expr {($g + $h) & 0xffffffff}] - - set g [expr {($g ^ ($h << 8)) & 0xffffffff}] - set b [expr {($b + $g) & 0xffffffff}] - set h [expr {($h + $a) & 0xffffffff}] - - set h [expr {($h ^ ($a >> 9)) & 0xffffffff}] - set c [expr {($c + $h) & 0xffffffff}] - set a [expr {($a + $b) & 0xffffffff}] - - return [list $a $b $c $d $e $f $g $h] -} diff --git a/test/27 b/test/27 deleted file mode 100644 index 902ec5c..0000000 --- a/test/27 +++ /dev/null @@ -1,20 +0,0 @@ -proc twitter::follow {nick uhost hand chan argv} { - if {![channel get $chan twitter]} { return } - - if {[string length $argv] < 1} { - $twitter::output_cmd "PRIVMSG $chan :Usage: !follow " - return - } - - if {[catch {::twitlib::query $::twitlib::follow_url [list screen_name $argv]} result]} { - $twitter::output_cmd "PRIVMSG $chan :Twitter failed or already friends with $argv!" - return - } - - if {[dict exists $result error]} { - twitter::output $chan "Follow failed ($argv): [dict get $result error]" - return - } - - twitter::output $chan "Now following [dict get $result screen_name]!" -} \ No newline at end of file