diff --git a/spam.ipynb b/spam.ipynb new file mode 100644 index 0000000..e998c63 --- /dev/null +++ b/spam.ipynb @@ -0,0 +1,924 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:87246e0b638c75117130cc2dcee08ee41768dd2111d3f025edc934e53bfbb81c" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.cross_validation import train_test_split\n", + "from sklearn.feature_selection import VarianceThreshold\n", + "from sklearn import cluster\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn import metrics\n", + "from sklearn.cross_validation import cross_val_score\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn import tree\n", + "from sklearn.tree import DecisionTreeClassifier, export_graphviz\n", + "from sklearn import decomposition\n", + "from sklearn.externals.six import StringIO " + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 151 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Reads panda data frame and get the values out: numpy" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "spam_file = pd.read_csv(\"spambase.data\", header=None)\n", + "spam=spam_file.values" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 131 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Get the last column out: Response Variable" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "spam_target = spam[0::,-1]\n", + "spam_target.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 144, + "text": [ + "array([ 1., 1., 1., ..., 0., 0., 0.])" + ] + } + ], + "prompt_number": 144 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Get all columns but the last as a numpy object of predictors " + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "spam_data = spam[0::,:-1:]\n", + "spam_data.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 39, + "text": [ + "(4601, 57)" + ] + } + ], + "prompt_number": 39 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Split dataset into train and test datasets also split by predictors or response" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "x_train, x_test, y_train, y_test = train_test_split(spam_data, spam_target, test_size=0.4, random_state=0)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 40 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Gaussian Analysis: 80%" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "clf = GaussianNB()\n", + "clf = clf.fit(x_train, y_train)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 41 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "predicted = clf.predict(x_test)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 42 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print(metrics.classification_report(y_test, predicted))\n", + "print(metrics.confusion_matrix(y_test, predicted))\n", + "print(metrics.f1_score(y_test, predicted))\n", + "scores = cross_val_score(clf, spam_data, spam_target, cv=5)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0.0 0.94 0.73 0.83 1097\n", + " 1.0 0.70 0.93 0.80 744\n", + "\n", + "avg / total 0.85 0.82 0.82 1841\n", + "\n", + "[[806 291]\n", + " [ 49 695]]\n", + "0.803468208092\n" + ] + } + ], + "prompt_number": 44 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Decision Tree: 87%" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "clf = tree.DecisionTreeClassifier()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 147 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "clf = clf.fit(x_train, y_train)\n", + "predicted = clf.predict(x_test)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 148 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print(metrics.classification_report(y_test, predicted))\n", + "print(metrics.confusion_matrix(y_test, predicted))\n", + "print(metrics.f1_score(y_test, predicted))\n", + "scores = cross_val_score(clf, spam_data, spam_target, cv=5)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0.0 0.92 0.91 0.91 1097\n", + " 1.0 0.87 0.88 0.88 744\n", + "\n", + "avg / total 0.90 0.90 0.90 1841\n", + "\n", + "[[999 98]\n", + " [ 88 656]]\n", + "0.875834445928\n" + ] + } + ], + "prompt_number": 149 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "\n", + "with open(\"spam_tree.dot\", 'w') as f:\n", + " f = export_graphviz(clf, out_file=f)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 152 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "!dot -Tpdf spam_tree.dot -o spam_tree.pdf" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 154 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Logistic Regression: 89%" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "clf = LogisticRegression()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 52 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "clf = clf.fit(x_train, y_train)\n", + "predicted = clf.predict(x_test)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 53 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print(metrics.classification_report(y_test, predicted))\n", + "print(metrics.confusion_matrix(y_test, predicted))\n", + "print(metrics.f1_score(y_test, predicted))\n", + "scores = cross_val_score(clf, spam_data, spam_target, cv=5)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0.0 0.92 0.95 0.93 1097\n", + " 1.0 0.92 0.87 0.89 744\n", + "\n", + "avg / total 0.92 0.92 0.91 1841\n", + "\n", + "[[1037 60]\n", + " [ 96 648]]\n", + "0.892561983471\n" + ] + } + ], + "prompt_number": 54 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Nearest Neighbors: 68%" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "clf = KNeighborsClassifier(n_neighbors=2)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 68 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "clf = clf.fit(x_train, y_train)\n", + "predicted = clf.predict(x_test)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "[ 1. 0. 0. ..., 1. 1. 0.]\n" + ] + } + ], + "prompt_number": 75 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print(metrics.classification_report(y_test, predicted))\n", + "print(metrics.confusion_matrix(y_test, predicted))\n", + "print(metrics.f1_score(y_test, predicted))\n", + "scores = cross_val_score(clf, spam_data, spam_target, cv=5)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0.0 0.76 0.94 0.84 1097\n", + " 1.0 0.86 0.56 0.68 744\n", + "\n", + "avg / total 0.80 0.79 0.77 1841\n", + "\n", + "[[1026 71]\n", + " [ 324 420]]\n", + "0.68016194332\n" + ] + } + ], + "prompt_number": 70 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Dimensionality Reduction: PCA" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "pca = decomposition.RandomizedPCA(n_components=50, whiten=True)\n", + "pca.fit(x_train, y_train)\n", + "x_train_pca = pca.transform(x_train)\n", + "x_test_pca = pca.transform(x_test)\n", + "selected_vectors_trainx = x_train_pca[::,0:10:]\n", + "selected_vectors_testx = x_test_pca[::,0:10:]\n", + "#print(pca.explained_variance_ratio_)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 158 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Logistic Regression - Reduction through Principal Component Analysis 80%" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "clf = LogisticRegression()\n", + "\n", + "clf = clf.fit(selected_vectors_trainx, y_train)\n", + "predicted = clf.predict(selected_vectors_testx)\n", + "\n", + "print(metrics.classification_report(y_test, predicted))\n", + "print(metrics.confusion_matrix(y_test, predicted))\n", + "print(metrics.f1_score(y_test, predicted))\n", + "scores = cross_val_score(clf, spam_data, spam_target, cv=5)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0.0 0.86 0.90 0.88 1097\n", + " 1.0 0.84 0.78 0.81 744\n", + "\n", + "avg / total 0.85 0.85 0.85 1841\n", + "\n", + "[[990 107]\n", + " [166 578]]\n", + "0.808957312806\n" + ] + } + ], + "prompt_number": 159 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Reduction through Variance Threshold" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "sel = VarianceThreshold()\n", + "x_train_var = sel.fit_transform(x_train)\n", + "x_test_var = sel.fit_transform(x_test)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 111 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Logistic Regression - Reduction through Variance Threshold 89%" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "clf = LogisticRegression()\n", + "\n", + "clf = clf.fit(x_train_var, y_train)\n", + "predicted = clf.predict(x_test_var)\n", + "\n", + "print(metrics.classification_report(y_test, predicted))\n", + "print(metrics.confusion_matrix(y_test, predicted))\n", + "print(metrics.f1_score(y_test, predicted))\n", + "scores = cross_val_score(clf, spam_data, spam_target, cv=5)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0.0 0.91 0.94 0.93 1097\n", + " 1.0 0.91 0.87 0.89 744\n", + "\n", + "avg / total 0.91 0.91 0.91 1841\n", + "\n", + "[[1036 61]\n", + " [ 97 647]]\n", + "0.891184573003\n" + ] + } + ], + "prompt_number": 112 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Reduction through variable selection" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#Word Frequency\n", + "spam_data_words = spam[0::,0:48:]\n", + "spam_data_words.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 117, + "text": [ + "(4601, 48)" + ] + } + ], + "prompt_number": 117 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#char Frequency\n", + "spam_data_chars = spam[0::,48:54:]\n", + "spam_data_chars.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 121, + "text": [ + "(4601, 6)" + ] + } + ], + "prompt_number": 121 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#word + chars freqs\n", + "spam_data_words_chars = spam[0::,0:54:]\n", + "spam_data_words_chars.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 132, + "text": [ + "(4601, 54)" + ] + } + ], + "prompt_number": 132 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#word + chars freqs + cap avg\n", + "spam_data_words_chars_cap = spam[0::,0:55:]\n", + "spam_data_words_chars_cap.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 136, + "text": [ + "(4601, 55)" + ] + } + ], + "prompt_number": 136 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#capitals average\n", + "spam_data_cap_avg = spam[0::,54:55:]\n", + "spam_data_cap_avg.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 122, + "text": [ + "(4601, 1)" + ] + } + ], + "prompt_number": 122 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#capital longest\n", + "spam_data_cap_avg = spam[0::,55:56:]\n", + "spam_data_cap_avg.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 123, + "text": [ + "(4601, 1)" + ] + } + ], + "prompt_number": 123 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#capital total\n", + "spam_data_cap_avg = spam[0::,56:57:]\n", + "spam_data_cap_avg.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 124, + "text": [ + "(4601, 1)" + ] + } + ], + "prompt_number": 124 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Logistic Regression with features: Word and Char Frequencies: 88%" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "x_train, x_test, y_train, y_test = train_test_split(spam_data_words_chars, spam_target, test_size=0.4, random_state=0)\n", + "clf = LogisticRegression()\n", + "\n", + "clf = clf.fit(x_train, y_train)\n", + "predicted = clf.predict(x_test)\n", + "\n", + "print(metrics.classification_report(y_test, predicted))\n", + "print(metrics.confusion_matrix(y_test, predicted))\n", + "print(metrics.f1_score(y_test, predicted))\n", + "scores = cross_val_score(clf, spam_data_words_chars, spam_target, cv=5)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0.0 0.91 0.94 0.93 1097\n", + " 1.0 0.91 0.87 0.89 744\n", + "\n", + "avg / total 0.91 0.91 0.91 1841\n", + "\n", + "[[1032 65]\n", + " [ 100 644]]\n", + "0.88644184446\n" + ] + } + ], + "prompt_number": 138 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Logistic Regression with Word Frequencies: 87%" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "x_train, x_test, y_train, y_test = train_test_split(spam_data_words, spam_target, test_size=0.4, random_state=0)\n", + "clf = LogisticRegression()\n", + "\n", + "clf = clf.fit(x_train, y_train)\n", + "predicted = clf.predict(x_test)\n", + "\n", + "print(metrics.classification_report(y_test, predicted))\n", + "print(metrics.confusion_matrix(y_test, predicted))\n", + "print(metrics.f1_score(y_test, predicted))\n", + "scores = cross_val_score(clf, spam_data_words, spam_target, cv=5)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0.0 0.90 0.93 0.92 1097\n", + " 1.0 0.90 0.85 0.88 744\n", + "\n", + "avg / total 0.90 0.90 0.90 1841\n", + "\n", + "[[1025 72]\n", + " [ 108 636]]\n", + "0.876033057851\n" + ] + } + ], + "prompt_number": 140 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Logistic Regression with Char Frequencies: 72%" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "x_train, x_test, y_train, y_test = train_test_split(spam_data_chars, spam_target, test_size=0.4, random_state=0)\n", + "clf = LogisticRegression()\n", + "\n", + "clf = clf.fit(x_train, y_train)\n", + "predicted = clf.predict(x_test)\n", + "\n", + "print(metrics.classification_report(y_test, predicted))\n", + "print(metrics.confusion_matrix(y_test, predicted))\n", + "print(metrics.f1_score(y_test, predicted))\n", + "scores = cross_val_score(clf, spam_data_chars, spam_target, cv=5)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0.0 0.78 0.95 0.86 1097\n", + " 1.0 0.90 0.60 0.72 744\n", + "\n", + "avg / total 0.83 0.81 0.80 1841\n", + "\n", + "[[1047 50]\n", + " [ 297 447]]\n", + "0.720386784851\n" + ] + } + ], + "prompt_number": 141 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Logistic Regression with Word and Char Frequencies as well as capital average: 88%" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "x_train, x_test, y_train, y_test = train_test_split(spam_data_words_chars_cap, spam_target, test_size=0.4, random_state=0)\n", + "clf = LogisticRegression()\n", + "\n", + "clf = clf.fit(x_train, y_train)\n", + "predicted = clf.predict(x_test)\n", + "\n", + "print(metrics.classification_report(y_test, predicted))\n", + "print(metrics.confusion_matrix(y_test, predicted))\n", + "print(metrics.f1_score(y_test, predicted))\n", + "scores = cross_val_score(clf, spam_data_words_chars_cap, spam_target, cv=5)\n", + "\n" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0.0 0.91 0.94 0.93 1097\n", + " 1.0 0.91 0.87 0.89 744\n", + "\n", + "avg / total 0.91 0.91 0.91 1841\n", + "\n", + "[[1034 63]\n", + " [ 98 646]]\n", + "0.889194769443\n" + ] + } + ], + "prompt_number": 142 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Conclusion" + ] + }, + { + "cell_type": "heading", + "level": 6, + "metadata": {}, + "source": [ + "The best performance was obtained by running a logistic regression on all variables. 89.5% success ratio. Results:\n", + "\n", + "Log Regression: 89%\n", + "Gaussian: 80%\n", + "Decision Tree: 87%\n", + "Nearest Neighbors: 67%\n", + "\n", + "Different feature reduction methods were also tested: principal components, variance threshold and different subsets of variables but there were no major improvements. It is important to notice that the char frequencies don't bring much to the model. More of the data variability is explained by the word frequencies only (87%). If 87% was an acceptable level of success, it would be better to use the model with just the word frequencies." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [] + } + ], + "metadata": {} + } + ] +} \ No newline at end of file