-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnlpQ3.py
More file actions
71 lines (47 loc) · 1.78 KB
/
nlpQ3.py
File metadata and controls
71 lines (47 loc) · 1.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import nltk
import string
from collections import Counter
import json
import pandas as pd
from sklearn.externals import joblib
import numpy as np
import math
from scipy.sparse import hstack
data = []
with open('yelp_train_academic_dataset_review.json') as f:
for line in f:
data.append(json.loads(line))
star = []
for i in range(len(data)):
star.append(data[i].values()[5])
TEXT = [data[i].values()[3] for i in range(len(data))]
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
bigram_vectorizer = CountVectorizer(ngram_range=(1,2), token_pattern=r'\b\w+\b', min_df=0.01,max_df = 2.5, lowercase = False, stop_words = 'english')
X = bigram_vectorizer.fit_transform(TEXT)
transformer = TfidfTransformer()
feature_names = bigram_vectorizer.get_feature_names()
from sklearn.linear_model import Ridge
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, star, test_size = 0.33, random_state = 42)
#alphas = np.array([50, 35, 20, 5])
Rmodel = Ridge(alpha = 500)
#print(Rmodel.get_params().keys())
#Rmodel = Ridge(alpha = 500)
#grid = GridSearchCV(estimator=Rmodel, param_grid=dict(alpha=alphas))
#gridfit = grid.fit(X_train, y_train)
#print(grid.best_score_)
#print(grid.best_estimator_.alpha) #alpha = 100 is the best
#gridpred = grid.predict(X_test)
#gridscore = grid.score(X_test, y_test)
#print(gridscore)
Rmodelfit = Rmodel.fit(X, star)
Rmodelpred = Rmodel.predict(X)
Rmodelscore = Rmodel.score(X, star)
#print(Rmodelscore)
#print(grid.best_score_)
#print(grid.best_estimator_.alpha)
from sklearn.externals import joblib
joblib.dump(Rmodel, 'nlp_q3.pkl')
joblib.dump(feature_names, 'feature_names_q3.pkl')