Skip to content
This repository was archived by the owner on Sep 21, 2021. It is now read-only.

Final Project Submission #198

Open
wants to merge 28 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
b79acdf
updating custom-training1.txt
serra98 Apr 12, 2021
a4a0f6e
new updates/adding sql queries
serra98 Apr 15, 2021
82d02e2
Merge branch 'master' into serra2
serra98 Apr 15, 2021
42d73b7
Merge pull request #12 from meganmastro/serra2
serra98 Apr 15, 2021
13de474
Update sql_queries.sql
serra98 Apr 16, 2021
c96127d
Merge pull request #13 from meganmastro/serra2
serra98 Apr 16, 2021
17f4148
created unsupervised/supervised folder
serra98 Apr 22, 2021
54119f1
Create README.md
meganmastro Apr 23, 2021
cabbd78
creating deliverable folder and adding scrum files in corresponding d…
serra98 Apr 29, 2021
34a3c2b
adding/moving SCRUM files to corresponding folder
serra98 Apr 29, 2021
c4009cc
pushing SCRUMs and reorganizing files
meganmastro Apr 30, 2021
ef8489c
adding final report and final extraction code
meganmastro May 1, 2021
37690eb
Update README.md
meganmastro May 1, 2021
f75df98
Update README.md
meganmastro May 1, 2021
dce4fea
Update README.md
meganmastro May 1, 2021
a45cbc9
Update README.md
meganmastro May 1, 2021
6e0317c
Update README.md
meganmastro May 1, 2021
eafb1bd
Update README.md
meganmastro May 1, 2021
0205d23
Update README.md
meganmastro May 1, 2021
3486fed
Update README.md
meganmastro May 1, 2021
ff24ed4
Update README.md
meganmastro May 1, 2021
26ec8f6
Update README.md
meganmastro May 1, 2021
53fc779
Update README.md
meganmastro May 1, 2021
b22f50b
Update README.md
meganmastro May 1, 2021
f38a3ac
Update README.md
meganmastro May 1, 2021
614d4b4
Update README.md
meganmastro May 1, 2021
ed5595e
Create README.md
meganmastro May 1, 2021
620b43a
Update README.md
meganmastro May 1, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added Civera/Civera Project Final Report.pdf
Binary file not shown.
3,033 changes: 3,033 additions & 0 deletions Civera/Code/extraction_pipeline.ipynb

Large diffs are not rendered by default.

106 changes: 106 additions & 0 deletions Civera/Code/sql_queries.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
--SQL CODE--

-- Get Distinct Actions --
SELECT distinct (c_a_index.action)
FROM wp_courtdocs.cdocs_case_action_index as c_a_index
WHERE c_a_index.action != " "

-- Already filled/Trained --
SELECT c_a_index.case_action_id, c_a_index.case_id, c_a_index.actor, c_a_index.action , c_a_index.description
FROM wp_courtdocs.cdocs_case_action_index as c_a_index
WHERE c_a_index.actor != " " and c_a_index.action != " "

-- Rows we have to update --
SELECT c_a_index.case_action_id, c_a_index.case_id, c_a_index.actor, c_a_index.action , c_a_index.description
FROM wp_courtdocs.cdocs_case_action_index as c_a_index
WHERE c_a_index.actor = " " or c_a_index.action = " "

-- Get Number of Rows Where Action is NULL or Actor is NULL
SELECT count(cdocs_case_action_index.case_action_id)
FROM wp_courtdocs.cdocs_case_action_index
where cdocs_case_action_index.actor = " " or cdocs_case_action_index.action = " "

-- HOW TO INSERT / EXAMPLE --
INSERT INTO wp_courtdocs_NORMALIZED.distinct_case_actions (action, description)
VALUES ('Continuance' , 'continued for payment'),
('Corporate disclosure statement' , 'Corporate disclosure statement filled by'),
('Counterclaim filed' , 'counterclaim filed by')

-- Table to Update later --
INSERT INTO wp_courtdocs_NORMALIZED.cdocs_case_action_index (case_action_id, case_id, actor, action, description, date_time, file_reference_number, last_indexed)


-- ROW NUMBER --
SELECT *
FROM (
SELECT @curRow := @curRow + 1 AS row_number, wp_courtdocs.cdocs_case_action_index.case_action_id
FROM wp_courtdocs.cdocs_case_action_index
JOIN (
SELECT @curRow := 0
) r
WHERE wp_courtdocs.cdocs_case_action_index.actor = " " or wp_courtdocs.cdocs_case_action_index.action = " "
) sub


-- ROW NUMBER with UNIQUE CASE_ACTION_ID (Action = NULL or Actor = NULL) --
-- saved data in table "wp_court_docs_NORMALIZED.case_index_num" --
SELECT row_num, case_action_id
FROM wp_court_docs_NORMALIZED.case_index_num

-- Total Rows: 38159737 --

-- Dividing Work/Chunking --
-- example query: let's call this Query1 --
SELECT c_n.row_num, c_i.*
FROM wp_courtdocs.cdocs_case_action_index as c_i
INNER JOIN wp_court_docs_NORMALIZED.case_index_num as c_n on c_i.case_action_id = c_n.case_action_id
WHERE row_num < 50000


-- Just In case Unsupervised Learning doesn't work --
-- Training Set --

SELECT c_a_index.actor, c_a_index.action , c_a_index.description, c.description as preprocessed_desc
FROM wp_courtdocs.cdocs_case_action_index as c_a_index
INNER JOIN wp_courtdocs_NORMALIZED.distinct_case_actions as c on c_a_index.action = c.action
WHERE c_a_index.action != " " and c_a_index.actor != " " and c_a_index.description REGEXP
(SELECT GROUP_CONCAT(c.description SEPARATOR '|')
FROM wp_courtdocs_NORMALIZED.distinct_case_actions as c) and RAND() LIMIT 50000

-- Test Set --
SELECT c_a_index.actor, c_a_index.action , c_a_index.description
FROM wp_courtdocs.cdocs_case_action_index as c_a_index
WHERE c_a_index.action = " " and c_a_index.description REGEXP
(SELECT GROUP_CONCAT(c.description SEPARATOR '|')
FROM wp_courtdocs_NORMALIZED.distinct_case_actions as c) LIMIT 100

-- IF We have to USE REGEX / Might be helpful to use this --
SELECT c_a_index.action , c_a_index.actor, c_a_index.description
FROM wp_courtdocs.cdocs_case_action_index as c_a_index
where c_a_index.description REGEXP (SELECT GROUP_CONCAT(description SEPARATOR '|')
FROM wp_courtdocs_NORMALIZED.distinct_case_actions)


-- EXAMPLE FOR CONNECTING ON PYTHON --
--import mysql.connector--

--mydb = mysql.connector.connect(host='', user='', password='')--

--if (mydb):-
-- print("Connection Successful")--
-- else: -
-- print("Connection Unsuccessful") --

-- mycursor = mydb.cursor() --

-- query1 = ''' SELECT FROM ''' --
-- pd.read_sql_query(query1,mydb)--

-- Dividing Work/Chunking --
-- example query: let's call this Query1 --
SELECT c_n.row_num, c_i.*
FROM wp_courtdocs.cdocs_case_action_index as c_i
INNER JOIN wp_court_docs_NORMALIZED.case_index_num as c_n on c_i.case_action_id = c_n.case_action_id
WHERE row_num < 50000

-- pd.read_sql_query(Query1,mydb)--
236 changes: 236 additions & 0 deletions Civera/Code/supervised/train_action.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
import swifter
import numpy as np
import pandas as pd
import seaborn as sns
import re
import math
from csv import writer
import copy
import os
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
import mysql.connector

mydb = mysql.connector.connect(host='', user='', password='')

if (mydb):
print("Connection Successful")
else:
print("Connection Unsuccessful")

mycursor = mydb.cursor()

#code for new custom training Set
# SELECT distinct(c_a_index.action) ,
# count(c_a_index.description),
# ROUND(count(c_a_index.description) * 0.01,0) as rounded
# FROM wp_courtdocs.cdocs_case_action_index as c_a_index
# where c_a_index.action != " " and c_a_index.actor != " "
# group by c_a_index.action

# Load sql to dataframe
# Get Training Set (Action != NULL and Actor != NULL)
# Getting 50000 values first
custom_training = pd.read_csv("C:\\Users\\Serra\\Desktop\\CS506Spring2021Repository\\Civera\\Data\\custom-training.txt", error_bad_lines=False)


query1 = '''SELECT c_a_index.actor, c_a_index.action , c_a_index.description, c.description as preprocessed_desc FROM wp_courtdocs.cdocs_case_action_index as c_a_index INNER JOIN wp_courtdocs_NORMALIZED.distinct_case_actions as c on c_a_index.action = c.action where c_a_index.action != " " and c_a_index.actor != " " and c_a_index.description REGEXP (SELECT GROUP_CONCAT(c.description SEPARATOR '|') FROM wp_courtdocs_NORMALIZED.distinct_case_actions as c) and RAND() LIMIT 50000'''
case_index_not_null = pd.read_sql_query(query1,mydb)
columns = ['action','description','preprocessed_desc']
trainSet3 = case_index_not_null[columns]
#print(trainSet1.head())


# Get Test Set (Action = NULL)
# Getting 10000 values first
query2 = '''SELECT c_a_index.actor, c_a_index.action , c_a_index.description FROM wp_courtdocs.cdocs_case_action_index as c_a_index where c_a_index.action = " " and c_a_index.description REGEXP (SELECT GROUP_CONCAT(c.description SEPARATOR '|') FROM wp_courtdocs_NORMALIZED.distinct_case_actions as c) LIMIT 1000'''
action_null = pd.read_sql_query(query2,mydb)
columns1 = ['action','description']
trainSet = custom_training
testSet = action_null[columns1]

print(trainSet.head())
print(testSet.head())

# Get Distinct Values of Actions Field with Index Number
path1 = "C:\\Users\\Serra\\Desktop\\CS506Spring2021Repository\\Civera\\Data\\distinct-case-actions.txt"
distinct_actions = pd.read_csv(path1)
print(distinct_actions.head())

#merge training set with distinct-case-actions.txt to get the index value for distinct actions
trainSet = trainSet.merge(distinct_actions, on='action')
print(trainSet.head())

r = re.compile(r'[^\w\s]+')

testSet['description'] = [r.sub('', x) for x in testSet['description'].tolist()]
testSet['description'] = testSet['description'].str.lower().str.split()
print(testSet.head())

stopwords = stopwords.words('english')
#remove stopwords in trainSet
# trainSet['description'] = trainSet['description'].apply(lambda x: [item for item in x if item not in stopwords])
# print("stopwords")
# print(trainSet.head())
# print()

#remove stopwords in testSet
testSet['description'] = testSet['description'].apply(lambda x: [item for item in x if item not in stopwords])
print(testSet.head())

#use Lemmatizer for train and test set
#lemmatizer = WordNetLemmatizer()
#trainSet['description'] = trainSet['description'].apply(lambda x:[lemmatizer.lemmatize(word) for word in x])
#testSet['description'] = testSet['description'].apply(lambda x:[lemmatizer.lemmatize(word) for word in x])

#remove duplicate words after lemmatizing
#trainSet['description'] = trainSet['description'].apply(lambda x:list(dict.fromkeys(x)))
#print()
#print('trainingSet after lemmatizer & removing dupes ')
#print(trainSet.head())

#testSet['description'] = testSet['description'].apply(lambda x:list(dict.fromkeys(x)))
#print()
#print('testSet after lemmatizer & removing dupes ')
#print(testSet.head())

#join back
# trainSet1['description'] = trainSet1 ['description'].apply(lambda x:' '.join(x))
#testSet['description'] = testSet['description'].apply(lambda x:[item for item in x if len(x) < 7])
testSet['description'] = testSet['description'].apply(lambda x:' '.join(x))
# print()

# trainSet1['description'] = trainSet1['description'].astype('str')
testSet['description'] = testSet['description'].astype('str')

tags = ["IN", "CC", "CD"]
testSet['description'] = testSet['description'].apply(lambda x:[a[0] for a in nltk.pos_tag(word_tokenize(x)) if a[1] not in tags ])
testSet['description'] = testSet['description'].apply(' '.join)
print(testSet.head())

#print("testSet - get rid of wrong words/misspelled words")
#words = set(nltk.corpus.words.words())
#testSet['description'] = testSet['description'].apply(lambda x:[w for w in nltk.wordpunct_tokenize(x) if w.lower() in words or not w.isalpha() ])
#print(testSet.head())

#testSet['description'] = testSet['description'].apply(' '.join)
#print(testSet.head())

#copy
trainSet1 = copy.deepcopy(trainSet)
testSet1 = copy.deepcopy(testSet)

print("preprocessing done")
print(trainSet1.head())
print(testSet1.head())

path6 = 'C:\\Users\\Serra\\Desktop\\CS506Spring2021Repository\\Civera\\Data\\preprocessed-test.txt'
testSet1.to_csv(path6, mode='w', index = False)

#train-test-split starts
#X = trainSet1['description']
X = trainSet1['description']
y = trainSet1['action_index']

print("train-test-split processing")
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)


clf1 = Pipeline([('tfidf', TfidfVectorizer()),('rdf',RandomForestClassifier()),])
# training data through the pipeline
clf1.fit(X_train, y_train)

#RandomForest Prediction
prediction1 = clf1.predict(testSet1['description'])
#prediction1 = clf1.predict(testSet1['preprocessed_desc'])
print(prediction1.shape)
print(prediction1)
print()

print('RF accuracy score')
print(accuracy_score(y_test, prediction1))
print('Mean Squared accuracy score')
print(mean_squared_error(y_test, prediction1))
print("RMSE on testing set = ", math.sqrt(mean_squared_error(y_test, prediction1)))

print ('RF accuracy: TRAINING', clf1.score(X_train,y_train))
print ('RF accuracy: TESTING', clf1.score(X_test,y_test))

clf2 = Pipeline([('tfidf', TfidfVectorizer()),('mnb',MultinomialNB()),])
# training data through the pipeline
clf2.fit(X_train, y_train)

#MultinomialNB Prediction
prediction2 = clf2.predict(testSet1['description'])
#prediction2 = clf2.predict(testSet1['preprocessed_desc'])
print(prediction2.shape)

print('MNB accuracy score')
print(accuracy_score(y_test, prediction2))
print ('MNB accuracy: TRAINING', clf2.score(X_train,y_train))
print ('MNB accuracy: TESTING', clf2.score(X_test,y_test))


clf3 = Pipeline([('tfidf', TfidfVectorizer()),('lsvc', LinearSVC(dual=False,C = 0.2)),])
# training data through the pipeline
clf3.fit(X_train, y_train)

#LinearSVC Prediction
prediction3 = clf3.predict(testSet1['description'])
#prediction3 = clf3.predict(testSet1['preprocessed_desc'])
print(prediction3.shape)

print('LinearSVC accuracy score')
print(accuracy_score(y_test, prediction3))
#score:

# estimators=[('RDF',clf1),('MNB',clf2),('SVC',clf3)]
# ensemble = VotingClassifier(estimators, voting='hard')
# #fit model to training data
# ensemble.fit(X_train, y_train)

# prediction4 = ensemble.predict(testSet1['description'])
# print(prediction4.shape)

# print(accuracy_score(y_test, prediction4[:9190]))

submission1 = pd.DataFrame({'description':testSet1['description'],'action_index':prediction1})
# #Visualize the first 5 rows
print("prediction 1")
print(submission1.head())
submission1 = submission1.merge(distinct_actions, on='action_index')
query3 = '''SELECT c.action, c.description as preprocessed_desc FROM wp_courtdocs_NORMALIZED.distinct_case_actions as c;'''
preprocessed_actions = pd.read_sql_query(query3,mydb)
submission1 = submission1.merge(distinct_actions, on='action_index')
#submission1 = submission1.merge(preprocessed_actions, on='action')

submission2 = pd.DataFrame({'description':testSet1['description'],'action_index':prediction2})
print("prediction 2")
print(submission2.head())
submission2 = submission2.merge(distinct_actions, on='action_index')
#submission1 = submission1.merge(preprocessed_actions, on='action')

path4 = 'C:\\Users\\Serra\\Desktop\\CS506Spring2021Repository\\Civera\\Data\\RandomForest-Prediction.txt'
path5 = 'C:\\Users\\Serra\\Desktop\\CS506Spring2021Repository\\Civera\\Data\\MultinomialNB-Prediction.txt'
submission1.to_csv(path4, mode='w', index = False)
submission2.to_csv(path5, mode='w', index = False)
# #testSet1.to_csv(path3, mode='w', index = False, header = False)

Loading