This repository has been archived by the owner on Feb 2, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_ngrams.py
67 lines (52 loc) · 1.6 KB
/
generate_ngrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import sys
from sentence import Sentence
from db import DB
from config import db_params
def most_common(l, n):
counter = {}
for x in l:
if x in counter:
counter[x] += 1
else:
counter[x] = 1
common = sorted(counter, key=counter.get, reverse=True)
return common[:n]
def main(n):
db = DB(db_params)
result = db.query(
'select activityDescription,noga1,noga2,noga3,noga4,noga5 from CONTACTS where noga1!="" AND '
'activityDescription != "N/A" limit 10000'
)
knowledge = {}
for item in result:
description = item[0].lower()
nogas = [item[1], item[2], item[3], item[4], item[5]]
nogas = filter(None, nogas)
noga = str(nogas[0])
sentence_proc = Sentence(n)
sentence_proc.set_desc(description)
sentence_proc.tokenize()
sentence_proc.generate_ngrams()
all_ngrams = sentence_proc.get_ngrams()
for ngram in all_ngrams:
if ngram not in knowledge:
knowledge[ngram] = []
knowledge[ngram].append(noga)
for key, value in knowledge.iteritems():
value = most_common(value, 5)
value = ', '.join(value)
print '{0}, {1}'.format(key, value)
def is_numeric(n):
try:
val = int(n)
return True
except ValueError:
return False
if __name__ == '__main__':
args = sys.argv[1:]
args_len = len(args)
if (args_len > 1) or (args_len < 1) or (not is_numeric(args[0])):
print('example: python {0} 3'.format(sys.argv[0]))
exit()
n = int(args[0])
main(n)