-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnblearn.py
75 lines (68 loc) · 1.9 KB
/
nblearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import sys,math
import pickle
inputfile= sys.argv[1]
outputfile= sys.argv[2]
i=0
Dict={}
Dclass={}
Tot_dict={}
count=0
cnt=0
index=0
total_msg=0
vocab_size=0
extra_ct=0
total_dict={}
wcount=0
try:
finame=open(inputfile,'r')
foname=open(outputfile,'wb')
for line in finame:
total_msg += 1
name=line.split()[0]
if(name not in Dict):
Dclass[name]=index
extra_ct += 1
word_dir={}
subd={'TOTAL':1,'PROB':0.0,'WD':word_dir,'NEW':0.0,'WCount':0}
Dict[name]=subd
index += 1
elif(name in Dict):
cnt=Dict[name]['TOTAL']
cnt += 1
Dict[name]['TOTAL']=cnt
list= line.lower().split()
for i in range(len(list)):
if(list[i] not in total_dict):
total_dict[list[i]]=1
if(list[i] not in Dict[name]['WD'] ):
if(list[i] != name.lower()):
Dict[name]['WD'][list[i]]=1
wcount=Dict[name]['WCount']
wcount += 1
Dict[name]['WCount']=wcount
else:
count=Dict[name]['WD'].get(list[i])
count += 1
count=Dict[name]['WD'][list[i]]=count
wcount=Dict[name]['WCount']
wcount += 1
Dict[name]['WCount']=wcount
vocab_size=len(total_dict)-extra_ct
for key,value in Dclass.items():
cls=Dict[key]['TOTAL']
p=math.log10(cls)-math.log10(total_msg)
Dict[key]['PROB']=p
wc=Dict[key]['WCount']
for k,v in Dict[key]['WD'].items():
val=math.log10(v+1) - math.log10(wc+vocab_size)
Dict[key]['WD'][k]=val
v=math.log10(1) - math.log10(wc+vocab_size)
Dict[key]['NEW']=v
pickle.dump(Dict,foname)
pickle.dump(Dclass,foname)
except IOError:
print ('Error while reading from file')
finally:
finame.close()
foname.close()