-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathParser_CharanSingh.py
More file actions
126 lines (121 loc) · 3.2 KB
/
Copy pathParser_CharanSingh.py
File metadata and controls
126 lines (121 loc) · 3.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import random
import csv
# Must correct heading in Mother Teresa Speech, include in one line
# Correct heading of HEALTH: A TOP PRIORITY in index
speech_file= open("CharanSingh_Speeches.txt", "r")
data = speech_file.readlines()
index = dict()
speech_file.close()
def remove_fileHeaders():
global data
for i in range(len(data)):
if data[i].rstrip()=="Contents":
data=data[i:]
break
def extract_contents():
global data
ctr=1
for i in range(len(data)):
if data[i].isupper():
date=extract_date(data, i+1)
index[data[i][:-2]]= date
ctr+=1
if "MESSAGES AND TRIBUTES" in index:
index.pop("MESSAGES AND TRIBUTES")
ctr-=1
if "THE HOCKEY WIZARD" in index:
data=data[i+19:]
break
def extract_date(data, i):
j=i
while True:
try:
int(data[j][-3])
line = data[j]
month,find, end=get_month(line)
day = line[find-3:find-1]
year = line[find+end+1:find+end+5]
return year+month+day
break
except:
j+=1
def get_month(line):
if "January" in line:
f=line.find("January")
return ("01",f, 7)
elif "February" in line:
f=line.find("February")
return ("02",f,8)
elif "March" in line:
f=line.find("March")
return ("03",f,5)
elif "April" in line:
f=line.find("April")
return ("04",f,5)
elif "May" in line:
f=line.find("May")
return ("05",f,3)
elif "June" in line:
f=line.find("June")
return ("06",f,4)
elif "July" in line:
f=line.find("July")
return ("07",f,4)
elif "August" in line:
f=line.find("August")
return ("08",f,6)
elif "September" in line:
f=line.find("September")
return ("09",f,9)
elif "October" in line:
f=line.find("October")
return ("10",f,7)
elif "November" in line:
f=line.find("November")
return ("11",f,8)
elif "December" in line:
f=line.find("December")
return ("12",f,8)
else:
return -1
remove_fileHeaders()
extract_contents()
i=0
ctr=0
list=[]
while i < len(data):
if data[i].upper()[:-2] in index:
j=i+3
flg=True
list.append(index[data[i].upper()[:-2]])
fname="t"+index[data[i].upper()[:-2]]+str(random.randint(100000,999999))
f1 = open("/home/arsh/Dropbox/DATA_project_dataset_assembler_for_text_analysis/data_extraction_in_progress/charan_singh_arsh/"+fname+".txt", "w+")
while data[j].upper()[:-2] not in index:
if data[j]=="\n" and data[j+1]=="\n":
j+=2
flg=False
elif data[j] == 'CHARAN SINGH: SELECTED SPEECHES \n':
flg=True
try:
int(data[j+3][-3])
j+=5
except:
j+=2
else:
if flg:
f1.writelines(data[j])
j+=1
if j>=len(data):
break
f1.close()
# f2 = open("/home/arsh/Dropbox/DATA_project_dataset_assembler_for_text_analysis/data_extraction_in_progress/"+"charan_singh_metadata.csv", "a")
# writer = csv.writer(f2)
# ctr+=1
# speech_head=""
# for t in data[i].lower():
# if t.isalpha():
# speech_head+=t
# data_row = [fname,"charansingh", index[data[i].upper()[:-2]][:4],index[data[i].upper()[:-2]][:6], index[data[i].upper()[:-2]], "pmo", "firstterm","xxxxx" ,"" , "speech", "other", "other", "x", "india", "ncr", "newdelhi","capital", "hindiother", speech_head, "book", "x", index[data[i].upper()[:-2]][:4],"charan"+index[data[i].upper()[:-2]][:4], ctr, fname[-6:], "completed", "x", ctr, ctr ]
# writer.writerow(data_row)
# f2.close()
i=j