text-processing/Parser_CharanSingh.py at master · jtmart/text-processing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import random
import csv
# Must correct heading in Mother Teresa Speech, include in one line
# Correct heading of HEALTH: A TOP PRIORITY in index

speech_file= open("CharanSingh_Speeches.txt", "r")
data = speech_file.readlines()
index = dict()
speech_file.close()

def remove_fileHeaders():
	global data
	for i in range(len(data)):
		if data[i].rstrip()=="Contents":
			data=data[i:]
			break
def extract_contents():
	global data
	ctr=1
	for i in range(len(data)):
		if data[i].isupper():
			date=extract_date(data, i+1)
			index[data[i][:-2]]= date
			ctr+=1
		if "MESSAGES AND TRIBUTES" in index:
			index.pop("MESSAGES AND TRIBUTES")
			ctr-=1
		if "THE HOCKEY WIZARD" in index:
			data=data[i+19:]
			break

def extract_date(data, i):
	j=i
	while True:
		try:
			int(data[j][-3])
			line = data[j]
			month,find, end=get_month(line)
			day = line[find-3:find-1]
			year = line[find+end+1:find+end+5]
			return year+month+day
			break
		except:
			j+=1

def get_month(line):
	if "January" in line:
		f=line.find("January")
		return ("01",f, 7)
	elif "February" in line:
		f=line.find("February")
		return ("02",f,8)
	elif "March" in line:
		f=line.find("March")
		return ("03",f,5)
	elif "April" in line:
		f=line.find("April")
		return ("04",f,5)
	elif "May" in line:
		f=line.find("May")
		return ("05",f,3)
	elif "June" in line:
		f=line.find("June")
		return ("06",f,4)
	elif "July" in line:
		f=line.find("July")
		return ("07",f,4)
	elif "August" in line:
		f=line.find("August")
		return ("08",f,6)
	elif "September" in line:
		f=line.find("September")
		return ("09",f,9)
	elif "October" in line:
		f=line.find("October")
		return ("10",f,7)
	elif "November" in line:
		f=line.find("November")
		return ("11",f,8)
	elif "December" in line:
		f=line.find("December")
		return ("12",f,8)
	else:
		return -1

remove_fileHeaders()
extract_contents()
i=0
ctr=0
list=[]
while i < len(data):
	if data[i].upper()[:-2] in index:
		j=i+3
		flg=True
		list.append(index[data[i].upper()[:-2]])
		fname="t"+index[data[i].upper()[:-2]]+str(random.randint(100000,999999))
		f1 = open("/home/arsh/Dropbox/DATA_project_dataset_assembler_for_text_analysis/data_extraction_in_progress/charan_singh_arsh/"+fname+".txt", "w+")
		while data[j].upper()[:-2] not in index:
			if data[j]=="\n" and data[j+1]=="\n":
				j+=2
				flg=False
			elif data[j] == 'CHARAN SINGH: SELECTED SPEECHES \n':
				flg=True
				try:
					int(data[j+3][-3])
					j+=5
				except:
					j+=2
			else:
				if flg:
					f1.writelines(data[j])
				j+=1
			if j>=len(data):
				break
		f1.close()
	# f2 = open("/home/arsh/Dropbox/DATA_project_dataset_assembler_for_text_analysis/data_extraction_in_progress/"+"charan_singh_metadata.csv", "a")
	# writer = csv.writer(f2)
	# ctr+=1
	# speech_head=""
	# for t in data[i].lower():
	# 	if t.isalpha():
	# 		speech_head+=t
	# data_row = [fname,"charansingh", index[data[i].upper()[:-2]][:4],index[data[i].upper()[:-2]][:6], index[data[i].upper()[:-2]], "pmo", "firstterm","xxxxx" ,"" , "speech", "other", "other", "x", "india", "ncr", "newdelhi","capital", "hindiother", speech_head, "book", "x", index[data[i].upper()[:-2]][:4],"charan"+index[data[i].upper()[:-2]][:4], ctr, fname[-6:], "completed", "x", ctr, ctr ]
	# writer.writerow(data_row)
	# f2.close()
	i=j