-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcanada_fsa_parser.py
191 lines (138 loc) · 5.2 KB
/
canada_fsa_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
'''
- Canadian FSA (Forward Sortation Area) parser
- Use Canada Post's listing to parse FSAs and Areas into a pandas dataframe and JSON format
- Example: https://www.canadapost.ca/assets/pdf/KB/nps/nps_nonlettermail_fsalist_jan2019.pdf (Jan 2019)
- Download the PDF on your computer, and use directory as input
'''
# Import dependencies
import PyPDF2
import re
import pandas as pd
import json
# Get the directory
file_location=input("Please enter the directory: ")
# Parse the file
def parse_pages(pdfReader):
"""
Loops over all the pages from the Canada Post document, and returns parsed data
"""
fullList = []
### Get number of paged
numPages = pdfReader.numPages
### Loop over pages
for page in range(0, numPages):
# Create page object
pageObj = pdfReader.getPage(page)
# Extract text
text = pageObj.extractText()
# Parse data
parsedData = parse_page(text)
# Build data
fullList.append(parsedData)
# Initialize the lists
fsa_name_list = []
fsa_desc_list = []
# To count FSAs and ensure totals are 1.6K, initialize to 1
counter = 1
# Iterate over the fullList
for value in fullList:
for idx, subvalue in enumerate(value):
# Extract the tokens
fsa = subvalue[0]
fsa_desc = subvalue[1:len(subvalue)]
# Join tokens into a sentence
fsa_desc_clean = ' '.join(fsa_desc)
# Build the lists
fsa_name_list.append(fsa)
fsa_desc_list.append(fsa_desc_clean)
# Incremenet
counter+=1
### Build the table
df = pd.DataFrame()
df['FSA'] = fsa_name_list
df['Area'] = fsa_desc_list
### Return dataframe
return df
def parse_page(strings):
"""
Input: Takes in a string of Canada Post's listing of Forward Sortation Areas
Output: Parsed FSA name and location in a large list
"""
### Use RegEx to identify the FSAs
fsa_vals = re.findall(r'([A-Z][0-9][A-Z$])', strings)
### Split up the input into tokens, separated by a space
tokens = strings.split()
# List to capture the indices of identified FSAs in fsa_vals
location_list = []
# Loop over all the FSA-values & append to list
for fsa in fsa_vals:
for idx, token in enumerate(tokens):
if(token==fsa):
location_list.append(idx)
### Construct the FSA list
#List of lists for FSAs
fsa_full_list=[]
# Loop over the indices of identified FSAs
for idx_value in range(len(location_list)):
# Temporary list to capture data for each individual FSA
fsa_temp_list=[]
# For all FSAs...
try:
for i in range(location_list[idx_value], location_list[idx_value+1]):
# Add the data to the temporary list
fsa_temp_list.append(tokens[i])
# Except the last FSA...
except:
# Create variable to capture the "last index" to capture
last_index = 0
# The last index occurs before the "(+) ADDITION" descriptor in tokens
for idx, token in enumerate(tokens):
if(token=='(+)' and tokens[idx+1]=='ADDITION'):
last_index=idx # This is the last index
# Now, use the last index to capture the last FSA's data
for i in range(location_list[len(location_list)-1], last_index):
fsa_temp_list.append(tokens[i])
# Finally, append to the large list, to create a list of lists
fsa_full_list.append(fsa_temp_list)
### Return the list of lists
return fsa_full_list
def getFSAValues(file_location):
# Open File
pdfFileObj = open(file_location, 'rb')
# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# Parse the pdf
df = parse_pages(pdfReader)
# Convert to JSON
fsa_json = json.loads(df.to_json())
# Return dataframe and json file
return df, fsa_json
print("...Parsing the data")
try:
dfvals, jsonvals = getFSAValues(file_location)
print("...Success!")
while(True):
response = input("Do you want to save into Excel? (Y/n)")
if(response=='Y' or response=='y'):
dfvals.to_excel('FSA_Values.xlsx')
print("...Saved into Excel file")
break
elif(response=='N' or response=='n'):
break
else:
print("Please enter a valid response")
pass
while(True):
response = input("Do you want to save into JSON format? (Y/n)")
if(response=='Y' or response=='y'):
with open('FSA_vals.txt', 'w') as outfile:
json.dump(jsonvals, outfile)
print("...Saved into TXT file")
break
elif(response=='N' or response=='n'):
break
else:
print("Please enter a valid response")
pass
except:
print(f"Unknown error...")