-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfixPageFeature.py
62 lines (54 loc) · 2.03 KB
/
fixPageFeature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python
"""
fixPageFeature.py: convert absolute page number to relative value
usage: python fixPageFeature.py < file
note: expected line input: label ... DATE=... ... PAGE=...
20190429 erikt(at)xs4all.nl
"""
import re
import sys
COMMAND = sys.argv.pop(0)
PAGELABELS = [ "PGLBZERO","PGLBONE","PGLBTWO","PGLBTHREE","PGLBFOUR" ]
def readArticles():
articles = []
pages = []
dates = []
maxPages = {}
for line in sys.stdin:
articles.append(line)
fields = line.split()
counter = 0
while len(articles) > len(pages) or len(articles) > len(dates):
match = re.search("^DATE=(.*)$",fields[counter])
if match:
dates.append(match.group(1))
match = re.search("^PAGE=(.*)$",fields[counter])
if match:
pages.append(int(match.group(1)))
counter += 1
if counter >= len(fields):
sys.exit(COMMAND+": incomplete line: "+line)
if not dates[-1] in maxPages or pages[-1] > maxPages[dates[-1]]:
maxPages[dates[-1]] = pages[-1]
return(articles,dates,pages,maxPages)
def getGenreLabel(article):
fields = article.split()
label = fields.pop(0)
article = " ".join(fields)
return(label,article)
def getPageLabels(page,maxPage):
relativePage = (page-1)/(maxPage-1)
if relativePage <= 0.25: return(PAGELABELS[0],PAGELABELS[1])
if relativePage <= 0.50: return(PAGELABELS[1],PAGELABELS[2])
if relativePage <= 0.75: return(PAGELABELS[2],PAGELABELS[3])
else: return(PAGELABELS[3],PAGELABELS[4])
def printArticles(articles,dates,pages,maxPages):
for i in range(0,len(articles)):
genreLabel,article = getGenreLabel(articles[i])
pageLabel1,pageLabel2 = getPageLabels(pages[i],maxPages[dates[i]])
print(genreLabel,pageLabel1,pageLabel2,article)
def main(argv):
articles,dates,pages,maxPages = readArticles()
printArticles(articles,dates,pages,maxPages)
if __name__ == "__main__":
sys.exit(main(sys.argv))