-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser.py
More file actions
108 lines (77 loc) · 3.28 KB
/
parser.py
File metadata and controls
108 lines (77 loc) · 3.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from bs4 import BeautifulSoup
import csv
import os
def html_parser(book_path, url):
ttr = []
if not os.path.exists(book_path):
return
html = BeautifulSoup(open(book_path), features='lxml')
bookTitle = html.find('h1', id = 'bookTitle')
bookTitle = bookTitle.contents[0].strip() if bookTitle else ''
ttr.append(bookTitle)
bookSeries = html.find('h2', id = 'bookSeries')
if bookSeries:
bookSeries= bookSeries.find('a')
bookSeries = bookSeries.contents[0].strip()[1:-1] if bookSeries else ''
ttr.append(bookSeries)
bookAuthors = html.find('span', itemprop = 'name')
bookAuthors = bookAuthors.contents[0].strip() if bookAuthors else ''
ttr.append(bookAuthors)
ratingValue = html.find('span', itemprop = 'ratingValue')
ratingValue = ratingValue.contents[0].strip() if ratingValue else ''
ttr.append(ratingValue)
ratingCount = html.find('meta', itemprop='ratingCount')
ratingCount = ratingCount['content'] if ratingCount else ''
ttr.append(ratingCount)
reviewCount = html.find('meta', itemprop='reviewCount')
reviewCount = reviewCount['content'] if reviewCount else ''
ttr.append(reviewCount)
plot = html.find('div', {'id': 'description'})
if plot:
plot = plot.get_text().split('\n')
if len(plot) == 5:
plot = plot[2]
else:
plot = plot[1]
else:
plot = ''
ttr.append(plot)
NumberofPages = html.find('span', itemprop='numberOfPages')
NumberofPages = NumberofPages.contents[0].split()[0] if NumberofPages else ''
ttr.append(NumberofPages)
PublishingDate = html.find_all('div', {'class':'row'})
if PublishingDate and len(PublishingDate) > 1:
PublishingDate = ' '.join(PublishingDate[1].contents[0].split()[1:4]).replace('by', '').rstrip()
else:
PublishingDate = ''
ttr.append(PublishingDate)
Characters = html.find_all('a', href=True)
if Characters:
Characters = ' '.join([' '.join(el.contents) for el in Characters if el['href'].startswith('/characters/')])
ttr.append(Characters)
Setting = html.find_all('a', href=True)
if Setting:
Setting = ' '.join([' '.join(el.contents) for el in Setting if el['href'].startswith('/places/')])
ttr.append(Setting)
ttr.append(url)
return ttr
def create_tsv(create=False):
if create:
header = [
'bookTitle', 'bookSeries', 'bookAuthors',
'ratingValue', 'ratingCount', 'reviewCount',
'Plot', 'numberOfPages', 'PublishingDate',
'Characters', 'Setting', 'Url'
]
with open('./data/parsed_books.tsv', 'w') as out_file:
tsv_writer = csv.writer(out_file, delimiter='\t')
tsv_writer.writerow(header)
with open('book_links.txt', 'r') as txt:
for i, url in enumerate(txt):
book_n = i + 1
print('Book number:', book_n)
page_n = (book_n - 1) // 100 + 1
path = 'books/page_{}/{}.html'.format(page_n, book_n)
row = html_parser(path, url)
if row:
tsv_writer.writerow(row)