adm-homework-3/parser.py at main · mikcnt/adm-homework-3 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from bs4 import BeautifulSoup
import csv
import os

def html_parser(book_path, url):
    ttr = []
    if not os.path.exists(book_path):
        return
    html = BeautifulSoup(open(book_path), features='lxml')

    bookTitle = html.find('h1', id = 'bookTitle')
    bookTitle = bookTitle.contents[0].strip() if bookTitle else ''

    ttr.append(bookTitle)

    bookSeries = html.find('h2', id = 'bookSeries')
    if bookSeries:
        bookSeries= bookSeries.find('a')
    bookSeries = bookSeries.contents[0].strip()[1:-1] if bookSeries else ''

    ttr.append(bookSeries)

    bookAuthors = html.find('span', itemprop = 'name')
    bookAuthors = bookAuthors.contents[0].strip() if bookAuthors else ''

    ttr.append(bookAuthors)

    ratingValue = html.find('span', itemprop = 'ratingValue')
    ratingValue = ratingValue.contents[0].strip() if ratingValue else ''

    ttr.append(ratingValue)

    ratingCount = html.find('meta', itemprop='ratingCount')
    ratingCount = ratingCount['content'] if ratingCount else ''

    ttr.append(ratingCount)

    reviewCount = html.find('meta', itemprop='reviewCount')
    reviewCount = reviewCount['content'] if reviewCount else ''

    ttr.append(reviewCount)

    plot = html.find('div', {'id': 'description'})

    if plot:
        plot = plot.get_text().split('\n')
        if len(plot) == 5:
            plot = plot[2]
        else:
            plot = plot[1]
    else:
        plot = ''

    ttr.append(plot)

    NumberofPages = html.find('span', itemprop='numberOfPages')
    NumberofPages = NumberofPages.contents[0].split()[0] if NumberofPages else ''

    ttr.append(NumberofPages)

    PublishingDate = html.find_all('div', {'class':'row'})
    if PublishingDate and len(PublishingDate) > 1:
        PublishingDate = ' '.join(PublishingDate[1].contents[0].split()[1:4]).replace('by', '').rstrip()
    else:
        PublishingDate = ''

    ttr.append(PublishingDate)

    Characters = html.find_all('a', href=True)

    if Characters:
        Characters = ' '.join([' '.join(el.contents) for el in Characters if el['href'].startswith('/characters/')])

    ttr.append(Characters)

    Setting = html.find_all('a', href=True)

    if Setting:
        Setting = ' '.join([' '.join(el.contents) for el in Setting if el['href'].startswith('/places/')])

    ttr.append(Setting)

    ttr.append(url)

    return ttr


def create_tsv(create=False):
    if create:
        header = [
            'bookTitle', 'bookSeries', 'bookAuthors',
            'ratingValue', 'ratingCount', 'reviewCount',
            'Plot', 'numberOfPages', 'PublishingDate',
            'Characters', 'Setting', 'Url'
            ]

        with open('./data/parsed_books.tsv', 'w') as out_file:
            tsv_writer = csv.writer(out_file, delimiter='\t')
            tsv_writer.writerow(header)
            with open('book_links.txt', 'r') as txt:
                for i, url in enumerate(txt):
                    book_n = i + 1
                    print('Book number:', book_n)
                    page_n = (book_n - 1) // 100 + 1
                    path = 'books/page_{}/{}.html'.format(page_n, book_n)
                    row = html_parser(path, url)
                    if row:
                        tsv_writer.writerow(row)