-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_books.py
48 lines (33 loc) · 1.14 KB
/
get_books.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import csv
import sys
from bs4 import BeautifulSoup
import requests
def get_books(url: str):
page = 1
count = 0
while True:
req = requests.get(url, params={"page": page})
soup = BeautifulSoup(req.text, "html.parser")
rows = soup.select("tr[class='bookalike review']")
if len(rows) == 0:
# When we go past the last page, GoodReads returns an empty page.
break
for row in rows:
title = row.select("td[class='field title']")[0].select("a")[0]
author = row.select("td[class='field author']")[0].select("a")[0]
yield {
"author": author.string,
"title": title.attrs["title"],
"link": title.attrs["href"],
}
count += 1
print(f"finished page {page} with {count} books so far", file=sys.stderr)
page += 1
fields = ["author", "title", "link"]
if __name__ == "__main__":
url = sys.argv[1]
writer = csv.writer(sys.stdout)
# header
writer.writerow(fields)
for result in get_books(url):
writer.writerow([result[key] for key in fields])