Skip to content

Commit d224776

Browse files
committed
Add tests and credit Bulbapedia in README
This should resolve #2.
1 parent bc962bd commit d224776

File tree

11 files changed

+24438
-62
lines changed

11 files changed

+24438
-62
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ __pycache__/
77
pokemon
88
tmp
99
ptoos.epub
10+
ptoos-with-links.epub
1011

1112
# C extensions
1213
*.so

README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,24 @@ to descriptions and pictures of the Pokemon within the e-book itself.
55

66
It works with the epub that you can download from [Daystar Eld's Patreon](https://www.patreon.com/daystareld/).
77

8+
## Usage
9+
810
```shell
11+
pip install --user pipenv
912
pipenv install
1013
pipenv shell
1114
python ptoos-xray.py "DaystarEld - Pokemon The Origin of Species.epub"
1215
```
16+
17+
## Run tests
18+
19+
```shell
20+
pipenv install --dev
21+
pipenv run pytest
22+
```
23+
24+
## Credits
25+
26+
Full credit for the Pokemon names, images, and descriptions goes to
27+
[Bulbapedia](https://bulbapedia.bulbagarden.net) under
28+
[Attribution-NonCommercial-ShareAlike 2.5](https://creativecommons.org/licenses/by-nc-sa/2.5/).

pytest.ini

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[pytest]
2+
pythonpath = src

src/epub.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import ebooklib
22
import logging
33
import re
4+
import sys
45
from bs4 import BeautifulSoup, Tag
56
from bs4.element import NavigableString
67
from ebooklib import epub
@@ -24,7 +25,7 @@ def create_pokedex_chapter(pokemon: List[Pokemon]) -> epub.EpubHtml:
2425
for p in pokemon:
2526
content.append(f'<h2 id="{POKEMON_ID_PREFIX}{p.name.lower()}">{p.name}</h2>')
2627
content.append(
27-
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filepath}"/><br/></p>'
28+
f' <p><img alt="[Pokemon {p.name}]" src="../{p.img_filename}"/><br/></p>'
2829
)
2930
for paragraph in p.description.split("\n"):
3031
content.append(f" <p>{paragraph}</p>")
@@ -80,8 +81,12 @@ def patch_paragraph(paragraph: Tag):
8081
chapter.content = str(soup)
8182

8283

83-
def patch(epub_filepath: str, pokemon: List[Pokemon]):
84-
book = epub.read_epub(epub_filepath)
84+
def patch(epub_filename: str, pokemon: List[Pokemon]):
85+
try:
86+
book = epub.read_epub(epub_filename)
87+
except Exception:
88+
logging.exception("Failed to open epub.")
89+
sys.exit(1)
8590

8691
pokemon_lookup = {p.name.lower(): p for p in pokemon}
8792
chapters = [
@@ -103,17 +108,17 @@ def patch(epub_filepath: str, pokemon: List[Pokemon]):
103108
book.spine.append((chapter.id, "yes"))
104109

105110
for p in pokemon:
106-
image_content = open(p.img_filepath, "rb").read()
111+
image_content = open(p.img_filename, "rb").read()
107112
img = epub.EpubItem(
108113
uid=p.name,
109-
file_name=p.img_filepath,
114+
file_name=p.img_filename,
110115
media_type="image/png",
111116
content=image_content,
112117
)
113118
book.add_item(img)
114119

115120
console = Console()
116-
epub_out = epub_filepath.replace(".", "-with-links.")
121+
epub_out = epub_filename.replace(".", "-with-links.")
117122
with console.status(f"Writing {epub_out}"):
118123
epub.write_epub(epub_out, book, {})
119124
console.print(f"[green]✓[/green] [orange1]{epub_out}[/orange1] written")

src/main.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,12 @@ def main():
1111
level=logging.INFO,
1212
format="%(message)s",
1313
datefmt="[%X]",
14-
handlers=[RichHandler()],
14+
handlers=[RichHandler(rich_tracebacks=True)],
1515
)
1616
try:
1717
ptoos_epub = sys.argv[1]
1818
except IndexError:
1919
ptoos_epub = "ptoos.epub"
20+
logging.warning(f"No epub file provided. Defaulting to '{ptoos_epub}'.")
2021
pokemon = src.pokemon.get_pokemon()
2122
src.epub.patch(ptoos_epub, pokemon)

src/pokemon.py

Lines changed: 71 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -20,90 +20,106 @@ class Pokemon(BaseModel):
2020
index: str
2121
html_url: str
2222
img_url: str
23-
html_filepath: str
24-
img_filepath: str
25-
json_filepath: str
23+
html_filename: str
24+
img_filename: str
25+
json_filename: str
2626
description: str = ""
2727
appears_in_book: bool = False
2828

2929

30-
def download_to_file(url: str, filepath: str, override=False):
31-
"""Downloads url into filepath."""
32-
if os.path.isfile(filepath) and override is False:
33-
logging.debug(f"'{filepath}' exists.")
30+
def download_to_file(url: str, filename: str, override=False):
31+
"""Downloads url into filename."""
32+
if os.path.isfile(filename) and override is False:
33+
logging.debug(f"'{filename}' exists.")
3434
return
3535

3636
headers = {
3737
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0"
3838
}
3939
r = requests.get(url, headers=headers)
4040
if r.status_code != 200:
41-
logging.warning(f"Could not download '{filepath}'")
42-
return
41+
logging.critical(f"Could not download '{filename}'.")
42+
sys.exit(1)
4343

4444
# Works for text and images
45-
with open(filepath, "wb") as f:
45+
with open(filename, "wb") as f:
4646
for c in r:
4747
f.write(c)
48-
logging.debug(f"'{filepath}' downloaded.")
48+
logging.debug(f"'{filename}' downloaded.")
4949

5050

51-
def get_pokemon() -> List[Pokemon]:
52-
"""Scrape Pokemon from the Bulbapedia national dex"""
53-
NATIONAL_INDEX_FILEPATH = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
54-
download_to_file(NATIONAL_INDEX_URL, NATIONAL_INDEX_FILEPATH)
55-
with open(NATIONAL_INDEX_FILEPATH, "r") as r:
56-
soup = BeautifulSoup(r, "html.parser")
57-
pokemon_list_soup: BeautifulSoup = soup.find(
58-
id="List_of_Pokémon_by_National_Pokédex_number"
59-
).parent
60-
generation_soups: BeautifulSoup = pokemon_list_soup.find_next_siblings("h3")
51+
def download_national_index_html(national_index_filename: str):
52+
download_to_file(NATIONAL_INDEX_URL, national_index_filename)
53+
6154

55+
def get_pokemon_table_row_soups(national_index_filename: str) -> List[BeautifulSoup]:
56+
with open(national_index_filename, "r") as r:
57+
soup = BeautifulSoup(r, "html.parser")
58+
pokemon_list_soup = soup.find(id="List_of_Pokémon_by_National_Pokédex_number").parent
59+
generation_soups = pokemon_list_soup.find_next_siblings("h3")
6260
table_row_soups = []
6361
for generation_soup in generation_soups:
64-
table_soup: BeautifulSoup = generation_soup.find_next_sibling("table")
65-
tbody_soup: BeautifulSoup = generation_soup.find_next("tbody")
62+
table_soup = generation_soup.find_next_sibling("table")
63+
tbody_soup = generation_soup.find_next("tbody")
6664
# skip first row because it is the header
6765
table_row_soups += tbody_soup.find_all("tr", recursive=False)[1:]
66+
return table_row_soups
67+
68+
69+
def extract_pokemon_from_table_row(table_row_soup: BeautifulSoup) -> Pokemon:
70+
name = table_row_soup.find_next("th").next_element.attrs["title"]
71+
72+
# load Pokemon from JSON if it already exists
73+
json_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
74+
if os.path.isfile(json_filename):
75+
p = Pokemon.parse_file(json_filename)
76+
logging.debug(f"Loaded '{p.json_filename}'.")
77+
return p
78+
79+
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
80+
html_url = (
81+
BULBAPEDIA_BASE_URL
82+
+ table_row_soup.find_next("th").next_element.attrs["href"]
83+
)
84+
img_url = table_row_soup.find("img").attrs["src"]
85+
html_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
86+
img_filename = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
87+
return Pokemon(
88+
name=name,
89+
index=index,
90+
html_url=html_url,
91+
img_url=img_url,
92+
html_filename=html_filename,
93+
img_filename=img_filename,
94+
json_filename=json_filename,
95+
)
96+
97+
98+
def get_pokemon() -> List[Pokemon]:
99+
"""Scrape Pokemon from the Bulbapedia national dex"""
100+
if not os.path.isdir(POKEMON_CACHE_DIRECTORY):
101+
os.mkdir(POKEMON_CACHE_DIRECTORY)
102+
national_index_filename = os.path.join(POKEMON_CACHE_DIRECTORY, "pokedex.html")
103+
download_national_index_html(national_index_filename)
104+
table_row_soups = get_pokemon_table_row_soups(national_index_filename)
68105

69106
pokemon = []
70107
for table_row_soup in track(table_row_soups, description="Download Pokemon"):
71-
name = table_row_soup.find_next("th").next_element.attrs["title"]
108+
p = extract_pokemon_from_table_row(table_row_soup)
72109

73-
# ignore Galarian and Alolan Pokemon so
74-
if pokemon and pokemon[-1].name == name:
110+
# Ignore Galarian and Alolan Pokemon (Pokemon with the same name)
111+
if pokemon and pokemon[-1].name == p.name:
75112
continue
113+
pokemon.append(p)
76114

77-
# load Pokemon from JSON if it already exists
78-
json_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".json")
79-
if os.path.isfile(json_filepath):
80-
p = Pokemon.parse_file(json_filepath)
81-
pokemon.append(p)
82-
logging.debug(f"Loaded {p.json_filepath}.")
115+
# Pokemon has already been downloaded
116+
if p.description and os.path.isfile(p.img_filename):
83117
continue
84118

85-
index = table_row_soup.find_next("td").next_sibling.next_sibling.text.strip()
86-
html_url = (
87-
BULBAPEDIA_BASE_URL
88-
+ table_row_soup.find_next("th").next_element.attrs["href"]
89-
)
90-
img_url = table_row_soup.find("img").attrs["src"]
91-
html_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".html")
92-
img_filepath = os.path.join(POKEMON_CACHE_DIRECTORY, name.lower() + ".png")
93-
p = Pokemon(
94-
name=name,
95-
index=index,
96-
html_url=html_url,
97-
img_url=img_url,
98-
html_filepath=html_filepath,
99-
img_filepath=img_filepath,
100-
json_filepath=json_filepath,
101-
)
102-
pokemon.append(p)
103119
extend_pokemon(p)
104-
with open(p.json_filepath, "w") as f:
120+
with open(p.json_filename, "w") as f:
105121
f.write(p.json())
106-
logging.debug(f"Saved {p.json_filepath}.")
122+
logging.debug(f"Saved {p.json_filename}.")
107123

108124
# Filter out speculative Pokemon
109125
pokemon = [
@@ -117,8 +133,8 @@ def get_pokemon() -> List[Pokemon]:
117133

118134
def extend_pokemon(p: Pokemon):
119135
"""Add description and download Pokemon image"""
120-
download_to_file(p.html_url, p.html_filepath)
121-
with open(p.html_filepath, "r") as r:
136+
download_to_file(p.html_url, p.html_filename)
137+
with open(p.html_filename, "r") as r:
122138
soup = BeautifulSoup(r, "html.parser")
123139
content_soup: BeautifulSoup = soup.find(id="mw-content-text").contents[0]
124140

@@ -136,4 +152,4 @@ def extend_pokemon(p: Pokemon):
136152
)
137153
img_url = img_url.replace("//", "https://")
138154
p.img_url = img_url
139-
download_to_file(img_url, p.img_filepath)
155+
download_to_file(img_url, p.img_filename)

0 commit comments

Comments
 (0)