@@ -20,90 +20,106 @@ class Pokemon(BaseModel):
20
20
index : str
21
21
html_url : str
22
22
img_url : str
23
- html_filepath : str
24
- img_filepath : str
25
- json_filepath : str
23
+ html_filename : str
24
+ img_filename : str
25
+ json_filename : str
26
26
description : str = ""
27
27
appears_in_book : bool = False
28
28
29
29
30
- def download_to_file (url : str , filepath : str , override = False ):
31
- """Downloads url into filepath ."""
32
- if os .path .isfile (filepath ) and override is False :
33
- logging .debug (f"'{ filepath } ' exists." )
30
+ def download_to_file (url : str , filename : str , override = False ):
31
+ """Downloads url into filename ."""
32
+ if os .path .isfile (filename ) and override is False :
33
+ logging .debug (f"'{ filename } ' exists." )
34
34
return
35
35
36
36
headers = {
37
37
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0"
38
38
}
39
39
r = requests .get (url , headers = headers )
40
40
if r .status_code != 200 :
41
- logging .warning (f"Could not download '{ filepath } ' " )
42
- return
41
+ logging .critical (f"Could not download '{ filename } '. " )
42
+ sys . exit ( 1 )
43
43
44
44
# Works for text and images
45
- with open (filepath , "wb" ) as f :
45
+ with open (filename , "wb" ) as f :
46
46
for c in r :
47
47
f .write (c )
48
- logging .debug (f"'{ filepath } ' downloaded." )
48
+ logging .debug (f"'{ filename } ' downloaded." )
49
49
50
50
51
- def get_pokemon () -> List [Pokemon ]:
52
- """Scrape Pokemon from the Bulbapedia national dex"""
53
- NATIONAL_INDEX_FILEPATH = os .path .join (POKEMON_CACHE_DIRECTORY , "pokedex.html" )
54
- download_to_file (NATIONAL_INDEX_URL , NATIONAL_INDEX_FILEPATH )
55
- with open (NATIONAL_INDEX_FILEPATH , "r" ) as r :
56
- soup = BeautifulSoup (r , "html.parser" )
57
- pokemon_list_soup : BeautifulSoup = soup .find (
58
- id = "List_of_Pokémon_by_National_Pokédex_number"
59
- ).parent
60
- generation_soups : BeautifulSoup = pokemon_list_soup .find_next_siblings ("h3" )
51
+ def download_national_index_html (national_index_filename : str ):
52
+ download_to_file (NATIONAL_INDEX_URL , national_index_filename )
53
+
61
54
55
+ def get_pokemon_table_row_soups (national_index_filename : str ) -> List [BeautifulSoup ]:
56
+ with open (national_index_filename , "r" ) as r :
57
+ soup = BeautifulSoup (r , "html.parser" )
58
+ pokemon_list_soup = soup .find (id = "List_of_Pokémon_by_National_Pokédex_number" ).parent
59
+ generation_soups = pokemon_list_soup .find_next_siblings ("h3" )
62
60
table_row_soups = []
63
61
for generation_soup in generation_soups :
64
- table_soup : BeautifulSoup = generation_soup .find_next_sibling ("table" )
65
- tbody_soup : BeautifulSoup = generation_soup .find_next ("tbody" )
62
+ table_soup = generation_soup .find_next_sibling ("table" )
63
+ tbody_soup = generation_soup .find_next ("tbody" )
66
64
# skip first row because it is the header
67
65
table_row_soups += tbody_soup .find_all ("tr" , recursive = False )[1 :]
66
+ return table_row_soups
67
+
68
+
69
+ def extract_pokemon_from_table_row (table_row_soup : BeautifulSoup ) -> Pokemon :
70
+ name = table_row_soup .find_next ("th" ).next_element .attrs ["title" ]
71
+
72
+ # load Pokemon from JSON if it already exists
73
+ json_filename = os .path .join (POKEMON_CACHE_DIRECTORY , name .lower () + ".json" )
74
+ if os .path .isfile (json_filename ):
75
+ p = Pokemon .parse_file (json_filename )
76
+ logging .debug (f"Loaded '{ p .json_filename } '." )
77
+ return p
78
+
79
+ index = table_row_soup .find_next ("td" ).next_sibling .next_sibling .text .strip ()
80
+ html_url = (
81
+ BULBAPEDIA_BASE_URL
82
+ + table_row_soup .find_next ("th" ).next_element .attrs ["href" ]
83
+ )
84
+ img_url = table_row_soup .find ("img" ).attrs ["src" ]
85
+ html_filename = os .path .join (POKEMON_CACHE_DIRECTORY , name .lower () + ".html" )
86
+ img_filename = os .path .join (POKEMON_CACHE_DIRECTORY , name .lower () + ".png" )
87
+ return Pokemon (
88
+ name = name ,
89
+ index = index ,
90
+ html_url = html_url ,
91
+ img_url = img_url ,
92
+ html_filename = html_filename ,
93
+ img_filename = img_filename ,
94
+ json_filename = json_filename ,
95
+ )
96
+
97
+
98
+ def get_pokemon () -> List [Pokemon ]:
99
+ """Scrape Pokemon from the Bulbapedia national dex"""
100
+ if not os .path .isdir (POKEMON_CACHE_DIRECTORY ):
101
+ os .mkdir (POKEMON_CACHE_DIRECTORY )
102
+ national_index_filename = os .path .join (POKEMON_CACHE_DIRECTORY , "pokedex.html" )
103
+ download_national_index_html (national_index_filename )
104
+ table_row_soups = get_pokemon_table_row_soups (national_index_filename )
68
105
69
106
pokemon = []
70
107
for table_row_soup in track (table_row_soups , description = "Download Pokemon" ):
71
- name = table_row_soup . find_next ( "th" ). next_element . attrs [ "title" ]
108
+ p = extract_pokemon_from_table_row ( table_row_soup )
72
109
73
- # ignore Galarian and Alolan Pokemon so
74
- if pokemon and pokemon [- 1 ].name == name :
110
+ # Ignore Galarian and Alolan Pokemon (Pokemon with the same name)
111
+ if pokemon and pokemon [- 1 ].name == p . name :
75
112
continue
113
+ pokemon .append (p )
76
114
77
- # load Pokemon from JSON if it already exists
78
- json_filepath = os .path .join (POKEMON_CACHE_DIRECTORY , name .lower () + ".json" )
79
- if os .path .isfile (json_filepath ):
80
- p = Pokemon .parse_file (json_filepath )
81
- pokemon .append (p )
82
- logging .debug (f"Loaded { p .json_filepath } ." )
115
+ # Pokemon has already been downloaded
116
+ if p .description and os .path .isfile (p .img_filename ):
83
117
continue
84
118
85
- index = table_row_soup .find_next ("td" ).next_sibling .next_sibling .text .strip ()
86
- html_url = (
87
- BULBAPEDIA_BASE_URL
88
- + table_row_soup .find_next ("th" ).next_element .attrs ["href" ]
89
- )
90
- img_url = table_row_soup .find ("img" ).attrs ["src" ]
91
- html_filepath = os .path .join (POKEMON_CACHE_DIRECTORY , name .lower () + ".html" )
92
- img_filepath = os .path .join (POKEMON_CACHE_DIRECTORY , name .lower () + ".png" )
93
- p = Pokemon (
94
- name = name ,
95
- index = index ,
96
- html_url = html_url ,
97
- img_url = img_url ,
98
- html_filepath = html_filepath ,
99
- img_filepath = img_filepath ,
100
- json_filepath = json_filepath ,
101
- )
102
- pokemon .append (p )
103
119
extend_pokemon (p )
104
- with open (p .json_filepath , "w" ) as f :
120
+ with open (p .json_filename , "w" ) as f :
105
121
f .write (p .json ())
106
- logging .debug (f"Saved { p .json_filepath } ." )
122
+ logging .debug (f"Saved { p .json_filename } ." )
107
123
108
124
# Filter out speculative Pokemon
109
125
pokemon = [
@@ -117,8 +133,8 @@ def get_pokemon() -> List[Pokemon]:
117
133
118
134
def extend_pokemon (p : Pokemon ):
119
135
"""Add description and download Pokemon image"""
120
- download_to_file (p .html_url , p .html_filepath )
121
- with open (p .html_filepath , "r" ) as r :
136
+ download_to_file (p .html_url , p .html_filename )
137
+ with open (p .html_filename , "r" ) as r :
122
138
soup = BeautifulSoup (r , "html.parser" )
123
139
content_soup : BeautifulSoup = soup .find (id = "mw-content-text" ).contents [0 ]
124
140
@@ -136,4 +152,4 @@ def extend_pokemon(p: Pokemon):
136
152
)
137
153
img_url = img_url .replace ("//" , "https://" )
138
154
p .img_url = img_url
139
- download_to_file (img_url , p .img_filepath )
155
+ download_to_file (img_url , p .img_filename )
0 commit comments