Skip to content

Commit 1c644a4

Browse files
committed
changed filename to museums_victoria_fetch.py
1 parent c120add commit 1c644a4

File tree

1 file changed

+236
-0
lines changed

1 file changed

+236
-0
lines changed
Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
#!/usr/bin/env python
2+
"""
3+
Fetch CC Legal Tool usage from the Museums Victoria Collections API.
4+
"""
5+
6+
# Standard library
7+
import argparse
8+
import csv
9+
import json
10+
import os
11+
import sys
12+
import textwrap
13+
import traceback
14+
15+
# Third-party
16+
import requests
17+
from pygments import highlight
18+
from pygments.formatters import TerminalFormatter
19+
from pygments.lexers import PythonTracebackLexer
20+
from requests.adapters import HTTPAdapter
21+
from urllib3.util.retry import Retry
22+
23+
# Add parent directory so shared can be imported
24+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
25+
26+
# First-party/Local
27+
import shared # noqa: E402
28+
29+
# Setup
30+
LOGGER, PATHS = shared.setup(__file__)
31+
32+
# Constants
33+
BASE_URL = "https://collections.museumsvictoria.com.au/api/search"
34+
FILE_RECORDS = os.path.join(PATHS["data_phase"], "museums_raw.csv")
35+
HEADER_RECORDS = [
36+
"ID",
37+
"TITLE",
38+
"RECORD TYPE",
39+
"CONTENT LICENCE SHORT NAME",
40+
"MEDIA JSON",
41+
]
42+
MAX_PER_PAGE = 100 # Pagination limit as defined by the API documentation
43+
QUARTER = os.path.basename(PATHS["data_quarter"])
44+
RECORD_TYPES = [
45+
"article",
46+
"item",
47+
"species",
48+
"specimen",
49+
] # Type of record to return
50+
51+
52+
def parse_arguments():
53+
"""
54+
Parse command-line options, returns parsed argument namespace.
55+
"""
56+
LOGGER.info("Parsing command-line options")
57+
parser = argparse.ArgumentParser(description=__doc__)
58+
parser.add_argument(
59+
"--enable-save",
60+
action="store_true",
61+
help="Enable saving results",
62+
)
63+
parser.add_argument(
64+
"--enable-git",
65+
action="store_true",
66+
help="Enable git actions (fetch, merge, add, commit, and push)",
67+
)
68+
args = parser.parse_args()
69+
if not args.enable_save and args.enable_git:
70+
parser.error("--enable-git requires --enable-save")
71+
return args
72+
73+
74+
def get_requests_session():
75+
"""
76+
Returns a configured requests session with retries and a User-Agent.
77+
"""
78+
max_retries = Retry(
79+
total=5,
80+
backoff_factor=10,
81+
status_forcelist=shared.STATUS_FORCELIST,
82+
)
83+
session = requests.Session()
84+
session.mount("https://", HTTPAdapter(max_retries=max_retries))
85+
# Museums Victoria API requires a User-Agent header
86+
session.headers.update({"User-Agent": shared.USER_AGENT})
87+
return session
88+
89+
90+
def sanitize_string(s):
91+
"""Replaces newline and carriage return characters with a space."""
92+
if isinstance(s, str):
93+
return s.replace("\n", " ").replace("\r", "")
94+
return s
95+
96+
97+
def initialize_data_file(file_path, header):
98+
if not os.path.isfile(file_path):
99+
with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
100+
writer = csv.DictWriter(
101+
file_obj, fieldnames=header, dialect="unix"
102+
)
103+
writer.writeheader()
104+
105+
106+
def write_data(args, data):
107+
"""
108+
Saves the fetched records to a CSV file.
109+
"""
110+
if not args.enable_save:
111+
return args
112+
LOGGER.info("Saving fetched data")
113+
os.makedirs(PATHS["data_phase"], exist_ok=True)
114+
for record in data:
115+
media = record.get("media")
116+
media_json_string = json.dumps(
117+
[
118+
{"type": i.get("type"), "licence": i.get("licence")}
119+
for i in media
120+
]
121+
)
122+
content_license_short_name = record.get("licence", {}).get(
123+
"shortName", "Not Found"
124+
)
125+
row = {
126+
"ID": record.get("id"),
127+
"TITLE": record.get("title"),
128+
"RECORD TYPE": record.get("recordType"),
129+
"CONTENT LICENCE SHORT NAME": sanitize_string(
130+
content_license_short_name
131+
),
132+
"MEDIA JSON": sanitize_string(media_json_string),
133+
}
134+
initialize_data_file(FILE_RECORDS, HEADER_RECORDS)
135+
with open(FILE_RECORDS, "a", encoding="utf-8", newline="\n") as file:
136+
writer = csv.DictWriter(
137+
file, fieldnames=HEADER_RECORDS, dialect="unix"
138+
)
139+
writer.writerow(row)
140+
LOGGER.info(f"Successfully saved records to {FILE_RECORDS}")
141+
142+
return args
143+
144+
145+
def fetch_museums_victoria_data(args, session):
146+
"""
147+
Fetches all records with images from the Museums Victoria API by iterating
148+
through all record types and handling pagination.
149+
"""
150+
151+
# Iterate through each record type
152+
for record_type in RECORD_TYPES:
153+
current_page = 1
154+
total_pages = None
155+
156+
LOGGER.info(f"--- Starting fetch for: {record_type.upper()} ---")
157+
158+
while True:
159+
# 1. Construct the API query parameters
160+
params = {
161+
"recordtype": record_type,
162+
# "perpage": 20,
163+
"perpage": MAX_PER_PAGE,
164+
"page": current_page,
165+
# "page": 1,
166+
"envelope": "true",
167+
}
168+
try:
169+
r = session.get(BASE_URL, params=params, timeout=30)
170+
r.raise_for_status()
171+
data = r.json()
172+
results = data.get("response", [])
173+
except requests.HTTPError as e:
174+
raise shared.QuantifyingException(f"HTTP Error: {e}", 1)
175+
except requests.RequestException as e:
176+
raise shared.QuantifyingException(f"Request Exception: {e}", 1)
177+
except KeyError as e:
178+
raise shared.QuantifyingException(f"KeyError: {e}", 1)
179+
180+
# 3. Handle data and pagination metadata
181+
write_data(args, results)
182+
183+
# Initialize total_pages on the first request for this record type
184+
if total_pages is None:
185+
headers = data.get("headers", {})
186+
# total_pages = 1
187+
total_pages = int(headers.get("totalResults", "0"))
188+
189+
# 4. Check for next page and break the loop if done
190+
current_page += 1
191+
if current_page > total_pages:
192+
break
193+
194+
195+
def main():
196+
args = parse_arguments()
197+
shared.paths_log(LOGGER, PATHS)
198+
shared.git_fetch_and_merge(args, PATHS["repo"])
199+
session = get_requests_session()
200+
fetch_museums_victoria_data(args, session)
201+
args = shared.git_add_and_commit(
202+
args,
203+
PATHS["repo"],
204+
PATHS["data_quarter"],
205+
f"Add and commit new Museums Victoria data for {QUARTER}",
206+
)
207+
shared.git_push_changes(args, PATHS["repo"])
208+
209+
210+
if __name__ == "__main__":
211+
try:
212+
main()
213+
except shared.QuantifyingException as e:
214+
if e.exit_code == 0:
215+
LOGGER.info(e.message)
216+
else:
217+
LOGGER.error(e.message)
218+
sys.exit(e.exit_code)
219+
except SystemExit as e:
220+
if e.code != 0:
221+
LOGGER.error(f"System exit with code: {e.code}")
222+
sys.exit(e.code)
223+
except KeyboardInterrupt:
224+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
225+
sys.exit(130)
226+
except Exception:
227+
traceback_formatted = textwrap.indent(
228+
highlight(
229+
traceback.format_exc(),
230+
PythonTracebackLexer(),
231+
TerminalFormatter(),
232+
),
233+
" ",
234+
)
235+
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
236+
sys.exit(1)

0 commit comments

Comments
 (0)