|
| 1 | +#!/usr/bin/env python |
| 2 | +""" |
| 3 | +Fetch CC Legal Tool usage from the Museums Victoria Collections API. |
| 4 | +""" |
| 5 | + |
| 6 | +# Standard library |
| 7 | +import argparse |
| 8 | +import csv |
| 9 | +import json |
| 10 | +import os |
| 11 | +import sys |
| 12 | +import textwrap |
| 13 | +import traceback |
| 14 | + |
| 15 | +# Third-party |
| 16 | +import requests |
| 17 | +from pygments import highlight |
| 18 | +from pygments.formatters import TerminalFormatter |
| 19 | +from pygments.lexers import PythonTracebackLexer |
| 20 | +from requests.adapters import HTTPAdapter |
| 21 | +from urllib3.util.retry import Retry |
| 22 | + |
| 23 | +# Add parent directory so shared can be imported |
| 24 | +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) |
| 25 | + |
| 26 | +# First-party/Local |
| 27 | +import shared # noqa: E402 |
| 28 | + |
| 29 | +# Setup |
| 30 | +LOGGER, PATHS = shared.setup(__file__) |
| 31 | + |
| 32 | +# Constants |
| 33 | +BASE_URL = "https://collections.museumsvictoria.com.au/api/search" |
| 34 | +FILE_RECORDS = os.path.join(PATHS["data_phase"], "museums_raw.csv") |
| 35 | +HEADER_RECORDS = [ |
| 36 | + "ID", |
| 37 | + "TITLE", |
| 38 | + "RECORD TYPE", |
| 39 | + "CONTENT LICENCE SHORT NAME", |
| 40 | + "MEDIA JSON", |
| 41 | +] |
| 42 | +MAX_PER_PAGE = 100 # Pagination limit as defined by the API documentation |
| 43 | +QUARTER = os.path.basename(PATHS["data_quarter"]) |
| 44 | +RECORD_TYPES = [ |
| 45 | + "article", |
| 46 | + "item", |
| 47 | + "species", |
| 48 | + "specimen", |
| 49 | +] # Type of record to return |
| 50 | + |
| 51 | + |
| 52 | +def parse_arguments(): |
| 53 | + """ |
| 54 | + Parse command-line options, returns parsed argument namespace. |
| 55 | + """ |
| 56 | + LOGGER.info("Parsing command-line options") |
| 57 | + parser = argparse.ArgumentParser(description=__doc__) |
| 58 | + parser.add_argument( |
| 59 | + "--enable-save", |
| 60 | + action="store_true", |
| 61 | + help="Enable saving results", |
| 62 | + ) |
| 63 | + parser.add_argument( |
| 64 | + "--enable-git", |
| 65 | + action="store_true", |
| 66 | + help="Enable git actions (fetch, merge, add, commit, and push)", |
| 67 | + ) |
| 68 | + args = parser.parse_args() |
| 69 | + if not args.enable_save and args.enable_git: |
| 70 | + parser.error("--enable-git requires --enable-save") |
| 71 | + return args |
| 72 | + |
| 73 | + |
| 74 | +def get_requests_session(): |
| 75 | + """ |
| 76 | + Returns a configured requests session with retries and a User-Agent. |
| 77 | + """ |
| 78 | + max_retries = Retry( |
| 79 | + total=5, |
| 80 | + backoff_factor=10, |
| 81 | + status_forcelist=shared.STATUS_FORCELIST, |
| 82 | + ) |
| 83 | + session = requests.Session() |
| 84 | + session.mount("https://", HTTPAdapter(max_retries=max_retries)) |
| 85 | + # Museums Victoria API requires a User-Agent header |
| 86 | + session.headers.update({"User-Agent": shared.USER_AGENT}) |
| 87 | + return session |
| 88 | + |
| 89 | + |
| 90 | +def sanitize_string(s): |
| 91 | + """Replaces newline and carriage return characters with a space.""" |
| 92 | + if isinstance(s, str): |
| 93 | + return s.replace("\n", " ").replace("\r", "") |
| 94 | + return s |
| 95 | + |
| 96 | + |
| 97 | +def initialize_data_file(file_path, header): |
| 98 | + if not os.path.isfile(file_path): |
| 99 | + with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj: |
| 100 | + writer = csv.DictWriter( |
| 101 | + file_obj, fieldnames=header, dialect="unix" |
| 102 | + ) |
| 103 | + writer.writeheader() |
| 104 | + |
| 105 | + |
| 106 | +def write_data(args, data): |
| 107 | + """ |
| 108 | + Saves the fetched records to a CSV file. |
| 109 | + """ |
| 110 | + if not args.enable_save: |
| 111 | + return args |
| 112 | + LOGGER.info("Saving fetched data") |
| 113 | + os.makedirs(PATHS["data_phase"], exist_ok=True) |
| 114 | + for record in data: |
| 115 | + media = record.get("media") |
| 116 | + media_json_string = json.dumps( |
| 117 | + [ |
| 118 | + {"type": i.get("type"), "licence": i.get("licence")} |
| 119 | + for i in media |
| 120 | + ] |
| 121 | + ) |
| 122 | + content_license_short_name = record.get("licence", {}).get( |
| 123 | + "shortName", "Not Found" |
| 124 | + ) |
| 125 | + row = { |
| 126 | + "ID": record.get("id"), |
| 127 | + "TITLE": record.get("title"), |
| 128 | + "RECORD TYPE": record.get("recordType"), |
| 129 | + "CONTENT LICENCE SHORT NAME": sanitize_string( |
| 130 | + content_license_short_name |
| 131 | + ), |
| 132 | + "MEDIA JSON": sanitize_string(media_json_string), |
| 133 | + } |
| 134 | + initialize_data_file(FILE_RECORDS, HEADER_RECORDS) |
| 135 | + with open(FILE_RECORDS, "a", encoding="utf-8", newline="\n") as file: |
| 136 | + writer = csv.DictWriter( |
| 137 | + file, fieldnames=HEADER_RECORDS, dialect="unix" |
| 138 | + ) |
| 139 | + writer.writerow(row) |
| 140 | + LOGGER.info(f"Successfully saved records to {FILE_RECORDS}") |
| 141 | + |
| 142 | + return args |
| 143 | + |
| 144 | + |
| 145 | +def fetch_museums_victoria_data(args, session): |
| 146 | + """ |
| 147 | + Fetches all records with images from the Museums Victoria API by iterating |
| 148 | + through all record types and handling pagination. |
| 149 | + """ |
| 150 | + |
| 151 | + # Iterate through each record type |
| 152 | + for record_type in RECORD_TYPES: |
| 153 | + current_page = 1 |
| 154 | + total_pages = None |
| 155 | + |
| 156 | + LOGGER.info(f"--- Starting fetch for: {record_type.upper()} ---") |
| 157 | + |
| 158 | + while True: |
| 159 | + # 1. Construct the API query parameters |
| 160 | + params = { |
| 161 | + "recordtype": record_type, |
| 162 | + # "perpage": 20, |
| 163 | + "perpage": MAX_PER_PAGE, |
| 164 | + "page": current_page, |
| 165 | + # "page": 1, |
| 166 | + "envelope": "true", |
| 167 | + } |
| 168 | + try: |
| 169 | + r = session.get(BASE_URL, params=params, timeout=30) |
| 170 | + r.raise_for_status() |
| 171 | + data = r.json() |
| 172 | + results = data.get("response", []) |
| 173 | + except requests.HTTPError as e: |
| 174 | + raise shared.QuantifyingException(f"HTTP Error: {e}", 1) |
| 175 | + except requests.RequestException as e: |
| 176 | + raise shared.QuantifyingException(f"Request Exception: {e}", 1) |
| 177 | + except KeyError as e: |
| 178 | + raise shared.QuantifyingException(f"KeyError: {e}", 1) |
| 179 | + |
| 180 | + # 3. Handle data and pagination metadata |
| 181 | + write_data(args, results) |
| 182 | + |
| 183 | + # Initialize total_pages on the first request for this record type |
| 184 | + if total_pages is None: |
| 185 | + headers = data.get("headers", {}) |
| 186 | + # total_pages = 1 |
| 187 | + total_pages = int(headers.get("totalResults", "0")) |
| 188 | + |
| 189 | + # 4. Check for next page and break the loop if done |
| 190 | + current_page += 1 |
| 191 | + if current_page > total_pages: |
| 192 | + break |
| 193 | + |
| 194 | + |
| 195 | +def main(): |
| 196 | + args = parse_arguments() |
| 197 | + shared.paths_log(LOGGER, PATHS) |
| 198 | + shared.git_fetch_and_merge(args, PATHS["repo"]) |
| 199 | + session = get_requests_session() |
| 200 | + fetch_museums_victoria_data(args, session) |
| 201 | + args = shared.git_add_and_commit( |
| 202 | + args, |
| 203 | + PATHS["repo"], |
| 204 | + PATHS["data_quarter"], |
| 205 | + f"Add and commit new Museums Victoria data for {QUARTER}", |
| 206 | + ) |
| 207 | + shared.git_push_changes(args, PATHS["repo"]) |
| 208 | + |
| 209 | + |
| 210 | +if __name__ == "__main__": |
| 211 | + try: |
| 212 | + main() |
| 213 | + except shared.QuantifyingException as e: |
| 214 | + if e.exit_code == 0: |
| 215 | + LOGGER.info(e.message) |
| 216 | + else: |
| 217 | + LOGGER.error(e.message) |
| 218 | + sys.exit(e.exit_code) |
| 219 | + except SystemExit as e: |
| 220 | + if e.code != 0: |
| 221 | + LOGGER.error(f"System exit with code: {e.code}") |
| 222 | + sys.exit(e.code) |
| 223 | + except KeyboardInterrupt: |
| 224 | + LOGGER.info("(130) Halted via KeyboardInterrupt.") |
| 225 | + sys.exit(130) |
| 226 | + except Exception: |
| 227 | + traceback_formatted = textwrap.indent( |
| 228 | + highlight( |
| 229 | + traceback.format_exc(), |
| 230 | + PythonTracebackLexer(), |
| 231 | + TerminalFormatter(), |
| 232 | + ), |
| 233 | + " ", |
| 234 | + ) |
| 235 | + LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") |
| 236 | + sys.exit(1) |
0 commit comments