Skip to content

Commit 79e6ef5

Browse files
committed
fetching meaningful data about the size and distribution of the commons
1 parent 1c644a4 commit 79e6ef5

File tree

1 file changed

+91
-49
lines changed

1 file changed

+91
-49
lines changed

scripts/1-fetch/museums_victoria_fetch.py

Lines changed: 91 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@
66
# Standard library
77
import argparse
88
import csv
9-
import json
109
import os
1110
import sys
1211
import textwrap
1312
import traceback
13+
from collections import defaultdict
1414

1515
# Third-party
1616
import requests
@@ -31,14 +31,18 @@
3131

3232
# Constants
3333
BASE_URL = "https://collections.museumsvictoria.com.au/api/search"
34-
FILE_RECORDS = os.path.join(PATHS["data_phase"], "museums_raw.csv")
35-
HEADER_RECORDS = [
36-
"ID",
37-
"TITLE",
38-
"RECORD TYPE",
39-
"CONTENT LICENCE SHORT NAME",
40-
"MEDIA JSON",
41-
]
34+
FILE1_COUNT = shared.path_join(
35+
PATHS["data_phase"], "museums_victoria_1_count.csv"
36+
)
37+
FILE2_MEDIA = shared.path_join(
38+
PATHS["data_phase"], "museums_victoria_2_count_by_media.csv"
39+
)
40+
FILE3_RECORD = shared.path_join(
41+
PATHS["data_phase"], "museums_victoria_3_count_by_record.csv"
42+
)
43+
HEADER1_COUNT = ["TOOL IDENTIFIER", "COUNT"]
44+
HEADER2_MEDIA = ["TOOL IDENTIFIER", "MEDIA TYPE", "COUNT"]
45+
HEADER3_RECORD = ["TOOL IDENTIFIER", "RECORD TYPE", "COUNT"]
4246
MAX_PER_PAGE = 100 # Pagination limit as defined by the API documentation
4347
QUARTER = os.path.basename(PATHS["data_quarter"])
4448
RECORD_TYPES = [
@@ -103,51 +107,72 @@ def initialize_data_file(file_path, header):
103107
writer.writeheader()
104108

105109

106-
def write_data(args, data):
107-
"""
108-
Saves the fetched records to a CSV file.
109-
"""
110+
def initialize_all_data_files(args):
110111
if not args.enable_save:
111-
return args
112-
LOGGER.info("Saving fetched data")
112+
return
113+
114+
# Create data directory for this phase
113115
os.makedirs(PATHS["data_phase"], exist_ok=True)
114-
for record in data:
115-
media = record.get("media")
116-
media_json_string = json.dumps(
117-
[
118-
{"type": i.get("type"), "licence": i.get("licence")}
119-
for i in media
116+
117+
initialize_data_file(FILE1_COUNT, HEADER1_COUNT)
118+
initialize_data_file(FILE2_MEDIA, HEADER2_MEDIA)
119+
initialize_data_file(FILE3_RECORD, HEADER3_RECORD)
120+
121+
122+
def write_counts_to_csv(args, data: dict):
123+
if not args.enable_save:
124+
return
125+
for data in data.items():
126+
rows = []
127+
file_path = data[0]
128+
if file_path == FILE2_MEDIA:
129+
fieldnames = HEADER2_MEDIA
130+
for media_type in data[1].items():
131+
rows.extend(
132+
{
133+
"TOOL IDENTIFIER": row[0],
134+
"MEDIA TYPE": media_type[0],
135+
"COUNT": row[1],
136+
}
137+
for row in media_type[1].items()
138+
)
139+
elif file_path == FILE3_RECORD:
140+
fieldnames = HEADER3_RECORD
141+
for record_type in data[1].items():
142+
rows.extend(
143+
{
144+
"TOOL IDENTIFIER": row[0],
145+
"RECORD TYPE": record_type[0],
146+
"COUNT": row[1],
147+
}
148+
for row in record_type[1].items()
149+
)
150+
else:
151+
fieldnames = HEADER1_COUNT
152+
rows = [
153+
{
154+
"TOOL IDENTIFIER": row[0],
155+
"COUNT": row[1],
156+
}
157+
for row in data[1].items()
120158
]
121-
)
122-
content_license_short_name = record.get("licence", {}).get(
123-
"shortName", "Not Found"
124-
)
125-
row = {
126-
"ID": record.get("id"),
127-
"TITLE": record.get("title"),
128-
"RECORD TYPE": record.get("recordType"),
129-
"CONTENT LICENCE SHORT NAME": sanitize_string(
130-
content_license_short_name
131-
),
132-
"MEDIA JSON": sanitize_string(media_json_string),
133-
}
134-
initialize_data_file(FILE_RECORDS, HEADER_RECORDS)
135-
with open(FILE_RECORDS, "a", encoding="utf-8", newline="\n") as file:
159+
with open(file_path, "a", encoding="utf-8", newline="\n") as file_obj:
136160
writer = csv.DictWriter(
137-
file, fieldnames=HEADER_RECORDS, dialect="unix"
161+
file_obj, fieldnames=fieldnames, dialect="unix"
138162
)
139-
writer.writerow(row)
140-
LOGGER.info(f"Successfully saved records to {FILE_RECORDS}")
163+
writer.writerows(rows)
141164

142-
return args
143165

144-
145-
def fetch_museums_victoria_data(args, session):
166+
def fetch_museums_victoria_data(session):
146167
"""
147168
Fetches all records with images from the Museums Victoria API by iterating
148169
through all record types and handling pagination.
149170
"""
150171

172+
record_counts = defaultdict(lambda: defaultdict(int))
173+
media_counts = defaultdict(lambda: defaultdict(int))
174+
licences_count = defaultdict(int)
175+
151176
# Iterate through each record type
152177
for record_type in RECORD_TYPES:
153178
current_page = 1
@@ -168,36 +193,53 @@ def fetch_museums_victoria_data(args, session):
168193
try:
169194
r = session.get(BASE_URL, params=params, timeout=30)
170195
r.raise_for_status()
171-
data = r.json()
172-
results = data.get("response", [])
173196
except requests.HTTPError as e:
174197
raise shared.QuantifyingException(f"HTTP Error: {e}", 1)
175198
except requests.RequestException as e:
176199
raise shared.QuantifyingException(f"Request Exception: {e}", 1)
177200
except KeyError as e:
178201
raise shared.QuantifyingException(f"KeyError: {e}", 1)
202+
data = r.json()
203+
results = data.get("response", [])
204+
for res in results:
205+
media_list = res.get("media", [])
206+
for media_item in media_list:
207+
licence_data = media_item.get("licence")
208+
209+
# COUNTING THE UNIQUE LICENCE TYPES
210+
license_short_name = licence_data.get("shortName")
211+
if license_short_name:
212+
licences_count[license_short_name] += 1
179213

180-
# 3. Handle data and pagination metadata
181-
write_data(args, results)
214+
# COUNTING LICENSES BY MEDIA TYPES
215+
media_type = media_item.get("type")
216+
media_counts[media_type][license_short_name] += 1
182217

183-
# Initialize total_pages on the first request for this record type
218+
# COUNTING LICENSES BY RECORD TYPES
219+
record_counts[record_type][license_short_name] += 1
184220
if total_pages is None:
185221
headers = data.get("headers", {})
186222
# total_pages = 1
187223
total_pages = int(headers.get("totalResults", "0"))
188224

189-
# 4. Check for next page and break the loop if done
190225
current_page += 1
191226
if current_page > total_pages:
192227
break
228+
return {
229+
FILE1_COUNT: licences_count,
230+
FILE2_MEDIA: media_counts,
231+
FILE3_RECORD: record_counts,
232+
}
193233

194234

195235
def main():
196236
args = parse_arguments()
197237
shared.paths_log(LOGGER, PATHS)
198238
shared.git_fetch_and_merge(args, PATHS["repo"])
239+
initialize_all_data_files(args)
199240
session = get_requests_session()
200-
fetch_museums_victoria_data(args, session)
241+
data = fetch_museums_victoria_data(session)
242+
write_counts_to_csv(args, data)
201243
args = shared.git_add_and_commit(
202244
args,
203245
PATHS["repo"],

0 commit comments

Comments
 (0)