66# Standard library
77import argparse
88import csv
9- import json
109import os
1110import sys
1211import textwrap
1312import traceback
13+ from collections import defaultdict
1414
1515# Third-party
1616import requests
3131
3232# Constants
3333BASE_URL = "https://collections.museumsvictoria.com.au/api/search"
34- FILE_RECORDS = os .path .join (PATHS ["data_phase" ], "museums_raw.csv" )
35- HEADER_RECORDS = [
36- "ID" ,
37- "TITLE" ,
38- "RECORD TYPE" ,
39- "CONTENT LICENCE SHORT NAME" ,
40- "MEDIA JSON" ,
41- ]
34+ FILE1_COUNT = shared .path_join (
35+ PATHS ["data_phase" ], "museums_victoria_1_count.csv"
36+ )
37+ FILE2_MEDIA = shared .path_join (
38+ PATHS ["data_phase" ], "museums_victoria_2_count_by_media.csv"
39+ )
40+ FILE3_RECORD = shared .path_join (
41+ PATHS ["data_phase" ], "museums_victoria_3_count_by_record.csv"
42+ )
43+ HEADER1_COUNT = ["TOOL IDENTIFIER" , "COUNT" ]
44+ HEADER2_MEDIA = ["TOOL IDENTIFIER" , "MEDIA TYPE" , "COUNT" ]
45+ HEADER3_RECORD = ["TOOL IDENTIFIER" , "RECORD TYPE" , "COUNT" ]
4246MAX_PER_PAGE = 100 # Pagination limit as defined by the API documentation
4347QUARTER = os .path .basename (PATHS ["data_quarter" ])
4448RECORD_TYPES = [
@@ -103,51 +107,72 @@ def initialize_data_file(file_path, header):
103107 writer .writeheader ()
104108
105109
106- def write_data (args , data ):
107- """
108- Saves the fetched records to a CSV file.
109- """
110+ def initialize_all_data_files (args ):
110111 if not args .enable_save :
111- return args
112- LOGGER .info ("Saving fetched data" )
112+ return
113+
114+ # Create data directory for this phase
113115 os .makedirs (PATHS ["data_phase" ], exist_ok = True )
114- for record in data :
115- media = record .get ("media" )
116- media_json_string = json .dumps (
117- [
118- {"type" : i .get ("type" ), "licence" : i .get ("licence" )}
119- for i in media
116+
117+ initialize_data_file (FILE1_COUNT , HEADER1_COUNT )
118+ initialize_data_file (FILE2_MEDIA , HEADER2_MEDIA )
119+ initialize_data_file (FILE3_RECORD , HEADER3_RECORD )
120+
121+
122+ def write_counts_to_csv (args , data : dict ):
123+ if not args .enable_save :
124+ return
125+ for data in data .items ():
126+ rows = []
127+ file_path = data [0 ]
128+ if file_path == FILE2_MEDIA :
129+ fieldnames = HEADER2_MEDIA
130+ for media_type in data [1 ].items ():
131+ rows .extend (
132+ {
133+ "TOOL IDENTIFIER" : row [0 ],
134+ "MEDIA TYPE" : media_type [0 ],
135+ "COUNT" : row [1 ],
136+ }
137+ for row in media_type [1 ].items ()
138+ )
139+ elif file_path == FILE3_RECORD :
140+ fieldnames = HEADER3_RECORD
141+ for record_type in data [1 ].items ():
142+ rows .extend (
143+ {
144+ "TOOL IDENTIFIER" : row [0 ],
145+ "RECORD TYPE" : record_type [0 ],
146+ "COUNT" : row [1 ],
147+ }
148+ for row in record_type [1 ].items ()
149+ )
150+ else :
151+ fieldnames = HEADER1_COUNT
152+ rows = [
153+ {
154+ "TOOL IDENTIFIER" : row [0 ],
155+ "COUNT" : row [1 ],
156+ }
157+ for row in data [1 ].items ()
120158 ]
121- )
122- content_license_short_name = record .get ("licence" , {}).get (
123- "shortName" , "Not Found"
124- )
125- row = {
126- "ID" : record .get ("id" ),
127- "TITLE" : record .get ("title" ),
128- "RECORD TYPE" : record .get ("recordType" ),
129- "CONTENT LICENCE SHORT NAME" : sanitize_string (
130- content_license_short_name
131- ),
132- "MEDIA JSON" : sanitize_string (media_json_string ),
133- }
134- initialize_data_file (FILE_RECORDS , HEADER_RECORDS )
135- with open (FILE_RECORDS , "a" , encoding = "utf-8" , newline = "\n " ) as file :
159+ with open (file_path , "a" , encoding = "utf-8" , newline = "\n " ) as file_obj :
136160 writer = csv .DictWriter (
137- file , fieldnames = HEADER_RECORDS , dialect = "unix"
161+ file_obj , fieldnames = fieldnames , dialect = "unix"
138162 )
139- writer .writerow (row )
140- LOGGER .info (f"Successfully saved records to { FILE_RECORDS } " )
163+ writer .writerows (rows )
141164
142- return args
143165
144-
145- def fetch_museums_victoria_data (args , session ):
166+ def fetch_museums_victoria_data (session ):
146167 """
147168 Fetches all records with images from the Museums Victoria API by iterating
148169 through all record types and handling pagination.
149170 """
150171
172+ record_counts = defaultdict (lambda : defaultdict (int ))
173+ media_counts = defaultdict (lambda : defaultdict (int ))
174+ licences_count = defaultdict (int )
175+
151176 # Iterate through each record type
152177 for record_type in RECORD_TYPES :
153178 current_page = 1
@@ -168,36 +193,53 @@ def fetch_museums_victoria_data(args, session):
168193 try :
169194 r = session .get (BASE_URL , params = params , timeout = 30 )
170195 r .raise_for_status ()
171- data = r .json ()
172- results = data .get ("response" , [])
173196 except requests .HTTPError as e :
174197 raise shared .QuantifyingException (f"HTTP Error: { e } " , 1 )
175198 except requests .RequestException as e :
176199 raise shared .QuantifyingException (f"Request Exception: { e } " , 1 )
177200 except KeyError as e :
178201 raise shared .QuantifyingException (f"KeyError: { e } " , 1 )
202+ data = r .json ()
203+ results = data .get ("response" , [])
204+ for res in results :
205+ media_list = res .get ("media" , [])
206+ for media_item in media_list :
207+ licence_data = media_item .get ("licence" )
208+
209+ # COUNTING THE UNIQUE LICENCE TYPES
210+ license_short_name = licence_data .get ("shortName" )
211+ if license_short_name :
212+ licences_count [license_short_name ] += 1
179213
180- # 3. Handle data and pagination metadata
181- write_data (args , results )
214+ # COUNTING LICENSES BY MEDIA TYPES
215+ media_type = media_item .get ("type" )
216+ media_counts [media_type ][license_short_name ] += 1
182217
183- # Initialize total_pages on the first request for this record type
218+ # COUNTING LICENSES BY RECORD TYPES
219+ record_counts [record_type ][license_short_name ] += 1
184220 if total_pages is None :
185221 headers = data .get ("headers" , {})
186222 # total_pages = 1
187223 total_pages = int (headers .get ("totalResults" , "0" ))
188224
189- # 4. Check for next page and break the loop if done
190225 current_page += 1
191226 if current_page > total_pages :
192227 break
228+ return {
229+ FILE1_COUNT : licences_count ,
230+ FILE2_MEDIA : media_counts ,
231+ FILE3_RECORD : record_counts ,
232+ }
193233
194234
195235def main ():
196236 args = parse_arguments ()
197237 shared .paths_log (LOGGER , PATHS )
198238 shared .git_fetch_and_merge (args , PATHS ["repo" ])
239+ initialize_all_data_files (args )
199240 session = get_requests_session ()
200- fetch_museums_victoria_data (args , session )
241+ data = fetch_museums_victoria_data (session )
242+ write_counts_to_csv (args , data )
201243 args = shared .git_add_and_commit (
202244 args ,
203245 PATHS ["repo" ],
0 commit comments