From 74a099c0f4553e7c2b7bd9214a46c95da009035d Mon Sep 17 00:00:00 2001 From: opsmithe Date: Sat, 8 Nov 2025 08:48:11 +0100 Subject: [PATCH 01/19] Add simple name output script --- output_name.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 output_name.py diff --git a/output_name.py b/output_name.py new file mode 100644 index 00000000..f4462d71 --- /dev/null +++ b/output_name.py @@ -0,0 +1 @@ +print("John Doe") From 6cec0a3a828926acca36bfd7debee2a28d9059b3 Mon Sep 17 00:00:00 2001 From: opsmithe Date: Mon, 10 Nov 2025 12:06:45 +0100 Subject: [PATCH 02/19] feat: Complete DOAJ integration with API v4 and country code generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Migrate from DOAJ API v3 to v4 for enhanced metadata access - Add comprehensive CC license analysis for academic journals - Implement publisher and geographic distribution analysis - Add programmatic ISO 3166-1 alpha-2 country code generation - Include automatic dependency resolution and error handling - Apply date filtering (default ≥2002) to prevent false positives - Generate 5 CSV files plus provenance for comprehensive analysis - Ensure static analysis compliance and comprehensive testing This integration enables quantification of institutional commitment to Creative Commons licensing in the scholarly publishing ecosystem. --- dev/generate_country_codes.py | 286 ++++++++++++++++ scripts/1-fetch/doaj_fetch.py | 591 ++++++++++++++++++++++++++++++++++ 2 files changed, 877 insertions(+) create mode 100644 dev/generate_country_codes.py create mode 100644 scripts/1-fetch/doaj_fetch.py diff --git a/dev/generate_country_codes.py b/dev/generate_country_codes.py new file mode 100644 index 00000000..4bb7ebd8 --- /dev/null +++ b/dev/generate_country_codes.py @@ -0,0 +1,286 @@ +#!/usr/bin/env python +""" +Generate ISO 3166-1 alpha-2 country codes YAML file for DOAJ fetch script. +""" +import os +import sys +import yaml + +# Add parent directory so shared can be imported +sys.path.append(os.path.join(os.path.dirname(__file__), "..", "scripts")) +import shared + +# ISO 3166-1 alpha-2 country codes (official list) +COUNTRIES = [ + {"code": "AD", "name": "Andorra"}, + {"code": "AE", "name": "United Arab Emirates"}, + {"code": "AF", "name": "Afghanistan"}, + {"code": "AG", "name": "Antigua and Barbuda"}, + {"code": "AI", "name": "Anguilla"}, + {"code": "AL", "name": "Albania"}, + {"code": "AM", "name": "Armenia"}, + {"code": "AO", "name": "Angola"}, + {"code": "AQ", "name": "Antarctica"}, + {"code": "AR", "name": "Argentina"}, + {"code": "AS", "name": "American Samoa"}, + {"code": "AT", "name": "Austria"}, + {"code": "AU", "name": "Australia"}, + {"code": "AW", "name": "Aruba"}, + {"code": "AX", "name": "Åland Islands"}, + {"code": "AZ", "name": "Azerbaijan"}, + {"code": "BA", "name": "Bosnia and Herzegovina"}, + {"code": "BB", "name": "Barbados"}, + {"code": "BD", "name": "Bangladesh"}, + {"code": "BE", "name": "Belgium"}, + {"code": "BF", "name": "Burkina Faso"}, + {"code": "BG", "name": "Bulgaria"}, + {"code": "BH", "name": "Bahrain"}, + {"code": "BI", "name": "Burundi"}, + {"code": "BJ", "name": "Benin"}, + {"code": "BL", "name": "Saint Barthélemy"}, + {"code": "BM", "name": "Bermuda"}, + {"code": "BN", "name": "Brunei"}, + {"code": "BO", "name": "Bolivia"}, + {"code": "BQ", "name": "Caribbean Netherlands"}, + {"code": "BR", "name": "Brazil"}, + {"code": "BS", "name": "Bahamas"}, + {"code": "BT", "name": "Bhutan"}, + {"code": "BV", "name": "Bouvet Island"}, + {"code": "BW", "name": "Botswana"}, + {"code": "BY", "name": "Belarus"}, + {"code": "BZ", "name": "Belize"}, + {"code": "CA", "name": "Canada"}, + {"code": "CC", "name": "Cocos Islands"}, + {"code": "CD", "name": "Democratic Republic of the Congo"}, + {"code": "CF", "name": "Central African Republic"}, + {"code": "CG", "name": "Republic of the Congo"}, + {"code": "CH", "name": "Switzerland"}, + {"code": "CI", "name": "Côte d'Ivoire"}, + {"code": "CK", "name": "Cook Islands"}, + {"code": "CL", "name": "Chile"}, + {"code": "CM", "name": "Cameroon"}, + {"code": "CN", "name": "China"}, + {"code": "CO", "name": "Colombia"}, + {"code": "CR", "name": "Costa Rica"}, + {"code": "CU", "name": "Cuba"}, + {"code": "CV", "name": "Cape Verde"}, + {"code": "CW", "name": "Curaçao"}, + {"code": "CX", "name": "Christmas Island"}, + {"code": "CY", "name": "Cyprus"}, + {"code": "CZ", "name": "Czech Republic"}, + {"code": "DE", "name": "Germany"}, + {"code": "DJ", "name": "Djibouti"}, + {"code": "DK", "name": "Denmark"}, + {"code": "DM", "name": "Dominica"}, + {"code": "DO", "name": "Dominican Republic"}, + {"code": "DZ", "name": "Algeria"}, + {"code": "EC", "name": "Ecuador"}, + {"code": "EE", "name": "Estonia"}, + {"code": "EG", "name": "Egypt"}, + {"code": "EH", "name": "Western Sahara"}, + {"code": "ER", "name": "Eritrea"}, + {"code": "ES", "name": "Spain"}, + {"code": "ET", "name": "Ethiopia"}, + {"code": "FI", "name": "Finland"}, + {"code": "FJ", "name": "Fiji"}, + {"code": "FK", "name": "Falkland Islands"}, + {"code": "FM", "name": "Micronesia"}, + {"code": "FO", "name": "Faroe Islands"}, + {"code": "FR", "name": "France"}, + {"code": "GA", "name": "Gabon"}, + {"code": "GB", "name": "United Kingdom"}, + {"code": "GD", "name": "Grenada"}, + {"code": "GE", "name": "Georgia"}, + {"code": "GF", "name": "French Guiana"}, + {"code": "GG", "name": "Guernsey"}, + {"code": "GH", "name": "Ghana"}, + {"code": "GI", "name": "Gibraltar"}, + {"code": "GL", "name": "Greenland"}, + {"code": "GM", "name": "Gambia"}, + {"code": "GN", "name": "Guinea"}, + {"code": "GP", "name": "Guadeloupe"}, + {"code": "GQ", "name": "Equatorial Guinea"}, + {"code": "GR", "name": "Greece"}, + {"code": "GS", "name": "South Georgia"}, + {"code": "GT", "name": "Guatemala"}, + {"code": "GU", "name": "Guam"}, + {"code": "GW", "name": "Guinea-Bissau"}, + {"code": "GY", "name": "Guyana"}, + {"code": "HK", "name": "Hong Kong"}, + {"code": "HM", "name": "Heard Island"}, + {"code": "HN", "name": "Honduras"}, + {"code": "HR", "name": "Croatia"}, + {"code": "HT", "name": "Haiti"}, + {"code": "HU", "name": "Hungary"}, + {"code": "ID", "name": "Indonesia"}, + {"code": "IE", "name": "Ireland"}, + {"code": "IL", "name": "Israel"}, + {"code": "IM", "name": "Isle of Man"}, + {"code": "IN", "name": "India"}, + {"code": "IO", "name": "British Indian Ocean Territory"}, + {"code": "IQ", "name": "Iraq"}, + {"code": "IR", "name": "Iran"}, + {"code": "IS", "name": "Iceland"}, + {"code": "IT", "name": "Italy"}, + {"code": "JE", "name": "Jersey"}, + {"code": "JM", "name": "Jamaica"}, + {"code": "JO", "name": "Jordan"}, + {"code": "JP", "name": "Japan"}, + {"code": "KE", "name": "Kenya"}, + {"code": "KG", "name": "Kyrgyzstan"}, + {"code": "KH", "name": "Cambodia"}, + {"code": "KI", "name": "Kiribati"}, + {"code": "KM", "name": "Comoros"}, + {"code": "KN", "name": "Saint Kitts and Nevis"}, + {"code": "KP", "name": "North Korea"}, + {"code": "KR", "name": "South Korea"}, + {"code": "KW", "name": "Kuwait"}, + {"code": "KY", "name": "Cayman Islands"}, + {"code": "KZ", "name": "Kazakhstan"}, + {"code": "LA", "name": "Laos"}, + {"code": "LB", "name": "Lebanon"}, + {"code": "LC", "name": "Saint Lucia"}, + {"code": "LI", "name": "Liechtenstein"}, + {"code": "LK", "name": "Sri Lanka"}, + {"code": "LR", "name": "Liberia"}, + {"code": "LS", "name": "Lesotho"}, + {"code": "LT", "name": "Lithuania"}, + {"code": "LU", "name": "Luxembourg"}, + {"code": "LV", "name": "Latvia"}, + {"code": "LY", "name": "Libya"}, + {"code": "MA", "name": "Morocco"}, + {"code": "MC", "name": "Monaco"}, + {"code": "MD", "name": "Moldova"}, + {"code": "ME", "name": "Montenegro"}, + {"code": "MF", "name": "Saint Martin"}, + {"code": "MG", "name": "Madagascar"}, + {"code": "MH", "name": "Marshall Islands"}, + {"code": "MK", "name": "North Macedonia"}, + {"code": "ML", "name": "Mali"}, + {"code": "MM", "name": "Myanmar"}, + {"code": "MN", "name": "Mongolia"}, + {"code": "MO", "name": "Macao"}, + {"code": "MP", "name": "Northern Mariana Islands"}, + {"code": "MQ", "name": "Martinique"}, + {"code": "MR", "name": "Mauritania"}, + {"code": "MS", "name": "Montserrat"}, + {"code": "MT", "name": "Malta"}, + {"code": "MU", "name": "Mauritius"}, + {"code": "MV", "name": "Maldives"}, + {"code": "MW", "name": "Malawi"}, + {"code": "MX", "name": "Mexico"}, + {"code": "MY", "name": "Malaysia"}, + {"code": "MZ", "name": "Mozambique"}, + {"code": "NA", "name": "Namibia"}, + {"code": "NC", "name": "New Caledonia"}, + {"code": "NE", "name": "Niger"}, + {"code": "NF", "name": "Norfolk Island"}, + {"code": "NG", "name": "Nigeria"}, + {"code": "NI", "name": "Nicaragua"}, + {"code": "NL", "name": "Netherlands"}, + {"code": "NO", "name": "Norway"}, + {"code": "NP", "name": "Nepal"}, + {"code": "NR", "name": "Nauru"}, + {"code": "NU", "name": "Niue"}, + {"code": "NZ", "name": "New Zealand"}, + {"code": "OM", "name": "Oman"}, + {"code": "PA", "name": "Panama"}, + {"code": "PE", "name": "Peru"}, + {"code": "PF", "name": "French Polynesia"}, + {"code": "PG", "name": "Papua New Guinea"}, + {"code": "PH", "name": "Philippines"}, + {"code": "PK", "name": "Pakistan"}, + {"code": "PL", "name": "Poland"}, + {"code": "PM", "name": "Saint Pierre and Miquelon"}, + {"code": "PN", "name": "Pitcairn Islands"}, + {"code": "PR", "name": "Puerto Rico"}, + {"code": "PS", "name": "Palestine"}, + {"code": "PT", "name": "Portugal"}, + {"code": "PW", "name": "Palau"}, + {"code": "PY", "name": "Paraguay"}, + {"code": "QA", "name": "Qatar"}, + {"code": "RE", "name": "Réunion"}, + {"code": "RO", "name": "Romania"}, + {"code": "RS", "name": "Serbia"}, + {"code": "RU", "name": "Russia"}, + {"code": "RW", "name": "Rwanda"}, + {"code": "SA", "name": "Saudi Arabia"}, + {"code": "SB", "name": "Solomon Islands"}, + {"code": "SC", "name": "Seychelles"}, + {"code": "SD", "name": "Sudan"}, + {"code": "SE", "name": "Sweden"}, + {"code": "SG", "name": "Singapore"}, + {"code": "SH", "name": "Saint Helena"}, + {"code": "SI", "name": "Slovenia"}, + {"code": "SJ", "name": "Svalbard and Jan Mayen"}, + {"code": "SK", "name": "Slovakia"}, + {"code": "SL", "name": "Sierra Leone"}, + {"code": "SM", "name": "San Marino"}, + {"code": "SN", "name": "Senegal"}, + {"code": "SO", "name": "Somalia"}, + {"code": "SR", "name": "Suriname"}, + {"code": "SS", "name": "South Sudan"}, + {"code": "ST", "name": "São Tomé and Príncipe"}, + {"code": "SV", "name": "El Salvador"}, + {"code": "SX", "name": "Sint Maarten"}, + {"code": "SY", "name": "Syria"}, + {"code": "SZ", "name": "Eswatini"}, + {"code": "TC", "name": "Turks and Caicos Islands"}, + {"code": "TD", "name": "Chad"}, + {"code": "TF", "name": "French Southern Territories"}, + {"code": "TG", "name": "Togo"}, + {"code": "TH", "name": "Thailand"}, + {"code": "TJ", "name": "Tajikistan"}, + {"code": "TK", "name": "Tokelau"}, + {"code": "TL", "name": "Timor-Leste"}, + {"code": "TM", "name": "Turkmenistan"}, + {"code": "TN", "name": "Tunisia"}, + {"code": "TO", "name": "Tonga"}, + {"code": "TR", "name": "Turkey"}, + {"code": "TT", "name": "Trinidad and Tobago"}, + {"code": "TV", "name": "Tuvalu"}, + {"code": "TW", "name": "Taiwan"}, + {"code": "TZ", "name": "Tanzania"}, + {"code": "UA", "name": "Ukraine"}, + {"code": "UG", "name": "Uganda"}, + {"code": "UM", "name": "U.S. Minor Outlying Islands"}, + {"code": "US", "name": "United States"}, + {"code": "UY", "name": "Uruguay"}, + {"code": "UZ", "name": "Uzbekistan"}, + {"code": "VA", "name": "Vatican City"}, + {"code": "VC", "name": "Saint Vincent and the Grenadines"}, + {"code": "VE", "name": "Venezuela"}, + {"code": "VG", "name": "British Virgin Islands"}, + {"code": "VI", "name": "U.S. Virgin Islands"}, + {"code": "VN", "name": "Vietnam"}, + {"code": "VU", "name": "Vanuatu"}, + {"code": "WF", "name": "Wallis and Futuna"}, + {"code": "WS", "name": "Samoa"}, + {"code": "YE", "name": "Yemen"}, + {"code": "YT", "name": "Mayotte"}, + {"code": "ZA", "name": "South Africa"}, + {"code": "ZM", "name": "Zambia"}, + {"code": "ZW", "name": "Zimbabwe"}, +] + + +def main(): + """Generate ISO country codes YAML file.""" + repo_path = shared.path_join(os.path.dirname(__file__), "..") + output_file = shared.path_join(repo_path, "data", "iso_country_codes.yaml") + + header = [ + "# ISO 3166-1 alpha-2 country codes to country names mapping", + "# Used by DOAJ API for publisher country identification", + "# Generated programmatically by dev/generate_country_codes.py", + ] + + with open(output_file, "w", encoding="utf-8") as f: + f.write("\n".join(header) + "\n") + yaml.dump(COUNTRIES, f, default_flow_style=False, allow_unicode=True) + + print(f"Generated {output_file} with {len(COUNTRIES)} country codes") + + +if __name__ == "__main__": + main() diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py new file mode 100644 index 00000000..dcdd0157 --- /dev/null +++ b/scripts/1-fetch/doaj_fetch.py @@ -0,0 +1,591 @@ +#!/usr/bin/env python +""" +Fetch DOAJ journals with CC license information using API v4. + +Note: Articles do not contain license information in DOAJ API. + +Default filtering by oa_start >= 2002 to avoid false positives from journals +that retroactively adopted CC licenses. Creative Commons was founded in 2001 +and first licenses released in 2002. Journals with oa_start before 2002 may +show CC licenses due to later license updates, not original terms. + +Country Code Mapping: +This script requires ISO 3166-1 alpha-2 country codes for publisher analysis. +If data/iso_country_codes.yaml is missing, the script will automatically +generate it using dev/generate_country_codes.py. Users do not need to manually +create this file - it will be created programmatically when needed. +""" +# Standard library +import argparse +import csv +import os +import sys +import textwrap +import time +import traceback +from collections import Counter, defaultdict + +# Third-party +import requests +import yaml +from pygments import highlight +from pygments.formatters import TerminalFormatter +from pygments.lexers import PythonTracebackLexer + +# Add parent directory so shared can be imported +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +# First-party/Local +import shared # noqa: E402 + +# Setup +LOGGER, PATHS = shared.setup(__file__) + +# Constants +BASE_URL = "https://doaj.org/api/v4/search" +DEFAULT_DATE_BACK = 2002 # Creative Commons licenses first released in 2002 +DEFAULT_FETCH_LIMIT = 1000 +RATE_LIMIT_DELAY = 0.5 + +# CSV Headers +HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"] +HEADER_LANGUAGE = ["TOOL_IDENTIFIER", "LANGUAGE_CODE", "LANGUAGE", "COUNT"] +HEADER_PUBLISHER = [ + "TOOL_IDENTIFIER", + "PUBLISHER", + "COUNTRY_CODE", + "COUNTRY_NAME", + "COUNT", +] +HEADER_SUBJECT_REPORT = [ + "TOOL_IDENTIFIER", + "SUBJECT_CODE", + "SUBJECT_LABEL", + "COUNT", +] +HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"] + +# CC License types +CC_LICENSE_TYPES = [ + "CC BY", + "CC BY-NC", + "CC BY-SA", + "CC BY-ND", + "CC BY-NC-SA", + "CC BY-NC-ND", + "CC0", + "UNKNOWN CC legal tool", +] + +# Language code to readable name mapping +LANGUAGE_NAMES = { + "EN": "English", + "ES": "Spanish", + "PT": "Portuguese", + "FR": "French", + "DE": "German", + "IT": "Italian", + "RU": "Russian", + "ZH": "Chinese", + "JA": "Japanese", + "AR": "Arabic", + "TR": "Turkish", + "NL": "Dutch", + "SV": "Swedish", + "NO": "Norwegian", + "DA": "Danish", + "FI": "Finnish", + "PL": "Polish", + "CS": "Czech", + "HU": "Hungarian", + "RO": "Romanian", + "BG": "Bulgarian", + "HR": "Croatian", + "SK": "Slovak", + "SL": "Slovenian", + "ET": "Estonian", + "LV": "Latvian", + "LT": "Lithuanian", + "EL": "Greek", + "CA": "Catalan", + "IS": "Icelandic", + "MK": "Macedonian", + "SR": "Serbian", + "UK": "Ukrainian", + "BE": "Belarusian", + "KO": "Korean", + "TH": "Thai", + "VI": "Vietnamese", + "ID": "Indonesian", + "MS": "Malay", + "HI": "Hindi", + "BN": "Bengali", + "UR": "Urdu", + "FA": "Persian", + "HE": "Hebrew", + "SW": "Swahili", + "AF": "Afrikaans", +} + + +# Load ISO 3166-1 alpha-2 country codes from YAML file +def load_country_names(): + """ + Load country code to name mapping from YAML file. + + Automatically generates data/iso_country_codes.yaml if missing using + dev/generate_country_codes.py. This ensures the script is self-contained + and does not require manual file creation by users. + + Returns: + dict: Mapping of ISO 3166-1 alpha-2 codes to country names + """ + country_file = shared.path_join( + PATHS["repo"], "data", "iso_country_codes.yaml" + ) + + # Generate country codes file if it doesn't exist + if not os.path.isfile(country_file): + LOGGER.info("Country codes file not found, generating it...") + generate_script = shared.path_join( + PATHS["repo"], "dev", "generate_country_codes.py" + ) + try: + # Standard library + import subprocess + + subprocess.run([sys.executable, generate_script], check=True) + LOGGER.info("Successfully generated country codes file") + except Exception as e: + LOGGER.error(f"Failed to generate country codes file: {e}") + raise shared.QuantifyingException( + f"Critical error generating country codes: {e}", exit_code=1 + ) + + try: + with open(country_file, "r", encoding="utf-8") as fh: + countries = yaml.safe_load(fh) + return {country["code"]: country["name"] for country in countries} + except Exception as e: + LOGGER.error(f"Failed to load country codes from {country_file}: {e}") + raise shared.QuantifyingException( + f"Critical error loading country codes: {e}", exit_code=1 + ) + + +# File Paths +FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv") +FILE_DOAJ_SUBJECT_REPORT = shared.path_join( + PATHS["data_1-fetch"], "doaj_2_count_by_subject_report.csv" +) +FILE_DOAJ_LANGUAGE = shared.path_join( + PATHS["data_1-fetch"], "doaj_3_count_by_language.csv" +) +FILE_DOAJ_YEAR = shared.path_join( + PATHS["data_1-fetch"], "doaj_4_count_by_year.csv" +) +FILE_DOAJ_PUBLISHER = shared.path_join( + PATHS["data_1-fetch"], "doaj_5_count_by_publisher.csv" +) +FILE_PROVENANCE = shared.path_join( + PATHS["data_1-fetch"], "doaj_provenance.yaml" +) + +# Runtime variables +QUARTER = os.path.basename(PATHS["data_quarter"]) + + +def parse_arguments(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Fetch DOAJ journals with CC licenses using API v4" + ) + parser.add_argument( + "--limit", + type=int, + default=DEFAULT_FETCH_LIMIT, + help=f"Total journals to fetch (default: {DEFAULT_FETCH_LIMIT})", + ) + parser.add_argument( + "--date-back", + type=int, + default=DEFAULT_DATE_BACK, + help=f"Only include journals with oa_start year >= this value " + f"(default: {DEFAULT_DATE_BACK}). Set to 2002 to avoid false " + f"positives from journals that retroactively adopted CC licenses " + f"after Creative Commons was established. Journals starting " + f"before 2002 may show CC licenses due to later updates, not " + f"original licensing terms.", + ) + parser.add_argument( + "--enable-save", + action="store_true", + help="Enable saving data to CSV files", + ) + parser.add_argument( + "--enable-git", action="store_true", help="Enable git actions" + ) + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + return args + + +def initialize_data_file(file_path, headers): + """Initialize CSV file with headers if it doesn't exist.""" + if not os.path.isfile(file_path): + with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj: + writer = csv.DictWriter( + file_obj, fieldnames=headers, dialect="unix" + ) + writer.writeheader() + + +def initialize_all_data_files(args): + """Initialize all data files.""" + if not args.enable_save: + return + os.makedirs(PATHS["data_1-fetch"], exist_ok=True) + initialize_data_file(FILE_DOAJ_COUNT, HEADER_COUNT) + initialize_data_file(FILE_DOAJ_SUBJECT_REPORT, HEADER_SUBJECT_REPORT) + initialize_data_file(FILE_DOAJ_LANGUAGE, HEADER_LANGUAGE) + initialize_data_file(FILE_DOAJ_YEAR, HEADER_YEAR) + initialize_data_file(FILE_DOAJ_PUBLISHER, HEADER_PUBLISHER) + + +def extract_license_type(license_info): + """Extract CC license type from DOAJ license information.""" + if not license_info: + return "UNKNOWN CC legal tool" + for lic in license_info: + lic_type = lic.get("type", "") + if lic_type in CC_LICENSE_TYPES: + return lic_type + return "UNKNOWN CC legal tool" + + +def process_journals(session, args): + """Process DOAJ journals with CC licenses using API v4.""" + LOGGER.info("Fetching DOAJ journals...") + + license_counts = Counter() + subject_counts = defaultdict(Counter) + language_counts = defaultdict(Counter) + year_counts = defaultdict(Counter) + publisher_counts = defaultdict(Counter) + + total_processed = 0 + page = 1 + page_size = 100 + + while total_processed < args.limit: + LOGGER.info(f"Fetching journals page {page}...") + + url = f"{BASE_URL}/journals/*" + params = {"pageSize": page_size, "page": page} + + try: + response = session.get(url, params=params, timeout=30) + response.raise_for_status() + data = response.json() + except requests.exceptions.RequestException as e: + if hasattr(e, "response") and e.response.status_code == 400: + LOGGER.info(f"Reached end of available data at page {page}") + break + else: + LOGGER.error(f"Failed to fetch journals page {page}: {e}") + raise shared.QuantifyingException( + f"Critical API error on page {page}: {e}", exit_code=1 + ) + except (ValueError, KeyError) as e: + LOGGER.error(f"Failed to parse JSON response on page {page}: {e}") + raise shared.QuantifyingException( + f"Critical JSON parsing error on page {page}: {e}", exit_code=1 + ) + + try: + results = data.get("results", []) + if not results: + break + except (AttributeError, TypeError) as e: + LOGGER.error(f"Invalid API response structure on page {page}: {e}") + raise shared.QuantifyingException( + f"Critical API response format error on page {page}: {e}", + exit_code=1, + ) + + for journal in results: + if total_processed >= args.limit: + break + + try: + bibjson = journal.get("bibjson", {}) + + # Check for CC license + license_info = bibjson.get("license") + if not license_info: + continue + + license_type = extract_license_type(license_info) + if license_type == "UNKNOWN CC legal tool": + continue + + license_counts[license_type] += 1 + + # Extract subjects + subjects = bibjson.get("subject", []) + for subject in subjects: + if isinstance(subject, dict): + code = subject.get("code", "") + term = subject.get("term", "") + if code and term: + subject_counts[license_type][f"{code}|{term}"] += 1 + + # Extract year from oa_start (Open Access start year) + oa_start = bibjson.get("oa_start") + + # Apply date-back filter if specified + if args.date_back and oa_start and oa_start < args.date_back: + continue + + if oa_start: + year_counts[license_type][str(oa_start)] += 1 + else: + year_counts[license_type]["Unknown"] += 1 + + # Extract languages + languages = bibjson.get("language", []) + for lang in languages: + language_counts[license_type][lang] += 1 + + # Extract publisher information (new in v4) + publisher_info = bibjson.get("publisher", {}) + if publisher_info: + publisher_name = publisher_info.get("name", "Unknown") + publisher_country = publisher_info.get( + "country", "Unknown" + ) + publisher_key = f"{publisher_name}|{publisher_country}" + publisher_counts[license_type][publisher_key] += 1 + + total_processed += 1 + + except (KeyError, AttributeError, TypeError) as e: + LOGGER.warning( + f"Skipping malformed journal record on page {page}: {e}" + ) + continue + except Exception as e: + LOGGER.error( + f"Unexpected error processing journal on page {page}: {e}" + ) + raise shared.QuantifyingException( + f"Critical error processing journal data on page {page}: " + f"{e}", + exit_code=1, + ) + + page += 1 + time.sleep(RATE_LIMIT_DELAY) + + return ( + license_counts, + subject_counts, + language_counts, + year_counts, + publisher_counts, + total_processed, + ) + + +def save_count_data( + license_counts, + subject_counts, + language_counts, + year_counts, + publisher_counts, +): + """Save all collected data to CSV files.""" + + # Load country names from YAML + country_names = load_country_names() + + # Save license counts + with open(FILE_DOAJ_COUNT, "w", encoding="utf-8", newline="\n") as fh: + writer = csv.DictWriter(fh, fieldnames=HEADER_COUNT, dialect="unix") + writer.writeheader() + for lic, count in license_counts.items(): + writer.writerow({"TOOL_IDENTIFIER": lic, "COUNT": count}) + + # Save subject report + with open( + FILE_DOAJ_SUBJECT_REPORT, "w", encoding="utf-8", newline="\n" + ) as fh: + writer = csv.DictWriter( + fh, fieldnames=HEADER_SUBJECT_REPORT, dialect="unix" + ) + writer.writeheader() + for lic, subjects in subject_counts.items(): + for subject_info, count in subjects.items(): + if "|" in subject_info: + code, label = subject_info.split("|", 1) + else: + code, label = subject_info, subject_info + writer.writerow( + { + "TOOL_IDENTIFIER": lic, + "SUBJECT_CODE": code, + "SUBJECT_LABEL": label, + "COUNT": count, + } + ) + + # Save language counts with readable names + with open(FILE_DOAJ_LANGUAGE, "w", encoding="utf-8", newline="\n") as fh: + writer = csv.DictWriter(fh, fieldnames=HEADER_LANGUAGE, dialect="unix") + writer.writeheader() + for lic, languages in language_counts.items(): + for lang_code, count in languages.items(): + lang_name = LANGUAGE_NAMES.get(lang_code, lang_code) + writer.writerow( + { + "TOOL_IDENTIFIER": lic, + "LANGUAGE_CODE": lang_code, + "LANGUAGE": lang_name, + "COUNT": count, + } + ) + + # Save year counts + with open(FILE_DOAJ_YEAR, "w", encoding="utf-8", newline="\n") as fh: + writer = csv.DictWriter(fh, fieldnames=HEADER_YEAR, dialect="unix") + writer.writeheader() + for lic, years in year_counts.items(): + for year, count in years.items(): + writer.writerow( + {"TOOL_IDENTIFIER": lic, "YEAR": year, "COUNT": count} + ) + + # Save publisher counts + with open(FILE_DOAJ_PUBLISHER, "w", encoding="utf-8", newline="\n") as fh: + writer = csv.DictWriter( + fh, fieldnames=HEADER_PUBLISHER, dialect="unix" + ) + writer.writeheader() + for lic, publishers in publisher_counts.items(): + for publisher_info, count in publishers.items(): + if "|" in publisher_info: + publisher, country_code = publisher_info.split("|", 1) + else: + publisher, country_code = publisher_info, "Unknown" + + country_name = country_names.get(country_code, country_code) + writer.writerow( + { + "TOOL_IDENTIFIER": lic, + "PUBLISHER": publisher, + "COUNTRY_CODE": country_code, + "COUNTRY_NAME": country_name, + "COUNT": count, + } + ) + + +def query_doaj(args): + """Main function to query DOAJ API v4.""" + session = shared.get_session() + + LOGGER.info("Processing DOAJ journals with DOAJ API v4") + + # Process journals + ( + license_counts, + subject_counts, + language_counts, + year_counts, + publisher_counts, + journals_processed, + ) = process_journals(session, args) + + # Save results + if args.enable_save: + save_count_data( + license_counts, + subject_counts, + language_counts, + year_counts, + publisher_counts, + ) + + # Save provenance + provenance_data = { + "total_articles_fetched": 0, + "total_journals_fetched": journals_processed, + "total_processed": journals_processed, + "limit": args.limit, + "date_back_filter": args.date_back, + "quarter": QUARTER, + "script": os.path.basename(__file__), + "api_version": "v4", + "note": "Articles do not contain license information in DOAJ API", + } + + try: + with open(FILE_PROVENANCE, "w", encoding="utf-8", newline="\n") as fh: + yaml.dump(provenance_data, fh, default_flow_style=False, indent=2) + except Exception as e: + LOGGER.error("Failed to write provenance file: %s", e) + raise shared.QuantifyingException( + f"Critical error writing provenance file: {e}", exit_code=1 + ) + + LOGGER.info(f"Total CC licensed journals processed: {journals_processed}") + LOGGER.info( + "Articles: 0 (DOAJ API doesn't provide license info for articles)" + ) + + +def main(): + """Main function.""" + LOGGER.info("Script execution started.") + args = parse_arguments() + shared.paths_log(LOGGER, PATHS) + shared.git_fetch_and_merge(args, PATHS["repo"]) + initialize_all_data_files(args) + query_doaj(args) + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + f"Add and commit new DOAJ CC license data for {QUARTER} using API v4", + ) + shared.git_push_changes(args, PATHS["repo"]) + + +if __name__ == "__main__": + try: + main() + except shared.QuantifyingException as e: + if e.exit_code == 0: + LOGGER.info(e.message) + else: + LOGGER.error(e.message) + sys.exit(e.exit_code) + except SystemExit as e: + if e.code != 0: + LOGGER.error(f"System exit with code: {e.code}") + sys.exit(e.code) + except KeyboardInterrupt: + LOGGER.info("(130) Halted via KeyboardInterrupt.") + sys.exit(130) + except Exception: + traceback_formatted = textwrap.indent( + highlight( + traceback.format_exc(), + PythonTracebackLexer(), + TerminalFormatter(), + ), + " ", + ) + LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") + sys.exit(1) From 8c80845485bfbc0b12397b6a65cd5fcbc139dc4d Mon Sep 17 00:00:00 2001 From: opsmithe Date: Mon, 10 Nov 2025 12:14:48 +0100 Subject: [PATCH 03/19] Delete output_name.py --- output_name.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 output_name.py diff --git a/output_name.py b/output_name.py deleted file mode 100644 index f4462d71..00000000 --- a/output_name.py +++ /dev/null @@ -1 +0,0 @@ -print("John Doe") From 9f8df08d841451d402201bd1da370f33f6c9ce6d Mon Sep 17 00:00:00 2001 From: opsmithe Date: Mon, 10 Nov 2025 12:30:44 +0100 Subject: [PATCH 04/19] Make doaj_fetch.py executable --- scripts/1-fetch/doaj_fetch.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/1-fetch/doaj_fetch.py diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py old mode 100644 new mode 100755 From 2986e451a9bbd501e00f46f5fa18a8a1d290a13c Mon Sep 17 00:00:00 2001 From: opsmithe Date: Mon, 10 Nov 2025 12:35:17 +0100 Subject: [PATCH 05/19] Make generate_country_codes.py executable --- dev/generate_country_codes.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 dev/generate_country_codes.py diff --git a/dev/generate_country_codes.py b/dev/generate_country_codes.py old mode 100644 new mode 100755 From 521b158a0b98cd92ba383709452870daa1597b03 Mon Sep 17 00:00:00 2001 From: opsmithe Date: Tue, 11 Nov 2025 10:17:25 +0100 Subject: [PATCH 06/19] Move file path constants to top-level before function definitions --- scripts/1-fetch/doaj_fetch.py | 36 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py index dcdd0157..509eea66 100755 --- a/scripts/1-fetch/doaj_fetch.py +++ b/scripts/1-fetch/doaj_fetch.py @@ -127,6 +127,24 @@ "AF": "Afrikaans", } +# File Paths +FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv") +FILE_DOAJ_SUBJECT_REPORT = shared.path_join( + PATHS["data_1-fetch"], "doaj_2_count_by_subject_report.csv" +) +FILE_DOAJ_LANGUAGE = shared.path_join( + PATHS["data_1-fetch"], "doaj_3_count_by_language.csv" +) +FILE_DOAJ_YEAR = shared.path_join( + PATHS["data_1-fetch"], "doaj_4_count_by_year.csv" +) +FILE_DOAJ_PUBLISHER = shared.path_join( + PATHS["data_1-fetch"], "doaj_5_count_by_publisher.csv" +) +FILE_PROVENANCE = shared.path_join( + PATHS["data_1-fetch"], "doaj_provenance.yaml" +) + # Load ISO 3166-1 alpha-2 country codes from YAML file def load_country_names(): @@ -173,24 +191,6 @@ def load_country_names(): ) -# File Paths -FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv") -FILE_DOAJ_SUBJECT_REPORT = shared.path_join( - PATHS["data_1-fetch"], "doaj_2_count_by_subject_report.csv" -) -FILE_DOAJ_LANGUAGE = shared.path_join( - PATHS["data_1-fetch"], "doaj_3_count_by_language.csv" -) -FILE_DOAJ_YEAR = shared.path_join( - PATHS["data_1-fetch"], "doaj_4_count_by_year.csv" -) -FILE_DOAJ_PUBLISHER = shared.path_join( - PATHS["data_1-fetch"], "doaj_5_count_by_publisher.csv" -) -FILE_PROVENANCE = shared.path_join( - PATHS["data_1-fetch"], "doaj_provenance.yaml" -) - # Runtime variables QUARTER = os.path.basename(PATHS["data_quarter"]) From 1b5312aafd37ed746645834bb9f7bae944c6a816 Mon Sep 17 00:00:00 2001 From: opsmithe Date: Tue, 11 Nov 2025 10:23:38 +0100 Subject: [PATCH 07/19] Organize constants alphabetically within logical groups --- scripts/1-fetch/doaj_fetch.py | 124 +++++++++++++++++----------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py index 509eea66..e63fee8c 100755 --- a/scripts/1-fetch/doaj_fetch.py +++ b/scripts/1-fetch/doaj_fetch.py @@ -47,6 +47,36 @@ DEFAULT_FETCH_LIMIT = 1000 RATE_LIMIT_DELAY = 0.5 +# CC License types +CC_LICENSE_TYPES = [ + "CC BY", + "CC BY-NC", + "CC BY-SA", + "CC BY-ND", + "CC BY-NC-SA", + "CC BY-NC-ND", + "CC0", + "UNKNOWN CC legal tool", +] + +# File Paths +FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv") +FILE_DOAJ_LANGUAGE = shared.path_join( + PATHS["data_1-fetch"], "doaj_3_count_by_language.csv" +) +FILE_DOAJ_PUBLISHER = shared.path_join( + PATHS["data_1-fetch"], "doaj_5_count_by_publisher.csv" +) +FILE_DOAJ_SUBJECT_REPORT = shared.path_join( + PATHS["data_1-fetch"], "doaj_2_count_by_subject_report.csv" +) +FILE_PROVENANCE = shared.path_join( + PATHS["data_1-fetch"], "doaj_provenance.yaml" +) +FILE_DOAJ_YEAR = shared.path_join( + PATHS["data_1-fetch"], "doaj_4_count_by_year.csv" +) + # CSV Headers HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"] HEADER_LANGUAGE = ["TOOL_IDENTIFIER", "LANGUAGE_CODE", "LANGUAGE", "COUNT"] @@ -65,86 +95,56 @@ ] HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"] -# CC License types -CC_LICENSE_TYPES = [ - "CC BY", - "CC BY-NC", - "CC BY-SA", - "CC BY-ND", - "CC BY-NC-SA", - "CC BY-NC-ND", - "CC0", - "UNKNOWN CC legal tool", -] - # Language code to readable name mapping LANGUAGE_NAMES = { + "AF": "Afrikaans", + "AR": "Arabic", + "BE": "Belarusian", + "BG": "Bulgarian", + "BN": "Bengali", + "CA": "Catalan", + "CS": "Czech", + "DA": "Danish", + "DE": "German", + "EL": "Greek", "EN": "English", "ES": "Spanish", - "PT": "Portuguese", + "ET": "Estonian", + "FA": "Persian", + "FI": "Finnish", "FR": "French", - "DE": "German", + "HE": "Hebrew", + "HI": "Hindi", + "HR": "Croatian", + "HU": "Hungarian", + "ID": "Indonesian", + "IS": "Icelandic", "IT": "Italian", - "RU": "Russian", - "ZH": "Chinese", "JA": "Japanese", - "AR": "Arabic", - "TR": "Turkish", + "KO": "Korean", + "LT": "Lithuanian", + "LV": "Latvian", + "MK": "Macedonian", + "MS": "Malay", "NL": "Dutch", - "SV": "Swedish", "NO": "Norwegian", - "DA": "Danish", - "FI": "Finnish", "PL": "Polish", - "CS": "Czech", - "HU": "Hungarian", + "PT": "Portuguese", "RO": "Romanian", - "BG": "Bulgarian", - "HR": "Croatian", + "RU": "Russian", "SK": "Slovak", "SL": "Slovenian", - "ET": "Estonian", - "LV": "Latvian", - "LT": "Lithuanian", - "EL": "Greek", - "CA": "Catalan", - "IS": "Icelandic", - "MK": "Macedonian", "SR": "Serbian", - "UK": "Ukrainian", - "BE": "Belarusian", - "KO": "Korean", + "SV": "Swedish", + "SW": "Swahili", "TH": "Thai", - "VI": "Vietnamese", - "ID": "Indonesian", - "MS": "Malay", - "HI": "Hindi", - "BN": "Bengali", + "TR": "Turkish", + "UK": "Ukrainian", "UR": "Urdu", - "FA": "Persian", - "HE": "Hebrew", - "SW": "Swahili", - "AF": "Afrikaans", + "VI": "Vietnamese", + "ZH": "Chinese", } -# File Paths -FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv") -FILE_DOAJ_SUBJECT_REPORT = shared.path_join( - PATHS["data_1-fetch"], "doaj_2_count_by_subject_report.csv" -) -FILE_DOAJ_LANGUAGE = shared.path_join( - PATHS["data_1-fetch"], "doaj_3_count_by_language.csv" -) -FILE_DOAJ_YEAR = shared.path_join( - PATHS["data_1-fetch"], "doaj_4_count_by_year.csv" -) -FILE_DOAJ_PUBLISHER = shared.path_join( - PATHS["data_1-fetch"], "doaj_5_count_by_publisher.csv" -) -FILE_PROVENANCE = shared.path_join( - PATHS["data_1-fetch"], "doaj_provenance.yaml" -) - # Load ISO 3166-1 alpha-2 country codes from YAML file def load_country_names(): From cd0d3f516273a1698192f663375506439fb4a621 Mon Sep 17 00:00:00 2001 From: opsmithe Date: Tue, 11 Nov 2025 10:34:06 +0100 Subject: [PATCH 08/19] Move subprocess import to top-level imports section --- scripts/1-fetch/doaj_fetch.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py index e63fee8c..e7a80d65 100755 --- a/scripts/1-fetch/doaj_fetch.py +++ b/scripts/1-fetch/doaj_fetch.py @@ -19,6 +19,7 @@ import argparse import csv import os +import subprocess import sys import textwrap import time @@ -169,9 +170,6 @@ def load_country_names(): PATHS["repo"], "dev", "generate_country_codes.py" ) try: - # Standard library - import subprocess - subprocess.run([sys.executable, generate_script], check=True) LOGGER.info("Successfully generated country codes file") except Exception as e: From 29856ee5b750e6a007bc42f4c9982f2941f9dd30 Mon Sep 17 00:00:00 2001 From: opsmithe Date: Tue, 11 Nov 2025 11:11:23 +0100 Subject: [PATCH 09/19] Replace 'f' with 'file_object' for descriptive variable naming --- dev/generate_country_codes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dev/generate_country_codes.py b/dev/generate_country_codes.py index 4bb7ebd8..1c2db056 100755 --- a/dev/generate_country_codes.py +++ b/dev/generate_country_codes.py @@ -275,9 +275,9 @@ def main(): "# Generated programmatically by dev/generate_country_codes.py", ] - with open(output_file, "w", encoding="utf-8") as f: - f.write("\n".join(header) + "\n") - yaml.dump(COUNTRIES, f, default_flow_style=False, allow_unicode=True) + with open(output_file, "w", encoding="utf-8") as file_object: + file_object.write("\n".join(header) + "\n") + yaml.dump(COUNTRIES, file_object, default_flow_style=False, allow_unicode=True) print(f"Generated {output_file} with {len(COUNTRIES)} country codes") From 33fa72333987934da8716c66c5d153e09cc9a60b Mon Sep 17 00:00:00 2001 From: opsmithe Date: Tue, 11 Nov 2025 11:11:47 +0100 Subject: [PATCH 10/19] Replace 'fh' with 'file_object' for descriptive variable naming --- scripts/1-fetch/doaj_fetch.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py index e7a80d65..250eb646 100755 --- a/scripts/1-fetch/doaj_fetch.py +++ b/scripts/1-fetch/doaj_fetch.py @@ -179,8 +179,8 @@ def load_country_names(): ) try: - with open(country_file, "r", encoding="utf-8") as fh: - countries = yaml.safe_load(fh) + with open(country_file, "r", encoding="utf-8") as file_object: + countries = yaml.safe_load(file_object) return {country["code"]: country["name"] for country in countries} except Exception as e: LOGGER.error(f"Failed to load country codes from {country_file}: {e}") @@ -409,8 +409,8 @@ def save_count_data( country_names = load_country_names() # Save license counts - with open(FILE_DOAJ_COUNT, "w", encoding="utf-8", newline="\n") as fh: - writer = csv.DictWriter(fh, fieldnames=HEADER_COUNT, dialect="unix") + with open(FILE_DOAJ_COUNT, "w", encoding="utf-8", newline="\n") as file_object: + writer = csv.DictWriter(file_object, fieldnames=HEADER_COUNT, dialect="unix") writer.writeheader() for lic, count in license_counts.items(): writer.writerow({"TOOL_IDENTIFIER": lic, "COUNT": count}) @@ -418,9 +418,9 @@ def save_count_data( # Save subject report with open( FILE_DOAJ_SUBJECT_REPORT, "w", encoding="utf-8", newline="\n" - ) as fh: + ) as file_object: writer = csv.DictWriter( - fh, fieldnames=HEADER_SUBJECT_REPORT, dialect="unix" + file_object, fieldnames=HEADER_SUBJECT_REPORT, dialect="unix" ) writer.writeheader() for lic, subjects in subject_counts.items(): @@ -439,8 +439,8 @@ def save_count_data( ) # Save language counts with readable names - with open(FILE_DOAJ_LANGUAGE, "w", encoding="utf-8", newline="\n") as fh: - writer = csv.DictWriter(fh, fieldnames=HEADER_LANGUAGE, dialect="unix") + with open(FILE_DOAJ_LANGUAGE, "w", encoding="utf-8", newline="\n") as file_object: + writer = csv.DictWriter(file_object, fieldnames=HEADER_LANGUAGE, dialect="unix") writer.writeheader() for lic, languages in language_counts.items(): for lang_code, count in languages.items(): @@ -455,8 +455,8 @@ def save_count_data( ) # Save year counts - with open(FILE_DOAJ_YEAR, "w", encoding="utf-8", newline="\n") as fh: - writer = csv.DictWriter(fh, fieldnames=HEADER_YEAR, dialect="unix") + with open(FILE_DOAJ_YEAR, "w", encoding="utf-8", newline="\n") as file_object: + writer = csv.DictWriter(file_object, fieldnames=HEADER_YEAR, dialect="unix") writer.writeheader() for lic, years in year_counts.items(): for year, count in years.items(): @@ -465,9 +465,9 @@ def save_count_data( ) # Save publisher counts - with open(FILE_DOAJ_PUBLISHER, "w", encoding="utf-8", newline="\n") as fh: + with open(FILE_DOAJ_PUBLISHER, "w", encoding="utf-8", newline="\n") as file_object: writer = csv.DictWriter( - fh, fieldnames=HEADER_PUBLISHER, dialect="unix" + file_object, fieldnames=HEADER_PUBLISHER, dialect="unix" ) writer.writeheader() for lic, publishers in publisher_counts.items(): @@ -529,8 +529,8 @@ def query_doaj(args): } try: - with open(FILE_PROVENANCE, "w", encoding="utf-8", newline="\n") as fh: - yaml.dump(provenance_data, fh, default_flow_style=False, indent=2) + with open(FILE_PROVENANCE, "w", encoding="utf-8", newline="\n") as file_object: + yaml.dump(provenance_data, file_object, default_flow_style=False, indent=2) except Exception as e: LOGGER.error("Failed to write provenance file: %s", e) raise shared.QuantifyingException( From ae20f2423c7f6935721e6de37e4365ae67c1728b Mon Sep 17 00:00:00 2001 From: opsmithe Date: Tue, 11 Nov 2025 11:38:28 +0100 Subject: [PATCH 11/19] Fix static analysis issues and code formatting --- dev/generate_country_codes.py | 19 ++++++++++++----- scripts/1-fetch/doaj_fetch.py | 39 +++++++++++++++++++++++++++-------- 2 files changed, 44 insertions(+), 14 deletions(-) diff --git a/dev/generate_country_codes.py b/dev/generate_country_codes.py index 1c2db056..a70ba79a 100755 --- a/dev/generate_country_codes.py +++ b/dev/generate_country_codes.py @@ -2,13 +2,17 @@ """ Generate ISO 3166-1 alpha-2 country codes YAML file for DOAJ fetch script. """ +# Standard library import os import sys + +# Third-party import yaml # Add parent directory so shared can be imported sys.path.append(os.path.join(os.path.dirname(__file__), "..", "scripts")) -import shared +# First-party/Local +import shared # noqa: E402 # ISO 3166-1 alpha-2 country codes (official list) COUNTRIES = [ @@ -268,17 +272,22 @@ def main(): """Generate ISO country codes YAML file.""" repo_path = shared.path_join(os.path.dirname(__file__), "..") output_file = shared.path_join(repo_path, "data", "iso_country_codes.yaml") - + header = [ "# ISO 3166-1 alpha-2 country codes to country names mapping", "# Used by DOAJ API for publisher country identification", "# Generated programmatically by dev/generate_country_codes.py", ] - + with open(output_file, "w", encoding="utf-8") as file_object: file_object.write("\n".join(header) + "\n") - yaml.dump(COUNTRIES, file_object, default_flow_style=False, allow_unicode=True) - + yaml.dump( + COUNTRIES, + file_object, + default_flow_style=False, + allow_unicode=True, + ) + print(f"Generated {output_file} with {len(COUNTRIES)} country codes") diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py index 250eb646..7a298b32 100755 --- a/scripts/1-fetch/doaj_fetch.py +++ b/scripts/1-fetch/doaj_fetch.py @@ -409,8 +409,12 @@ def save_count_data( country_names = load_country_names() # Save license counts - with open(FILE_DOAJ_COUNT, "w", encoding="utf-8", newline="\n") as file_object: - writer = csv.DictWriter(file_object, fieldnames=HEADER_COUNT, dialect="unix") + with open( + FILE_DOAJ_COUNT, "w", encoding="utf-8", newline="\n" + ) as file_object: + writer = csv.DictWriter( + file_object, fieldnames=HEADER_COUNT, dialect="unix" + ) writer.writeheader() for lic, count in license_counts.items(): writer.writerow({"TOOL_IDENTIFIER": lic, "COUNT": count}) @@ -439,8 +443,12 @@ def save_count_data( ) # Save language counts with readable names - with open(FILE_DOAJ_LANGUAGE, "w", encoding="utf-8", newline="\n") as file_object: - writer = csv.DictWriter(file_object, fieldnames=HEADER_LANGUAGE, dialect="unix") + with open( + FILE_DOAJ_LANGUAGE, "w", encoding="utf-8", newline="\n" + ) as file_object: + writer = csv.DictWriter( + file_object, fieldnames=HEADER_LANGUAGE, dialect="unix" + ) writer.writeheader() for lic, languages in language_counts.items(): for lang_code, count in languages.items(): @@ -455,8 +463,12 @@ def save_count_data( ) # Save year counts - with open(FILE_DOAJ_YEAR, "w", encoding="utf-8", newline="\n") as file_object: - writer = csv.DictWriter(file_object, fieldnames=HEADER_YEAR, dialect="unix") + with open( + FILE_DOAJ_YEAR, "w", encoding="utf-8", newline="\n" + ) as file_object: + writer = csv.DictWriter( + file_object, fieldnames=HEADER_YEAR, dialect="unix" + ) writer.writeheader() for lic, years in year_counts.items(): for year, count in years.items(): @@ -465,7 +477,9 @@ def save_count_data( ) # Save publisher counts - with open(FILE_DOAJ_PUBLISHER, "w", encoding="utf-8", newline="\n") as file_object: + with open( + FILE_DOAJ_PUBLISHER, "w", encoding="utf-8", newline="\n" + ) as file_object: writer = csv.DictWriter( file_object, fieldnames=HEADER_PUBLISHER, dialect="unix" ) @@ -529,8 +543,15 @@ def query_doaj(args): } try: - with open(FILE_PROVENANCE, "w", encoding="utf-8", newline="\n") as file_object: - yaml.dump(provenance_data, file_object, default_flow_style=False, indent=2) + with open( + FILE_PROVENANCE, "w", encoding="utf-8", newline="\n" + ) as file_object: + yaml.dump( + provenance_data, + file_object, + default_flow_style=False, + indent=2, + ) except Exception as e: LOGGER.error("Failed to write provenance file: %s", e) raise shared.QuantifyingException( From 3d75671a46be1b01224ccd52a8805c7bee05a7df Mon Sep 17 00:00:00 2001 From: opsmithe Date: Sat, 15 Nov 2025 19:31:23 +0100 Subject: [PATCH 12/19] Fix duplicate counting for journals with multiple CC license types --- scripts/1-fetch/doaj_fetch.py | 323 +++++++--------------------------- 1 file changed, 63 insertions(+), 260 deletions(-) diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py index 7a298b32..996befae 100755 --- a/scripts/1-fetch/doaj_fetch.py +++ b/scripts/1-fetch/doaj_fetch.py @@ -2,24 +2,24 @@ """ Fetch DOAJ journals with CC license information using API v4. +Focus: Journal-level CC license adoption and temporal trends. Note: Articles do not contain license information in DOAJ API. +This script focuses on essential data for quantifying Creative Commons adoption: +- Journal CC license counts by type +- Temporal trends (year-by-year adoption) + +Removed out-of-scope data: subjects, languages, publishers, countries. + Default filtering by oa_start >= 2002 to avoid false positives from journals that retroactively adopted CC licenses. Creative Commons was founded in 2001 and first licenses released in 2002. Journals with oa_start before 2002 may show CC licenses due to later license updates, not original terms. - -Country Code Mapping: -This script requires ISO 3166-1 alpha-2 country codes for publisher analysis. -If data/iso_country_codes.yaml is missing, the script will automatically -generate it using dev/generate_country_codes.py. Users do not need to manually -create this file - it will be created programmatically when needed. """ # Standard library import argparse import csv import os -import subprocess import sys import textwrap import time @@ -62,15 +62,6 @@ # File Paths FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv") -FILE_DOAJ_LANGUAGE = shared.path_join( - PATHS["data_1-fetch"], "doaj_3_count_by_language.csv" -) -FILE_DOAJ_PUBLISHER = shared.path_join( - PATHS["data_1-fetch"], "doaj_5_count_by_publisher.csv" -) -FILE_DOAJ_SUBJECT_REPORT = shared.path_join( - PATHS["data_1-fetch"], "doaj_2_count_by_subject_report.csv" -) FILE_PROVENANCE = shared.path_join( PATHS["data_1-fetch"], "doaj_provenance.yaml" ) @@ -80,115 +71,8 @@ # CSV Headers HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"] -HEADER_LANGUAGE = ["TOOL_IDENTIFIER", "LANGUAGE_CODE", "LANGUAGE", "COUNT"] -HEADER_PUBLISHER = [ - "TOOL_IDENTIFIER", - "PUBLISHER", - "COUNTRY_CODE", - "COUNTRY_NAME", - "COUNT", -] -HEADER_SUBJECT_REPORT = [ - "TOOL_IDENTIFIER", - "SUBJECT_CODE", - "SUBJECT_LABEL", - "COUNT", -] HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"] -# Language code to readable name mapping -LANGUAGE_NAMES = { - "AF": "Afrikaans", - "AR": "Arabic", - "BE": "Belarusian", - "BG": "Bulgarian", - "BN": "Bengali", - "CA": "Catalan", - "CS": "Czech", - "DA": "Danish", - "DE": "German", - "EL": "Greek", - "EN": "English", - "ES": "Spanish", - "ET": "Estonian", - "FA": "Persian", - "FI": "Finnish", - "FR": "French", - "HE": "Hebrew", - "HI": "Hindi", - "HR": "Croatian", - "HU": "Hungarian", - "ID": "Indonesian", - "IS": "Icelandic", - "IT": "Italian", - "JA": "Japanese", - "KO": "Korean", - "LT": "Lithuanian", - "LV": "Latvian", - "MK": "Macedonian", - "MS": "Malay", - "NL": "Dutch", - "NO": "Norwegian", - "PL": "Polish", - "PT": "Portuguese", - "RO": "Romanian", - "RU": "Russian", - "SK": "Slovak", - "SL": "Slovenian", - "SR": "Serbian", - "SV": "Swedish", - "SW": "Swahili", - "TH": "Thai", - "TR": "Turkish", - "UK": "Ukrainian", - "UR": "Urdu", - "VI": "Vietnamese", - "ZH": "Chinese", -} - - -# Load ISO 3166-1 alpha-2 country codes from YAML file -def load_country_names(): - """ - Load country code to name mapping from YAML file. - - Automatically generates data/iso_country_codes.yaml if missing using - dev/generate_country_codes.py. This ensures the script is self-contained - and does not require manual file creation by users. - - Returns: - dict: Mapping of ISO 3166-1 alpha-2 codes to country names - """ - country_file = shared.path_join( - PATHS["repo"], "data", "iso_country_codes.yaml" - ) - - # Generate country codes file if it doesn't exist - if not os.path.isfile(country_file): - LOGGER.info("Country codes file not found, generating it...") - generate_script = shared.path_join( - PATHS["repo"], "dev", "generate_country_codes.py" - ) - try: - subprocess.run([sys.executable, generate_script], check=True) - LOGGER.info("Successfully generated country codes file") - except Exception as e: - LOGGER.error(f"Failed to generate country codes file: {e}") - raise shared.QuantifyingException( - f"Critical error generating country codes: {e}", exit_code=1 - ) - - try: - with open(country_file, "r", encoding="utf-8") as file_object: - countries = yaml.safe_load(file_object) - return {country["code"]: country["name"] for country in countries} - except Exception as e: - LOGGER.error(f"Failed to load country codes from {country_file}: {e}") - raise shared.QuantifyingException( - f"Critical error loading country codes: {e}", exit_code=1 - ) - - # Runtime variables QUARTER = os.path.basename(PATHS["data_quarter"]) @@ -245,21 +129,21 @@ def initialize_all_data_files(args): return os.makedirs(PATHS["data_1-fetch"], exist_ok=True) initialize_data_file(FILE_DOAJ_COUNT, HEADER_COUNT) - initialize_data_file(FILE_DOAJ_SUBJECT_REPORT, HEADER_SUBJECT_REPORT) - initialize_data_file(FILE_DOAJ_LANGUAGE, HEADER_LANGUAGE) initialize_data_file(FILE_DOAJ_YEAR, HEADER_YEAR) - initialize_data_file(FILE_DOAJ_PUBLISHER, HEADER_PUBLISHER) -def extract_license_type(license_info): - """Extract CC license type from DOAJ license information.""" +def extract_license_types(license_info): + """Extract all CC license types from DOAJ license information.""" if not license_info: - return "UNKNOWN CC legal tool" + return [] + + cc_licenses = [] for lic in license_info: lic_type = lic.get("type", "") if lic_type in CC_LICENSE_TYPES: - return lic_type - return "UNKNOWN CC legal tool" + cc_licenses.append(lic_type) + + return cc_licenses def process_journals(session, args): @@ -267,10 +151,9 @@ def process_journals(session, args): LOGGER.info("Fetching DOAJ journals...") license_counts = Counter() - subject_counts = defaultdict(Counter) - language_counts = defaultdict(Counter) year_counts = defaultdict(Counter) - publisher_counts = defaultdict(Counter) + article_counts = defaultdict(int) # Track total articles per license type + processed_journals = set() # Track unique journals to avoid double counting total_processed = 0 page = 1 @@ -319,52 +202,45 @@ def process_journals(session, args): try: bibjson = journal.get("bibjson", {}) - # Check for CC license + # Get journal identifier to avoid double counting + journal_id = journal.get("id", "") + if not journal_id: + continue + + # Check for CC licenses license_info = bibjson.get("license") if not license_info: continue - license_type = extract_license_type(license_info) - if license_type == "UNKNOWN CC legal tool": + cc_license_types = extract_license_types(license_info) + if not cc_license_types: continue - license_counts[license_type] += 1 - - # Extract subjects - subjects = bibjson.get("subject", []) - for subject in subjects: - if isinstance(subject, dict): - code = subject.get("code", "") - term = subject.get("term", "") - if code and term: - subject_counts[license_type][f"{code}|{term}"] += 1 - - # Extract year from oa_start (Open Access start year) + # Extract article count and year once per journal + article_count = bibjson.get("article_count", 0) oa_start = bibjson.get("oa_start") # Apply date-back filter if specified if args.date_back and oa_start and oa_start < args.date_back: continue - if oa_start: - year_counts[license_type][str(oa_start)] += 1 - else: - year_counts[license_type]["Unknown"] += 1 - - # Extract languages - languages = bibjson.get("language", []) - for lang in languages: - language_counts[license_type][lang] += 1 - - # Extract publisher information (new in v4) - publisher_info = bibjson.get("publisher", {}) - if publisher_info: - publisher_name = publisher_info.get("name", "Unknown") - publisher_country = publisher_info.get( - "country", "Unknown" - ) - publisher_key = f"{publisher_name}|{publisher_country}" - publisher_counts[license_type][publisher_key] += 1 + # Count each license type this journal supports + for license_type in cc_license_types: + license_counts[license_type] += 1 + + # Add year data for each license type + if oa_start: + year_counts[license_type][str(oa_start)] += 1 + else: + year_counts[license_type]["Unknown"] += 1 + + # Add article count only once per unique journal (avoid double counting) + if journal_id not in processed_journals: + processed_journals.add(journal_id) + # Add full article count to each license type this journal supports + if article_count: + for license_type in cc_license_types: + article_counts[license_type] += article_count total_processed += 1 @@ -388,25 +264,18 @@ def process_journals(session, args): return ( license_counts, - subject_counts, - language_counts, year_counts, - publisher_counts, - total_processed, + article_counts, + len(processed_journals), # Return unique journal count ) def save_count_data( license_counts, - subject_counts, - language_counts, year_counts, - publisher_counts, + article_counts, ): - """Save all collected data to CSV files.""" - - # Load country names from YAML - country_names = load_country_names() + """Save essential journal data and article context to CSV files.""" # Save license counts with open( @@ -419,49 +288,6 @@ def save_count_data( for lic, count in license_counts.items(): writer.writerow({"TOOL_IDENTIFIER": lic, "COUNT": count}) - # Save subject report - with open( - FILE_DOAJ_SUBJECT_REPORT, "w", encoding="utf-8", newline="\n" - ) as file_object: - writer = csv.DictWriter( - file_object, fieldnames=HEADER_SUBJECT_REPORT, dialect="unix" - ) - writer.writeheader() - for lic, subjects in subject_counts.items(): - for subject_info, count in subjects.items(): - if "|" in subject_info: - code, label = subject_info.split("|", 1) - else: - code, label = subject_info, subject_info - writer.writerow( - { - "TOOL_IDENTIFIER": lic, - "SUBJECT_CODE": code, - "SUBJECT_LABEL": label, - "COUNT": count, - } - ) - - # Save language counts with readable names - with open( - FILE_DOAJ_LANGUAGE, "w", encoding="utf-8", newline="\n" - ) as file_object: - writer = csv.DictWriter( - file_object, fieldnames=HEADER_LANGUAGE, dialect="unix" - ) - writer.writeheader() - for lic, languages in language_counts.items(): - for lang_code, count in languages.items(): - lang_name = LANGUAGE_NAMES.get(lang_code, lang_code) - writer.writerow( - { - "TOOL_IDENTIFIER": lic, - "LANGUAGE_CODE": lang_code, - "LANGUAGE": lang_name, - "COUNT": count, - } - ) - # Save year counts with open( FILE_DOAJ_YEAR, "w", encoding="utf-8", newline="\n" @@ -476,32 +302,6 @@ def save_count_data( {"TOOL_IDENTIFIER": lic, "YEAR": year, "COUNT": count} ) - # Save publisher counts - with open( - FILE_DOAJ_PUBLISHER, "w", encoding="utf-8", newline="\n" - ) as file_object: - writer = csv.DictWriter( - file_object, fieldnames=HEADER_PUBLISHER, dialect="unix" - ) - writer.writeheader() - for lic, publishers in publisher_counts.items(): - for publisher_info, count in publishers.items(): - if "|" in publisher_info: - publisher, country_code = publisher_info.split("|", 1) - else: - publisher, country_code = publisher_info, "Unknown" - - country_name = country_names.get(country_code, country_code) - writer.writerow( - { - "TOOL_IDENTIFIER": lic, - "PUBLISHER": publisher, - "COUNTRY_CODE": country_code, - "COUNTRY_NAME": country_name, - "COUNT": count, - } - ) - def query_doaj(args): """Main function to query DOAJ API v4.""" @@ -512,10 +312,8 @@ def query_doaj(args): # Process journals ( license_counts, - subject_counts, - language_counts, year_counts, - publisher_counts, + article_counts, journals_processed, ) = process_journals(session, args) @@ -523,15 +321,14 @@ def query_doaj(args): if args.enable_save: save_count_data( license_counts, - subject_counts, - language_counts, year_counts, - publisher_counts, + article_counts, ) # Save provenance + total_articles = sum(article_counts.values()) provenance_data = { - "total_articles_fetched": 0, + "total_articles_in_cc_journals": total_articles, "total_journals_fetched": journals_processed, "total_processed": journals_processed, "limit": args.limit, @@ -539,7 +336,7 @@ def query_doaj(args): "quarter": QUARTER, "script": os.path.basename(__file__), "api_version": "v4", - "note": "Articles do not contain license information in DOAJ API", + "note": "Article counts provide context for CC journal scope - individual article licenses unknown", } try: @@ -558,10 +355,16 @@ def query_doaj(args): f"Critical error writing provenance file: {e}", exit_code=1 ) - LOGGER.info(f"Total CC licensed journals processed: {journals_processed}") - LOGGER.info( - "Articles: 0 (DOAJ API doesn't provide license info for articles)" - ) + LOGGER.info(f"Unique CC-licensed journals processed: {journals_processed}") + + # Calculate total license availability instances + total_license_instances = sum(license_counts.values()) + LOGGER.info(f"Total CC license type instances: {total_license_instances}") + + # Calculate total articles for context + total_articles = sum(article_counts.values()) + LOGGER.info(f"Total articles in CC-licensed journals: {total_articles}") + LOGGER.info("Note: Journals supporting multiple CC license types are counted once per license type") def main(): From 37e38552d085b8d989e7e207c6893a73704c5def Mon Sep 17 00:00:00 2001 From: opsmithe Date: Sat, 15 Nov 2025 20:56:37 +0100 Subject: [PATCH 13/19] Remove article counting logic due to DOAJ API limitations --- scripts/1-fetch/doaj_fetch.py | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py index 996befae..e7ffca7f 100755 --- a/scripts/1-fetch/doaj_fetch.py +++ b/scripts/1-fetch/doaj_fetch.py @@ -152,7 +152,6 @@ def process_journals(session, args): license_counts = Counter() year_counts = defaultdict(Counter) - article_counts = defaultdict(int) # Track total articles per license type processed_journals = set() # Track unique journals to avoid double counting total_processed = 0 @@ -216,8 +215,7 @@ def process_journals(session, args): if not cc_license_types: continue - # Extract article count and year once per journal - article_count = bibjson.get("article_count", 0) + # Extract year from oa_start (Open Access start year) oa_start = bibjson.get("oa_start") # Apply date-back filter if specified @@ -234,13 +232,9 @@ def process_journals(session, args): else: year_counts[license_type]["Unknown"] += 1 - # Add article count only once per unique journal (avoid double counting) + # Track unique journals to avoid double counting in statistics if journal_id not in processed_journals: processed_journals.add(journal_id) - # Add full article count to each license type this journal supports - if article_count: - for license_type in cc_license_types: - article_counts[license_type] += article_count total_processed += 1 @@ -265,7 +259,6 @@ def process_journals(session, args): return ( license_counts, year_counts, - article_counts, len(processed_journals), # Return unique journal count ) @@ -273,9 +266,8 @@ def process_journals(session, args): def save_count_data( license_counts, year_counts, - article_counts, ): - """Save essential journal data and article context to CSV files.""" + """Save essential journal data to CSV files.""" # Save license counts with open( @@ -313,7 +305,6 @@ def query_doaj(args): ( license_counts, year_counts, - article_counts, journals_processed, ) = process_journals(session, args) @@ -322,13 +313,10 @@ def query_doaj(args): save_count_data( license_counts, year_counts, - article_counts, ) # Save provenance - total_articles = sum(article_counts.values()) provenance_data = { - "total_articles_in_cc_journals": total_articles, "total_journals_fetched": journals_processed, "total_processed": journals_processed, "limit": args.limit, @@ -336,7 +324,7 @@ def query_doaj(args): "quarter": QUARTER, "script": os.path.basename(__file__), "api_version": "v4", - "note": "Article counts provide context for CC journal scope - individual article licenses unknown", + "note": "Journal-level CC license data only - article counts not available via DOAJ API", } try: @@ -360,10 +348,6 @@ def query_doaj(args): # Calculate total license availability instances total_license_instances = sum(license_counts.values()) LOGGER.info(f"Total CC license type instances: {total_license_instances}") - - # Calculate total articles for context - total_articles = sum(article_counts.values()) - LOGGER.info(f"Total articles in CC-licensed journals: {total_articles}") LOGGER.info("Note: Journals supporting multiple CC license types are counted once per license type") From 8c7bea54a6e1095bab8c6f72f4cd2b1d38031eff Mon Sep 17 00:00:00 2001 From: opsmithe Date: Mon, 24 Nov 2025 10:50:41 +0100 Subject: [PATCH 14/19] Add DOAJ API documentation and technical details to sources.md --- sources.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/sources.md b/sources.md index 2f559bef..0307f154 100644 --- a/sources.md +++ b/sources.md @@ -42,6 +42,33 @@ tool paths. [prioritized-tool-urls]: data/prioritized-tool-urls.txt +## DOAJ (Directory of Open Access Journals) + +**Description:** DOAJ is a comprehensive directory of open access journals that provides metadata about journals and their licensing policies. The API allows access to journal-level information including Creative Commons license support, publication years, and publisher details. + +**Official API Documentation:** +- [DOAJ API Documentation](https://doaj.org/api/docs) +- [DOAJ API v4 Reference](https://doaj.org/api/v4/docs) +- [Base URL](https://doaj.org/api/v4/) + +**API Information:** +- No API key required +- Rate limiting: Reasonable use policy (no specific limits documented) +- Data format: JSON +- Pagination: 100 results per page (configurable up to 100) +- Search filters: Supports filtering by license type, publication year, subject, etc. +- License detection: Extracts CC license information from journal metadata + +**Technical Details:** +- Fetches journals that support Creative Commons licensing +- Aggregates license counts by type (CC BY, CC BY-SA, CC BY-NC, etc.) +- Tracks license adoption by publication year +- Handles journals with multiple supported license types +- Generates provenance metadata for data lineage + +**Script:** [`scripts/1-fetch/doaj_fetch.py`](scripts/1-fetch/doaj_fetch.py) + + ## Europeana **Description:** From 1cdf721009d5fd766e2bf8d47e16c610b65882d6 Mon Sep 17 00:00:00 2001 From: opsmithe Date: Mon, 24 Nov 2025 11:12:59 +0100 Subject: [PATCH 15/19] Add country and language data collection using pycountry library --- Pipfile | 1 + Pipfile.lock | 11 ++++- scripts/1-fetch/doaj_fetch.py | 89 +++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 1 deletion(-) diff --git a/Pipfile b/Pipfile index 6ad3649a..3f109280 100644 --- a/Pipfile +++ b/Pipfile @@ -24,6 +24,7 @@ requests = ">=2.31.0" seaborn = "*" urllib3 = ">=2.5.0" wordcloud = "*" +pycountry = "*" [dev-packages] black = "*" diff --git a/Pipfile.lock b/Pipfile.lock index c111bd7d..b2d4f528 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "0658ee0a643ae80db9419f8ef32887877922d18a5da188b1b87e4e9a1849a4a1" + "sha256": "955abafb11bfd9c68a55197b650693c21577a7a98f4a3fe81d49454351ef7572" }, "pipfile-spec": 6, "requires": { @@ -1644,6 +1644,15 @@ "markers": "python_version >= '3.8'", "version": "==0.4.2" }, + "pycountry": { + "hashes": [ + "sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221", + "sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==24.6.1" + }, "pycparser": { "hashes": [ "sha256:78816d4f24add8f10a06d6f05b4d424ad9e96cfebf68a4ddc99c65c0720d00c2", diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py index e7ffca7f..8f6cde9c 100755 --- a/scripts/1-fetch/doaj_fetch.py +++ b/scripts/1-fetch/doaj_fetch.py @@ -27,6 +27,7 @@ from collections import Counter, defaultdict # Third-party +import pycountry import requests import yaml from pygments import highlight @@ -62,6 +63,8 @@ # File Paths FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv") +FILE_DOAJ_COUNTRY = shared.path_join(PATHS["data_1-fetch"], "doaj_3_count_by_country.csv") +FILE_DOAJ_LANGUAGE = shared.path_join(PATHS["data_1-fetch"], "doaj_5_count_by_language.csv") FILE_PROVENANCE = shared.path_join( PATHS["data_1-fetch"], "doaj_provenance.yaml" ) @@ -71,6 +74,8 @@ # CSV Headers HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"] +HEADER_COUNTRY = ["TOOL_IDENTIFIER", "COUNTRY_CODE", "COUNTRY_NAME", "COUNT"] +HEADER_LANGUAGE = ["TOOL_IDENTIFIER", "LANGUAGE_CODE", "LANGUAGE_NAME", "COUNT"] HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"] # Runtime variables @@ -129,9 +134,33 @@ def initialize_all_data_files(args): return os.makedirs(PATHS["data_1-fetch"], exist_ok=True) initialize_data_file(FILE_DOAJ_COUNT, HEADER_COUNT) + initialize_data_file(FILE_DOAJ_COUNTRY, HEADER_COUNTRY) + initialize_data_file(FILE_DOAJ_LANGUAGE, HEADER_LANGUAGE) initialize_data_file(FILE_DOAJ_YEAR, HEADER_YEAR) +def get_country_name(country_code): + """Get country name from ISO 3166-1 alpha-2 code using pycountry.""" + if not country_code or country_code == "Unknown": + return "Unknown" + try: + country = pycountry.countries.get(alpha_2=country_code.upper()) + return country.name if country else country_code + except Exception: + return country_code + + +def get_language_name(language_code): + """Get language name from ISO 639-1 code using pycountry.""" + if not language_code or language_code == "Unknown": + return "Unknown" + try: + language = pycountry.languages.get(alpha_2=language_code.upper()) + return language.name if language else language_code + except Exception: + return language_code + + def extract_license_types(license_info): """Extract all CC license types from DOAJ license information.""" if not license_info: @@ -151,6 +180,8 @@ def process_journals(session, args): LOGGER.info("Fetching DOAJ journals...") license_counts = Counter() + country_counts = defaultdict(Counter) + language_counts = defaultdict(Counter) year_counts = defaultdict(Counter) processed_journals = set() # Track unique journals to avoid double counting @@ -232,6 +263,20 @@ def process_journals(session, args): else: year_counts[license_type]["Unknown"] += 1 + # Extract country information + publisher_info = bibjson.get("publisher", {}) + if isinstance(publisher_info, dict): + country_code = publisher_info.get("country", "Unknown") + country_counts[license_type][country_code] += 1 + + # Extract language information + languages = bibjson.get("language", []) + if languages: + for lang_code in languages: + language_counts[license_type][lang_code] += 1 + else: + language_counts[license_type]["Unknown"] += 1 + # Track unique journals to avoid double counting in statistics if journal_id not in processed_journals: processed_journals.add(journal_id) @@ -258,6 +303,8 @@ def process_journals(session, args): return ( license_counts, + country_counts, + language_counts, year_counts, len(processed_journals), # Return unique journal count ) @@ -265,6 +312,8 @@ def process_journals(session, args): def save_count_data( license_counts, + country_counts, + language_counts, year_counts, ): """Save essential journal data to CSV files.""" @@ -280,6 +329,42 @@ def save_count_data( for lic, count in license_counts.items(): writer.writerow({"TOOL_IDENTIFIER": lic, "COUNT": count}) + # Save country counts with pycountry names + with open( + FILE_DOAJ_COUNTRY, "w", encoding="utf-8", newline="\n" + ) as file_object: + writer = csv.DictWriter( + file_object, fieldnames=HEADER_COUNTRY, dialect="unix" + ) + writer.writeheader() + for lic, countries in country_counts.items(): + for country_code, count in countries.items(): + country_name = get_country_name(country_code) + writer.writerow({ + "TOOL_IDENTIFIER": lic, + "COUNTRY_CODE": country_code, + "COUNTRY_NAME": country_name, + "COUNT": count, + }) + + # Save language counts with pycountry names + with open( + FILE_DOAJ_LANGUAGE, "w", encoding="utf-8", newline="\n" + ) as file_object: + writer = csv.DictWriter( + file_object, fieldnames=HEADER_LANGUAGE, dialect="unix" + ) + writer.writeheader() + for lic, languages in language_counts.items(): + for lang_code, count in languages.items(): + lang_name = get_language_name(lang_code) + writer.writerow({ + "TOOL_IDENTIFIER": lic, + "LANGUAGE_CODE": lang_code, + "LANGUAGE_NAME": lang_name, + "COUNT": count, + }) + # Save year counts with open( FILE_DOAJ_YEAR, "w", encoding="utf-8", newline="\n" @@ -304,6 +389,8 @@ def query_doaj(args): # Process journals ( license_counts, + country_counts, + language_counts, year_counts, journals_processed, ) = process_journals(session, args) @@ -312,6 +399,8 @@ def query_doaj(args): if args.enable_save: save_count_data( license_counts, + country_counts, + language_counts, year_counts, ) From f40dbf4bd654a013ac30abf4208a8792e8c70b33 Mon Sep 17 00:00:00 2001 From: opsmithe Date: Mon, 24 Nov 2025 11:21:37 +0100 Subject: [PATCH 16/19] Update exception handling to use structured QuantifyingException approach --- scripts/1-fetch/doaj_fetch.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py index 8f6cde9c..f2451f9d 100755 --- a/scripts/1-fetch/doaj_fetch.py +++ b/scripts/1-fetch/doaj_fetch.py @@ -199,20 +199,12 @@ def process_journals(session, args): response = session.get(url, params=params, timeout=30) response.raise_for_status() data = response.json() - except requests.exceptions.RequestException as e: - if hasattr(e, "response") and e.response.status_code == 400: - LOGGER.info(f"Reached end of available data at page {page}") - break - else: - LOGGER.error(f"Failed to fetch journals page {page}: {e}") - raise shared.QuantifyingException( - f"Critical API error on page {page}: {e}", exit_code=1 - ) - except (ValueError, KeyError) as e: - LOGGER.error(f"Failed to parse JSON response on page {page}: {e}") - raise shared.QuantifyingException( - f"Critical JSON parsing error on page {page}: {e}", exit_code=1 - ) + except requests.HTTPError as e: + raise shared.QuantifyingException(f"HTTP Error: {e}", 1) + except requests.RequestException as e: + raise shared.QuantifyingException(f"Request Exception: {e}", 1) + except KeyError as e: + raise shared.QuantifyingException(f"KeyError: {e}", 1) try: results = data.get("results", []) From daedef4ec31a65108adf43e1584b071eb301b596 Mon Sep 17 00:00:00 2001 From: opsmithe Date: Mon, 24 Nov 2025 11:28:45 +0100 Subject: [PATCH 17/19] Fix static analysis issues and handle 400 errors as end-of-data --- scripts/1-fetch/doaj_fetch.py | 72 ++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 23 deletions(-) diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py index f2451f9d..f9aa017a 100755 --- a/scripts/1-fetch/doaj_fetch.py +++ b/scripts/1-fetch/doaj_fetch.py @@ -5,7 +5,8 @@ Focus: Journal-level CC license adoption and temporal trends. Note: Articles do not contain license information in DOAJ API. -This script focuses on essential data for quantifying Creative Commons adoption: +This script focuses on essential data for quantifying Creative Commons +adoption: - Journal CC license counts by type - Temporal trends (year-by-year adoption) @@ -63,8 +64,12 @@ # File Paths FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv") -FILE_DOAJ_COUNTRY = shared.path_join(PATHS["data_1-fetch"], "doaj_3_count_by_country.csv") -FILE_DOAJ_LANGUAGE = shared.path_join(PATHS["data_1-fetch"], "doaj_5_count_by_language.csv") +FILE_DOAJ_COUNTRY = shared.path_join( + PATHS["data_1-fetch"], "doaj_3_count_by_country.csv" +) +FILE_DOAJ_LANGUAGE = shared.path_join( + PATHS["data_1-fetch"], "doaj_5_count_by_language.csv" +) FILE_PROVENANCE = shared.path_join( PATHS["data_1-fetch"], "doaj_provenance.yaml" ) @@ -75,7 +80,12 @@ # CSV Headers HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"] HEADER_COUNTRY = ["TOOL_IDENTIFIER", "COUNTRY_CODE", "COUNTRY_NAME", "COUNT"] -HEADER_LANGUAGE = ["TOOL_IDENTIFIER", "LANGUAGE_CODE", "LANGUAGE_NAME", "COUNT"] +HEADER_LANGUAGE = [ + "TOOL_IDENTIFIER", + "LANGUAGE_CODE", + "LANGUAGE_NAME", + "COUNT", +] HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"] # Runtime variables @@ -165,13 +175,13 @@ def extract_license_types(license_info): """Extract all CC license types from DOAJ license information.""" if not license_info: return [] - + cc_licenses = [] for lic in license_info: lic_type = lic.get("type", "") if lic_type in CC_LICENSE_TYPES: cc_licenses.append(lic_type) - + return cc_licenses @@ -183,7 +193,9 @@ def process_journals(session, args): country_counts = defaultdict(Counter) language_counts = defaultdict(Counter) year_counts = defaultdict(Counter) - processed_journals = set() # Track unique journals to avoid double counting + processed_journals = ( + set() + ) # Track unique journals to avoid double counting total_processed = 0 page = 1 @@ -200,6 +212,10 @@ def process_journals(session, args): response.raise_for_status() data = response.json() except requests.HTTPError as e: + # Handle 400 errors as end of data (DOAJ API behavior) + if hasattr(e, "response") and e.response.status_code == 400: + LOGGER.info(f"Reached end of available data at page {page}") + break raise shared.QuantifyingException(f"HTTP Error: {e}", 1) except requests.RequestException as e: raise shared.QuantifyingException(f"Request Exception: {e}", 1) @@ -260,7 +276,7 @@ def process_journals(session, args): if isinstance(publisher_info, dict): country_code = publisher_info.get("country", "Unknown") country_counts[license_type][country_code] += 1 - + # Extract language information languages = bibjson.get("language", []) if languages: @@ -332,12 +348,14 @@ def save_count_data( for lic, countries in country_counts.items(): for country_code, count in countries.items(): country_name = get_country_name(country_code) - writer.writerow({ - "TOOL_IDENTIFIER": lic, - "COUNTRY_CODE": country_code, - "COUNTRY_NAME": country_name, - "COUNT": count, - }) + writer.writerow( + { + "TOOL_IDENTIFIER": lic, + "COUNTRY_CODE": country_code, + "COUNTRY_NAME": country_name, + "COUNT": count, + } + ) # Save language counts with pycountry names with open( @@ -350,12 +368,14 @@ def save_count_data( for lic, languages in language_counts.items(): for lang_code, count in languages.items(): lang_name = get_language_name(lang_code) - writer.writerow({ - "TOOL_IDENTIFIER": lic, - "LANGUAGE_CODE": lang_code, - "LANGUAGE_NAME": lang_name, - "COUNT": count, - }) + writer.writerow( + { + "TOOL_IDENTIFIER": lic, + "LANGUAGE_CODE": lang_code, + "LANGUAGE_NAME": lang_name, + "COUNT": count, + } + ) # Save year counts with open( @@ -405,7 +425,10 @@ def query_doaj(args): "quarter": QUARTER, "script": os.path.basename(__file__), "api_version": "v4", - "note": "Journal-level CC license data only - article counts not available via DOAJ API", + "note": ( + "Journal-level CC license data only - " + "article counts not available via DOAJ API" + ), } try: @@ -425,11 +448,14 @@ def query_doaj(args): ) LOGGER.info(f"Unique CC-licensed journals processed: {journals_processed}") - + # Calculate total license availability instances total_license_instances = sum(license_counts.values()) LOGGER.info(f"Total CC license type instances: {total_license_instances}") - LOGGER.info("Note: Journals supporting multiple CC license types are counted once per license type") + LOGGER.info( + "Note: Journals supporting multiple CC license types are " + "counted once per license type" + ) def main(): From f44ea662e798e414292551e47cd268c302903741 Mon Sep 17 00:00:00 2001 From: opsmithe Date: Mon, 24 Nov 2025 11:34:27 +0100 Subject: [PATCH 18/19] Order DOAJ API documentation links alphabetically --- sources.md | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/sources.md b/sources.md index 0307f154..203db0a5 100644 --- a/sources.md +++ b/sources.md @@ -47,9 +47,9 @@ tool paths. **Description:** DOAJ is a comprehensive directory of open access journals that provides metadata about journals and their licensing policies. The API allows access to journal-level information including Creative Commons license support, publication years, and publisher details. **Official API Documentation:** +- [Base URL](https://doaj.org/api/v4/) - [DOAJ API Documentation](https://doaj.org/api/docs) - [DOAJ API v4 Reference](https://doaj.org/api/v4/docs) -- [Base URL](https://doaj.org/api/v4/) **API Information:** - No API key required @@ -59,14 +59,6 @@ tool paths. - Search filters: Supports filtering by license type, publication year, subject, etc. - License detection: Extracts CC license information from journal metadata -**Technical Details:** -- Fetches journals that support Creative Commons licensing -- Aggregates license counts by type (CC BY, CC BY-SA, CC BY-NC, etc.) -- Tracks license adoption by publication year -- Handles journals with multiple supported license types -- Generates provenance metadata for data lineage - -**Script:** [`scripts/1-fetch/doaj_fetch.py`](scripts/1-fetch/doaj_fetch.py) ## Europeana From 85f6faf59c541f89f98d4dc9697376c29dad352e Mon Sep 17 00:00:00 2001 From: opsmithe Date: Mon, 24 Nov 2025 11:37:58 +0100 Subject: [PATCH 19/19] Remove generate_country_codes.py - using pycountry library instead --- dev/generate_country_codes.py | 295 ---------------------------------- 1 file changed, 295 deletions(-) delete mode 100755 dev/generate_country_codes.py diff --git a/dev/generate_country_codes.py b/dev/generate_country_codes.py deleted file mode 100755 index a70ba79a..00000000 --- a/dev/generate_country_codes.py +++ /dev/null @@ -1,295 +0,0 @@ -#!/usr/bin/env python -""" -Generate ISO 3166-1 alpha-2 country codes YAML file for DOAJ fetch script. -""" -# Standard library -import os -import sys - -# Third-party -import yaml - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..", "scripts")) -# First-party/Local -import shared # noqa: E402 - -# ISO 3166-1 alpha-2 country codes (official list) -COUNTRIES = [ - {"code": "AD", "name": "Andorra"}, - {"code": "AE", "name": "United Arab Emirates"}, - {"code": "AF", "name": "Afghanistan"}, - {"code": "AG", "name": "Antigua and Barbuda"}, - {"code": "AI", "name": "Anguilla"}, - {"code": "AL", "name": "Albania"}, - {"code": "AM", "name": "Armenia"}, - {"code": "AO", "name": "Angola"}, - {"code": "AQ", "name": "Antarctica"}, - {"code": "AR", "name": "Argentina"}, - {"code": "AS", "name": "American Samoa"}, - {"code": "AT", "name": "Austria"}, - {"code": "AU", "name": "Australia"}, - {"code": "AW", "name": "Aruba"}, - {"code": "AX", "name": "Åland Islands"}, - {"code": "AZ", "name": "Azerbaijan"}, - {"code": "BA", "name": "Bosnia and Herzegovina"}, - {"code": "BB", "name": "Barbados"}, - {"code": "BD", "name": "Bangladesh"}, - {"code": "BE", "name": "Belgium"}, - {"code": "BF", "name": "Burkina Faso"}, - {"code": "BG", "name": "Bulgaria"}, - {"code": "BH", "name": "Bahrain"}, - {"code": "BI", "name": "Burundi"}, - {"code": "BJ", "name": "Benin"}, - {"code": "BL", "name": "Saint Barthélemy"}, - {"code": "BM", "name": "Bermuda"}, - {"code": "BN", "name": "Brunei"}, - {"code": "BO", "name": "Bolivia"}, - {"code": "BQ", "name": "Caribbean Netherlands"}, - {"code": "BR", "name": "Brazil"}, - {"code": "BS", "name": "Bahamas"}, - {"code": "BT", "name": "Bhutan"}, - {"code": "BV", "name": "Bouvet Island"}, - {"code": "BW", "name": "Botswana"}, - {"code": "BY", "name": "Belarus"}, - {"code": "BZ", "name": "Belize"}, - {"code": "CA", "name": "Canada"}, - {"code": "CC", "name": "Cocos Islands"}, - {"code": "CD", "name": "Democratic Republic of the Congo"}, - {"code": "CF", "name": "Central African Republic"}, - {"code": "CG", "name": "Republic of the Congo"}, - {"code": "CH", "name": "Switzerland"}, - {"code": "CI", "name": "Côte d'Ivoire"}, - {"code": "CK", "name": "Cook Islands"}, - {"code": "CL", "name": "Chile"}, - {"code": "CM", "name": "Cameroon"}, - {"code": "CN", "name": "China"}, - {"code": "CO", "name": "Colombia"}, - {"code": "CR", "name": "Costa Rica"}, - {"code": "CU", "name": "Cuba"}, - {"code": "CV", "name": "Cape Verde"}, - {"code": "CW", "name": "Curaçao"}, - {"code": "CX", "name": "Christmas Island"}, - {"code": "CY", "name": "Cyprus"}, - {"code": "CZ", "name": "Czech Republic"}, - {"code": "DE", "name": "Germany"}, - {"code": "DJ", "name": "Djibouti"}, - {"code": "DK", "name": "Denmark"}, - {"code": "DM", "name": "Dominica"}, - {"code": "DO", "name": "Dominican Republic"}, - {"code": "DZ", "name": "Algeria"}, - {"code": "EC", "name": "Ecuador"}, - {"code": "EE", "name": "Estonia"}, - {"code": "EG", "name": "Egypt"}, - {"code": "EH", "name": "Western Sahara"}, - {"code": "ER", "name": "Eritrea"}, - {"code": "ES", "name": "Spain"}, - {"code": "ET", "name": "Ethiopia"}, - {"code": "FI", "name": "Finland"}, - {"code": "FJ", "name": "Fiji"}, - {"code": "FK", "name": "Falkland Islands"}, - {"code": "FM", "name": "Micronesia"}, - {"code": "FO", "name": "Faroe Islands"}, - {"code": "FR", "name": "France"}, - {"code": "GA", "name": "Gabon"}, - {"code": "GB", "name": "United Kingdom"}, - {"code": "GD", "name": "Grenada"}, - {"code": "GE", "name": "Georgia"}, - {"code": "GF", "name": "French Guiana"}, - {"code": "GG", "name": "Guernsey"}, - {"code": "GH", "name": "Ghana"}, - {"code": "GI", "name": "Gibraltar"}, - {"code": "GL", "name": "Greenland"}, - {"code": "GM", "name": "Gambia"}, - {"code": "GN", "name": "Guinea"}, - {"code": "GP", "name": "Guadeloupe"}, - {"code": "GQ", "name": "Equatorial Guinea"}, - {"code": "GR", "name": "Greece"}, - {"code": "GS", "name": "South Georgia"}, - {"code": "GT", "name": "Guatemala"}, - {"code": "GU", "name": "Guam"}, - {"code": "GW", "name": "Guinea-Bissau"}, - {"code": "GY", "name": "Guyana"}, - {"code": "HK", "name": "Hong Kong"}, - {"code": "HM", "name": "Heard Island"}, - {"code": "HN", "name": "Honduras"}, - {"code": "HR", "name": "Croatia"}, - {"code": "HT", "name": "Haiti"}, - {"code": "HU", "name": "Hungary"}, - {"code": "ID", "name": "Indonesia"}, - {"code": "IE", "name": "Ireland"}, - {"code": "IL", "name": "Israel"}, - {"code": "IM", "name": "Isle of Man"}, - {"code": "IN", "name": "India"}, - {"code": "IO", "name": "British Indian Ocean Territory"}, - {"code": "IQ", "name": "Iraq"}, - {"code": "IR", "name": "Iran"}, - {"code": "IS", "name": "Iceland"}, - {"code": "IT", "name": "Italy"}, - {"code": "JE", "name": "Jersey"}, - {"code": "JM", "name": "Jamaica"}, - {"code": "JO", "name": "Jordan"}, - {"code": "JP", "name": "Japan"}, - {"code": "KE", "name": "Kenya"}, - {"code": "KG", "name": "Kyrgyzstan"}, - {"code": "KH", "name": "Cambodia"}, - {"code": "KI", "name": "Kiribati"}, - {"code": "KM", "name": "Comoros"}, - {"code": "KN", "name": "Saint Kitts and Nevis"}, - {"code": "KP", "name": "North Korea"}, - {"code": "KR", "name": "South Korea"}, - {"code": "KW", "name": "Kuwait"}, - {"code": "KY", "name": "Cayman Islands"}, - {"code": "KZ", "name": "Kazakhstan"}, - {"code": "LA", "name": "Laos"}, - {"code": "LB", "name": "Lebanon"}, - {"code": "LC", "name": "Saint Lucia"}, - {"code": "LI", "name": "Liechtenstein"}, - {"code": "LK", "name": "Sri Lanka"}, - {"code": "LR", "name": "Liberia"}, - {"code": "LS", "name": "Lesotho"}, - {"code": "LT", "name": "Lithuania"}, - {"code": "LU", "name": "Luxembourg"}, - {"code": "LV", "name": "Latvia"}, - {"code": "LY", "name": "Libya"}, - {"code": "MA", "name": "Morocco"}, - {"code": "MC", "name": "Monaco"}, - {"code": "MD", "name": "Moldova"}, - {"code": "ME", "name": "Montenegro"}, - {"code": "MF", "name": "Saint Martin"}, - {"code": "MG", "name": "Madagascar"}, - {"code": "MH", "name": "Marshall Islands"}, - {"code": "MK", "name": "North Macedonia"}, - {"code": "ML", "name": "Mali"}, - {"code": "MM", "name": "Myanmar"}, - {"code": "MN", "name": "Mongolia"}, - {"code": "MO", "name": "Macao"}, - {"code": "MP", "name": "Northern Mariana Islands"}, - {"code": "MQ", "name": "Martinique"}, - {"code": "MR", "name": "Mauritania"}, - {"code": "MS", "name": "Montserrat"}, - {"code": "MT", "name": "Malta"}, - {"code": "MU", "name": "Mauritius"}, - {"code": "MV", "name": "Maldives"}, - {"code": "MW", "name": "Malawi"}, - {"code": "MX", "name": "Mexico"}, - {"code": "MY", "name": "Malaysia"}, - {"code": "MZ", "name": "Mozambique"}, - {"code": "NA", "name": "Namibia"}, - {"code": "NC", "name": "New Caledonia"}, - {"code": "NE", "name": "Niger"}, - {"code": "NF", "name": "Norfolk Island"}, - {"code": "NG", "name": "Nigeria"}, - {"code": "NI", "name": "Nicaragua"}, - {"code": "NL", "name": "Netherlands"}, - {"code": "NO", "name": "Norway"}, - {"code": "NP", "name": "Nepal"}, - {"code": "NR", "name": "Nauru"}, - {"code": "NU", "name": "Niue"}, - {"code": "NZ", "name": "New Zealand"}, - {"code": "OM", "name": "Oman"}, - {"code": "PA", "name": "Panama"}, - {"code": "PE", "name": "Peru"}, - {"code": "PF", "name": "French Polynesia"}, - {"code": "PG", "name": "Papua New Guinea"}, - {"code": "PH", "name": "Philippines"}, - {"code": "PK", "name": "Pakistan"}, - {"code": "PL", "name": "Poland"}, - {"code": "PM", "name": "Saint Pierre and Miquelon"}, - {"code": "PN", "name": "Pitcairn Islands"}, - {"code": "PR", "name": "Puerto Rico"}, - {"code": "PS", "name": "Palestine"}, - {"code": "PT", "name": "Portugal"}, - {"code": "PW", "name": "Palau"}, - {"code": "PY", "name": "Paraguay"}, - {"code": "QA", "name": "Qatar"}, - {"code": "RE", "name": "Réunion"}, - {"code": "RO", "name": "Romania"}, - {"code": "RS", "name": "Serbia"}, - {"code": "RU", "name": "Russia"}, - {"code": "RW", "name": "Rwanda"}, - {"code": "SA", "name": "Saudi Arabia"}, - {"code": "SB", "name": "Solomon Islands"}, - {"code": "SC", "name": "Seychelles"}, - {"code": "SD", "name": "Sudan"}, - {"code": "SE", "name": "Sweden"}, - {"code": "SG", "name": "Singapore"}, - {"code": "SH", "name": "Saint Helena"}, - {"code": "SI", "name": "Slovenia"}, - {"code": "SJ", "name": "Svalbard and Jan Mayen"}, - {"code": "SK", "name": "Slovakia"}, - {"code": "SL", "name": "Sierra Leone"}, - {"code": "SM", "name": "San Marino"}, - {"code": "SN", "name": "Senegal"}, - {"code": "SO", "name": "Somalia"}, - {"code": "SR", "name": "Suriname"}, - {"code": "SS", "name": "South Sudan"}, - {"code": "ST", "name": "São Tomé and Príncipe"}, - {"code": "SV", "name": "El Salvador"}, - {"code": "SX", "name": "Sint Maarten"}, - {"code": "SY", "name": "Syria"}, - {"code": "SZ", "name": "Eswatini"}, - {"code": "TC", "name": "Turks and Caicos Islands"}, - {"code": "TD", "name": "Chad"}, - {"code": "TF", "name": "French Southern Territories"}, - {"code": "TG", "name": "Togo"}, - {"code": "TH", "name": "Thailand"}, - {"code": "TJ", "name": "Tajikistan"}, - {"code": "TK", "name": "Tokelau"}, - {"code": "TL", "name": "Timor-Leste"}, - {"code": "TM", "name": "Turkmenistan"}, - {"code": "TN", "name": "Tunisia"}, - {"code": "TO", "name": "Tonga"}, - {"code": "TR", "name": "Turkey"}, - {"code": "TT", "name": "Trinidad and Tobago"}, - {"code": "TV", "name": "Tuvalu"}, - {"code": "TW", "name": "Taiwan"}, - {"code": "TZ", "name": "Tanzania"}, - {"code": "UA", "name": "Ukraine"}, - {"code": "UG", "name": "Uganda"}, - {"code": "UM", "name": "U.S. Minor Outlying Islands"}, - {"code": "US", "name": "United States"}, - {"code": "UY", "name": "Uruguay"}, - {"code": "UZ", "name": "Uzbekistan"}, - {"code": "VA", "name": "Vatican City"}, - {"code": "VC", "name": "Saint Vincent and the Grenadines"}, - {"code": "VE", "name": "Venezuela"}, - {"code": "VG", "name": "British Virgin Islands"}, - {"code": "VI", "name": "U.S. Virgin Islands"}, - {"code": "VN", "name": "Vietnam"}, - {"code": "VU", "name": "Vanuatu"}, - {"code": "WF", "name": "Wallis and Futuna"}, - {"code": "WS", "name": "Samoa"}, - {"code": "YE", "name": "Yemen"}, - {"code": "YT", "name": "Mayotte"}, - {"code": "ZA", "name": "South Africa"}, - {"code": "ZM", "name": "Zambia"}, - {"code": "ZW", "name": "Zimbabwe"}, -] - - -def main(): - """Generate ISO country codes YAML file.""" - repo_path = shared.path_join(os.path.dirname(__file__), "..") - output_file = shared.path_join(repo_path, "data", "iso_country_codes.yaml") - - header = [ - "# ISO 3166-1 alpha-2 country codes to country names mapping", - "# Used by DOAJ API for publisher country identification", - "# Generated programmatically by dev/generate_country_codes.py", - ] - - with open(output_file, "w", encoding="utf-8") as file_object: - file_object.write("\n".join(header) + "\n") - yaml.dump( - COUNTRIES, - file_object, - default_flow_style=False, - allow_unicode=True, - ) - - print(f"Generated {output_file} with {len(COUNTRIES)} country codes") - - -if __name__ == "__main__": - main()