From 74a099c0f4553e7c2b7bd9214a46c95da009035d Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Sat, 8 Nov 2025 08:48:11 +0100
Subject: [PATCH 01/19] Add simple name output script

---
 output_name.py | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 output_name.py

diff --git a/output_name.py b/output_name.py
new file mode 100644
index 00000000..f4462d71
--- /dev/null
+++ b/output_name.py
@@ -0,0 +1 @@
+print("John Doe")

From 6cec0a3a828926acca36bfd7debee2a28d9059b3 Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Mon, 10 Nov 2025 12:06:45 +0100
Subject: [PATCH 02/19] feat: Complete DOAJ integration with API v4 and country
 code generation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Migrate from DOAJ API v3 to v4 for enhanced metadata access
- Add comprehensive CC license analysis for academic journals
- Implement publisher and geographic distribution analysis
- Add programmatic ISO 3166-1 alpha-2 country code generation
- Include automatic dependency resolution and error handling
- Apply date filtering (default ≥2002) to prevent false positives
- Generate 5 CSV files plus provenance for comprehensive analysis
- Ensure static analysis compliance and comprehensive testing

This integration enables quantification of institutional commitment
to Creative Commons licensing in the scholarly publishing ecosystem.
---
 dev/generate_country_codes.py | 286 ++++++++++++++++
 scripts/1-fetch/doaj_fetch.py | 591 ++++++++++++++++++++++++++++++++++
 2 files changed, 877 insertions(+)
 create mode 100644 dev/generate_country_codes.py
 create mode 100644 scripts/1-fetch/doaj_fetch.py

diff --git a/dev/generate_country_codes.py b/dev/generate_country_codes.py
new file mode 100644
index 00000000..4bb7ebd8
--- /dev/null
+++ b/dev/generate_country_codes.py
@@ -0,0 +1,286 @@
+#!/usr/bin/env python
+"""
+Generate ISO 3166-1 alpha-2 country codes YAML file for DOAJ fetch script.
+"""
+import os
+import sys
+import yaml
+
+# Add parent directory so shared can be imported
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", "scripts"))
+import shared
+
+# ISO 3166-1 alpha-2 country codes (official list)
+COUNTRIES = [
+    {"code": "AD", "name": "Andorra"},
+    {"code": "AE", "name": "United Arab Emirates"},
+    {"code": "AF", "name": "Afghanistan"},
+    {"code": "AG", "name": "Antigua and Barbuda"},
+    {"code": "AI", "name": "Anguilla"},
+    {"code": "AL", "name": "Albania"},
+    {"code": "AM", "name": "Armenia"},
+    {"code": "AO", "name": "Angola"},
+    {"code": "AQ", "name": "Antarctica"},
+    {"code": "AR", "name": "Argentina"},
+    {"code": "AS", "name": "American Samoa"},
+    {"code": "AT", "name": "Austria"},
+    {"code": "AU", "name": "Australia"},
+    {"code": "AW", "name": "Aruba"},
+    {"code": "AX", "name": "Åland Islands"},
+    {"code": "AZ", "name": "Azerbaijan"},
+    {"code": "BA", "name": "Bosnia and Herzegovina"},
+    {"code": "BB", "name": "Barbados"},
+    {"code": "BD", "name": "Bangladesh"},
+    {"code": "BE", "name": "Belgium"},
+    {"code": "BF", "name": "Burkina Faso"},
+    {"code": "BG", "name": "Bulgaria"},
+    {"code": "BH", "name": "Bahrain"},
+    {"code": "BI", "name": "Burundi"},
+    {"code": "BJ", "name": "Benin"},
+    {"code": "BL", "name": "Saint Barthélemy"},
+    {"code": "BM", "name": "Bermuda"},
+    {"code": "BN", "name": "Brunei"},
+    {"code": "BO", "name": "Bolivia"},
+    {"code": "BQ", "name": "Caribbean Netherlands"},
+    {"code": "BR", "name": "Brazil"},
+    {"code": "BS", "name": "Bahamas"},
+    {"code": "BT", "name": "Bhutan"},
+    {"code": "BV", "name": "Bouvet Island"},
+    {"code": "BW", "name": "Botswana"},
+    {"code": "BY", "name": "Belarus"},
+    {"code": "BZ", "name": "Belize"},
+    {"code": "CA", "name": "Canada"},
+    {"code": "CC", "name": "Cocos Islands"},
+    {"code": "CD", "name": "Democratic Republic of the Congo"},
+    {"code": "CF", "name": "Central African Republic"},
+    {"code": "CG", "name": "Republic of the Congo"},
+    {"code": "CH", "name": "Switzerland"},
+    {"code": "CI", "name": "Côte d'Ivoire"},
+    {"code": "CK", "name": "Cook Islands"},
+    {"code": "CL", "name": "Chile"},
+    {"code": "CM", "name": "Cameroon"},
+    {"code": "CN", "name": "China"},
+    {"code": "CO", "name": "Colombia"},
+    {"code": "CR", "name": "Costa Rica"},
+    {"code": "CU", "name": "Cuba"},
+    {"code": "CV", "name": "Cape Verde"},
+    {"code": "CW", "name": "Curaçao"},
+    {"code": "CX", "name": "Christmas Island"},
+    {"code": "CY", "name": "Cyprus"},
+    {"code": "CZ", "name": "Czech Republic"},
+    {"code": "DE", "name": "Germany"},
+    {"code": "DJ", "name": "Djibouti"},
+    {"code": "DK", "name": "Denmark"},
+    {"code": "DM", "name": "Dominica"},
+    {"code": "DO", "name": "Dominican Republic"},
+    {"code": "DZ", "name": "Algeria"},
+    {"code": "EC", "name": "Ecuador"},
+    {"code": "EE", "name": "Estonia"},
+    {"code": "EG", "name": "Egypt"},
+    {"code": "EH", "name": "Western Sahara"},
+    {"code": "ER", "name": "Eritrea"},
+    {"code": "ES", "name": "Spain"},
+    {"code": "ET", "name": "Ethiopia"},
+    {"code": "FI", "name": "Finland"},
+    {"code": "FJ", "name": "Fiji"},
+    {"code": "FK", "name": "Falkland Islands"},
+    {"code": "FM", "name": "Micronesia"},
+    {"code": "FO", "name": "Faroe Islands"},
+    {"code": "FR", "name": "France"},
+    {"code": "GA", "name": "Gabon"},
+    {"code": "GB", "name": "United Kingdom"},
+    {"code": "GD", "name": "Grenada"},
+    {"code": "GE", "name": "Georgia"},
+    {"code": "GF", "name": "French Guiana"},
+    {"code": "GG", "name": "Guernsey"},
+    {"code": "GH", "name": "Ghana"},
+    {"code": "GI", "name": "Gibraltar"},
+    {"code": "GL", "name": "Greenland"},
+    {"code": "GM", "name": "Gambia"},
+    {"code": "GN", "name": "Guinea"},
+    {"code": "GP", "name": "Guadeloupe"},
+    {"code": "GQ", "name": "Equatorial Guinea"},
+    {"code": "GR", "name": "Greece"},
+    {"code": "GS", "name": "South Georgia"},
+    {"code": "GT", "name": "Guatemala"},
+    {"code": "GU", "name": "Guam"},
+    {"code": "GW", "name": "Guinea-Bissau"},
+    {"code": "GY", "name": "Guyana"},
+    {"code": "HK", "name": "Hong Kong"},
+    {"code": "HM", "name": "Heard Island"},
+    {"code": "HN", "name": "Honduras"},
+    {"code": "HR", "name": "Croatia"},
+    {"code": "HT", "name": "Haiti"},
+    {"code": "HU", "name": "Hungary"},
+    {"code": "ID", "name": "Indonesia"},
+    {"code": "IE", "name": "Ireland"},
+    {"code": "IL", "name": "Israel"},
+    {"code": "IM", "name": "Isle of Man"},
+    {"code": "IN", "name": "India"},
+    {"code": "IO", "name": "British Indian Ocean Territory"},
+    {"code": "IQ", "name": "Iraq"},
+    {"code": "IR", "name": "Iran"},
+    {"code": "IS", "name": "Iceland"},
+    {"code": "IT", "name": "Italy"},
+    {"code": "JE", "name": "Jersey"},
+    {"code": "JM", "name": "Jamaica"},
+    {"code": "JO", "name": "Jordan"},
+    {"code": "JP", "name": "Japan"},
+    {"code": "KE", "name": "Kenya"},
+    {"code": "KG", "name": "Kyrgyzstan"},
+    {"code": "KH", "name": "Cambodia"},
+    {"code": "KI", "name": "Kiribati"},
+    {"code": "KM", "name": "Comoros"},
+    {"code": "KN", "name": "Saint Kitts and Nevis"},
+    {"code": "KP", "name": "North Korea"},
+    {"code": "KR", "name": "South Korea"},
+    {"code": "KW", "name": "Kuwait"},
+    {"code": "KY", "name": "Cayman Islands"},
+    {"code": "KZ", "name": "Kazakhstan"},
+    {"code": "LA", "name": "Laos"},
+    {"code": "LB", "name": "Lebanon"},
+    {"code": "LC", "name": "Saint Lucia"},
+    {"code": "LI", "name": "Liechtenstein"},
+    {"code": "LK", "name": "Sri Lanka"},
+    {"code": "LR", "name": "Liberia"},
+    {"code": "LS", "name": "Lesotho"},
+    {"code": "LT", "name": "Lithuania"},
+    {"code": "LU", "name": "Luxembourg"},
+    {"code": "LV", "name": "Latvia"},
+    {"code": "LY", "name": "Libya"},
+    {"code": "MA", "name": "Morocco"},
+    {"code": "MC", "name": "Monaco"},
+    {"code": "MD", "name": "Moldova"},
+    {"code": "ME", "name": "Montenegro"},
+    {"code": "MF", "name": "Saint Martin"},
+    {"code": "MG", "name": "Madagascar"},
+    {"code": "MH", "name": "Marshall Islands"},
+    {"code": "MK", "name": "North Macedonia"},
+    {"code": "ML", "name": "Mali"},
+    {"code": "MM", "name": "Myanmar"},
+    {"code": "MN", "name": "Mongolia"},
+    {"code": "MO", "name": "Macao"},
+    {"code": "MP", "name": "Northern Mariana Islands"},
+    {"code": "MQ", "name": "Martinique"},
+    {"code": "MR", "name": "Mauritania"},
+    {"code": "MS", "name": "Montserrat"},
+    {"code": "MT", "name": "Malta"},
+    {"code": "MU", "name": "Mauritius"},
+    {"code": "MV", "name": "Maldives"},
+    {"code": "MW", "name": "Malawi"},
+    {"code": "MX", "name": "Mexico"},
+    {"code": "MY", "name": "Malaysia"},
+    {"code": "MZ", "name": "Mozambique"},
+    {"code": "NA", "name": "Namibia"},
+    {"code": "NC", "name": "New Caledonia"},
+    {"code": "NE", "name": "Niger"},
+    {"code": "NF", "name": "Norfolk Island"},
+    {"code": "NG", "name": "Nigeria"},
+    {"code": "NI", "name": "Nicaragua"},
+    {"code": "NL", "name": "Netherlands"},
+    {"code": "NO", "name": "Norway"},
+    {"code": "NP", "name": "Nepal"},
+    {"code": "NR", "name": "Nauru"},
+    {"code": "NU", "name": "Niue"},
+    {"code": "NZ", "name": "New Zealand"},
+    {"code": "OM", "name": "Oman"},
+    {"code": "PA", "name": "Panama"},
+    {"code": "PE", "name": "Peru"},
+    {"code": "PF", "name": "French Polynesia"},
+    {"code": "PG", "name": "Papua New Guinea"},
+    {"code": "PH", "name": "Philippines"},
+    {"code": "PK", "name": "Pakistan"},
+    {"code": "PL", "name": "Poland"},
+    {"code": "PM", "name": "Saint Pierre and Miquelon"},
+    {"code": "PN", "name": "Pitcairn Islands"},
+    {"code": "PR", "name": "Puerto Rico"},
+    {"code": "PS", "name": "Palestine"},
+    {"code": "PT", "name": "Portugal"},
+    {"code": "PW", "name": "Palau"},
+    {"code": "PY", "name": "Paraguay"},
+    {"code": "QA", "name": "Qatar"},
+    {"code": "RE", "name": "Réunion"},
+    {"code": "RO", "name": "Romania"},
+    {"code": "RS", "name": "Serbia"},
+    {"code": "RU", "name": "Russia"},
+    {"code": "RW", "name": "Rwanda"},
+    {"code": "SA", "name": "Saudi Arabia"},
+    {"code": "SB", "name": "Solomon Islands"},
+    {"code": "SC", "name": "Seychelles"},
+    {"code": "SD", "name": "Sudan"},
+    {"code": "SE", "name": "Sweden"},
+    {"code": "SG", "name": "Singapore"},
+    {"code": "SH", "name": "Saint Helena"},
+    {"code": "SI", "name": "Slovenia"},
+    {"code": "SJ", "name": "Svalbard and Jan Mayen"},
+    {"code": "SK", "name": "Slovakia"},
+    {"code": "SL", "name": "Sierra Leone"},
+    {"code": "SM", "name": "San Marino"},
+    {"code": "SN", "name": "Senegal"},
+    {"code": "SO", "name": "Somalia"},
+    {"code": "SR", "name": "Suriname"},
+    {"code": "SS", "name": "South Sudan"},
+    {"code": "ST", "name": "São Tomé and Príncipe"},
+    {"code": "SV", "name": "El Salvador"},
+    {"code": "SX", "name": "Sint Maarten"},
+    {"code": "SY", "name": "Syria"},
+    {"code": "SZ", "name": "Eswatini"},
+    {"code": "TC", "name": "Turks and Caicos Islands"},
+    {"code": "TD", "name": "Chad"},
+    {"code": "TF", "name": "French Southern Territories"},
+    {"code": "TG", "name": "Togo"},
+    {"code": "TH", "name": "Thailand"},
+    {"code": "TJ", "name": "Tajikistan"},
+    {"code": "TK", "name": "Tokelau"},
+    {"code": "TL", "name": "Timor-Leste"},
+    {"code": "TM", "name": "Turkmenistan"},
+    {"code": "TN", "name": "Tunisia"},
+    {"code": "TO", "name": "Tonga"},
+    {"code": "TR", "name": "Turkey"},
+    {"code": "TT", "name": "Trinidad and Tobago"},
+    {"code": "TV", "name": "Tuvalu"},
+    {"code": "TW", "name": "Taiwan"},
+    {"code": "TZ", "name": "Tanzania"},
+    {"code": "UA", "name": "Ukraine"},
+    {"code": "UG", "name": "Uganda"},
+    {"code": "UM", "name": "U.S. Minor Outlying Islands"},
+    {"code": "US", "name": "United States"},
+    {"code": "UY", "name": "Uruguay"},
+    {"code": "UZ", "name": "Uzbekistan"},
+    {"code": "VA", "name": "Vatican City"},
+    {"code": "VC", "name": "Saint Vincent and the Grenadines"},
+    {"code": "VE", "name": "Venezuela"},
+    {"code": "VG", "name": "British Virgin Islands"},
+    {"code": "VI", "name": "U.S. Virgin Islands"},
+    {"code": "VN", "name": "Vietnam"},
+    {"code": "VU", "name": "Vanuatu"},
+    {"code": "WF", "name": "Wallis and Futuna"},
+    {"code": "WS", "name": "Samoa"},
+    {"code": "YE", "name": "Yemen"},
+    {"code": "YT", "name": "Mayotte"},
+    {"code": "ZA", "name": "South Africa"},
+    {"code": "ZM", "name": "Zambia"},
+    {"code": "ZW", "name": "Zimbabwe"},
+]
+
+
+def main():
+    """Generate ISO country codes YAML file."""
+    repo_path = shared.path_join(os.path.dirname(__file__), "..")
+    output_file = shared.path_join(repo_path, "data", "iso_country_codes.yaml")
+    
+    header = [
+        "# ISO 3166-1 alpha-2 country codes to country names mapping",
+        "# Used by DOAJ API for publisher country identification",
+        "# Generated programmatically by dev/generate_country_codes.py",
+    ]
+    
+    with open(output_file, "w", encoding="utf-8") as f:
+        f.write("\n".join(header) + "\n")
+        yaml.dump(COUNTRIES, f, default_flow_style=False, allow_unicode=True)
+    
+    print(f"Generated {output_file} with {len(COUNTRIES)} country codes")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py
new file mode 100644
index 00000000..dcdd0157
--- /dev/null
+++ b/scripts/1-fetch/doaj_fetch.py
@@ -0,0 +1,591 @@
+#!/usr/bin/env python
+"""
+Fetch DOAJ journals with CC license information using API v4.
+
+Note: Articles do not contain license information in DOAJ API.
+
+Default filtering by oa_start >= 2002 to avoid false positives from journals
+that retroactively adopted CC licenses. Creative Commons was founded in 2001
+and first licenses released in 2002. Journals with oa_start before 2002 may
+show CC licenses due to later license updates, not original terms.
+
+Country Code Mapping:
+This script requires ISO 3166-1 alpha-2 country codes for publisher analysis.
+If data/iso_country_codes.yaml is missing, the script will automatically
+generate it using dev/generate_country_codes.py. Users do not need to manually
+create this file - it will be created programmatically when needed.
+"""
+# Standard library
+import argparse
+import csv
+import os
+import sys
+import textwrap
+import time
+import traceback
+from collections import Counter, defaultdict
+
+# Third-party
+import requests
+import yaml
+from pygments import highlight
+from pygments.formatters import TerminalFormatter
+from pygments.lexers import PythonTracebackLexer
+
+# Add parent directory so shared can be imported
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+
+# First-party/Local
+import shared  # noqa: E402
+
+# Setup
+LOGGER, PATHS = shared.setup(__file__)
+
+# Constants
+BASE_URL = "https://doaj.org/api/v4/search"
+DEFAULT_DATE_BACK = 2002  # Creative Commons licenses first released in 2002
+DEFAULT_FETCH_LIMIT = 1000
+RATE_LIMIT_DELAY = 0.5
+
+# CSV Headers
+HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"]
+HEADER_LANGUAGE = ["TOOL_IDENTIFIER", "LANGUAGE_CODE", "LANGUAGE", "COUNT"]
+HEADER_PUBLISHER = [
+    "TOOL_IDENTIFIER",
+    "PUBLISHER",
+    "COUNTRY_CODE",
+    "COUNTRY_NAME",
+    "COUNT",
+]
+HEADER_SUBJECT_REPORT = [
+    "TOOL_IDENTIFIER",
+    "SUBJECT_CODE",
+    "SUBJECT_LABEL",
+    "COUNT",
+]
+HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"]
+
+# CC License types
+CC_LICENSE_TYPES = [
+    "CC BY",
+    "CC BY-NC",
+    "CC BY-SA",
+    "CC BY-ND",
+    "CC BY-NC-SA",
+    "CC BY-NC-ND",
+    "CC0",
+    "UNKNOWN CC legal tool",
+]
+
+# Language code to readable name mapping
+LANGUAGE_NAMES = {
+    "EN": "English",
+    "ES": "Spanish",
+    "PT": "Portuguese",
+    "FR": "French",
+    "DE": "German",
+    "IT": "Italian",
+    "RU": "Russian",
+    "ZH": "Chinese",
+    "JA": "Japanese",
+    "AR": "Arabic",
+    "TR": "Turkish",
+    "NL": "Dutch",
+    "SV": "Swedish",
+    "NO": "Norwegian",
+    "DA": "Danish",
+    "FI": "Finnish",
+    "PL": "Polish",
+    "CS": "Czech",
+    "HU": "Hungarian",
+    "RO": "Romanian",
+    "BG": "Bulgarian",
+    "HR": "Croatian",
+    "SK": "Slovak",
+    "SL": "Slovenian",
+    "ET": "Estonian",
+    "LV": "Latvian",
+    "LT": "Lithuanian",
+    "EL": "Greek",
+    "CA": "Catalan",
+    "IS": "Icelandic",
+    "MK": "Macedonian",
+    "SR": "Serbian",
+    "UK": "Ukrainian",
+    "BE": "Belarusian",
+    "KO": "Korean",
+    "TH": "Thai",
+    "VI": "Vietnamese",
+    "ID": "Indonesian",
+    "MS": "Malay",
+    "HI": "Hindi",
+    "BN": "Bengali",
+    "UR": "Urdu",
+    "FA": "Persian",
+    "HE": "Hebrew",
+    "SW": "Swahili",
+    "AF": "Afrikaans",
+}
+
+
+# Load ISO 3166-1 alpha-2 country codes from YAML file
+def load_country_names():
+    """
+    Load country code to name mapping from YAML file.
+
+    Automatically generates data/iso_country_codes.yaml if missing using
+    dev/generate_country_codes.py. This ensures the script is self-contained
+    and does not require manual file creation by users.
+
+    Returns:
+        dict: Mapping of ISO 3166-1 alpha-2 codes to country names
+    """
+    country_file = shared.path_join(
+        PATHS["repo"], "data", "iso_country_codes.yaml"
+    )
+
+    # Generate country codes file if it doesn't exist
+    if not os.path.isfile(country_file):
+        LOGGER.info("Country codes file not found, generating it...")
+        generate_script = shared.path_join(
+            PATHS["repo"], "dev", "generate_country_codes.py"
+        )
+        try:
+            # Standard library
+            import subprocess
+
+            subprocess.run([sys.executable, generate_script], check=True)
+            LOGGER.info("Successfully generated country codes file")
+        except Exception as e:
+            LOGGER.error(f"Failed to generate country codes file: {e}")
+            raise shared.QuantifyingException(
+                f"Critical error generating country codes: {e}", exit_code=1
+            )
+
+    try:
+        with open(country_file, "r", encoding="utf-8") as fh:
+            countries = yaml.safe_load(fh)
+            return {country["code"]: country["name"] for country in countries}
+    except Exception as e:
+        LOGGER.error(f"Failed to load country codes from {country_file}: {e}")
+        raise shared.QuantifyingException(
+            f"Critical error loading country codes: {e}", exit_code=1
+        )
+
+
+# File Paths
+FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv")
+FILE_DOAJ_SUBJECT_REPORT = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_2_count_by_subject_report.csv"
+)
+FILE_DOAJ_LANGUAGE = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_3_count_by_language.csv"
+)
+FILE_DOAJ_YEAR = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_4_count_by_year.csv"
+)
+FILE_DOAJ_PUBLISHER = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_5_count_by_publisher.csv"
+)
+FILE_PROVENANCE = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_provenance.yaml"
+)
+
+# Runtime variables
+QUARTER = os.path.basename(PATHS["data_quarter"])
+
+
+def parse_arguments():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Fetch DOAJ journals with CC licenses using API v4"
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=DEFAULT_FETCH_LIMIT,
+        help=f"Total journals to fetch (default: {DEFAULT_FETCH_LIMIT})",
+    )
+    parser.add_argument(
+        "--date-back",
+        type=int,
+        default=DEFAULT_DATE_BACK,
+        help=f"Only include journals with oa_start year >= this value "
+        f"(default: {DEFAULT_DATE_BACK}). Set to 2002 to avoid false "
+        f"positives from journals that retroactively adopted CC licenses "
+        f"after Creative Commons was established. Journals starting "
+        f"before 2002 may show CC licenses due to later updates, not "
+        f"original licensing terms.",
+    )
+    parser.add_argument(
+        "--enable-save",
+        action="store_true",
+        help="Enable saving data to CSV files",
+    )
+    parser.add_argument(
+        "--enable-git", action="store_true", help="Enable git actions"
+    )
+    args = parser.parse_args()
+    if not args.enable_save and args.enable_git:
+        parser.error("--enable-git requires --enable-save")
+    return args
+
+
+def initialize_data_file(file_path, headers):
+    """Initialize CSV file with headers if it doesn't exist."""
+    if not os.path.isfile(file_path):
+        with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
+            writer = csv.DictWriter(
+                file_obj, fieldnames=headers, dialect="unix"
+            )
+            writer.writeheader()
+
+
+def initialize_all_data_files(args):
+    """Initialize all data files."""
+    if not args.enable_save:
+        return
+    os.makedirs(PATHS["data_1-fetch"], exist_ok=True)
+    initialize_data_file(FILE_DOAJ_COUNT, HEADER_COUNT)
+    initialize_data_file(FILE_DOAJ_SUBJECT_REPORT, HEADER_SUBJECT_REPORT)
+    initialize_data_file(FILE_DOAJ_LANGUAGE, HEADER_LANGUAGE)
+    initialize_data_file(FILE_DOAJ_YEAR, HEADER_YEAR)
+    initialize_data_file(FILE_DOAJ_PUBLISHER, HEADER_PUBLISHER)
+
+
+def extract_license_type(license_info):
+    """Extract CC license type from DOAJ license information."""
+    if not license_info:
+        return "UNKNOWN CC legal tool"
+    for lic in license_info:
+        lic_type = lic.get("type", "")
+        if lic_type in CC_LICENSE_TYPES:
+            return lic_type
+    return "UNKNOWN CC legal tool"
+
+
+def process_journals(session, args):
+    """Process DOAJ journals with CC licenses using API v4."""
+    LOGGER.info("Fetching DOAJ journals...")
+
+    license_counts = Counter()
+    subject_counts = defaultdict(Counter)
+    language_counts = defaultdict(Counter)
+    year_counts = defaultdict(Counter)
+    publisher_counts = defaultdict(Counter)
+
+    total_processed = 0
+    page = 1
+    page_size = 100
+
+    while total_processed < args.limit:
+        LOGGER.info(f"Fetching journals page {page}...")
+
+        url = f"{BASE_URL}/journals/*"
+        params = {"pageSize": page_size, "page": page}
+
+        try:
+            response = session.get(url, params=params, timeout=30)
+            response.raise_for_status()
+            data = response.json()
+        except requests.exceptions.RequestException as e:
+            if hasattr(e, "response") and e.response.status_code == 400:
+                LOGGER.info(f"Reached end of available data at page {page}")
+                break
+            else:
+                LOGGER.error(f"Failed to fetch journals page {page}: {e}")
+                raise shared.QuantifyingException(
+                    f"Critical API error on page {page}: {e}", exit_code=1
+                )
+        except (ValueError, KeyError) as e:
+            LOGGER.error(f"Failed to parse JSON response on page {page}: {e}")
+            raise shared.QuantifyingException(
+                f"Critical JSON parsing error on page {page}: {e}", exit_code=1
+            )
+
+        try:
+            results = data.get("results", [])
+            if not results:
+                break
+        except (AttributeError, TypeError) as e:
+            LOGGER.error(f"Invalid API response structure on page {page}: {e}")
+            raise shared.QuantifyingException(
+                f"Critical API response format error on page {page}: {e}",
+                exit_code=1,
+            )
+
+        for journal in results:
+            if total_processed >= args.limit:
+                break
+
+            try:
+                bibjson = journal.get("bibjson", {})
+
+                # Check for CC license
+                license_info = bibjson.get("license")
+                if not license_info:
+                    continue
+
+                license_type = extract_license_type(license_info)
+                if license_type == "UNKNOWN CC legal tool":
+                    continue
+
+                license_counts[license_type] += 1
+
+                # Extract subjects
+                subjects = bibjson.get("subject", [])
+                for subject in subjects:
+                    if isinstance(subject, dict):
+                        code = subject.get("code", "")
+                        term = subject.get("term", "")
+                        if code and term:
+                            subject_counts[license_type][f"{code}|{term}"] += 1
+
+                # Extract year from oa_start (Open Access start year)
+                oa_start = bibjson.get("oa_start")
+
+                # Apply date-back filter if specified
+                if args.date_back and oa_start and oa_start < args.date_back:
+                    continue
+
+                if oa_start:
+                    year_counts[license_type][str(oa_start)] += 1
+                else:
+                    year_counts[license_type]["Unknown"] += 1
+
+                # Extract languages
+                languages = bibjson.get("language", [])
+                for lang in languages:
+                    language_counts[license_type][lang] += 1
+
+                # Extract publisher information (new in v4)
+                publisher_info = bibjson.get("publisher", {})
+                if publisher_info:
+                    publisher_name = publisher_info.get("name", "Unknown")
+                    publisher_country = publisher_info.get(
+                        "country", "Unknown"
+                    )
+                    publisher_key = f"{publisher_name}|{publisher_country}"
+                    publisher_counts[license_type][publisher_key] += 1
+
+                total_processed += 1
+
+            except (KeyError, AttributeError, TypeError) as e:
+                LOGGER.warning(
+                    f"Skipping malformed journal record on page {page}: {e}"
+                )
+                continue
+            except Exception as e:
+                LOGGER.error(
+                    f"Unexpected error processing journal on page {page}: {e}"
+                )
+                raise shared.QuantifyingException(
+                    f"Critical error processing journal data on page {page}: "
+                    f"{e}",
+                    exit_code=1,
+                )
+
+        page += 1
+        time.sleep(RATE_LIMIT_DELAY)
+
+    return (
+        license_counts,
+        subject_counts,
+        language_counts,
+        year_counts,
+        publisher_counts,
+        total_processed,
+    )
+
+
+def save_count_data(
+    license_counts,
+    subject_counts,
+    language_counts,
+    year_counts,
+    publisher_counts,
+):
+    """Save all collected data to CSV files."""
+
+    # Load country names from YAML
+    country_names = load_country_names()
+
+    # Save license counts
+    with open(FILE_DOAJ_COUNT, "w", encoding="utf-8", newline="\n") as fh:
+        writer = csv.DictWriter(fh, fieldnames=HEADER_COUNT, dialect="unix")
+        writer.writeheader()
+        for lic, count in license_counts.items():
+            writer.writerow({"TOOL_IDENTIFIER": lic, "COUNT": count})
+
+    # Save subject report
+    with open(
+        FILE_DOAJ_SUBJECT_REPORT, "w", encoding="utf-8", newline="\n"
+    ) as fh:
+        writer = csv.DictWriter(
+            fh, fieldnames=HEADER_SUBJECT_REPORT, dialect="unix"
+        )
+        writer.writeheader()
+        for lic, subjects in subject_counts.items():
+            for subject_info, count in subjects.items():
+                if "|" in subject_info:
+                    code, label = subject_info.split("|", 1)
+                else:
+                    code, label = subject_info, subject_info
+                writer.writerow(
+                    {
+                        "TOOL_IDENTIFIER": lic,
+                        "SUBJECT_CODE": code,
+                        "SUBJECT_LABEL": label,
+                        "COUNT": count,
+                    }
+                )
+
+    # Save language counts with readable names
+    with open(FILE_DOAJ_LANGUAGE, "w", encoding="utf-8", newline="\n") as fh:
+        writer = csv.DictWriter(fh, fieldnames=HEADER_LANGUAGE, dialect="unix")
+        writer.writeheader()
+        for lic, languages in language_counts.items():
+            for lang_code, count in languages.items():
+                lang_name = LANGUAGE_NAMES.get(lang_code, lang_code)
+                writer.writerow(
+                    {
+                        "TOOL_IDENTIFIER": lic,
+                        "LANGUAGE_CODE": lang_code,
+                        "LANGUAGE": lang_name,
+                        "COUNT": count,
+                    }
+                )
+
+    # Save year counts
+    with open(FILE_DOAJ_YEAR, "w", encoding="utf-8", newline="\n") as fh:
+        writer = csv.DictWriter(fh, fieldnames=HEADER_YEAR, dialect="unix")
+        writer.writeheader()
+        for lic, years in year_counts.items():
+            for year, count in years.items():
+                writer.writerow(
+                    {"TOOL_IDENTIFIER": lic, "YEAR": year, "COUNT": count}
+                )
+
+    # Save publisher counts
+    with open(FILE_DOAJ_PUBLISHER, "w", encoding="utf-8", newline="\n") as fh:
+        writer = csv.DictWriter(
+            fh, fieldnames=HEADER_PUBLISHER, dialect="unix"
+        )
+        writer.writeheader()
+        for lic, publishers in publisher_counts.items():
+            for publisher_info, count in publishers.items():
+                if "|" in publisher_info:
+                    publisher, country_code = publisher_info.split("|", 1)
+                else:
+                    publisher, country_code = publisher_info, "Unknown"
+
+                country_name = country_names.get(country_code, country_code)
+                writer.writerow(
+                    {
+                        "TOOL_IDENTIFIER": lic,
+                        "PUBLISHER": publisher,
+                        "COUNTRY_CODE": country_code,
+                        "COUNTRY_NAME": country_name,
+                        "COUNT": count,
+                    }
+                )
+
+
+def query_doaj(args):
+    """Main function to query DOAJ API v4."""
+    session = shared.get_session()
+
+    LOGGER.info("Processing DOAJ journals with DOAJ API v4")
+
+    # Process journals
+    (
+        license_counts,
+        subject_counts,
+        language_counts,
+        year_counts,
+        publisher_counts,
+        journals_processed,
+    ) = process_journals(session, args)
+
+    # Save results
+    if args.enable_save:
+        save_count_data(
+            license_counts,
+            subject_counts,
+            language_counts,
+            year_counts,
+            publisher_counts,
+        )
+
+    # Save provenance
+    provenance_data = {
+        "total_articles_fetched": 0,
+        "total_journals_fetched": journals_processed,
+        "total_processed": journals_processed,
+        "limit": args.limit,
+        "date_back_filter": args.date_back,
+        "quarter": QUARTER,
+        "script": os.path.basename(__file__),
+        "api_version": "v4",
+        "note": "Articles do not contain license information in DOAJ API",
+    }
+
+    try:
+        with open(FILE_PROVENANCE, "w", encoding="utf-8", newline="\n") as fh:
+            yaml.dump(provenance_data, fh, default_flow_style=False, indent=2)
+    except Exception as e:
+        LOGGER.error("Failed to write provenance file: %s", e)
+        raise shared.QuantifyingException(
+            f"Critical error writing provenance file: {e}", exit_code=1
+        )
+
+    LOGGER.info(f"Total CC licensed journals processed: {journals_processed}")
+    LOGGER.info(
+        "Articles: 0 (DOAJ API doesn't provide license info for articles)"
+    )
+
+
+def main():
+    """Main function."""
+    LOGGER.info("Script execution started.")
+    args = parse_arguments()
+    shared.paths_log(LOGGER, PATHS)
+    shared.git_fetch_and_merge(args, PATHS["repo"])
+    initialize_all_data_files(args)
+    query_doaj(args)
+    args = shared.git_add_and_commit(
+        args,
+        PATHS["repo"],
+        PATHS["data_quarter"],
+        f"Add and commit new DOAJ CC license data for {QUARTER} using API v4",
+    )
+    shared.git_push_changes(args, PATHS["repo"])
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except shared.QuantifyingException as e:
+        if e.exit_code == 0:
+            LOGGER.info(e.message)
+        else:
+            LOGGER.error(e.message)
+        sys.exit(e.exit_code)
+    except SystemExit as e:
+        if e.code != 0:
+            LOGGER.error(f"System exit with code: {e.code}")
+        sys.exit(e.code)
+    except KeyboardInterrupt:
+        LOGGER.info("(130) Halted via KeyboardInterrupt.")
+        sys.exit(130)
+    except Exception:
+        traceback_formatted = textwrap.indent(
+            highlight(
+                traceback.format_exc(),
+                PythonTracebackLexer(),
+                TerminalFormatter(),
+            ),
+            "    ",
+        )
+        LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
+        sys.exit(1)

From 8c80845485bfbc0b12397b6a65cd5fcbc139dc4d Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Mon, 10 Nov 2025 12:14:48 +0100
Subject: [PATCH 03/19] Delete output_name.py

---
 output_name.py | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 output_name.py

diff --git a/output_name.py b/output_name.py
deleted file mode 100644
index f4462d71..00000000
--- a/output_name.py
+++ /dev/null
@@ -1 +0,0 @@
-print("John Doe")

From 9f8df08d841451d402201bd1da370f33f6c9ce6d Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Mon, 10 Nov 2025 12:30:44 +0100
Subject: [PATCH 04/19] Make doaj_fetch.py executable

---
 scripts/1-fetch/doaj_fetch.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 scripts/1-fetch/doaj_fetch.py

diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py
old mode 100644
new mode 100755

From 2986e451a9bbd501e00f46f5fa18a8a1d290a13c Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Mon, 10 Nov 2025 12:35:17 +0100
Subject: [PATCH 05/19] Make generate_country_codes.py executable

---
 dev/generate_country_codes.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 dev/generate_country_codes.py

diff --git a/dev/generate_country_codes.py b/dev/generate_country_codes.py
old mode 100644
new mode 100755

From 521b158a0b98cd92ba383709452870daa1597b03 Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Tue, 11 Nov 2025 10:17:25 +0100
Subject: [PATCH 06/19] Move file path constants to top-level before function
 definitions

---
 scripts/1-fetch/doaj_fetch.py | 36 +++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py
index dcdd0157..509eea66 100755
--- a/scripts/1-fetch/doaj_fetch.py
+++ b/scripts/1-fetch/doaj_fetch.py
@@ -127,6 +127,24 @@
     "AF": "Afrikaans",
 }
 
+# File Paths
+FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv")
+FILE_DOAJ_SUBJECT_REPORT = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_2_count_by_subject_report.csv"
+)
+FILE_DOAJ_LANGUAGE = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_3_count_by_language.csv"
+)
+FILE_DOAJ_YEAR = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_4_count_by_year.csv"
+)
+FILE_DOAJ_PUBLISHER = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_5_count_by_publisher.csv"
+)
+FILE_PROVENANCE = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_provenance.yaml"
+)
+
 
 # Load ISO 3166-1 alpha-2 country codes from YAML file
 def load_country_names():
@@ -173,24 +191,6 @@ def load_country_names():
         )
 
 
-# File Paths
-FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv")
-FILE_DOAJ_SUBJECT_REPORT = shared.path_join(
-    PATHS["data_1-fetch"], "doaj_2_count_by_subject_report.csv"
-)
-FILE_DOAJ_LANGUAGE = shared.path_join(
-    PATHS["data_1-fetch"], "doaj_3_count_by_language.csv"
-)
-FILE_DOAJ_YEAR = shared.path_join(
-    PATHS["data_1-fetch"], "doaj_4_count_by_year.csv"
-)
-FILE_DOAJ_PUBLISHER = shared.path_join(
-    PATHS["data_1-fetch"], "doaj_5_count_by_publisher.csv"
-)
-FILE_PROVENANCE = shared.path_join(
-    PATHS["data_1-fetch"], "doaj_provenance.yaml"
-)
-
 # Runtime variables
 QUARTER = os.path.basename(PATHS["data_quarter"])
 

From 1b5312aafd37ed746645834bb9f7bae944c6a816 Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Tue, 11 Nov 2025 10:23:38 +0100
Subject: [PATCH 07/19] Organize constants alphabetically within logical groups

---
 scripts/1-fetch/doaj_fetch.py | 124 +++++++++++++++++-----------------
 1 file changed, 62 insertions(+), 62 deletions(-)

diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py
index 509eea66..e63fee8c 100755
--- a/scripts/1-fetch/doaj_fetch.py
+++ b/scripts/1-fetch/doaj_fetch.py
@@ -47,6 +47,36 @@
 DEFAULT_FETCH_LIMIT = 1000
 RATE_LIMIT_DELAY = 0.5
 
+# CC License types
+CC_LICENSE_TYPES = [
+    "CC BY",
+    "CC BY-NC",
+    "CC BY-SA",
+    "CC BY-ND",
+    "CC BY-NC-SA",
+    "CC BY-NC-ND",
+    "CC0",
+    "UNKNOWN CC legal tool",
+]
+
+# File Paths
+FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv")
+FILE_DOAJ_LANGUAGE = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_3_count_by_language.csv"
+)
+FILE_DOAJ_PUBLISHER = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_5_count_by_publisher.csv"
+)
+FILE_DOAJ_SUBJECT_REPORT = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_2_count_by_subject_report.csv"
+)
+FILE_PROVENANCE = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_provenance.yaml"
+)
+FILE_DOAJ_YEAR = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_4_count_by_year.csv"
+)
+
 # CSV Headers
 HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"]
 HEADER_LANGUAGE = ["TOOL_IDENTIFIER", "LANGUAGE_CODE", "LANGUAGE", "COUNT"]
@@ -65,86 +95,56 @@
 ]
 HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"]
 
-# CC License types
-CC_LICENSE_TYPES = [
-    "CC BY",
-    "CC BY-NC",
-    "CC BY-SA",
-    "CC BY-ND",
-    "CC BY-NC-SA",
-    "CC BY-NC-ND",
-    "CC0",
-    "UNKNOWN CC legal tool",
-]
-
 # Language code to readable name mapping
 LANGUAGE_NAMES = {
+    "AF": "Afrikaans",
+    "AR": "Arabic",
+    "BE": "Belarusian",
+    "BG": "Bulgarian",
+    "BN": "Bengali",
+    "CA": "Catalan",
+    "CS": "Czech",
+    "DA": "Danish",
+    "DE": "German",
+    "EL": "Greek",
     "EN": "English",
     "ES": "Spanish",
-    "PT": "Portuguese",
+    "ET": "Estonian",
+    "FA": "Persian",
+    "FI": "Finnish",
     "FR": "French",
-    "DE": "German",
+    "HE": "Hebrew",
+    "HI": "Hindi",
+    "HR": "Croatian",
+    "HU": "Hungarian",
+    "ID": "Indonesian",
+    "IS": "Icelandic",
     "IT": "Italian",
-    "RU": "Russian",
-    "ZH": "Chinese",
     "JA": "Japanese",
-    "AR": "Arabic",
-    "TR": "Turkish",
+    "KO": "Korean",
+    "LT": "Lithuanian",
+    "LV": "Latvian",
+    "MK": "Macedonian",
+    "MS": "Malay",
     "NL": "Dutch",
-    "SV": "Swedish",
     "NO": "Norwegian",
-    "DA": "Danish",
-    "FI": "Finnish",
     "PL": "Polish",
-    "CS": "Czech",
-    "HU": "Hungarian",
+    "PT": "Portuguese",
     "RO": "Romanian",
-    "BG": "Bulgarian",
-    "HR": "Croatian",
+    "RU": "Russian",
     "SK": "Slovak",
     "SL": "Slovenian",
-    "ET": "Estonian",
-    "LV": "Latvian",
-    "LT": "Lithuanian",
-    "EL": "Greek",
-    "CA": "Catalan",
-    "IS": "Icelandic",
-    "MK": "Macedonian",
     "SR": "Serbian",
-    "UK": "Ukrainian",
-    "BE": "Belarusian",
-    "KO": "Korean",
+    "SV": "Swedish",
+    "SW": "Swahili",
     "TH": "Thai",
-    "VI": "Vietnamese",
-    "ID": "Indonesian",
-    "MS": "Malay",
-    "HI": "Hindi",
-    "BN": "Bengali",
+    "TR": "Turkish",
+    "UK": "Ukrainian",
     "UR": "Urdu",
-    "FA": "Persian",
-    "HE": "Hebrew",
-    "SW": "Swahili",
-    "AF": "Afrikaans",
+    "VI": "Vietnamese",
+    "ZH": "Chinese",
 }
 
-# File Paths
-FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv")
-FILE_DOAJ_SUBJECT_REPORT = shared.path_join(
-    PATHS["data_1-fetch"], "doaj_2_count_by_subject_report.csv"
-)
-FILE_DOAJ_LANGUAGE = shared.path_join(
-    PATHS["data_1-fetch"], "doaj_3_count_by_language.csv"
-)
-FILE_DOAJ_YEAR = shared.path_join(
-    PATHS["data_1-fetch"], "doaj_4_count_by_year.csv"
-)
-FILE_DOAJ_PUBLISHER = shared.path_join(
-    PATHS["data_1-fetch"], "doaj_5_count_by_publisher.csv"
-)
-FILE_PROVENANCE = shared.path_join(
-    PATHS["data_1-fetch"], "doaj_provenance.yaml"
-)
-
 
 # Load ISO 3166-1 alpha-2 country codes from YAML file
 def load_country_names():

From cd0d3f516273a1698192f663375506439fb4a621 Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Tue, 11 Nov 2025 10:34:06 +0100
Subject: [PATCH 08/19] Move subprocess import to top-level imports section

---
 scripts/1-fetch/doaj_fetch.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py
index e63fee8c..e7a80d65 100755
--- a/scripts/1-fetch/doaj_fetch.py
+++ b/scripts/1-fetch/doaj_fetch.py
@@ -19,6 +19,7 @@
 import argparse
 import csv
 import os
+import subprocess
 import sys
 import textwrap
 import time
@@ -169,9 +170,6 @@ def load_country_names():
             PATHS["repo"], "dev", "generate_country_codes.py"
         )
         try:
-            # Standard library
-            import subprocess
-
             subprocess.run([sys.executable, generate_script], check=True)
             LOGGER.info("Successfully generated country codes file")
         except Exception as e:

From 29856ee5b750e6a007bc42f4c9982f2941f9dd30 Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Tue, 11 Nov 2025 11:11:23 +0100
Subject: [PATCH 09/19] Replace 'f' with 'file_object' for descriptive variable
 naming

---
 dev/generate_country_codes.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dev/generate_country_codes.py b/dev/generate_country_codes.py
index 4bb7ebd8..1c2db056 100755
--- a/dev/generate_country_codes.py
+++ b/dev/generate_country_codes.py
@@ -275,9 +275,9 @@ def main():
         "# Generated programmatically by dev/generate_country_codes.py",
     ]
     
-    with open(output_file, "w", encoding="utf-8") as f:
-        f.write("\n".join(header) + "\n")
-        yaml.dump(COUNTRIES, f, default_flow_style=False, allow_unicode=True)
+    with open(output_file, "w", encoding="utf-8") as file_object:
+        file_object.write("\n".join(header) + "\n")
+        yaml.dump(COUNTRIES, file_object, default_flow_style=False, allow_unicode=True)
     
     print(f"Generated {output_file} with {len(COUNTRIES)} country codes")
 

From 33fa72333987934da8716c66c5d153e09cc9a60b Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Tue, 11 Nov 2025 11:11:47 +0100
Subject: [PATCH 10/19] Replace 'fh' with 'file_object' for descriptive
 variable naming

---
 scripts/1-fetch/doaj_fetch.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py
index e7a80d65..250eb646 100755
--- a/scripts/1-fetch/doaj_fetch.py
+++ b/scripts/1-fetch/doaj_fetch.py
@@ -179,8 +179,8 @@ def load_country_names():
             )
 
     try:
-        with open(country_file, "r", encoding="utf-8") as fh:
-            countries = yaml.safe_load(fh)
+        with open(country_file, "r", encoding="utf-8") as file_object:
+            countries = yaml.safe_load(file_object)
             return {country["code"]: country["name"] for country in countries}
     except Exception as e:
         LOGGER.error(f"Failed to load country codes from {country_file}: {e}")
@@ -409,8 +409,8 @@ def save_count_data(
     country_names = load_country_names()
 
     # Save license counts
-    with open(FILE_DOAJ_COUNT, "w", encoding="utf-8", newline="\n") as fh:
-        writer = csv.DictWriter(fh, fieldnames=HEADER_COUNT, dialect="unix")
+    with open(FILE_DOAJ_COUNT, "w", encoding="utf-8", newline="\n") as file_object:
+        writer = csv.DictWriter(file_object, fieldnames=HEADER_COUNT, dialect="unix")
         writer.writeheader()
         for lic, count in license_counts.items():
             writer.writerow({"TOOL_IDENTIFIER": lic, "COUNT": count})
@@ -418,9 +418,9 @@ def save_count_data(
     # Save subject report
     with open(
         FILE_DOAJ_SUBJECT_REPORT, "w", encoding="utf-8", newline="\n"
-    ) as fh:
+    ) as file_object:
         writer = csv.DictWriter(
-            fh, fieldnames=HEADER_SUBJECT_REPORT, dialect="unix"
+            file_object, fieldnames=HEADER_SUBJECT_REPORT, dialect="unix"
         )
         writer.writeheader()
         for lic, subjects in subject_counts.items():
@@ -439,8 +439,8 @@ def save_count_data(
                 )
 
     # Save language counts with readable names
-    with open(FILE_DOAJ_LANGUAGE, "w", encoding="utf-8", newline="\n") as fh:
-        writer = csv.DictWriter(fh, fieldnames=HEADER_LANGUAGE, dialect="unix")
+    with open(FILE_DOAJ_LANGUAGE, "w", encoding="utf-8", newline="\n") as file_object:
+        writer = csv.DictWriter(file_object, fieldnames=HEADER_LANGUAGE, dialect="unix")
         writer.writeheader()
         for lic, languages in language_counts.items():
             for lang_code, count in languages.items():
@@ -455,8 +455,8 @@ def save_count_data(
                 )
 
     # Save year counts
-    with open(FILE_DOAJ_YEAR, "w", encoding="utf-8", newline="\n") as fh:
-        writer = csv.DictWriter(fh, fieldnames=HEADER_YEAR, dialect="unix")
+    with open(FILE_DOAJ_YEAR, "w", encoding="utf-8", newline="\n") as file_object:
+        writer = csv.DictWriter(file_object, fieldnames=HEADER_YEAR, dialect="unix")
         writer.writeheader()
         for lic, years in year_counts.items():
             for year, count in years.items():
@@ -465,9 +465,9 @@ def save_count_data(
                 )
 
     # Save publisher counts
-    with open(FILE_DOAJ_PUBLISHER, "w", encoding="utf-8", newline="\n") as fh:
+    with open(FILE_DOAJ_PUBLISHER, "w", encoding="utf-8", newline="\n") as file_object:
         writer = csv.DictWriter(
-            fh, fieldnames=HEADER_PUBLISHER, dialect="unix"
+            file_object, fieldnames=HEADER_PUBLISHER, dialect="unix"
         )
         writer.writeheader()
         for lic, publishers in publisher_counts.items():
@@ -529,8 +529,8 @@ def query_doaj(args):
     }
 
     try:
-        with open(FILE_PROVENANCE, "w", encoding="utf-8", newline="\n") as fh:
-            yaml.dump(provenance_data, fh, default_flow_style=False, indent=2)
+        with open(FILE_PROVENANCE, "w", encoding="utf-8", newline="\n") as file_object:
+            yaml.dump(provenance_data, file_object, default_flow_style=False, indent=2)
     except Exception as e:
         LOGGER.error("Failed to write provenance file: %s", e)
         raise shared.QuantifyingException(

From ae20f2423c7f6935721e6de37e4365ae67c1728b Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Tue, 11 Nov 2025 11:38:28 +0100
Subject: [PATCH 11/19] Fix static analysis issues and code formatting

---
 dev/generate_country_codes.py | 19 ++++++++++++-----
 scripts/1-fetch/doaj_fetch.py | 39 +++++++++++++++++++++++++++--------
 2 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/dev/generate_country_codes.py b/dev/generate_country_codes.py
index 1c2db056..a70ba79a 100755
--- a/dev/generate_country_codes.py
+++ b/dev/generate_country_codes.py
@@ -2,13 +2,17 @@
 """
 Generate ISO 3166-1 alpha-2 country codes YAML file for DOAJ fetch script.
 """
+# Standard library
 import os
 import sys
+
+# Third-party
 import yaml
 
 # Add parent directory so shared can be imported
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", "scripts"))
-import shared
+# First-party/Local
+import shared  # noqa: E402
 
 # ISO 3166-1 alpha-2 country codes (official list)
 COUNTRIES = [
@@ -268,17 +272,22 @@ def main():
     """Generate ISO country codes YAML file."""
     repo_path = shared.path_join(os.path.dirname(__file__), "..")
     output_file = shared.path_join(repo_path, "data", "iso_country_codes.yaml")
-    
+
     header = [
         "# ISO 3166-1 alpha-2 country codes to country names mapping",
         "# Used by DOAJ API for publisher country identification",
         "# Generated programmatically by dev/generate_country_codes.py",
     ]
-    
+
     with open(output_file, "w", encoding="utf-8") as file_object:
         file_object.write("\n".join(header) + "\n")
-        yaml.dump(COUNTRIES, file_object, default_flow_style=False, allow_unicode=True)
-    
+        yaml.dump(
+            COUNTRIES,
+            file_object,
+            default_flow_style=False,
+            allow_unicode=True,
+        )
+
     print(f"Generated {output_file} with {len(COUNTRIES)} country codes")
 
 
diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py
index 250eb646..7a298b32 100755
--- a/scripts/1-fetch/doaj_fetch.py
+++ b/scripts/1-fetch/doaj_fetch.py
@@ -409,8 +409,12 @@ def save_count_data(
     country_names = load_country_names()
 
     # Save license counts
-    with open(FILE_DOAJ_COUNT, "w", encoding="utf-8", newline="\n") as file_object:
-        writer = csv.DictWriter(file_object, fieldnames=HEADER_COUNT, dialect="unix")
+    with open(
+        FILE_DOAJ_COUNT, "w", encoding="utf-8", newline="\n"
+    ) as file_object:
+        writer = csv.DictWriter(
+            file_object, fieldnames=HEADER_COUNT, dialect="unix"
+        )
         writer.writeheader()
         for lic, count in license_counts.items():
             writer.writerow({"TOOL_IDENTIFIER": lic, "COUNT": count})
@@ -439,8 +443,12 @@ def save_count_data(
                 )
 
     # Save language counts with readable names
-    with open(FILE_DOAJ_LANGUAGE, "w", encoding="utf-8", newline="\n") as file_object:
-        writer = csv.DictWriter(file_object, fieldnames=HEADER_LANGUAGE, dialect="unix")
+    with open(
+        FILE_DOAJ_LANGUAGE, "w", encoding="utf-8", newline="\n"
+    ) as file_object:
+        writer = csv.DictWriter(
+            file_object, fieldnames=HEADER_LANGUAGE, dialect="unix"
+        )
         writer.writeheader()
         for lic, languages in language_counts.items():
             for lang_code, count in languages.items():
@@ -455,8 +463,12 @@ def save_count_data(
                 )
 
     # Save year counts
-    with open(FILE_DOAJ_YEAR, "w", encoding="utf-8", newline="\n") as file_object:
-        writer = csv.DictWriter(file_object, fieldnames=HEADER_YEAR, dialect="unix")
+    with open(
+        FILE_DOAJ_YEAR, "w", encoding="utf-8", newline="\n"
+    ) as file_object:
+        writer = csv.DictWriter(
+            file_object, fieldnames=HEADER_YEAR, dialect="unix"
+        )
         writer.writeheader()
         for lic, years in year_counts.items():
             for year, count in years.items():
@@ -465,7 +477,9 @@ def save_count_data(
                 )
 
     # Save publisher counts
-    with open(FILE_DOAJ_PUBLISHER, "w", encoding="utf-8", newline="\n") as file_object:
+    with open(
+        FILE_DOAJ_PUBLISHER, "w", encoding="utf-8", newline="\n"
+    ) as file_object:
         writer = csv.DictWriter(
             file_object, fieldnames=HEADER_PUBLISHER, dialect="unix"
         )
@@ -529,8 +543,15 @@ def query_doaj(args):
     }
 
     try:
-        with open(FILE_PROVENANCE, "w", encoding="utf-8", newline="\n") as file_object:
-            yaml.dump(provenance_data, file_object, default_flow_style=False, indent=2)
+        with open(
+            FILE_PROVENANCE, "w", encoding="utf-8", newline="\n"
+        ) as file_object:
+            yaml.dump(
+                provenance_data,
+                file_object,
+                default_flow_style=False,
+                indent=2,
+            )
     except Exception as e:
         LOGGER.error("Failed to write provenance file: %s", e)
         raise shared.QuantifyingException(

From 3d75671a46be1b01224ccd52a8805c7bee05a7df Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Sat, 15 Nov 2025 19:31:23 +0100
Subject: [PATCH 12/19] Fix duplicate counting for journals with multiple CC
 license types

---
 scripts/1-fetch/doaj_fetch.py | 323 +++++++---------------------------
 1 file changed, 63 insertions(+), 260 deletions(-)

diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py
index 7a298b32..996befae 100755
--- a/scripts/1-fetch/doaj_fetch.py
+++ b/scripts/1-fetch/doaj_fetch.py
@@ -2,24 +2,24 @@
 """
 Fetch DOAJ journals with CC license information using API v4.
 
+Focus: Journal-level CC license adoption and temporal trends.
 Note: Articles do not contain license information in DOAJ API.
 
+This script focuses on essential data for quantifying Creative Commons adoption:
+- Journal CC license counts by type
+- Temporal trends (year-by-year adoption)
+
+Removed out-of-scope data: subjects, languages, publishers, countries.
+
 Default filtering by oa_start >= 2002 to avoid false positives from journals
 that retroactively adopted CC licenses. Creative Commons was founded in 2001
 and first licenses released in 2002. Journals with oa_start before 2002 may
 show CC licenses due to later license updates, not original terms.
-
-Country Code Mapping:
-This script requires ISO 3166-1 alpha-2 country codes for publisher analysis.
-If data/iso_country_codes.yaml is missing, the script will automatically
-generate it using dev/generate_country_codes.py. Users do not need to manually
-create this file - it will be created programmatically when needed.
 """
 # Standard library
 import argparse
 import csv
 import os
-import subprocess
 import sys
 import textwrap
 import time
@@ -62,15 +62,6 @@
 
 # File Paths
 FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv")
-FILE_DOAJ_LANGUAGE = shared.path_join(
-    PATHS["data_1-fetch"], "doaj_3_count_by_language.csv"
-)
-FILE_DOAJ_PUBLISHER = shared.path_join(
-    PATHS["data_1-fetch"], "doaj_5_count_by_publisher.csv"
-)
-FILE_DOAJ_SUBJECT_REPORT = shared.path_join(
-    PATHS["data_1-fetch"], "doaj_2_count_by_subject_report.csv"
-)
 FILE_PROVENANCE = shared.path_join(
     PATHS["data_1-fetch"], "doaj_provenance.yaml"
 )
@@ -80,115 +71,8 @@
 
 # CSV Headers
 HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"]
-HEADER_LANGUAGE = ["TOOL_IDENTIFIER", "LANGUAGE_CODE", "LANGUAGE", "COUNT"]
-HEADER_PUBLISHER = [
-    "TOOL_IDENTIFIER",
-    "PUBLISHER",
-    "COUNTRY_CODE",
-    "COUNTRY_NAME",
-    "COUNT",
-]
-HEADER_SUBJECT_REPORT = [
-    "TOOL_IDENTIFIER",
-    "SUBJECT_CODE",
-    "SUBJECT_LABEL",
-    "COUNT",
-]
 HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"]
 
-# Language code to readable name mapping
-LANGUAGE_NAMES = {
-    "AF": "Afrikaans",
-    "AR": "Arabic",
-    "BE": "Belarusian",
-    "BG": "Bulgarian",
-    "BN": "Bengali",
-    "CA": "Catalan",
-    "CS": "Czech",
-    "DA": "Danish",
-    "DE": "German",
-    "EL": "Greek",
-    "EN": "English",
-    "ES": "Spanish",
-    "ET": "Estonian",
-    "FA": "Persian",
-    "FI": "Finnish",
-    "FR": "French",
-    "HE": "Hebrew",
-    "HI": "Hindi",
-    "HR": "Croatian",
-    "HU": "Hungarian",
-    "ID": "Indonesian",
-    "IS": "Icelandic",
-    "IT": "Italian",
-    "JA": "Japanese",
-    "KO": "Korean",
-    "LT": "Lithuanian",
-    "LV": "Latvian",
-    "MK": "Macedonian",
-    "MS": "Malay",
-    "NL": "Dutch",
-    "NO": "Norwegian",
-    "PL": "Polish",
-    "PT": "Portuguese",
-    "RO": "Romanian",
-    "RU": "Russian",
-    "SK": "Slovak",
-    "SL": "Slovenian",
-    "SR": "Serbian",
-    "SV": "Swedish",
-    "SW": "Swahili",
-    "TH": "Thai",
-    "TR": "Turkish",
-    "UK": "Ukrainian",
-    "UR": "Urdu",
-    "VI": "Vietnamese",
-    "ZH": "Chinese",
-}
-
-
-# Load ISO 3166-1 alpha-2 country codes from YAML file
-def load_country_names():
-    """
-    Load country code to name mapping from YAML file.
-
-    Automatically generates data/iso_country_codes.yaml if missing using
-    dev/generate_country_codes.py. This ensures the script is self-contained
-    and does not require manual file creation by users.
-
-    Returns:
-        dict: Mapping of ISO 3166-1 alpha-2 codes to country names
-    """
-    country_file = shared.path_join(
-        PATHS["repo"], "data", "iso_country_codes.yaml"
-    )
-
-    # Generate country codes file if it doesn't exist
-    if not os.path.isfile(country_file):
-        LOGGER.info("Country codes file not found, generating it...")
-        generate_script = shared.path_join(
-            PATHS["repo"], "dev", "generate_country_codes.py"
-        )
-        try:
-            subprocess.run([sys.executable, generate_script], check=True)
-            LOGGER.info("Successfully generated country codes file")
-        except Exception as e:
-            LOGGER.error(f"Failed to generate country codes file: {e}")
-            raise shared.QuantifyingException(
-                f"Critical error generating country codes: {e}", exit_code=1
-            )
-
-    try:
-        with open(country_file, "r", encoding="utf-8") as file_object:
-            countries = yaml.safe_load(file_object)
-            return {country["code"]: country["name"] for country in countries}
-    except Exception as e:
-        LOGGER.error(f"Failed to load country codes from {country_file}: {e}")
-        raise shared.QuantifyingException(
-            f"Critical error loading country codes: {e}", exit_code=1
-        )
-
-
 # Runtime variables
 QUARTER = os.path.basename(PATHS["data_quarter"])
 
@@ -245,21 +129,21 @@ def initialize_all_data_files(args):
         return
     os.makedirs(PATHS["data_1-fetch"], exist_ok=True)
     initialize_data_file(FILE_DOAJ_COUNT, HEADER_COUNT)
-    initialize_data_file(FILE_DOAJ_SUBJECT_REPORT, HEADER_SUBJECT_REPORT)
-    initialize_data_file(FILE_DOAJ_LANGUAGE, HEADER_LANGUAGE)
     initialize_data_file(FILE_DOAJ_YEAR, HEADER_YEAR)
-    initialize_data_file(FILE_DOAJ_PUBLISHER, HEADER_PUBLISHER)
 
 
-def extract_license_type(license_info):
-    """Extract CC license type from DOAJ license information."""
+def extract_license_types(license_info):
+    """Extract all CC license types from DOAJ license information."""
     if not license_info:
-        return "UNKNOWN CC legal tool"
+        return []
+    
+    cc_licenses = []
     for lic in license_info:
         lic_type = lic.get("type", "")
         if lic_type in CC_LICENSE_TYPES:
-            return lic_type
-    return "UNKNOWN CC legal tool"
+            cc_licenses.append(lic_type)
+    
+    return cc_licenses
 
 
 def process_journals(session, args):
@@ -267,10 +151,9 @@ def process_journals(session, args):
     LOGGER.info("Fetching DOAJ journals...")
 
     license_counts = Counter()
-    subject_counts = defaultdict(Counter)
-    language_counts = defaultdict(Counter)
     year_counts = defaultdict(Counter)
-    publisher_counts = defaultdict(Counter)
+    article_counts = defaultdict(int)  # Track total articles per license type
+    processed_journals = set()  # Track unique journals to avoid double counting
 
     total_processed = 0
     page = 1
@@ -319,52 +202,45 @@ def process_journals(session, args):
             try:
                 bibjson = journal.get("bibjson", {})
 
-                # Check for CC license
+                # Get journal identifier to avoid double counting
+                journal_id = journal.get("id", "")
+                if not journal_id:
+                    continue
+
+                # Check for CC licenses
                 license_info = bibjson.get("license")
                 if not license_info:
                     continue
 
-                license_type = extract_license_type(license_info)
-                if license_type == "UNKNOWN CC legal tool":
+                cc_license_types = extract_license_types(license_info)
+                if not cc_license_types:
                     continue
 
-                license_counts[license_type] += 1
-
-                # Extract subjects
-                subjects = bibjson.get("subject", [])
-                for subject in subjects:
-                    if isinstance(subject, dict):
-                        code = subject.get("code", "")
-                        term = subject.get("term", "")
-                        if code and term:
-                            subject_counts[license_type][f"{code}|{term}"] += 1
-
-                # Extract year from oa_start (Open Access start year)
+                # Extract article count and year once per journal
+                article_count = bibjson.get("article_count", 0)
                 oa_start = bibjson.get("oa_start")
 
                 # Apply date-back filter if specified
                 if args.date_back and oa_start and oa_start < args.date_back:
                     continue
 
-                if oa_start:
-                    year_counts[license_type][str(oa_start)] += 1
-                else:
-                    year_counts[license_type]["Unknown"] += 1
-
-                # Extract languages
-                languages = bibjson.get("language", [])
-                for lang in languages:
-                    language_counts[license_type][lang] += 1
-
-                # Extract publisher information (new in v4)
-                publisher_info = bibjson.get("publisher", {})
-                if publisher_info:
-                    publisher_name = publisher_info.get("name", "Unknown")
-                    publisher_country = publisher_info.get(
-                        "country", "Unknown"
-                    )
-                    publisher_key = f"{publisher_name}|{publisher_country}"
-                    publisher_counts[license_type][publisher_key] += 1
+                # Count each license type this journal supports
+                for license_type in cc_license_types:
+                    license_counts[license_type] += 1
+
+                    # Add year data for each license type
+                    if oa_start:
+                        year_counts[license_type][str(oa_start)] += 1
+                    else:
+                        year_counts[license_type]["Unknown"] += 1
+
+                # Add article count only once per unique journal (avoid double counting)
+                if journal_id not in processed_journals:
+                    processed_journals.add(journal_id)
+                    # Add full article count to each license type this journal supports
+                    if article_count:
+                        for license_type in cc_license_types:
+                            article_counts[license_type] += article_count
 
                 total_processed += 1
 
@@ -388,25 +264,18 @@ def process_journals(session, args):
 
     return (
         license_counts,
-        subject_counts,
-        language_counts,
         year_counts,
-        publisher_counts,
-        total_processed,
+        article_counts,
+        len(processed_journals),  # Return unique journal count
     )
 
 
 def save_count_data(
     license_counts,
-    subject_counts,
-    language_counts,
     year_counts,
-    publisher_counts,
+    article_counts,
 ):
-    """Save all collected data to CSV files."""
-
-    # Load country names from YAML
-    country_names = load_country_names()
+    """Save essential journal data and article context to CSV files."""
 
     # Save license counts
     with open(
@@ -419,49 +288,6 @@ def save_count_data(
         for lic, count in license_counts.items():
             writer.writerow({"TOOL_IDENTIFIER": lic, "COUNT": count})
 
-    # Save subject report
-    with open(
-        FILE_DOAJ_SUBJECT_REPORT, "w", encoding="utf-8", newline="\n"
-    ) as file_object:
-        writer = csv.DictWriter(
-            file_object, fieldnames=HEADER_SUBJECT_REPORT, dialect="unix"
-        )
-        writer.writeheader()
-        for lic, subjects in subject_counts.items():
-            for subject_info, count in subjects.items():
-                if "|" in subject_info:
-                    code, label = subject_info.split("|", 1)
-                else:
-                    code, label = subject_info, subject_info
-                writer.writerow(
-                    {
-                        "TOOL_IDENTIFIER": lic,
-                        "SUBJECT_CODE": code,
-                        "SUBJECT_LABEL": label,
-                        "COUNT": count,
-                    }
-                )
-
-    # Save language counts with readable names
-    with open(
-        FILE_DOAJ_LANGUAGE, "w", encoding="utf-8", newline="\n"
-    ) as file_object:
-        writer = csv.DictWriter(
-            file_object, fieldnames=HEADER_LANGUAGE, dialect="unix"
-        )
-        writer.writeheader()
-        for lic, languages in language_counts.items():
-            for lang_code, count in languages.items():
-                lang_name = LANGUAGE_NAMES.get(lang_code, lang_code)
-                writer.writerow(
-                    {
-                        "TOOL_IDENTIFIER": lic,
-                        "LANGUAGE_CODE": lang_code,
-                        "LANGUAGE": lang_name,
-                        "COUNT": count,
-                    }
-                )
-
     # Save year counts
     with open(
         FILE_DOAJ_YEAR, "w", encoding="utf-8", newline="\n"
@@ -476,32 +302,6 @@ def save_count_data(
                     {"TOOL_IDENTIFIER": lic, "YEAR": year, "COUNT": count}
                 )
 
-    # Save publisher counts
-    with open(
-        FILE_DOAJ_PUBLISHER, "w", encoding="utf-8", newline="\n"
-    ) as file_object:
-        writer = csv.DictWriter(
-            file_object, fieldnames=HEADER_PUBLISHER, dialect="unix"
-        )
-        writer.writeheader()
-        for lic, publishers in publisher_counts.items():
-            for publisher_info, count in publishers.items():
-                if "|" in publisher_info:
-                    publisher, country_code = publisher_info.split("|", 1)
-                else:
-                    publisher, country_code = publisher_info, "Unknown"
-
-                country_name = country_names.get(country_code, country_code)
-                writer.writerow(
-                    {
-                        "TOOL_IDENTIFIER": lic,
-                        "PUBLISHER": publisher,
-                        "COUNTRY_CODE": country_code,
-                        "COUNTRY_NAME": country_name,
-                        "COUNT": count,
-                    }
-                )
-
 
 def query_doaj(args):
     """Main function to query DOAJ API v4."""
@@ -512,10 +312,8 @@ def query_doaj(args):
     # Process journals
     (
         license_counts,
-        subject_counts,
-        language_counts,
         year_counts,
-        publisher_counts,
+        article_counts,
         journals_processed,
     ) = process_journals(session, args)
 
@@ -523,15 +321,14 @@ def query_doaj(args):
     if args.enable_save:
         save_count_data(
             license_counts,
-            subject_counts,
-            language_counts,
             year_counts,
-            publisher_counts,
+            article_counts,
         )
 
     # Save provenance
+    total_articles = sum(article_counts.values())
     provenance_data = {
-        "total_articles_fetched": 0,
+        "total_articles_in_cc_journals": total_articles,
         "total_journals_fetched": journals_processed,
         "total_processed": journals_processed,
         "limit": args.limit,
@@ -539,7 +336,7 @@ def query_doaj(args):
         "quarter": QUARTER,
         "script": os.path.basename(__file__),
         "api_version": "v4",
-        "note": "Articles do not contain license information in DOAJ API",
+        "note": "Article counts provide context for CC journal scope - individual article licenses unknown",
     }
 
     try:
@@ -558,10 +355,16 @@ def query_doaj(args):
             f"Critical error writing provenance file: {e}", exit_code=1
         )
 
-    LOGGER.info(f"Total CC licensed journals processed: {journals_processed}")
-    LOGGER.info(
-        "Articles: 0 (DOAJ API doesn't provide license info for articles)"
-    )
+    LOGGER.info(f"Unique CC-licensed journals processed: {journals_processed}")
+    
+    # Calculate total license availability instances
+    total_license_instances = sum(license_counts.values())
+    LOGGER.info(f"Total CC license type instances: {total_license_instances}")
+    
+    # Calculate total articles for context
+    total_articles = sum(article_counts.values())
+    LOGGER.info(f"Total articles in CC-licensed journals: {total_articles}")
+    LOGGER.info("Note: Journals supporting multiple CC license types are counted once per license type")
 
 
 def main():

From 37e38552d085b8d989e7e207c6893a73704c5def Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Sat, 15 Nov 2025 20:56:37 +0100
Subject: [PATCH 13/19] Remove article counting logic due to DOAJ API
 limitations

---
 scripts/1-fetch/doaj_fetch.py | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py
index 996befae..e7ffca7f 100755
--- a/scripts/1-fetch/doaj_fetch.py
+++ b/scripts/1-fetch/doaj_fetch.py
@@ -152,7 +152,6 @@ def process_journals(session, args):
 
     license_counts = Counter()
     year_counts = defaultdict(Counter)
-    article_counts = defaultdict(int)  # Track total articles per license type
     processed_journals = set()  # Track unique journals to avoid double counting
 
     total_processed = 0
@@ -216,8 +215,7 @@ def process_journals(session, args):
                 if not cc_license_types:
                     continue
 
-                # Extract article count and year once per journal
-                article_count = bibjson.get("article_count", 0)
+                # Extract year from oa_start (Open Access start year)
                 oa_start = bibjson.get("oa_start")
 
                 # Apply date-back filter if specified
@@ -234,13 +232,9 @@ def process_journals(session, args):
                     else:
                         year_counts[license_type]["Unknown"] += 1
 
-                # Add article count only once per unique journal (avoid double counting)
+                # Track unique journals to avoid double counting in statistics
                 if journal_id not in processed_journals:
                     processed_journals.add(journal_id)
-                    # Add full article count to each license type this journal supports
-                    if article_count:
-                        for license_type in cc_license_types:
-                            article_counts[license_type] += article_count
 
                 total_processed += 1
 
@@ -265,7 +259,6 @@ def process_journals(session, args):
     return (
         license_counts,
         year_counts,
-        article_counts,
         len(processed_journals),  # Return unique journal count
     )
 
@@ -273,9 +266,8 @@ def process_journals(session, args):
 def save_count_data(
     license_counts,
     year_counts,
-    article_counts,
 ):
-    """Save essential journal data and article context to CSV files."""
+    """Save essential journal data to CSV files."""
 
     # Save license counts
     with open(
@@ -313,7 +305,6 @@ def query_doaj(args):
     (
         license_counts,
         year_counts,
-        article_counts,
         journals_processed,
     ) = process_journals(session, args)
 
@@ -322,13 +313,10 @@ def query_doaj(args):
         save_count_data(
             license_counts,
             year_counts,
-            article_counts,
         )
 
     # Save provenance
-    total_articles = sum(article_counts.values())
     provenance_data = {
-        "total_articles_in_cc_journals": total_articles,
         "total_journals_fetched": journals_processed,
         "total_processed": journals_processed,
         "limit": args.limit,
@@ -336,7 +324,7 @@ def query_doaj(args):
         "quarter": QUARTER,
         "script": os.path.basename(__file__),
         "api_version": "v4",
-        "note": "Article counts provide context for CC journal scope - individual article licenses unknown",
+        "note": "Journal-level CC license data only - article counts not available via DOAJ API",
     }
 
     try:
@@ -360,10 +348,6 @@ def query_doaj(args):
     # Calculate total license availability instances
     total_license_instances = sum(license_counts.values())
     LOGGER.info(f"Total CC license type instances: {total_license_instances}")
-    
-    # Calculate total articles for context
-    total_articles = sum(article_counts.values())
-    LOGGER.info(f"Total articles in CC-licensed journals: {total_articles}")
     LOGGER.info("Note: Journals supporting multiple CC license types are counted once per license type")
 
 

From 8c7bea54a6e1095bab8c6f72f4cd2b1d38031eff Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Mon, 24 Nov 2025 10:50:41 +0100
Subject: [PATCH 14/19] Add DOAJ API documentation and technical details to
 sources.md

---
 sources.md | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/sources.md b/sources.md
index 2f559bef..0307f154 100644
--- a/sources.md
+++ b/sources.md
@@ -42,6 +42,33 @@ tool paths.
 [prioritized-tool-urls]: data/prioritized-tool-urls.txt
 
 
+## DOAJ (Directory of Open Access Journals)
+
+**Description:** DOAJ is a comprehensive directory of open access journals that provides metadata about journals and their licensing policies. The API allows access to journal-level information including Creative Commons license support, publication years, and publisher details.
+
+**Official API Documentation:**
+- [DOAJ API Documentation](https://doaj.org/api/docs)
+- [DOAJ API v4 Reference](https://doaj.org/api/v4/docs)
+- [Base URL](https://doaj.org/api/v4/)
+
+**API Information:**
+- No API key required
+- Rate limiting: Reasonable use policy (no specific limits documented)
+- Data format: JSON
+- Pagination: 100 results per page (configurable up to 100)
+- Search filters: Supports filtering by license type, publication year, subject, etc.
+- License detection: Extracts CC license information from journal metadata
+
+**Technical Details:**
+- Fetches journals that support Creative Commons licensing
+- Aggregates license counts by type (CC BY, CC BY-SA, CC BY-NC, etc.)
+- Tracks license adoption by publication year
+- Handles journals with multiple supported license types
+- Generates provenance metadata for data lineage
+
+**Script:** [`scripts/1-fetch/doaj_fetch.py`](scripts/1-fetch/doaj_fetch.py)
+
+
 ## Europeana
 
 **Description:**

From 1cdf721009d5fd766e2bf8d47e16c610b65882d6 Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Mon, 24 Nov 2025 11:12:59 +0100
Subject: [PATCH 15/19] Add country and language data collection using
 pycountry library

---
 Pipfile                       |  1 +
 Pipfile.lock                  | 11 ++++-
 scripts/1-fetch/doaj_fetch.py | 89 +++++++++++++++++++++++++++++++++++
 3 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/Pipfile b/Pipfile
index 6ad3649a..3f109280 100644
--- a/Pipfile
+++ b/Pipfile
@@ -24,6 +24,7 @@ requests = ">=2.31.0"
 seaborn = "*"
 urllib3 = ">=2.5.0"
 wordcloud = "*"
+pycountry = "*"
 
 [dev-packages]
 black = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
index c111bd7d..b2d4f528 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "0658ee0a643ae80db9419f8ef32887877922d18a5da188b1b87e4e9a1849a4a1"
+            "sha256": "955abafb11bfd9c68a55197b650693c21577a7a98f4a3fe81d49454351ef7572"
         },
         "pipfile-spec": 6,
         "requires": {
@@ -1644,6 +1644,15 @@
             "markers": "python_version >= '3.8'",
             "version": "==0.4.2"
         },
+        "pycountry": {
+            "hashes": [
+                "sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221",
+                "sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '3.8'",
+            "version": "==24.6.1"
+        },
         "pycparser": {
             "hashes": [
                 "sha256:78816d4f24add8f10a06d6f05b4d424ad9e96cfebf68a4ddc99c65c0720d00c2",
diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py
index e7ffca7f..8f6cde9c 100755
--- a/scripts/1-fetch/doaj_fetch.py
+++ b/scripts/1-fetch/doaj_fetch.py
@@ -27,6 +27,7 @@
 from collections import Counter, defaultdict
 
 # Third-party
+import pycountry
 import requests
 import yaml
 from pygments import highlight
@@ -62,6 +63,8 @@
 
 # File Paths
 FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv")
+FILE_DOAJ_COUNTRY = shared.path_join(PATHS["data_1-fetch"], "doaj_3_count_by_country.csv")
+FILE_DOAJ_LANGUAGE = shared.path_join(PATHS["data_1-fetch"], "doaj_5_count_by_language.csv")
 FILE_PROVENANCE = shared.path_join(
     PATHS["data_1-fetch"], "doaj_provenance.yaml"
 )
@@ -71,6 +74,8 @@
 
 # CSV Headers
 HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"]
+HEADER_COUNTRY = ["TOOL_IDENTIFIER", "COUNTRY_CODE", "COUNTRY_NAME", "COUNT"]
+HEADER_LANGUAGE = ["TOOL_IDENTIFIER", "LANGUAGE_CODE", "LANGUAGE_NAME", "COUNT"]
 HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"]
 
 # Runtime variables
@@ -129,9 +134,33 @@ def initialize_all_data_files(args):
         return
     os.makedirs(PATHS["data_1-fetch"], exist_ok=True)
     initialize_data_file(FILE_DOAJ_COUNT, HEADER_COUNT)
+    initialize_data_file(FILE_DOAJ_COUNTRY, HEADER_COUNTRY)
+    initialize_data_file(FILE_DOAJ_LANGUAGE, HEADER_LANGUAGE)
     initialize_data_file(FILE_DOAJ_YEAR, HEADER_YEAR)
 
 
+def get_country_name(country_code):
+    """Get country name from ISO 3166-1 alpha-2 code using pycountry."""
+    if not country_code or country_code == "Unknown":
+        return "Unknown"
+    try:
+        country = pycountry.countries.get(alpha_2=country_code.upper())
+        return country.name if country else country_code
+    except Exception:
+        return country_code
+
+
+def get_language_name(language_code):
+    """Get language name from ISO 639-1 code using pycountry."""
+    if not language_code or language_code == "Unknown":
+        return "Unknown"
+    try:
+        language = pycountry.languages.get(alpha_2=language_code.upper())
+        return language.name if language else language_code
+    except Exception:
+        return language_code
+
+
 def extract_license_types(license_info):
     """Extract all CC license types from DOAJ license information."""
     if not license_info:
@@ -151,6 +180,8 @@ def process_journals(session, args):
     LOGGER.info("Fetching DOAJ journals...")
 
     license_counts = Counter()
+    country_counts = defaultdict(Counter)
+    language_counts = defaultdict(Counter)
     year_counts = defaultdict(Counter)
     processed_journals = set()  # Track unique journals to avoid double counting
 
@@ -232,6 +263,20 @@ def process_journals(session, args):
                     else:
                         year_counts[license_type]["Unknown"] += 1
 
+                    # Extract country information
+                    publisher_info = bibjson.get("publisher", {})
+                    if isinstance(publisher_info, dict):
+                        country_code = publisher_info.get("country", "Unknown")
+                        country_counts[license_type][country_code] += 1
+                    
+                    # Extract language information
+                    languages = bibjson.get("language", [])
+                    if languages:
+                        for lang_code in languages:
+                            language_counts[license_type][lang_code] += 1
+                    else:
+                        language_counts[license_type]["Unknown"] += 1
+
                 # Track unique journals to avoid double counting in statistics
                 if journal_id not in processed_journals:
                     processed_journals.add(journal_id)
@@ -258,6 +303,8 @@ def process_journals(session, args):
 
     return (
         license_counts,
+        country_counts,
+        language_counts,
         year_counts,
         len(processed_journals),  # Return unique journal count
     )
@@ -265,6 +312,8 @@ def process_journals(session, args):
 
 def save_count_data(
     license_counts,
+    country_counts,
+    language_counts,
     year_counts,
 ):
     """Save essential journal data to CSV files."""
@@ -280,6 +329,42 @@ def save_count_data(
         for lic, count in license_counts.items():
             writer.writerow({"TOOL_IDENTIFIER": lic, "COUNT": count})
 
+    # Save country counts with pycountry names
+    with open(
+        FILE_DOAJ_COUNTRY, "w", encoding="utf-8", newline="\n"
+    ) as file_object:
+        writer = csv.DictWriter(
+            file_object, fieldnames=HEADER_COUNTRY, dialect="unix"
+        )
+        writer.writeheader()
+        for lic, countries in country_counts.items():
+            for country_code, count in countries.items():
+                country_name = get_country_name(country_code)
+                writer.writerow({
+                    "TOOL_IDENTIFIER": lic,
+                    "COUNTRY_CODE": country_code,
+                    "COUNTRY_NAME": country_name,
+                    "COUNT": count,
+                })
+
+    # Save language counts with pycountry names
+    with open(
+        FILE_DOAJ_LANGUAGE, "w", encoding="utf-8", newline="\n"
+    ) as file_object:
+        writer = csv.DictWriter(
+            file_object, fieldnames=HEADER_LANGUAGE, dialect="unix"
+        )
+        writer.writeheader()
+        for lic, languages in language_counts.items():
+            for lang_code, count in languages.items():
+                lang_name = get_language_name(lang_code)
+                writer.writerow({
+                    "TOOL_IDENTIFIER": lic,
+                    "LANGUAGE_CODE": lang_code,
+                    "LANGUAGE_NAME": lang_name,
+                    "COUNT": count,
+                })
+
     # Save year counts
     with open(
         FILE_DOAJ_YEAR, "w", encoding="utf-8", newline="\n"
@@ -304,6 +389,8 @@ def query_doaj(args):
     # Process journals
     (
         license_counts,
+        country_counts,
+        language_counts,
         year_counts,
         journals_processed,
     ) = process_journals(session, args)
@@ -312,6 +399,8 @@ def query_doaj(args):
     if args.enable_save:
         save_count_data(
             license_counts,
+            country_counts,
+            language_counts,
             year_counts,
         )
 

From f40dbf4bd654a013ac30abf4208a8792e8c70b33 Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Mon, 24 Nov 2025 11:21:37 +0100
Subject: [PATCH 16/19] Update exception handling to use structured
 QuantifyingException approach

---
 scripts/1-fetch/doaj_fetch.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py
index 8f6cde9c..f2451f9d 100755
--- a/scripts/1-fetch/doaj_fetch.py
+++ b/scripts/1-fetch/doaj_fetch.py
@@ -199,20 +199,12 @@ def process_journals(session, args):
             response = session.get(url, params=params, timeout=30)
             response.raise_for_status()
             data = response.json()
-        except requests.exceptions.RequestException as e:
-            if hasattr(e, "response") and e.response.status_code == 400:
-                LOGGER.info(f"Reached end of available data at page {page}")
-                break
-            else:
-                LOGGER.error(f"Failed to fetch journals page {page}: {e}")
-                raise shared.QuantifyingException(
-                    f"Critical API error on page {page}: {e}", exit_code=1
-                )
-        except (ValueError, KeyError) as e:
-            LOGGER.error(f"Failed to parse JSON response on page {page}: {e}")
-            raise shared.QuantifyingException(
-                f"Critical JSON parsing error on page {page}: {e}", exit_code=1
-            )
+        except requests.HTTPError as e:
+            raise shared.QuantifyingException(f"HTTP Error: {e}", 1)
+        except requests.RequestException as e:
+            raise shared.QuantifyingException(f"Request Exception: {e}", 1)
+        except KeyError as e:
+            raise shared.QuantifyingException(f"KeyError: {e}", 1)
 
         try:
             results = data.get("results", [])

From daedef4ec31a65108adf43e1584b071eb301b596 Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Mon, 24 Nov 2025 11:28:45 +0100
Subject: [PATCH 17/19] Fix static analysis issues and handle 400 errors as
 end-of-data

---
 scripts/1-fetch/doaj_fetch.py | 72 ++++++++++++++++++++++++-----------
 1 file changed, 49 insertions(+), 23 deletions(-)

diff --git a/scripts/1-fetch/doaj_fetch.py b/scripts/1-fetch/doaj_fetch.py
index f2451f9d..f9aa017a 100755
--- a/scripts/1-fetch/doaj_fetch.py
+++ b/scripts/1-fetch/doaj_fetch.py
@@ -5,7 +5,8 @@
 Focus: Journal-level CC license adoption and temporal trends.
 Note: Articles do not contain license information in DOAJ API.
 
-This script focuses on essential data for quantifying Creative Commons adoption:
+This script focuses on essential data for quantifying Creative Commons
+adoption:
 - Journal CC license counts by type
 - Temporal trends (year-by-year adoption)
 
@@ -63,8 +64,12 @@
 
 # File Paths
 FILE_DOAJ_COUNT = shared.path_join(PATHS["data_1-fetch"], "doaj_1_count.csv")
-FILE_DOAJ_COUNTRY = shared.path_join(PATHS["data_1-fetch"], "doaj_3_count_by_country.csv")
-FILE_DOAJ_LANGUAGE = shared.path_join(PATHS["data_1-fetch"], "doaj_5_count_by_language.csv")
+FILE_DOAJ_COUNTRY = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_3_count_by_country.csv"
+)
+FILE_DOAJ_LANGUAGE = shared.path_join(
+    PATHS["data_1-fetch"], "doaj_5_count_by_language.csv"
+)
 FILE_PROVENANCE = shared.path_join(
     PATHS["data_1-fetch"], "doaj_provenance.yaml"
 )
@@ -75,7 +80,12 @@
 # CSV Headers
 HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"]
 HEADER_COUNTRY = ["TOOL_IDENTIFIER", "COUNTRY_CODE", "COUNTRY_NAME", "COUNT"]
-HEADER_LANGUAGE = ["TOOL_IDENTIFIER", "LANGUAGE_CODE", "LANGUAGE_NAME", "COUNT"]
+HEADER_LANGUAGE = [
+    "TOOL_IDENTIFIER",
+    "LANGUAGE_CODE",
+    "LANGUAGE_NAME",
+    "COUNT",
+]
 HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"]
 
 # Runtime variables
@@ -165,13 +175,13 @@ def extract_license_types(license_info):
     """Extract all CC license types from DOAJ license information."""
     if not license_info:
         return []
-    
+
     cc_licenses = []
     for lic in license_info:
         lic_type = lic.get("type", "")
         if lic_type in CC_LICENSE_TYPES:
             cc_licenses.append(lic_type)
-    
+
     return cc_licenses
 
 
@@ -183,7 +193,9 @@ def process_journals(session, args):
     country_counts = defaultdict(Counter)
     language_counts = defaultdict(Counter)
     year_counts = defaultdict(Counter)
-    processed_journals = set()  # Track unique journals to avoid double counting
+    processed_journals = (
+        set()
+    )  # Track unique journals to avoid double counting
 
     total_processed = 0
     page = 1
@@ -200,6 +212,10 @@ def process_journals(session, args):
             response.raise_for_status()
             data = response.json()
         except requests.HTTPError as e:
+            # Handle 400 errors as end of data (DOAJ API behavior)
+            if hasattr(e, "response") and e.response.status_code == 400:
+                LOGGER.info(f"Reached end of available data at page {page}")
+                break
             raise shared.QuantifyingException(f"HTTP Error: {e}", 1)
         except requests.RequestException as e:
             raise shared.QuantifyingException(f"Request Exception: {e}", 1)
@@ -260,7 +276,7 @@ def process_journals(session, args):
                     if isinstance(publisher_info, dict):
                         country_code = publisher_info.get("country", "Unknown")
                         country_counts[license_type][country_code] += 1
-                    
+
                     # Extract language information
                     languages = bibjson.get("language", [])
                     if languages:
@@ -332,12 +348,14 @@ def save_count_data(
         for lic, countries in country_counts.items():
             for country_code, count in countries.items():
                 country_name = get_country_name(country_code)
-                writer.writerow({
-                    "TOOL_IDENTIFIER": lic,
-                    "COUNTRY_CODE": country_code,
-                    "COUNTRY_NAME": country_name,
-                    "COUNT": count,
-                })
+                writer.writerow(
+                    {
+                        "TOOL_IDENTIFIER": lic,
+                        "COUNTRY_CODE": country_code,
+                        "COUNTRY_NAME": country_name,
+                        "COUNT": count,
+                    }
+                )
 
     # Save language counts with pycountry names
     with open(
@@ -350,12 +368,14 @@ def save_count_data(
         for lic, languages in language_counts.items():
             for lang_code, count in languages.items():
                 lang_name = get_language_name(lang_code)
-                writer.writerow({
-                    "TOOL_IDENTIFIER": lic,
-                    "LANGUAGE_CODE": lang_code,
-                    "LANGUAGE_NAME": lang_name,
-                    "COUNT": count,
-                })
+                writer.writerow(
+                    {
+                        "TOOL_IDENTIFIER": lic,
+                        "LANGUAGE_CODE": lang_code,
+                        "LANGUAGE_NAME": lang_name,
+                        "COUNT": count,
+                    }
+                )
 
     # Save year counts
     with open(
@@ -405,7 +425,10 @@ def query_doaj(args):
         "quarter": QUARTER,
         "script": os.path.basename(__file__),
         "api_version": "v4",
-        "note": "Journal-level CC license data only - article counts not available via DOAJ API",
+        "note": (
+            "Journal-level CC license data only - "
+            "article counts not available via DOAJ API"
+        ),
     }
 
     try:
@@ -425,11 +448,14 @@ def query_doaj(args):
         )
 
     LOGGER.info(f"Unique CC-licensed journals processed: {journals_processed}")
-    
+
     # Calculate total license availability instances
     total_license_instances = sum(license_counts.values())
     LOGGER.info(f"Total CC license type instances: {total_license_instances}")
-    LOGGER.info("Note: Journals supporting multiple CC license types are counted once per license type")
+    LOGGER.info(
+        "Note: Journals supporting multiple CC license types are "
+        "counted once per license type"
+    )
 
 
 def main():

From f44ea662e798e414292551e47cd268c302903741 Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Mon, 24 Nov 2025 11:34:27 +0100
Subject: [PATCH 18/19] Order DOAJ API documentation links alphabetically

---
 sources.md | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/sources.md b/sources.md
index 0307f154..203db0a5 100644
--- a/sources.md
+++ b/sources.md
@@ -47,9 +47,9 @@ tool paths.
 **Description:** DOAJ is a comprehensive directory of open access journals that provides metadata about journals and their licensing policies. The API allows access to journal-level information including Creative Commons license support, publication years, and publisher details.
 
 **Official API Documentation:**
+- [Base URL](https://doaj.org/api/v4/)
 - [DOAJ API Documentation](https://doaj.org/api/docs)
 - [DOAJ API v4 Reference](https://doaj.org/api/v4/docs)
-- [Base URL](https://doaj.org/api/v4/)
 
 **API Information:**
 - No API key required
@@ -59,14 +59,6 @@ tool paths.
 - Search filters: Supports filtering by license type, publication year, subject, etc.
 - License detection: Extracts CC license information from journal metadata
 
-**Technical Details:**
-- Fetches journals that support Creative Commons licensing
-- Aggregates license counts by type (CC BY, CC BY-SA, CC BY-NC, etc.)
-- Tracks license adoption by publication year
-- Handles journals with multiple supported license types
-- Generates provenance metadata for data lineage
-
-**Script:** [`scripts/1-fetch/doaj_fetch.py`](scripts/1-fetch/doaj_fetch.py)
 
 
 ## Europeana

From 85f6faf59c541f89f98d4dc9697376c29dad352e Mon Sep 17 00:00:00 2001
From: opsmithe <anyanwuchigozieprosper@gmail.com>
Date: Mon, 24 Nov 2025 11:37:58 +0100
Subject: [PATCH 19/19] Remove generate_country_codes.py - using pycountry
 library instead

---
 dev/generate_country_codes.py | 295 ----------------------------------
 1 file changed, 295 deletions(-)
 delete mode 100755 dev/generate_country_codes.py

diff --git a/dev/generate_country_codes.py b/dev/generate_country_codes.py
deleted file mode 100755
index a70ba79a..00000000
--- a/dev/generate_country_codes.py
+++ /dev/null
@@ -1,295 +0,0 @@
-#!/usr/bin/env python
-"""
-Generate ISO 3166-1 alpha-2 country codes YAML file for DOAJ fetch script.
-"""
-# Standard library
-import os
-import sys
-
-# Third-party
-import yaml
-
-# Add parent directory so shared can be imported
-sys.path.append(os.path.join(os.path.dirname(__file__), "..", "scripts"))
-# First-party/Local
-import shared  # noqa: E402
-
-# ISO 3166-1 alpha-2 country codes (official list)
-COUNTRIES = [
-    {"code": "AD", "name": "Andorra"},
-    {"code": "AE", "name": "United Arab Emirates"},
-    {"code": "AF", "name": "Afghanistan"},
-    {"code": "AG", "name": "Antigua and Barbuda"},
-    {"code": "AI", "name": "Anguilla"},
-    {"code": "AL", "name": "Albania"},
-    {"code": "AM", "name": "Armenia"},
-    {"code": "AO", "name": "Angola"},
-    {"code": "AQ", "name": "Antarctica"},
-    {"code": "AR", "name": "Argentina"},
-    {"code": "AS", "name": "American Samoa"},
-    {"code": "AT", "name": "Austria"},
-    {"code": "AU", "name": "Australia"},
-    {"code": "AW", "name": "Aruba"},
-    {"code": "AX", "name": "Åland Islands"},
-    {"code": "AZ", "name": "Azerbaijan"},
-    {"code": "BA", "name": "Bosnia and Herzegovina"},
-    {"code": "BB", "name": "Barbados"},
-    {"code": "BD", "name": "Bangladesh"},
-    {"code": "BE", "name": "Belgium"},
-    {"code": "BF", "name": "Burkina Faso"},
-    {"code": "BG", "name": "Bulgaria"},
-    {"code": "BH", "name": "Bahrain"},
-    {"code": "BI", "name": "Burundi"},
-    {"code": "BJ", "name": "Benin"},
-    {"code": "BL", "name": "Saint Barthélemy"},
-    {"code": "BM", "name": "Bermuda"},
-    {"code": "BN", "name": "Brunei"},
-    {"code": "BO", "name": "Bolivia"},
-    {"code": "BQ", "name": "Caribbean Netherlands"},
-    {"code": "BR", "name": "Brazil"},
-    {"code": "BS", "name": "Bahamas"},
-    {"code": "BT", "name": "Bhutan"},
-    {"code": "BV", "name": "Bouvet Island"},
-    {"code": "BW", "name": "Botswana"},
-    {"code": "BY", "name": "Belarus"},
-    {"code": "BZ", "name": "Belize"},
-    {"code": "CA", "name": "Canada"},
-    {"code": "CC", "name": "Cocos Islands"},
-    {"code": "CD", "name": "Democratic Republic of the Congo"},
-    {"code": "CF", "name": "Central African Republic"},
-    {"code": "CG", "name": "Republic of the Congo"},
-    {"code": "CH", "name": "Switzerland"},
-    {"code": "CI", "name": "Côte d'Ivoire"},
-    {"code": "CK", "name": "Cook Islands"},
-    {"code": "CL", "name": "Chile"},
-    {"code": "CM", "name": "Cameroon"},
-    {"code": "CN", "name": "China"},
-    {"code": "CO", "name": "Colombia"},
-    {"code": "CR", "name": "Costa Rica"},
-    {"code": "CU", "name": "Cuba"},
-    {"code": "CV", "name": "Cape Verde"},
-    {"code": "CW", "name": "Curaçao"},
-    {"code": "CX", "name": "Christmas Island"},
-    {"code": "CY", "name": "Cyprus"},
-    {"code": "CZ", "name": "Czech Republic"},
-    {"code": "DE", "name": "Germany"},
-    {"code": "DJ", "name": "Djibouti"},
-    {"code": "DK", "name": "Denmark"},
-    {"code": "DM", "name": "Dominica"},
-    {"code": "DO", "name": "Dominican Republic"},
-    {"code": "DZ", "name": "Algeria"},
-    {"code": "EC", "name": "Ecuador"},
-    {"code": "EE", "name": "Estonia"},
-    {"code": "EG", "name": "Egypt"},
-    {"code": "EH", "name": "Western Sahara"},
-    {"code": "ER", "name": "Eritrea"},
-    {"code": "ES", "name": "Spain"},
-    {"code": "ET", "name": "Ethiopia"},
-    {"code": "FI", "name": "Finland"},
-    {"code": "FJ", "name": "Fiji"},
-    {"code": "FK", "name": "Falkland Islands"},
-    {"code": "FM", "name": "Micronesia"},
-    {"code": "FO", "name": "Faroe Islands"},
-    {"code": "FR", "name": "France"},
-    {"code": "GA", "name": "Gabon"},
-    {"code": "GB", "name": "United Kingdom"},
-    {"code": "GD", "name": "Grenada"},
-    {"code": "GE", "name": "Georgia"},
-    {"code": "GF", "name": "French Guiana"},
-    {"code": "GG", "name": "Guernsey"},
-    {"code": "GH", "name": "Ghana"},
-    {"code": "GI", "name": "Gibraltar"},
-    {"code": "GL", "name": "Greenland"},
-    {"code": "GM", "name": "Gambia"},
-    {"code": "GN", "name": "Guinea"},
-    {"code": "GP", "name": "Guadeloupe"},
-    {"code": "GQ", "name": "Equatorial Guinea"},
-    {"code": "GR", "name": "Greece"},
-    {"code": "GS", "name": "South Georgia"},
-    {"code": "GT", "name": "Guatemala"},
-    {"code": "GU", "name": "Guam"},
-    {"code": "GW", "name": "Guinea-Bissau"},
-    {"code": "GY", "name": "Guyana"},
-    {"code": "HK", "name": "Hong Kong"},
-    {"code": "HM", "name": "Heard Island"},
-    {"code": "HN", "name": "Honduras"},
-    {"code": "HR", "name": "Croatia"},
-    {"code": "HT", "name": "Haiti"},
-    {"code": "HU", "name": "Hungary"},
-    {"code": "ID", "name": "Indonesia"},
-    {"code": "IE", "name": "Ireland"},
-    {"code": "IL", "name": "Israel"},
-    {"code": "IM", "name": "Isle of Man"},
-    {"code": "IN", "name": "India"},
-    {"code": "IO", "name": "British Indian Ocean Territory"},
-    {"code": "IQ", "name": "Iraq"},
-    {"code": "IR", "name": "Iran"},
-    {"code": "IS", "name": "Iceland"},
-    {"code": "IT", "name": "Italy"},
-    {"code": "JE", "name": "Jersey"},
-    {"code": "JM", "name": "Jamaica"},
-    {"code": "JO", "name": "Jordan"},
-    {"code": "JP", "name": "Japan"},
-    {"code": "KE", "name": "Kenya"},
-    {"code": "KG", "name": "Kyrgyzstan"},
-    {"code": "KH", "name": "Cambodia"},
-    {"code": "KI", "name": "Kiribati"},
-    {"code": "KM", "name": "Comoros"},
-    {"code": "KN", "name": "Saint Kitts and Nevis"},
-    {"code": "KP", "name": "North Korea"},
-    {"code": "KR", "name": "South Korea"},
-    {"code": "KW", "name": "Kuwait"},
-    {"code": "KY", "name": "Cayman Islands"},
-    {"code": "KZ", "name": "Kazakhstan"},
-    {"code": "LA", "name": "Laos"},
-    {"code": "LB", "name": "Lebanon"},
-    {"code": "LC", "name": "Saint Lucia"},
-    {"code": "LI", "name": "Liechtenstein"},
-    {"code": "LK", "name": "Sri Lanka"},
-    {"code": "LR", "name": "Liberia"},
-    {"code": "LS", "name": "Lesotho"},
-    {"code": "LT", "name": "Lithuania"},
-    {"code": "LU", "name": "Luxembourg"},
-    {"code": "LV", "name": "Latvia"},
-    {"code": "LY", "name": "Libya"},
-    {"code": "MA", "name": "Morocco"},
-    {"code": "MC", "name": "Monaco"},
-    {"code": "MD", "name": "Moldova"},
-    {"code": "ME", "name": "Montenegro"},
-    {"code": "MF", "name": "Saint Martin"},
-    {"code": "MG", "name": "Madagascar"},
-    {"code": "MH", "name": "Marshall Islands"},
-    {"code": "MK", "name": "North Macedonia"},
-    {"code": "ML", "name": "Mali"},
-    {"code": "MM", "name": "Myanmar"},
-    {"code": "MN", "name": "Mongolia"},
-    {"code": "MO", "name": "Macao"},
-    {"code": "MP", "name": "Northern Mariana Islands"},
-    {"code": "MQ", "name": "Martinique"},
-    {"code": "MR", "name": "Mauritania"},
-    {"code": "MS", "name": "Montserrat"},
-    {"code": "MT", "name": "Malta"},
-    {"code": "MU", "name": "Mauritius"},
-    {"code": "MV", "name": "Maldives"},
-    {"code": "MW", "name": "Malawi"},
-    {"code": "MX", "name": "Mexico"},
-    {"code": "MY", "name": "Malaysia"},
-    {"code": "MZ", "name": "Mozambique"},
-    {"code": "NA", "name": "Namibia"},
-    {"code": "NC", "name": "New Caledonia"},
-    {"code": "NE", "name": "Niger"},
-    {"code": "NF", "name": "Norfolk Island"},
-    {"code": "NG", "name": "Nigeria"},
-    {"code": "NI", "name": "Nicaragua"},
-    {"code": "NL", "name": "Netherlands"},
-    {"code": "NO", "name": "Norway"},
-    {"code": "NP", "name": "Nepal"},
-    {"code": "NR", "name": "Nauru"},
-    {"code": "NU", "name": "Niue"},
-    {"code": "NZ", "name": "New Zealand"},
-    {"code": "OM", "name": "Oman"},
-    {"code": "PA", "name": "Panama"},
-    {"code": "PE", "name": "Peru"},
-    {"code": "PF", "name": "French Polynesia"},
-    {"code": "PG", "name": "Papua New Guinea"},
-    {"code": "PH", "name": "Philippines"},
-    {"code": "PK", "name": "Pakistan"},
-    {"code": "PL", "name": "Poland"},
-    {"code": "PM", "name": "Saint Pierre and Miquelon"},
-    {"code": "PN", "name": "Pitcairn Islands"},
-    {"code": "PR", "name": "Puerto Rico"},
-    {"code": "PS", "name": "Palestine"},
-    {"code": "PT", "name": "Portugal"},
-    {"code": "PW", "name": "Palau"},
-    {"code": "PY", "name": "Paraguay"},
-    {"code": "QA", "name": "Qatar"},
-    {"code": "RE", "name": "Réunion"},
-    {"code": "RO", "name": "Romania"},
-    {"code": "RS", "name": "Serbia"},
-    {"code": "RU", "name": "Russia"},
-    {"code": "RW", "name": "Rwanda"},
-    {"code": "SA", "name": "Saudi Arabia"},
-    {"code": "SB", "name": "Solomon Islands"},
-    {"code": "SC", "name": "Seychelles"},
-    {"code": "SD", "name": "Sudan"},
-    {"code": "SE", "name": "Sweden"},
-    {"code": "SG", "name": "Singapore"},
-    {"code": "SH", "name": "Saint Helena"},
-    {"code": "SI", "name": "Slovenia"},
-    {"code": "SJ", "name": "Svalbard and Jan Mayen"},
-    {"code": "SK", "name": "Slovakia"},
-    {"code": "SL", "name": "Sierra Leone"},
-    {"code": "SM", "name": "San Marino"},
-    {"code": "SN", "name": "Senegal"},
-    {"code": "SO", "name": "Somalia"},
-    {"code": "SR", "name": "Suriname"},
-    {"code": "SS", "name": "South Sudan"},
-    {"code": "ST", "name": "São Tomé and Príncipe"},
-    {"code": "SV", "name": "El Salvador"},
-    {"code": "SX", "name": "Sint Maarten"},
-    {"code": "SY", "name": "Syria"},
-    {"code": "SZ", "name": "Eswatini"},
-    {"code": "TC", "name": "Turks and Caicos Islands"},
-    {"code": "TD", "name": "Chad"},
-    {"code": "TF", "name": "French Southern Territories"},
-    {"code": "TG", "name": "Togo"},
-    {"code": "TH", "name": "Thailand"},
-    {"code": "TJ", "name": "Tajikistan"},
-    {"code": "TK", "name": "Tokelau"},
-    {"code": "TL", "name": "Timor-Leste"},
-    {"code": "TM", "name": "Turkmenistan"},
-    {"code": "TN", "name": "Tunisia"},
-    {"code": "TO", "name": "Tonga"},
-    {"code": "TR", "name": "Turkey"},
-    {"code": "TT", "name": "Trinidad and Tobago"},
-    {"code": "TV", "name": "Tuvalu"},
-    {"code": "TW", "name": "Taiwan"},
-    {"code": "TZ", "name": "Tanzania"},
-    {"code": "UA", "name": "Ukraine"},
-    {"code": "UG", "name": "Uganda"},
-    {"code": "UM", "name": "U.S. Minor Outlying Islands"},
-    {"code": "US", "name": "United States"},
-    {"code": "UY", "name": "Uruguay"},
-    {"code": "UZ", "name": "Uzbekistan"},
-    {"code": "VA", "name": "Vatican City"},
-    {"code": "VC", "name": "Saint Vincent and the Grenadines"},
-    {"code": "VE", "name": "Venezuela"},
-    {"code": "VG", "name": "British Virgin Islands"},
-    {"code": "VI", "name": "U.S. Virgin Islands"},
-    {"code": "VN", "name": "Vietnam"},
-    {"code": "VU", "name": "Vanuatu"},
-    {"code": "WF", "name": "Wallis and Futuna"},
-    {"code": "WS", "name": "Samoa"},
-    {"code": "YE", "name": "Yemen"},
-    {"code": "YT", "name": "Mayotte"},
-    {"code": "ZA", "name": "South Africa"},
-    {"code": "ZM", "name": "Zambia"},
-    {"code": "ZW", "name": "Zimbabwe"},
-]
-
-
-def main():
-    """Generate ISO country codes YAML file."""
-    repo_path = shared.path_join(os.path.dirname(__file__), "..")
-    output_file = shared.path_join(repo_path, "data", "iso_country_codes.yaml")
-
-    header = [
-        "# ISO 3166-1 alpha-2 country codes to country names mapping",
-        "# Used by DOAJ API for publisher country identification",
-        "# Generated programmatically by dev/generate_country_codes.py",
-    ]
-
-    with open(output_file, "w", encoding="utf-8") as file_object:
-        file_object.write("\n".join(header) + "\n")
-        yaml.dump(
-            COUNTRIES,
-            file_object,
-            default_flow_style=False,
-            allow_unicode=True,
-        )
-
-    print(f"Generated {output_file} with {len(COUNTRIES)} country codes")
-
-
-if __name__ == "__main__":
-    main()