Added open_data function

Joyakis · Joyakis · commit 1a13e7921057 · 2025-11-24T13:15:14.000+03:00
diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py
@@ -311,7 +311,9 @@ def main():
 
     # Count data
     file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
-    count_data = pd.read_csv(file1_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
+    count_data = shared.open_data_file(
+        LOGGER, file1_count, usecols=["TOOL_IDENTIFIER", "COUNT"]
+    )
     process_product_totals(args, count_data)
     process_latest_prior_retired_totals(args, count_data)
     process_totals_by_free_cultural(args, count_data)
@@ -321,17 +323,19 @@ def main():
     file2_language = shared.path_join(
         PATHS["data_1-fetch"], "gcs_2_count_by_language.csv"
     )
-    language_data = pd.read_csv(
-        file2_language, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
+    language_data = shared.open_data_file(
+        LOGGER,
+        file2_language,
+        usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"],
     )
     process_totals_by_language(args, language_data)
 
     # Country data
     file3_country = shared.path_join(
         PATHS["data_1-fetch"], "gcs_3_count_by_country.csv"
     )
-    country_data = pd.read_csv(
-        file3_country, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
+    country_data = shared.open_data_file(
+        LOGGER, file3_country, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
     )
     process_totals_by_country(args, country_data)
 
diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py
@@ -178,7 +178,9 @@ def main():
     shared.git_fetch_and_merge(args, PATHS["repo"])
 
     file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
-    count_data = pd.read_csv(file_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
+    count_data = shared.open_data_file(
+        LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"]
+    )
     process_totals_by_license(args, count_data)
     process_totals_by_restriction(args, count_data)
 
diff --git a/scripts/3-report/gcs_report.py b/scripts/3-report/gcs_report.py
@@ -11,7 +11,6 @@
 import traceback
 
 # Third-party
-import pandas as pd
 from pygments import highlight
 from pygments.formatters import TerminalFormatter
 from pygments.lexers import PythonTracebackLexer
@@ -80,7 +79,8 @@ def gcs_intro(args):
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "CC legal tool product"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path)
+    data.set_index(name_label, inplace=True)
     total_count = f"{data['Count'].sum():,d}"
     shared.update_readme(
         args,
@@ -111,7 +111,9 @@ def plot_products(args):
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "CC legal tool product"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path)
+    data.set_index(name_label, inplace=True)
+
     data = data[::-1]  # reverse order
 
     title = "Products totals and percentages"
@@ -156,7 +158,8 @@ def plot_tool_status(args):
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "CC legal tool"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path)
+    data.set_index(name_label, inplace=True)
     data.sort_values(name_label, ascending=False, inplace=True)
 
     title = "CC legal tools status"
@@ -199,7 +202,8 @@ def plot_latest_tools(args):
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "CC legal tool"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path)
+    data.set_index(name_label, inplace=True)
     data.sort_values(name_label, ascending=False, inplace=True)
 
     title = "Latest CC legal tools"
@@ -241,7 +245,8 @@ def plot_prior_tools(args):
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "CC legal tool"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path)
+    data.set_index(name_label, inplace=True)
     data.sort_values(name_label, ascending=False, inplace=True)
 
     title = "Prior CC legal tools"
@@ -286,7 +291,8 @@ def plot_retired_tools(args):
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "CC legal tool"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path)
+    data.set_index(name_label, inplace=True)
     data.sort_values(name_label, ascending=False, inplace=True)
 
     title = "Retired CC legal tools"
@@ -332,7 +338,8 @@ def plot_countries_highest_usage(args):
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Country"
     data_label = "Count"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path)
+    data.set_index(name_label, inplace=True)
     total_count = f"{data['Count'].sum():,d}"
     data.sort_values(data_label, ascending=False, inplace=True)
     data = data[:10]  # limit to highest 10
@@ -385,7 +392,8 @@ def plot_languages_highest_usage(args):
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Language"
     data_label = "Count"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path)
+    data.set_index(name_label, inplace=True)
     total_count = f"{data['Count'].sum():,d}"
     data.sort_values(data_label, ascending=False, inplace=True)
     data = data[:10]  # limit to highest 10
@@ -439,7 +447,7 @@ def plot_free_culture(args):
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Category"
     data_label = "Count"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
 
     title = "Approved for Free Cultural Works"
     plt = plot.combined_plot(
diff --git a/scripts/3-report/github_report.py b/scripts/3-report/github_report.py
@@ -11,7 +11,6 @@
 import traceback
 
 # Third-party
-import pandas as pd
 from pygments import highlight
 from pygments.formatters import TerminalFormatter
 from pygments.lexers import PythonTracebackLexer
@@ -77,11 +76,8 @@ def load_data(args):
         PATHS["data"], f"{selected_quarter}", "1-fetch", "github_1_count.csv"
     )
 
-    if not os.path.exists(file_path):
-        LOGGER.error(f"Data file not found: {file_path}")
-        return pd.DataFrame()
+    data = shared.open_data_file(LOGGER, file_path)
 
-    data = pd.read_csv(file_path)
     LOGGER.info(f"Data loaded from {file_path}")
     return data
 
@@ -97,7 +93,8 @@ def github_intro(args):
     )
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "TOOL_IDENTIFIER"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path)
+    data.set_index(name_label, inplace=True)
     total_repositories = data.loc["Total public repositories", "COUNT"]
     cc_total = data[data.index.str.startswith("CC")]["COUNT"].sum()
     cc_percentage = f"{(cc_total / total_repositories) * 100:.2f}%"
@@ -152,7 +149,8 @@ def plot_totals_by_license_type(args):
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "License"
     data_label = "Count"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path)
+    data.set_index(name_label, inplace=True)
     data.sort_values(data_label, ascending=True, inplace=True)
     title = "Totals by license type"
     plt = plot.combined_plot(
@@ -201,7 +199,7 @@ def plot_totals_by_restriction(args):
     LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
     name_label = "Category"
     data_label = "Count"
-    data = pd.read_csv(file_path, index_col=name_label)
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
     data.sort_values(name_label, ascending=False, inplace=True)
     title = "Totals by restriction"
     plt = plot.combined_plot(
diff --git a/scripts/shared.py b/scripts/shared.py
@@ -6,6 +6,7 @@
 from datetime import datetime, timezone
 
 # Third-party
+import pandas as pd
 from git import InvalidGitRepositoryError, NoSuchPathError, Repo
 from pandas import PeriodIndex
 from requests import Session
@@ -66,6 +67,46 @@ def get_session(accept_header=None, session=None):
     return session
 
 
+def open_data_file(logger, file_path, usecols=None):
+    """
+    Open a CSV data file safely and convert
+    expected errors into QuantifyingException.
+    This function is shared so all process/report
+    scripts use the same error behavior.
+
+    """
+
+    # File does not exist
+    if not os.path.isfile(file_path):
+        raise QuantifyingException(
+            message=f"Data file not found: {file_path}", exit_code=1
+        )
+
+    try:
+        # Reading the file
+        return pd.read_csv(file_path, usecols=usecols)
+
+    # Empty or invalid CSV file
+    except pd.errors.EmptyDataError:
+        raise QuantifyingException(
+            message=f"CSV file is empty or invalid: {file_path}", exit_code=1
+        )
+
+    # Permission denied
+    except PermissionError:
+        raise QuantifyingException(
+            message=f"Permission denied when accessing data file: {file_path}",
+            exit_code=1,
+        )
+
+    #  Any other unexpected issue
+    except Exception as e:
+        raise QuantifyingException(
+            message=f"Unexpected error opening file '{file_path}': {str(e)}",
+            exit_code=1,
+        )
+
+
 def git_fetch_and_merge(args, repo_path, branch=None):
     if not args.enable_git:
         return