Skip to content

Commit 1a13e79

Browse files
committed
Added open_data function
1 parent d5f457e commit 1a13e79

File tree

5 files changed

+77
-24
lines changed

5 files changed

+77
-24
lines changed

scripts/2-process/gcs_process.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,9 @@ def main():
311311

312312
# Count data
313313
file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
314-
count_data = pd.read_csv(file1_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
314+
count_data = shared.open_data_file(
315+
LOGGER, file1_count, usecols=["TOOL_IDENTIFIER", "COUNT"]
316+
)
315317
process_product_totals(args, count_data)
316318
process_latest_prior_retired_totals(args, count_data)
317319
process_totals_by_free_cultural(args, count_data)
@@ -321,17 +323,19 @@ def main():
321323
file2_language = shared.path_join(
322324
PATHS["data_1-fetch"], "gcs_2_count_by_language.csv"
323325
)
324-
language_data = pd.read_csv(
325-
file2_language, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
326+
language_data = shared.open_data_file(
327+
LOGGER,
328+
file2_language,
329+
usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"],
326330
)
327331
process_totals_by_language(args, language_data)
328332

329333
# Country data
330334
file3_country = shared.path_join(
331335
PATHS["data_1-fetch"], "gcs_3_count_by_country.csv"
332336
)
333-
country_data = pd.read_csv(
334-
file3_country, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
337+
country_data = shared.open_data_file(
338+
LOGGER, file3_country, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
335339
)
336340
process_totals_by_country(args, country_data)
337341

scripts/2-process/github_process.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,9 @@ def main():
178178
shared.git_fetch_and_merge(args, PATHS["repo"])
179179

180180
file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
181-
count_data = pd.read_csv(file_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
181+
count_data = shared.open_data_file(
182+
LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"]
183+
)
182184
process_totals_by_license(args, count_data)
183185
process_totals_by_restriction(args, count_data)
184186

scripts/3-report/gcs_report.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import traceback
1212

1313
# Third-party
14-
import pandas as pd
1514
from pygments import highlight
1615
from pygments.formatters import TerminalFormatter
1716
from pygments.lexers import PythonTracebackLexer
@@ -80,7 +79,8 @@ def gcs_intro(args):
8079
)
8180
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
8281
name_label = "CC legal tool product"
83-
data = pd.read_csv(file_path, index_col=name_label)
82+
data = shared.open_data_file(LOGGER, file_path)
83+
data.set_index(name_label, inplace=True)
8484
total_count = f"{data['Count'].sum():,d}"
8585
shared.update_readme(
8686
args,
@@ -111,7 +111,9 @@ def plot_products(args):
111111
)
112112
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
113113
name_label = "CC legal tool product"
114-
data = pd.read_csv(file_path, index_col=name_label)
114+
data = shared.open_data_file(LOGGER, file_path)
115+
data.set_index(name_label, inplace=True)
116+
115117
data = data[::-1] # reverse order
116118

117119
title = "Products totals and percentages"
@@ -156,7 +158,8 @@ def plot_tool_status(args):
156158
)
157159
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
158160
name_label = "CC legal tool"
159-
data = pd.read_csv(file_path, index_col=name_label)
161+
data = shared.open_data_file(LOGGER, file_path)
162+
data.set_index(name_label, inplace=True)
160163
data.sort_values(name_label, ascending=False, inplace=True)
161164

162165
title = "CC legal tools status"
@@ -199,7 +202,8 @@ def plot_latest_tools(args):
199202
)
200203
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
201204
name_label = "CC legal tool"
202-
data = pd.read_csv(file_path, index_col=name_label)
205+
data = shared.open_data_file(LOGGER, file_path)
206+
data.set_index(name_label, inplace=True)
203207
data.sort_values(name_label, ascending=False, inplace=True)
204208

205209
title = "Latest CC legal tools"
@@ -241,7 +245,8 @@ def plot_prior_tools(args):
241245
)
242246
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
243247
name_label = "CC legal tool"
244-
data = pd.read_csv(file_path, index_col=name_label)
248+
data = shared.open_data_file(LOGGER, file_path)
249+
data.set_index(name_label, inplace=True)
245250
data.sort_values(name_label, ascending=False, inplace=True)
246251

247252
title = "Prior CC legal tools"
@@ -286,7 +291,8 @@ def plot_retired_tools(args):
286291
)
287292
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
288293
name_label = "CC legal tool"
289-
data = pd.read_csv(file_path, index_col=name_label)
294+
data = shared.open_data_file(LOGGER, file_path)
295+
data.set_index(name_label, inplace=True)
290296
data.sort_values(name_label, ascending=False, inplace=True)
291297

292298
title = "Retired CC legal tools"
@@ -332,7 +338,8 @@ def plot_countries_highest_usage(args):
332338
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
333339
name_label = "Country"
334340
data_label = "Count"
335-
data = pd.read_csv(file_path, index_col=name_label)
341+
data = shared.open_data_file(LOGGER, file_path)
342+
data.set_index(name_label, inplace=True)
336343
total_count = f"{data['Count'].sum():,d}"
337344
data.sort_values(data_label, ascending=False, inplace=True)
338345
data = data[:10] # limit to highest 10
@@ -385,7 +392,8 @@ def plot_languages_highest_usage(args):
385392
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
386393
name_label = "Language"
387394
data_label = "Count"
388-
data = pd.read_csv(file_path, index_col=name_label)
395+
data = shared.open_data_file(LOGGER, file_path)
396+
data.set_index(name_label, inplace=True)
389397
total_count = f"{data['Count'].sum():,d}"
390398
data.sort_values(data_label, ascending=False, inplace=True)
391399
data = data[:10] # limit to highest 10
@@ -439,7 +447,7 @@ def plot_free_culture(args):
439447
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
440448
name_label = "Category"
441449
data_label = "Count"
442-
data = pd.read_csv(file_path, index_col=name_label)
450+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
443451

444452
title = "Approved for Free Cultural Works"
445453
plt = plot.combined_plot(

scripts/3-report/github_report.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import traceback
1212

1313
# Third-party
14-
import pandas as pd
1514
from pygments import highlight
1615
from pygments.formatters import TerminalFormatter
1716
from pygments.lexers import PythonTracebackLexer
@@ -77,11 +76,8 @@ def load_data(args):
7776
PATHS["data"], f"{selected_quarter}", "1-fetch", "github_1_count.csv"
7877
)
7978

80-
if not os.path.exists(file_path):
81-
LOGGER.error(f"Data file not found: {file_path}")
82-
return pd.DataFrame()
79+
data = shared.open_data_file(LOGGER, file_path)
8380

84-
data = pd.read_csv(file_path)
8581
LOGGER.info(f"Data loaded from {file_path}")
8682
return data
8783

@@ -97,7 +93,8 @@ def github_intro(args):
9793
)
9894
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
9995
name_label = "TOOL_IDENTIFIER"
100-
data = pd.read_csv(file_path, index_col=name_label)
96+
data = shared.open_data_file(LOGGER, file_path)
97+
data.set_index(name_label, inplace=True)
10198
total_repositories = data.loc["Total public repositories", "COUNT"]
10299
cc_total = data[data.index.str.startswith("CC")]["COUNT"].sum()
103100
cc_percentage = f"{(cc_total / total_repositories) * 100:.2f}%"
@@ -152,7 +149,8 @@ def plot_totals_by_license_type(args):
152149
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
153150
name_label = "License"
154151
data_label = "Count"
155-
data = pd.read_csv(file_path, index_col=name_label)
152+
data = shared.open_data_file(LOGGER, file_path)
153+
data.set_index(name_label, inplace=True)
156154
data.sort_values(data_label, ascending=True, inplace=True)
157155
title = "Totals by license type"
158156
plt = plot.combined_plot(
@@ -201,7 +199,7 @@ def plot_totals_by_restriction(args):
201199
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
202200
name_label = "Category"
203201
data_label = "Count"
204-
data = pd.read_csv(file_path, index_col=name_label)
202+
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
205203
data.sort_values(name_label, ascending=False, inplace=True)
206204
title = "Totals by restriction"
207205
plt = plot.combined_plot(

scripts/shared.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from datetime import datetime, timezone
77

88
# Third-party
9+
import pandas as pd
910
from git import InvalidGitRepositoryError, NoSuchPathError, Repo
1011
from pandas import PeriodIndex
1112
from requests import Session
@@ -66,6 +67,46 @@ def get_session(accept_header=None, session=None):
6667
return session
6768

6869

70+
def open_data_file(logger, file_path, usecols=None):
71+
"""
72+
Open a CSV data file safely and convert
73+
expected errors into QuantifyingException.
74+
This function is shared so all process/report
75+
scripts use the same error behavior.
76+
77+
"""
78+
79+
# File does not exist
80+
if not os.path.isfile(file_path):
81+
raise QuantifyingException(
82+
message=f"Data file not found: {file_path}", exit_code=1
83+
)
84+
85+
try:
86+
# Reading the file
87+
return pd.read_csv(file_path, usecols=usecols)
88+
89+
# Empty or invalid CSV file
90+
except pd.errors.EmptyDataError:
91+
raise QuantifyingException(
92+
message=f"CSV file is empty or invalid: {file_path}", exit_code=1
93+
)
94+
95+
# Permission denied
96+
except PermissionError:
97+
raise QuantifyingException(
98+
message=f"Permission denied when accessing data file: {file_path}",
99+
exit_code=1,
100+
)
101+
102+
# Any other unexpected issue
103+
except Exception as e:
104+
raise QuantifyingException(
105+
message=f"Unexpected error opening file '{file_path}': {str(e)}",
106+
exit_code=1,
107+
)
108+
109+
69110
def git_fetch_and_merge(args, repo_path, branch=None):
70111
if not args.enable_git:
71112
return

0 commit comments

Comments
 (0)