Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/create_test_conda_env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ['3.12', '3.13']
python-version: ['3.12', '3.13', '3.14']
container:
image: ghcr.io/noaa-gfdl/fre-cli:miniconda24_gcc14_v2
options: "--privileged --cap-add=sys_admin --cap-add=mknod --device=/dev/fuse --security-opt seccomp=unconfined --security-opt label=disable --security-opt apparmor=unconfined" # needed for podman
Expand Down
14 changes: 11 additions & 3 deletions fre/cmor/cmor_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,15 +154,23 @@ def make_simple_varlist( dir_targ: str,

"""
# if the variable is in the filename, it's likely delimited by another period.
one_file = next(glob.iglob(os.path.join(dir_targ, "*.*.nc")), None)
if not one_file:
all_nc_files = glob.glob(os.path.join(dir_targ, "*.*.nc"))
if not all_nc_files:
fre_logger.error("No files found in the directory.") #uncovered
return None

one_datetime = None
search_pattern = None
try:
one_datetime = os.path.basename(one_file).split('.')[-3]
# Count files per datetime stamp and pick the most common one.
Copy link
Copy Markdown
Contributor

@singhd789 singhd789 Mar 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only thing I'm not fully understanding is why the datetime with the most files is picked. I understand it's for the representative datetime, but is that just to have like a full list of variables across datetimes? Would other (maybe not representative) datetimes have other variables present at all?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in the new form, it is 100% possible that other datetimes that do not have the most number of files do not contain a variable under another datetime. the function isn't supposed to remove all ambiguity for the user nor be an "ironclad" solid generator of variable lists, hence the function name, make_simple_var_list. it's supposed to get a user started

# This ensures we choose the most representative datetime, which
# maximises the number of variables found and avoids non-deterministic
# behaviour caused by filesystem-dependent glob ordering.
datetime_counts: Dict[str, int] = {}
for f in all_nc_files:
dt = os.path.basename(f).split('.')[-3]
datetime_counts[dt] = datetime_counts.get(dt, 0) + 1
one_datetime = max(datetime_counts, key=lambda k: datetime_counts[k])
except IndexError as e:
fre_logger.warning(' e = %s', e)
fre_logger.warning('WARNING: cannot find datetime in filenames, moving on and doing the best i can.')
Expand Down
44 changes: 21 additions & 23 deletions fre/cmor/tests/test_cmor_finder_make_simple_varlist.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
'''

import json
import os
import glob as glob_module
from unittest.mock import patch

Expand Down Expand Up @@ -190,40 +191,36 @@ def test_make_simple_varlist_mip_table_filter(tmp_path):
# ---- IndexError on datetime extraction (monkeypatched) ----
def test_make_simple_varlist_index_error_on_datetime(tmp_path):
"""
When ``os.path.basename(one_file).split('.')[-3]`` raises IndexError
(e.g. a file with fewer than 3 dot-segments sneaks in), the function
should catch it, set one_datetime = None, and fall back to the ``'*nc'``
search pattern.
When all discovered files have fewer than 3 dot-segments,
``split('.')[-3]`` raises IndexError. The function should catch it,
set one_datetime = None, and fall back to the ``'*nc'`` search pattern.
Covers the except IndexError branch and the one_datetime is None path.
"""
# Create a file that matches *.*.nc normally
# Create a real file so the fallback '*nc' glob returns something useful.
real_file = tmp_path / "model.19900101.temp.nc"
real_file.touch()

# Patch iglob so the *first* call (the *.*.nc probe) returns a pathological
# name with only one dot ("short.nc"), triggering IndexError on split('.')[-3].
# The *second* call (glob.glob with search_pattern) returns the real file.
# Patch glob.glob so the initial *.*.nc discovery call returns a file whose
# basename has fewer than 3 dot-separated segments, triggering IndexError on
# split('.')[-3]. The subsequent search-pattern glob uses the real filesystem.
fake_probe = str(tmp_path / "short.nc")
(tmp_path / "short.nc").touch()

original_iglob = glob_module.iglob
original_glob = glob_module.glob

def patched_iglob(pattern, **kwargs):
# Return the pathological file for the first probe
return iter([fake_probe])

def patched_glob(pattern, **kwargs):
# Return the real files for the search_pattern glob
if os.path.basename(pattern) == "*.*.nc":
# Return a pathological filename for the *.*.nc discovery call
return [fake_probe]
# Return real files for the search_pattern glob
return original_glob(pattern, **kwargs)

with patch('fre.cmor.cmor_finder.glob.iglob', side_effect=patched_iglob):
with patch('fre.cmor.cmor_finder.glob.glob', side_effect=patched_glob):
result = make_simple_varlist(str(tmp_path), None)
with patch('fre.cmor.cmor_finder.glob.glob', side_effect=patched_glob):
result = make_simple_varlist(str(tmp_path), None)

# short.nc split is ['short', 'nc'], [-3] raises IndexError
# one_datetime becomes None, search_pattern becomes '*nc'
# glob picks up *.nc files in the directory
# 'short.nc' has only 2 parts when split by '.', so [-3] raises IndexError;
# one_datetime becomes None, search_pattern becomes '*nc',
# and the real glob picks up *.nc files in the directory.
assert result is not None
assert isinstance(result, dict)

Expand Down Expand Up @@ -266,15 +263,16 @@ def test_make_simple_varlist_dedup_across_datetimes(tmp_path):
"""
Files with different datetime stamps but the same variable name
should be de-duplicated so the variable appears only once.
Covers the 'var_name already in target varlist, skip' branch.
The most-common datetime is selected and all variables for that
datetime are returned without duplicates.
"""
(tmp_path / "ocean.19900101.tos.nc").touch()
(tmp_path / "ocean.19900201.tos.nc").touch()
(tmp_path / "ocean.19900101.sos.nc").touch()
(tmp_path / "ocean.19900201.sos.nc").touch()

# All four files share datetime pattern "1990*" so they all get globbed;
# tos and sos each appear twice, the second occurrence triggers the skip.
# Both datetimes (19900101, 19900201) have 2 files each; the most-files
# datetime is picked, and both tos and sos appear in that set.
result = make_simple_varlist(str(tmp_path), None)

assert result is not None
Expand Down
Loading