Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/create_test_conda_env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ['3.12', '3.13']
python-version: ['3.12', '3.13', '3.14']
container:
image: ghcr.io/noaa-gfdl/fre-cli:miniconda24_gcc14_v2
options: "--privileged --cap-add=sys_admin --cap-add=mknod --device=/dev/fuse --security-opt seccomp=unconfined --security-opt label=disable --security-opt apparmor=unconfined" # needed for podman
Expand Down
53 changes: 14 additions & 39 deletions fre/cmor/cmor_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,36 +154,15 @@ def make_simple_varlist( dir_targ: str,

"""
# if the variable is in the filename, it's likely delimited by another period.
one_file = next(glob.iglob(os.path.join(dir_targ, "*.*.nc")), None)
if not one_file:
all_nc_files = glob.glob(os.path.join(dir_targ, "*.*.nc"))
if not all_nc_files:
fre_logger.error("No files found in the directory.") #uncovered
return None

one_datetime = None
search_pattern = None
try:
one_datetime = os.path.basename(one_file).split('.')[-3]
except IndexError as e:
fre_logger.warning(' e = %s', e)
fre_logger.warning('WARNING: cannot find datetime in filenames, moving on and doing the best i can.')

if one_datetime is None:
search_pattern = "*nc"
else:
search_pattern = f"*{one_datetime}*.nc"

# Find all files in the directory that match the datetime component
files = glob.glob(os.path.join(dir_targ, search_pattern))

# Check if any files were found
if not files:
fre_logger.error("No files found matching the pattern.") #uncovered
return None

if len(files) == 1:
fre_logger.warning("Warning: Only one file found matching the pattern.") #uncovered
if len(all_nc_files) == 1:
fre_logger.warning("Warning: Only one file found matching the pattern.")

fre_logger.info("Files found with %s in the filename. Number of files: %d", one_datetime, len(files))
fre_logger.info("Files found matching pattern. Number of files: %d", len(all_nc_files))

mip_vars = None
if json_mip_table is not None:
Expand All @@ -200,29 +179,25 @@ def make_simple_varlist( dir_targ: str,
mip_vars=[ key.split('_')[0] for key in full_mip_vars_list ]
fre_logger.debug('mip vars extracted for comparison when making var list: %s', mip_vars)


# Create a dictionary of variable names extracted from the filenames
var_list = {}
quick_vlist=[]
for targetfile in files:
# Build a deduplicated dict of variable names extracted from all filenames across
# all datetimes. Assigning to a dict naturally deduplicates while preserving
# first-seen insertion order (Python 3.7+).
var_list: Dict[str, str] = {}
for targetfile in all_nc_files:
var_name=os.path.basename(targetfile).split('.')[-2]
if mip_vars is not None and var_name not in mip_vars:
#fre_logger.debug('var_name %s not in mip_vars, omitting', var_name)
continue
var_list[var_name] = var_name

quick_vlist.append(var_name)

if len(quick_vlist) > 0:
var_list = { _var_name : _var_name for _var_name in quick_vlist }
else:
if not var_list:
fre_logger.warning('WARNING: no variables in target mip table found, or no matching pattern,'
' or not enough info in the filenames (i am expecting FRE-bronx like filenames)')
return None

# Write the variable list to the output JSON file
if output_variable_list is not None and len(var_list)>0:
if output_variable_list is not None:
try:
fre_logger.info('writing output variable list, %s', quick_vlist) # output_variable_list)
fre_logger.info('writing output variable list, %s', list(var_list.keys()))
with open(output_variable_list, 'w', encoding='utf-8') as f:
json.dump(var_list, f, indent=4)
except Exception as exc:
Expand Down
83 changes: 31 additions & 52 deletions fre/cmor/tests/test_cmor_finder_make_simple_varlist.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
'''

import json
import glob as glob_module
from unittest.mock import patch

import pytest
Expand Down Expand Up @@ -148,8 +147,9 @@ def test_make_simple_varlist_no_matching_pattern(tmp_path):
# ---- duplicate var_name skip coverage ----
def test_make_simple_varlist_deduplicates(tmp_path):
"""
When multiple files share the same var_name, the result should contain
the variable only once (duplicate skip path).
When multiple files share the same var_name across different datetimes,
the result should contain the variable only once. Variables that only
appear at a single datetime are still included.
"""
# Two files with var_name "temp" and one with "salt"
(tmp_path / "model.19900101.temp.nc").touch()
Expand Down Expand Up @@ -187,59 +187,18 @@ def test_make_simple_varlist_mip_table_filter(tmp_path):
assert "notinmip" not in result


# ---- IndexError on datetime extraction (monkeypatched) ----
def test_make_simple_varlist_index_error_on_datetime(tmp_path):
"""
When ``os.path.basename(one_file).split('.')[-3]`` raises IndexError
(e.g. a file with fewer than 3 dot-segments sneaks in), the function
should catch it, set one_datetime = None, and fall back to the ``'*nc'``
search pattern.
Covers the except IndexError branch and the one_datetime is None path.
"""
# Create a file that matches *.*.nc normally
real_file = tmp_path / "model.19900101.temp.nc"
real_file.touch()

# Patch iglob so the *first* call (the *.*.nc probe) returns a pathological
# name with only one dot ("short.nc"), triggering IndexError on split('.')[-3].
# The *second* call (glob.glob with search_pattern) returns the real file.
fake_probe = str(tmp_path / "short.nc")
(tmp_path / "short.nc").touch()

original_iglob = glob_module.iglob
original_glob = glob_module.glob

def patched_iglob(pattern, **kwargs):
# Return the pathological file for the first probe
return iter([fake_probe])

def patched_glob(pattern, **kwargs):
# Return the real files for the search_pattern glob
return original_glob(pattern, **kwargs)

with patch('fre.cmor.cmor_finder.glob.iglob', side_effect=patched_iglob):
with patch('fre.cmor.cmor_finder.glob.glob', side_effect=patched_glob):
result = make_simple_varlist(str(tmp_path), None)

# short.nc split is ['short', 'nc'], [-3] raises IndexError
# one_datetime becomes None, search_pattern becomes '*nc'
# glob picks up *.nc files in the directory
assert result is not None
assert isinstance(result, dict)


# ---- no files matching search pattern ----
def test_make_simple_varlist_no_files_matching_pattern(tmp_path):
"""
When the initial probe finds a file but the subsequent glob with the
derived search_pattern returns no files, the function should return None.
Covers the 'if not files' early return.
When glob finds no *.*.nc files in the directory the function should
return None. Covers the 'if not all_nc_files' early return by patching
glob.glob to return an empty list.
"""
# Create a file for the initial probe
# Create a file for the probe to land on (but the patch overrides it)
probe_file = tmp_path / "model.19900101.temp.nc"
probe_file.touch()

# Patch glob.glob to return empty list for the pattern-based search
# Patch glob.glob to return empty list
with patch('fre.cmor.cmor_finder.glob.glob', return_value=[]):
result = make_simple_varlist(str(tmp_path), None)

Expand All @@ -266,15 +225,16 @@ def test_make_simple_varlist_dedup_across_datetimes(tmp_path):
"""
Files with different datetime stamps but the same variable name
should be de-duplicated so the variable appears only once.
Covers the 'var_name already in target varlist, skip' branch.
All files across all datetimes are scanned; duplicate var names
are collapsed by dict assignment.
"""
(tmp_path / "ocean.19900101.tos.nc").touch()
(tmp_path / "ocean.19900201.tos.nc").touch()
(tmp_path / "ocean.19900101.sos.nc").touch()
(tmp_path / "ocean.19900201.sos.nc").touch()

# All four files share datetime pattern "1990*" so they all get globbed;
# tos and sos each appear twice, the second occurrence triggers the skip.
# All four files are scanned; tos and sos each appear twice but
# dict assignment deduplicates them.
result = make_simple_varlist(str(tmp_path), None)

assert result is not None
Expand All @@ -301,3 +261,22 @@ def test_make_simple_varlist_mip_table_no_match(tmp_path):

# No variables matched
assert result is None


# ---- variable only present at a minority datetime is still returned ----
def test_make_simple_varlist_minority_datetime_var_included(tmp_path):
"""
A variable that only appears at one datetime should still be returned
even when most files belong to a different datetime. Scanning all files
(not just those at the most-common datetime) guarantees this.
"""
# "temp" appears four times at 19900101; "salt" appears only once at 19900201.
for i in range(1, 5):
(tmp_path / f"model.1990010{i}.temp.nc").touch()
(tmp_path / "model.19900201.salt.nc").touch()

result = make_simple_varlist(str(tmp_path), None)

assert result is not None
assert "temp" in result
assert "salt" in result # must not be silently dropped
Loading