Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions esgpull/cli/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from esgpull.cli.utils import get_queries, init_esgpull, valid_name_tag
from esgpull.context import HintsDict, ResultSearch
from esgpull.exceptions import UnsetOptionsError
from esgpull.models import Dataset, File, FileStatus, Query
from esgpull.models import Dataset, File, FileStatus, Query, sql
from esgpull.tui import Verbosity
from esgpull.utils import format_size

Expand Down Expand Up @@ -190,11 +190,26 @@ def update(
f"Adding {len(unregistered_datasets)} new datasets to database."
)
esg.db.session.add_all(unregistered_datasets)
files_from_db = [
esg.db.get(File, f.sha) for f in qf.files if f in esg.db
]
registered_files = [f for f in files_from_db if f is not None]
unregistered_files = [f for f in qf.files if f not in esg.db]
# Get existing file_ids from DB to check by file_id (not SHA)
# This handles cases where the same file has different checksums
existing_file_ids = {
fid for fid in esg.db.scalars(sql.file.all_file_ids())
}
registered_files = []
unregistered_files = []
for f in qf.files:
if f.file_id in existing_file_ids:
# File exists in DB - fetch it by file_id
existing_sha = esg.db.scalars(
sql.file.with_file_id(f.file_id)
)
if existing_sha:
existing_file = esg.db.get(File, existing_sha[0])
if existing_file is not None:
registered_files.append(existing_file)
else:
# New file - not in DB yet
unregistered_files.append(f)
if len(unregistered_files) > 0:
esg.ui.print(
f"Adding {len(unregistered_files)} new files to database."
Expand Down
4 changes: 4 additions & 0 deletions esgpull/models/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ def with_status(*status: FileStatus) -> sa.Select[tuple[File]]:
def with_file_id(file_id: str) -> sa.Select[tuple[str]]:
return sa.select(File.sha).where(File.file_id == file_id).limit(1)

@staticmethod
def all_file_ids() -> sa.Select[tuple[str]]:
return sa.select(File.file_id)

@staticmethod
def total_size_with_status(
*status: FileStatus,
Expand Down
6 changes: 6 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from esgpull.config import Config
from esgpull.constants import CONFIG_FILENAME
from esgpull.database import Database
from esgpull.install_config import InstallConfig
from esgpull.models import File, FileStatus
from tests.utils import CEDA_NODE
Expand Down Expand Up @@ -50,3 +51,8 @@ def file():
)
f.compute_sha()
return f


@pytest.fixture
def db(config):
return Database.from_config(config)
67 changes: 67 additions & 0 deletions tests/test_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,3 +506,70 @@ async def run_test():
]
assert len(duplicate_files) == 1
assert duplicate_files[0].checksum == "abc123"


def test_update_checks_files_by_file_id_not_sha(db):
"""Test that update separates files by file_id, not SHA.

This verifies the fix for the bug where re-adding a query after removal
would fail because files were checked by SHA instead of file_id.
When the same file appears with different checksums (different replicas),
it should still be recognized as the same file.
"""
from esgpull.models import File, sql

# Create a file in the database with a specific SHA
existing_file = File(
file_id="dataset.v1.test.nc",
dataset_id="dataset.v1",
master_id="dataset.v1.test",
url="https://example.com/test.nc",
version="v1",
filename="test.nc",
local_path="dataset/v1/test.nc",
data_node="example.com",
checksum="original_checksum_abc123",
checksum_type="SHA256",
size=1000,
)
existing_file.compute_sha()
db.add(existing_file)

# Verify file is in DB
all_file_ids = set(db.scalars(sql.file.all_file_ids()))
assert "dataset.v1.test.nc" in all_file_ids

# Create a "fetched" file with same file_id but different SHA
# (This simulates what happens when fetching from a different replica)
fetched_file = File(
file_id="dataset.v1.test.nc",
dataset_id="dataset.v1",
master_id="dataset.v1.test",
url="https://other-node.com/test.nc",
version="v1",
filename="test.nc",
local_path="dataset/v1/test.nc",
data_node="other-node.com",
checksum="different_checksum_def456",
checksum_type="SHA256",
size=1000,
)
fetched_file.compute_sha()

# Verify the fetched file has a different SHA
assert fetched_file.sha != existing_file.sha

# The fetched file should be identified as existing by file_id
assert fetched_file.file_id in all_file_ids

# Fetch the existing file by file_id
existing_sha = db.scalars(sql.file.with_file_id(fetched_file.file_id))
assert existing_sha is not None
assert len(existing_sha) == 1
assert existing_sha[0] == existing_file.sha

# Get the existing file from DB
file_from_db = db.get(File, existing_sha[0])
assert file_from_db is not None
assert file_from_db.file_id == "dataset.v1.test.nc"
assert file_from_db.checksum == "original_checksum_abc123"
Loading