ESGF · svenrdz · Feb 5, 2026 · Feb 4, 2026 · Feb 4, 2026
diff --git a/esgpull/cli/update.py b/esgpull/cli/update.py
@@ -10,7 +10,7 @@
 from esgpull.cli.utils import get_queries, init_esgpull, valid_name_tag
 from esgpull.context import HintsDict, ResultSearch
 from esgpull.exceptions import UnsetOptionsError
-from esgpull.models import Dataset, File, FileStatus, Query
+from esgpull.models import Dataset, File, FileStatus, Query, sql
 from esgpull.tui import Verbosity
 from esgpull.utils import format_size
 
@@ -190,11 +190,26 @@ def update(
                         f"Adding {len(unregistered_datasets)} new datasets to database."
                     )
                     esg.db.session.add_all(unregistered_datasets)
-                files_from_db = [
-                    esg.db.get(File, f.sha) for f in qf.files if f in esg.db
-                ]
-                registered_files = [f for f in files_from_db if f is not None]
-                unregistered_files = [f for f in qf.files if f not in esg.db]
+                # Get existing file_ids from DB to check by file_id (not SHA)
+                # This handles cases where the same file has different checksums
+                existing_file_ids = {
+                    fid for fid in esg.db.scalars(sql.file.all_file_ids())
+                }
+                registered_files = []
+                unregistered_files = []
+                for f in qf.files:
+                    if f.file_id in existing_file_ids:
+                        # File exists in DB - fetch it by file_id
+                        existing_sha = esg.db.scalars(
+                            sql.file.with_file_id(f.file_id)
+                        )
+                        if existing_sha:
+                            existing_file = esg.db.get(File, existing_sha[0])
+                            if existing_file is not None:
+                                registered_files.append(existing_file)
+                    else:
+                        # New file - not in DB yet
+                        unregistered_files.append(f)
                 if len(unregistered_files) > 0:
                     esg.ui.print(
                         f"Adding {len(unregistered_files)} new files to database."

diff --git a/esgpull/models/sql.py b/esgpull/models/sql.py
@@ -109,6 +109,10 @@ def with_status(*status: FileStatus) -> sa.Select[tuple[File]]:
     def with_file_id(file_id: str) -> sa.Select[tuple[str]]:
         return sa.select(File.sha).where(File.file_id == file_id).limit(1)
 
+    @staticmethod
+    def all_file_ids() -> sa.Select[tuple[str]]:
+        return sa.select(File.file_id)
+
     @staticmethod
     def total_size_with_status(
         *status: FileStatus,

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,6 +4,7 @@
 
 from esgpull.config import Config
 from esgpull.constants import CONFIG_FILENAME
+from esgpull.database import Database
 from esgpull.install_config import InstallConfig
 from esgpull.models import File, FileStatus
 from tests.utils import CEDA_NODE
@@ -50,3 +51,8 @@ def file():
     )
     f.compute_sha()
     return f
+
+
+@pytest.fixture
+def db(config):
+    return Database.from_config(config)
diff --git a/tests/test_context.py b/tests/test_context.py
@@ -506,3 +506,70 @@ async def run_test():
     ]
     assert len(duplicate_files) == 1
     assert duplicate_files[0].checksum == "abc123"
+
+
+def test_update_checks_files_by_file_id_not_sha(db):
+    """Test that update separates files by file_id, not SHA.
+
+    This verifies the fix for the bug where re-adding a query after removal
+    would fail because files were checked by SHA instead of file_id.
+    When the same file appears with different checksums (different replicas),
+    it should still be recognized as the same file.
+    """
+    from esgpull.models import File, sql
+
+    # Create a file in the database with a specific SHA
+    existing_file = File(
+        file_id="dataset.v1.test.nc",
+        dataset_id="dataset.v1",
+        master_id="dataset.v1.test",
+        url="https://example.com/test.nc",
+        version="v1",
+        filename="test.nc",
+        local_path="dataset/v1/test.nc",
+        data_node="example.com",
+        checksum="original_checksum_abc123",
+        checksum_type="SHA256",
+        size=1000,
+    )
+    existing_file.compute_sha()
+    db.add(existing_file)
+
+    # Verify file is in DB
+    all_file_ids = set(db.scalars(sql.file.all_file_ids()))
+    assert "dataset.v1.test.nc" in all_file_ids
+
+    # Create a "fetched" file with same file_id but different SHA
+    # (This simulates what happens when fetching from a different replica)
+    fetched_file = File(
+        file_id="dataset.v1.test.nc",
+        dataset_id="dataset.v1",
+        master_id="dataset.v1.test",
+        url="https://other-node.com/test.nc",
+        version="v1",
+        filename="test.nc",
+        local_path="dataset/v1/test.nc",
+        data_node="other-node.com",
+        checksum="different_checksum_def456",
+        checksum_type="SHA256",
+        size=1000,
+    )
+    fetched_file.compute_sha()
+
+    # Verify the fetched file has a different SHA
+    assert fetched_file.sha != existing_file.sha
+
+    # The fetched file should be identified as existing by file_id
+    assert fetched_file.file_id in all_file_ids
+
+    # Fetch the existing file by file_id
+    existing_sha = db.scalars(sql.file.with_file_id(fetched_file.file_id))
+    assert existing_sha is not None
+    assert len(existing_sha) == 1
+    assert existing_sha[0] == existing_file.sha
+
+    # Get the existing file from DB
+    file_from_db = db.get(File, existing_sha[0])
+    assert file_from_db is not None
+    assert file_from_db.file_id == "dataset.v1.test.nc"
+    assert file_from_db.checksum == "original_checksum_abc123"