PR feedback

treysp · treysp · commit c7622f22ef9e · 2025-08-26T09:55:19.000-05:00
diff --git a/sqlmesh/core/console.py b/sqlmesh/core/console.py
@@ -9,6 +9,7 @@
 import textwrap
 from itertools import zip_longest
 from pathlib import Path
+from humanize import naturalsize, metric
 from hyperscript import h
 from rich.console import Console as RichConsole
 from rich.live import Live
@@ -4187,14 +4188,14 @@ def _create_evaluation_model_annotation(
     if execution_stats:
         rows_processed = execution_stats.total_rows_processed
         execution_stats_str += (
-            f"{_abbreviate_integer_count(rows_processed)} row{'s' if rows_processed != 1 else ''}"
+            f"{metric(rows_processed)} row{'s' if rows_processed != 1 else ''}"
             if rows_processed is not None and rows_processed >= 0
             else ""
         )
 
         bytes_processed = execution_stats.total_bytes_processed
         execution_stats_str += (
-            f"{', ' if execution_stats_str else ''}{_format_bytes(bytes_processed)}"
+            f"{', ' if execution_stats_str else ''}{naturalsize(bytes_processed, binary=True)}"
             if bytes_processed is not None and bytes_processed >= 0
             else ""
         )
@@ -4299,39 +4300,3 @@ def _calculate_annotation_str_len(
             + execution_stats_len,
         )
     return annotation_str_len
-
-
-# Convert number of bytes to a human-readable string
-# https://github.com/dbt-labs/dbt-adapters/blob/34fd178539dcb6f82e18e738adc03de7784c032f/dbt-bigquery/src/dbt/adapters/bigquery/connections.py#L165
-def _format_bytes(num_bytes: t.Optional[int]) -> str:
-    if num_bytes is not None and num_bytes >= 0:
-        if num_bytes < 1024:
-            return f"{num_bytes} bytes"
-
-        num_bytes_float = float(num_bytes) / 1024.0
-        for unit in ["KiB", "MiB", "GiB", "TiB", "PiB"]:
-            if num_bytes_float < 1024.0:
-                return f"{num_bytes_float:3.1f} {unit}"
-            num_bytes_float /= 1024.0
-
-        num_bytes_float *= 1024.0  # undo last division in loop
-        return f"{num_bytes_float:3.1f} {unit}"
-    return ""
-
-
-# Abbreviate integer count. Example: 1,000,000,000 -> 1b
-# https://github.com/dbt-labs/dbt-adapters/blob/34fd178539dcb6f82e18e738adc03de7784c032f/dbt-bigquery/src/dbt/adapters/bigquery/connections.py#L178
-def _abbreviate_integer_count(count: t.Optional[int]) -> str:
-    if count is not None and count >= 0:
-        if count < 1000:
-            return str(count)
-
-        count_float = float(count) / 1000.0
-        for unit in ["k", "m", "b", "t"]:
-            if count_float < 1000.0:
-                return f"{count_float:3.1f}{unit}".strip()
-            count_float /= 1000.0
-
-        count_float *= 1000.0  # undo last division in loop
-        return f"{count_float:3.1f}{unit}".strip()
-    return ""
diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py
@@ -2458,16 +2458,12 @@ def _execute(self, sql: str, track_rows_processed: bool = False, **kwargs: t.Any
             and track_rows_processed
             and QueryExecutionTracker.is_tracking()
         ):
-            rowcount_raw = getattr(self.cursor, "rowcount", None)
-            rowcount = None
-            if rowcount_raw is not None:
+            if (rowcount := getattr(self.cursor, "rowcount", None)) and rowcount is not None:
                 try:
-                    rowcount = int(rowcount_raw)
+                    self._record_execution_stats(sql, int(rowcount))
                 except (TypeError, ValueError):
                     return
 
-            self._record_execution_stats(sql, rowcount)
-
     @contextlib.contextmanager
     def temp_table(
         self,
diff --git a/sqlmesh/core/scheduler.py b/sqlmesh/core/scheduler.py
@@ -20,6 +20,7 @@
     DeployabilityIndex,
     Snapshot,
     SnapshotId,
+    SnapshotIdBatch,
     SnapshotEvaluator,
     apply_auto_restatements,
     earliest_start_date,
@@ -533,7 +534,7 @@ def run_node(node: SchedulingUnit) -> None:
                     num_audits_failed = sum(1 for result in audit_results if result.count)
 
                     execution_stats = self.snapshot_evaluator.execution_tracker.get_execution_stats(
-                        f"{snapshot.snapshot_id}_{node.batch_index}"
+                        SnapshotIdBatch(snapshot_id=snapshot.snapshot_id, batch_id=node.batch_index)
                     )
 
                     self.console.update_snapshot_evaluation_progress(
diff --git a/sqlmesh/core/snapshot/__init__.py b/sqlmesh/core/snapshot/__init__.py
@@ -8,6 +8,7 @@
     SnapshotDataVersion as SnapshotDataVersion,
     SnapshotFingerprint as SnapshotFingerprint,
     SnapshotId as SnapshotId,
+    SnapshotIdBatch as SnapshotIdBatch,
     SnapshotIdLike as SnapshotIdLike,
     SnapshotInfoLike as SnapshotInfoLike,
     SnapshotIntervals as SnapshotIntervals,
diff --git a/sqlmesh/core/snapshot/definition.py b/sqlmesh/core/snapshot/definition.py
@@ -162,6 +162,11 @@ def __str__(self) -> str:
         return f"SnapshotId<{self.name}: {self.identifier}>"
 
 
+class SnapshotIdBatch(PydanticModel, frozen=True):
+    snapshot_id: SnapshotId
+    batch_id: int
+
+
 class SnapshotNameVersion(PydanticModel, frozen=True):
     name: str
     version: str
diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py
@@ -61,6 +61,7 @@
     Intervals,
     Snapshot,
     SnapshotId,
+    SnapshotIdBatch,
     SnapshotInfoLike,
     SnapshotTableCleanupTask,
 )
@@ -171,7 +172,9 @@ def evaluate(
         Returns:
             The WAP ID of this evaluation if supported, None otherwise.
         """
-        with self.execution_tracker.track_execution(f"{snapshot.snapshot_id}_{batch_index}"):
+        with self.execution_tracker.track_execution(
+            SnapshotIdBatch(snapshot_id=snapshot.snapshot_id, batch_id=batch_index)
+        ):
             result = self._evaluate_snapshot(
                 start=start,
                 end=end,
diff --git a/sqlmesh/core/snapshot/execution_tracker.py b/sqlmesh/core/snapshot/execution_tracker.py
@@ -4,11 +4,12 @@
 from contextlib import contextmanager
 from threading import local, Lock
 from dataclasses import dataclass, field
+from sqlmesh.core.snapshot import SnapshotIdBatch
 
 
 @dataclass
 class QueryExecutionStats:
-    snapshot_batch_id: str
+    snapshot_id_batch: SnapshotIdBatch
     total_rows_processed: t.Optional[int] = None
     total_bytes_processed: t.Optional[int] = None
 
@@ -21,15 +22,15 @@ class QueryExecutionContext:
     It accumulates statistics from multiple cursor.execute() calls during a single snapshot evaluation.
 
     Attributes:
-        snapshot_batch_id: Identifier linking this context to a specific snapshot evaluation
+        snapshot_id_batch: Identifier linking this context to a specific snapshot evaluation
         stats: Running sum of cursor.rowcount and possibly bytes processed from all executed queries during evaluation
     """
 
-    snapshot_batch_id: str
+    snapshot_id_batch: SnapshotIdBatch
     stats: QueryExecutionStats = field(init=False)
 
     def __post_init__(self) -> None:
-        self.stats = QueryExecutionStats(snapshot_batch_id=self.snapshot_batch_id)
+        self.stats = QueryExecutionStats(snapshot_id_batch=self.snapshot_id_batch)
 
     def add_execution(
         self, sql: str, row_count: t.Optional[int], bytes_processed: t.Optional[int]
@@ -56,10 +57,12 @@ class QueryExecutionTracker:
     """Thread-local context manager for snapshot execution statistics, such as rows processed."""
 
     _thread_local = local()
-    _contexts: t.Dict[str, QueryExecutionContext] = {}
+    _contexts: t.Dict[SnapshotIdBatch, QueryExecutionContext] = {}
     _contexts_lock = Lock()
 
-    def get_execution_context(self, snapshot_id_batch: str) -> t.Optional[QueryExecutionContext]:
+    def get_execution_context(
+        self, snapshot_id_batch: SnapshotIdBatch
+    ) -> t.Optional[QueryExecutionContext]:
         with self._contexts_lock:
             return self._contexts.get(snapshot_id_batch)
 
@@ -69,10 +72,10 @@ def is_tracking(cls) -> bool:
 
     @contextmanager
     def track_execution(
-        self, snapshot_id_batch: str
+        self, snapshot_id_batch: SnapshotIdBatch
     ) -> t.Iterator[t.Optional[QueryExecutionContext]]:
         """Context manager for tracking snapshot execution statistics such as row counts and bytes processed."""
-        context = QueryExecutionContext(snapshot_batch_id=snapshot_id_batch)
+        context = QueryExecutionContext(snapshot_id_batch=snapshot_id_batch)
         self._thread_local.context = context
         with self._contexts_lock:
             self._contexts[snapshot_id_batch] = context
@@ -90,7 +93,9 @@ def record_execution(
         if context is not None:
             context.add_execution(sql, row_count, bytes_processed)
 
-    def get_execution_stats(self, snapshot_id_batch: str) -> t.Optional[QueryExecutionStats]:
+    def get_execution_stats(
+        self, snapshot_id_batch: SnapshotIdBatch
+    ) -> t.Optional[QueryExecutionStats]:
         with self._contexts_lock:
             context = self._contexts.get(snapshot_id_batch)
             self._contexts.pop(snapshot_id_batch, None)
diff --git a/tests/core/engine_adapter/integration/test_integration_snowflake.py b/tests/core/engine_adapter/integration/test_integration_snowflake.py
@@ -13,6 +13,7 @@
 from tests.core.engine_adapter.integration import TestContext
 from sqlmesh import model, ExecutionContext
 from pytest_mock import MockerFixture
+from sqlmesh.core.snapshot import SnapshotId, SnapshotIdBatch
 from sqlmesh.core.snapshot.execution_tracker import (
     QueryExecutionContext,
     QueryExecutionTracker,
@@ -322,7 +323,9 @@ def test_rows_tracker(
 
     add_execution_spy = mocker.spy(QueryExecutionContext, "add_execution")
 
-    with tracker.track_execution("a"):
+    with tracker.track_execution(
+        SnapshotIdBatch(snapshot_id=SnapshotId(name="a", identifier="a"), batch_id=0)
+    ):
         # Snowflake doesn't report row counts for CTAS, so this should not be tracked
         engine_adapter.execute(
             "CREATE TABLE a (id int) AS SELECT 1 as id", track_rows_processed=True
@@ -332,6 +335,8 @@ def test_rows_tracker(
 
     assert add_execution_spy.call_count == 2
 
-    stats = tracker.get_execution_stats("a")
+    stats = tracker.get_execution_stats(
+        SnapshotIdBatch(snapshot_id=SnapshotId(name="a", identifier="a"), batch_id=0)
+    )
     assert stats is not None
     assert stats.total_rows_processed == 3
diff --git a/tests/core/test_execution_tracker.py b/tests/core/test_execution_tracker.py
@@ -3,11 +3,12 @@
 from concurrent.futures import ThreadPoolExecutor
 
 from sqlmesh.core.snapshot.execution_tracker import QueryExecutionStats, QueryExecutionTracker
+from sqlmesh.core.snapshot import SnapshotIdBatch, SnapshotId
 
 
 def test_execution_tracker_thread_isolation() -> None:
-    def worker(id: str, row_counts: list[int]) -> QueryExecutionStats:
-        with execution_tracker.track_execution(id) as ctx:
+    def worker(id: SnapshotId, row_counts: list[int]) -> QueryExecutionStats:
+        with execution_tracker.track_execution(SnapshotIdBatch(snapshot_id=id, batch_id=0)) as ctx:
             assert execution_tracker.is_tracking()
 
             for count in row_counts:
@@ -20,18 +21,30 @@ def worker(id: str, row_counts: list[int]) -> QueryExecutionStats:
 
     with ThreadPoolExecutor() as executor:
         futures = [
-            executor.submit(worker, "batch_A", [10, 5]),
-            executor.submit(worker, "batch_B", [3, 7]),
+            executor.submit(worker, SnapshotId(name="batch_A", identifier="batch_A"), [10, 5]),
+            executor.submit(worker, SnapshotId(name="batch_B", identifier="batch_B"), [3, 7]),
         ]
         results = [f.result() for f in futures]
 
     # Main thread has no active tracking context
     assert not execution_tracker.is_tracking()
-    execution_tracker.record_execution("q", 10, None)
-    assert execution_tracker.get_execution_stats("q") is None
 
     # Order of results is not deterministic, so look up by id
-    by_batch = {s.snapshot_batch_id: s for s in results}
-
-    assert by_batch["batch_A"].total_rows_processed == 15
-    assert by_batch["batch_B"].total_rows_processed == 10
+    by_batch = {s.snapshot_id_batch: s for s in results}
+
+    assert (
+        by_batch[
+            SnapshotIdBatch(
+                snapshot_id=SnapshotId(name="batch_A", identifier="batch_A"), batch_id=0
+            )
+        ].total_rows_processed
+        == 15
+    )
+    assert (
+        by_batch[
+            SnapshotIdBatch(
+                snapshot_id=SnapshotId(name="batch_B", identifier="batch_B"), batch_id=0
+            )
+        ].total_rows_processed
+        == 10
+    )