Make tracking fully instance-based by passing to engine adapter

treysp · treysp · commit 2bbbe75bd46b · 2025-08-26T10:28:45.000-05:00
diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml
@@ -297,8 +297,8 @@ workflows:
           name: cloud_engine_<< matrix.engine >>
           context:
             - sqlmesh_cloud_database_integration
-          requires:
-            - engine_tests_docker
+          # requires:
+          #   - engine_tests_docker
           matrix:
             parameters:
               engine:
@@ -310,10 +310,10 @@ workflows:
                 - athena
                 - fabric
                 - gcp-postgres
-          filters:
-            branches:
-              only:
-                - main
+          # filters:
+          #   branches:
+          #     only:
+          #       - main
       - ui_style
       - ui_test
       - vscode_test
diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py
@@ -135,6 +135,7 @@ def __init__(
         shared_connection: bool = False,
         correlation_id: t.Optional[CorrelationId] = None,
         schema_differ_overrides: t.Optional[t.Dict[str, t.Any]] = None,
+        query_execution_tracker: t.Optional[QueryExecutionTracker] = None,
         **kwargs: t.Any,
     ):
         self.dialect = dialect.lower() or self.DIALECT
@@ -158,6 +159,7 @@ def __init__(
         self._multithreaded = multithreaded
         self.correlation_id = correlation_id
         self._schema_differ_overrides = schema_differ_overrides
+        self._query_execution_tracker = query_execution_tracker
 
     def with_settings(self, **kwargs: t.Any) -> EngineAdapter:
         extra_kwargs = {
@@ -2448,15 +2450,17 @@ def _log_sql(
     def _record_execution_stats(
         self, sql: str, rowcount: t.Optional[int] = None, bytes_processed: t.Optional[int] = None
     ) -> None:
-        QueryExecutionTracker.record_execution(sql, rowcount, bytes_processed)
+        if self._query_execution_tracker:
+            self._query_execution_tracker.record_execution(sql, rowcount, bytes_processed)
 
     def _execute(self, sql: str, track_rows_processed: bool = False, **kwargs: t.Any) -> None:
         self.cursor.execute(sql, **kwargs)
 
         if (
             self.SUPPORTS_QUERY_EXECUTION_TRACKING
             and track_rows_processed
-            and QueryExecutionTracker.is_tracking()
+            and self._query_execution_tracker
+            and self._query_execution_tracker.is_tracking()
         ):
             if (
                 rowcount := getattr(self.cursor, "rowcount", None)
diff --git a/sqlmesh/core/engine_adapter/bigquery.py b/sqlmesh/core/engine_adapter/bigquery.py
@@ -23,7 +23,6 @@
 )
 from sqlmesh.core.node import IntervalUnit
 from sqlmesh.core.schema_diff import TableAlterOperation, NestedSupport
-from sqlmesh.core.snapshot.execution_tracker import QueryExecutionTracker
 from sqlmesh.utils import optional_import, get_source_columns_to_types
 from sqlmesh.utils.date import to_datetime
 from sqlmesh.utils.errors import SQLMeshError
@@ -1097,7 +1096,11 @@ def _execute(
         self.cursor._set_rowcount(query_results)
         self.cursor._set_description(query_results.schema)
 
-        if track_rows_processed and QueryExecutionTracker.is_tracking():
+        if (
+            track_rows_processed
+            and self._query_execution_tracker
+            and self._query_execution_tracker.is_tracking()
+        ):
             num_rows = None
             if query_job.statement_type == "CREATE_TABLE_AS_SELECT":
                 # since table was just created, number rows in table == number rows processed
@@ -1106,7 +1109,9 @@ def _execute(
             elif query_job.statement_type in ["INSERT", "DELETE", "MERGE", "UPDATE"]:
                 num_rows = query_job.num_dml_affected_rows
 
-            QueryExecutionTracker.record_execution(sql, num_rows, query_job.total_bytes_processed)
+            self._query_execution_tracker.record_execution(
+                sql, num_rows, query_job.total_bytes_processed
+            )
 
     def _get_data_objects(
         self, schema_name: SchemaName, object_names: t.Optional[t.Set[str]] = None
diff --git a/sqlmesh/core/engine_adapter/snowflake.py b/sqlmesh/core/engine_adapter/snowflake.py
@@ -2,7 +2,6 @@
 
 import contextlib
 import logging
-import re
 import typing as t
 
 from sqlglot import exp
@@ -24,7 +23,6 @@
     SourceQuery,
     set_catalog,
 )
-from sqlmesh.core.snapshot.execution_tracker import QueryExecutionTracker
 from sqlmesh.utils import optional_import, get_source_columns_to_types
 from sqlmesh.utils.errors import SQLMeshError
 from sqlmesh.utils.pandas import columns_to_types_from_dtypes
@@ -189,7 +187,7 @@ def _create_table(
             table_description=table_description,
             column_descriptions=column_descriptions,
             table_kind=table_kind,
-            track_rows_processed=track_rows_processed,
+            track_rows_processed=False,
             **kwargs,
         )
 
@@ -667,41 +665,3 @@ def close(self) -> t.Any:
             self._connection_pool.set_attribute(self.SNOWPARK, None)
 
         return super().close()
-
-    def _record_execution_stats(
-        self, sql: str, rowcount: t.Optional[int] = None, bytes_processed: t.Optional[int] = None
-    ) -> None:
-        """Snowflake does not report row counts for CTAS like other DML operations.
-
-        They neither report the sentinel value -1 nor do they report 0 rows. Instead, they report a rowcount
-        of 1 and return a single data row containing one of the strings:
-          - "Table <table_name> successfully created."
-          - "<table_name> already exists, statement succeeded."
-
-        We do not want to record the incorrect row count of 1, so we check whether that row contains the table
-        successfully created string. If so, we return early and do not record the row count.
-
-        Ref: https://github.com/snowflakedb/snowflake-connector-python/issues/645
-        """
-        if rowcount == 1:
-            results = self.cursor.fetchone()
-            if results:
-                try:
-                    results_str = str(results[0])
-                except (TypeError, ValueError, IndexError):
-                    return
-
-                # Snowflake identifiers may be:
-                # - An unquoted contiguous set of [a-zA-Z0-9_$] characters
-                # - A double-quoted string that may contain spaces and nested double-quotes represented by `""`. Example: " my ""table"" name "
-                is_created = re.match(
-                    r'Table [a-zA-Z0-9_$ "]*? successfully created\.', results_str
-                )
-                is_already_exists = re.match(
-                    r'[a-zA-Z0-9_$ "]*? already exists, statement succeeded\.',
-                    results_str,
-                )
-                if is_created or is_already_exists:
-                    return
-
-        QueryExecutionTracker.record_execution(sql, rowcount, bytes_processed)
diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py
@@ -130,14 +130,18 @@ def __init__(
         self.adapters = (
             adapters if isinstance(adapters, t.Dict) else {selected_gateway or "": adapters}
         )
+        self.execution_tracker = QueryExecutionTracker()
+        self.adapters = {
+            gateway: adapter.with_settings(query_execution_tracker=self.execution_tracker)
+            for gateway, adapter in self.adapters.items()
+        }
         self.adapter = (
             next(iter(self.adapters.values()))
             if not selected_gateway
             else self.adapters[selected_gateway]
         )
         self.selected_gateway = selected_gateway
         self.ddl_concurrent_tasks = ddl_concurrent_tasks
-        self.execution_tracker = QueryExecutionTracker()
 
     def evaluate(
         self,
diff --git a/sqlmesh/core/snapshot/execution_tracker.py b/sqlmesh/core/snapshot/execution_tracker.py
@@ -2,7 +2,7 @@
 
 import typing as t
 from contextlib import contextmanager
-from threading import local, Lock
+from threading import local
 from dataclasses import dataclass, field
 from sqlmesh.core.snapshot import SnapshotIdBatch
 
@@ -56,19 +56,17 @@ def get_execution_stats(self) -> QueryExecutionStats:
 class QueryExecutionTracker:
     """Thread-local context manager for snapshot execution statistics, such as rows processed."""
 
-    _thread_local = local()
-    _contexts: t.Dict[SnapshotIdBatch, QueryExecutionContext] = {}
-    _contexts_lock = Lock()
+    def __init__(self) -> None:
+        self._thread_local = local()
+        self._contexts: t.Dict[SnapshotIdBatch, QueryExecutionContext] = {}
 
     def get_execution_context(
         self, snapshot_id_batch: SnapshotIdBatch
     ) -> t.Optional[QueryExecutionContext]:
-        with self._contexts_lock:
-            return self._contexts.get(snapshot_id_batch)
+        return self._contexts.get(snapshot_id_batch)
 
-    @classmethod
-    def is_tracking(cls) -> bool:
-        return getattr(cls._thread_local, "context", None) is not None
+    def is_tracking(self) -> bool:
+        return getattr(self._thread_local, "context", None) is not None
 
     @contextmanager
     def track_execution(
@@ -77,26 +75,23 @@ def track_execution(
         """Context manager for tracking snapshot execution statistics such as row counts and bytes processed."""
         context = QueryExecutionContext(snapshot_id_batch=snapshot_id_batch)
         self._thread_local.context = context
-        with self._contexts_lock:
-            self._contexts[snapshot_id_batch] = context
+        self._contexts[snapshot_id_batch] = context
 
         try:
             yield context
         finally:
             self._thread_local.context = None
 
-    @classmethod
     def record_execution(
-        cls, sql: str, row_count: t.Optional[int], bytes_processed: t.Optional[int]
+        self, sql: str, row_count: t.Optional[int], bytes_processed: t.Optional[int]
     ) -> None:
-        context = getattr(cls._thread_local, "context", None)
+        context = getattr(self._thread_local, "context", None)
         if context is not None:
             context.add_execution(sql, row_count, bytes_processed)
 
     def get_execution_stats(
         self, snapshot_id_batch: SnapshotIdBatch
     ) -> t.Optional[QueryExecutionStats]:
-        with self._contexts_lock:
-            context = self._contexts.get(snapshot_id_batch)
-            self._contexts.pop(snapshot_id_batch, None)
+        context = self._contexts.get(snapshot_id_batch)
+        self._contexts.pop(snapshot_id_batch, None)
         return context.get_execution_stats() if context else None
diff --git a/tests/core/engine_adapter/integration/test_integration_snowflake.py b/tests/core/engine_adapter/integration/test_integration_snowflake.py
@@ -327,16 +327,13 @@ def test_rows_tracker(
         SnapshotIdBatch(snapshot_id=SnapshotId(name="a", identifier="a"), batch_id=0)
     ):
         # Snowflake doesn't report row counts for CTAS, so this should not be tracked
-        engine_adapter.execute(
-            "CREATE TABLE a (id int) AS SELECT 1 as id", track_rows_processed=True
-        )
-        engine_adapter.execute("INSERT INTO a VALUES (2), (3)", track_rows_processed=True)
-        engine_adapter.execute("INSERT INTO a VALUES (4)", track_rows_processed=True)
+        engine_adapter._create_table("a", exp.select("1 as id"))
 
-    assert add_execution_spy.call_count == 2
+    assert add_execution_spy.call_count == 0
 
     stats = tracker.get_execution_stats(
         SnapshotIdBatch(snapshot_id=SnapshotId(name="a", identifier="a"), batch_id=0)
     )
     assert stats is not None
-    assert stats.total_rows_processed == 3
+    assert stats.total_rows_processed is None
+    assert stats.total_bytes_processed is None