Remove time travel test for cloud engines, handle pyspark DFs in dbx

treysp · treysp · commit 8ae058f0fb68 · 2025-08-20T18:10:54.000-05:00
diff --git a/sqlmesh/core/engine_adapter/databricks.py b/sqlmesh/core/engine_adapter/databricks.py
@@ -14,6 +14,7 @@
     SourceQuery,
 )
 from sqlmesh.core.engine_adapter.spark import SparkEngineAdapter
+from sqlmesh.engines.spark.db_api.spark_session import SparkSessionCursor
 from sqlmesh.core.node import IntervalUnit
 from sqlmesh.core.schema_diff import SchemaDiffer
 from sqlmesh.core.snapshot.execution_tracker import QueryExecutionTracker
@@ -380,38 +381,59 @@ def _record_execution_stats(
             except:
                 return
 
-            history = self.cursor.fetchall_arrow()
-            if history.num_rows:
-                history_df = history.to_pandas()
-                write_df = history_df[history_df["operation"] == "WRITE"]
-                write_df = write_df[write_df["timestamp"] == write_df["timestamp"].max()]
-                if not write_df.empty:
-                    metrics = write_df["operationMetrics"][0]
-                    if metrics:
-                        rowcount = None
-                        rowcount_str = [
-                            metric[1] for metric in metrics if metric[0] == "numOutputRows"
-                        ]
-                        if rowcount_str:
-                            try:
-                                rowcount = int(rowcount_str[0])
-                            except (TypeError, ValueError):
-                                pass
-
-                        bytes_processed = None
-                        bytes_str = [
-                            metric[1] for metric in metrics if metric[0] == "numOutputBytes"
-                        ]
-                        if bytes_str:
-                            try:
-                                bytes_processed = int(bytes_str[0])
-                            except (TypeError, ValueError):
-                                pass
-
-                        if rowcount is not None or bytes_processed is not None:
-                            # if no rows were written, df contains 0 for bytes but no value for rows
-                            rowcount = (
-                                0 if rowcount is None and bytes_processed is not None else rowcount
-                            )
-
-                            QueryExecutionTracker.record_execution(sql, rowcount, bytes_processed)
+            history = (
+                self.cursor.fetchdf()
+                if isinstance(self.cursor, SparkSessionCursor)
+                else self.cursor.fetchall_arrow()
+            )
+            if history is not None:
+                from pandas import DataFrame as PandasDataFrame
+                from pyspark.sql import DataFrame as PySparkDataFrame
+                from pyspark.sql.connect.dataframe import DataFrame as PySparkConnectDataFrame
+
+                history_df = None
+                if isinstance(history, PandasDataFrame):
+                    history_df = history
+                elif isinstance(history, (PySparkDataFrame, PySparkConnectDataFrame)):
+                    history_df = history.toPandas()
+                else:
+                    # arrow table
+                    history_df = history.to_pandas()
+
+                if history_df is not None and not history_df.empty:
+                    write_df = history_df[history_df["operation"] == "WRITE"]
+                    write_df = write_df[write_df["timestamp"] == write_df["timestamp"].max()]
+                    if not write_df.empty:
+                        metrics = write_df["operationMetrics"][0]
+                        if metrics:
+                            rowcount = None
+                            rowcount_str = [
+                                metric[1] for metric in metrics if metric[0] == "numOutputRows"
+                            ]
+                            if rowcount_str:
+                                try:
+                                    rowcount = int(rowcount_str[0])
+                                except (TypeError, ValueError):
+                                    pass
+
+                            bytes_processed = None
+                            bytes_str = [
+                                metric[1] for metric in metrics if metric[0] == "numOutputBytes"
+                            ]
+                            if bytes_str:
+                                try:
+                                    bytes_processed = int(bytes_str[0])
+                                except (TypeError, ValueError):
+                                    pass
+
+                            if rowcount is not None or bytes_processed is not None:
+                                # if no rows were written, df contains 0 for bytes but no value for rows
+                                rowcount = (
+                                    0
+                                    if rowcount is None and bytes_processed is not None
+                                    else rowcount
+                                )
+
+                                QueryExecutionTracker.record_execution(
+                                    sql, rowcount, bytes_processed
+                                )
diff --git a/tests/core/engine_adapter/integration/test_integration.py b/tests/core/engine_adapter/integration/test_integration.py
@@ -2462,23 +2462,21 @@ def capture_execution_stats(
             assert actual_execution_stats["full_model"].total_bytes_processed is not None
 
     # run that loads 0 rows in incremental model
-    actual_execution_stats = {}
-    with patch.object(
-        context.console, "update_snapshot_evaluation_progress", capture_execution_stats
-    ):
-        with time_machine.travel(date.today() + timedelta(days=1)):
-            context.run()
-
-    if ctx.engine_adapter.SUPPORTS_QUERY_EXECUTION_TRACKING:
-        assert actual_execution_stats["incremental_model"].total_rows_processed == 0
-        # snowflake doesn't track rows for CTAS
-        assert actual_execution_stats["full_model"].total_rows_processed == (
-            None if ctx.mark.startswith("snowflake") else 3
-        )
-
-        if ctx.mark.startswith("bigquery") or ctx.mark.startswith("databricks"):
-            assert actual_execution_stats["incremental_model"].total_bytes_processed is not None
-            assert actual_execution_stats["full_model"].total_bytes_processed is not None
+    # - some cloud DBs error because time travel messes up token expiration
+    if not ctx.is_remote:
+        actual_execution_stats = {}
+        with patch.object(
+            context.console, "update_snapshot_evaluation_progress", capture_execution_stats
+        ):
+            with time_machine.travel(date.today() + timedelta(days=1)):
+                context.run()
+
+        if ctx.engine_adapter.SUPPORTS_QUERY_EXECUTION_TRACKING:
+            assert actual_execution_stats["incremental_model"].total_rows_processed == 0
+            # snowflake doesn't track rows for CTAS
+            assert actual_execution_stats["full_model"].total_rows_processed == (
+                None if ctx.mark.startswith("snowflake") else 3
+            )
 
     # make and validate unmodified dev environment
     no_change_plan: Plan = context.plan_builder(