From c19732f78e179ea7a1d6d1f99157a502d5bee4af Mon Sep 17 00:00:00 2001 From: varun-edachali-dbx Date: Mon, 21 Jul 2025 10:47:37 +0000 Subject: [PATCH 1/7] comparator Signed-off-by: varun-edachali-dbx --- examples/experimental/comparator.py | 605 ++++++++++++++++++++++++++++ 1 file changed, 605 insertions(+) create mode 100755 examples/experimental/comparator.py diff --git a/examples/experimental/comparator.py b/examples/experimental/comparator.py new file mode 100755 index 000000000..e497c142f --- /dev/null +++ b/examples/experimental/comparator.py @@ -0,0 +1,605 @@ +#!/usr/bin/env python3 +""" +Python Connector Comparator + +This script compares the results between the Thrift backend and the SEA backend +of the Databricks SQL Python connector. It executes the same queries against both +backends and compares the results to ensure they match. + +Environment variables required: +- DATABRICKS_SERVER_HOSTNAME: The hostname of the Databricks server +- DATABRICKS_HTTP_PATH: The HTTP path of the Databricks server +- DATABRICKS_TOKEN: The token to use for authentication +- DATABRICKS_CATALOG: (Optional) The catalog to use +""" + +import os +import sys +import logging +import time +from typing import Any, Dict, List, Optional, Tuple, Union, Callable +import json +import pandas as pd +import pyarrow as pa +from databricks.sql.client import Connection, Cursor +from databricks.sql.types import Row + +# Set up logging +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +# Constants +DEFAULT_CATALOG = "main" +DEFAULT_SCHEMA = "default" +DEFAULT_ARRAY_SIZE = 1000 +DEFAULT_BUFFER_SIZE = 10485760 # 10MB + + +class ComparisonResult: + """Class to store and report comparison results.""" + + def __init__(self, test_name: str, query: str, args: List[Any] = None): + self.test_name = test_name + self.query = query + self.args = args or [] + self.differences = [] + self.success = True + self.thrift_time = 0.0 + self.sea_time = 0.0 + self.thrift_error = None + self.sea_error = None + + def add_difference(self, message: str, thrift_value: Any = None, sea_value: Any = None): + """Add a difference to the result.""" + self.differences.append({ + "message": message, + "thrift_value": thrift_value, + "sea_value": sea_value + }) + self.success = False + + def __str__(self) -> str: + """String representation of the comparison result.""" + result = f"Test: {self.test_name}\n" + result += f"Query: {self.query}\n" + + if self.args: + result += f"Args: {self.args}\n" + + result += f"Success: {self.success}\n" + result += f"Thrift time: {self.thrift_time:.4f}s, SEA time: {self.sea_time:.4f}s\n" + + if self.thrift_error: + result += f"Thrift error: {self.thrift_error}\n" + + if self.sea_error: + result += f"SEA error: {self.sea_error}\n" + + if not self.success: + result += "Differences:\n" + for diff in self.differences: + result += f" - {diff['message']}\n" + if diff.get('thrift_value') is not None: + result += f" Thrift: {diff['thrift_value']}\n" + if diff.get('sea_value') is not None: + result += f" SEA: {diff['sea_value']}\n" + + return result + + +class PythonConnectorComparator: + """ + Compares the Thrift and SEA backends of the Databricks SQL Python connector. + """ + + def __init__( + self, + server_hostname: str, + http_path: str, + access_token: str, + catalog: str = DEFAULT_CATALOG, + schema: str = DEFAULT_SCHEMA, + array_size: int = DEFAULT_ARRAY_SIZE, + buffer_size_bytes: int = DEFAULT_BUFFER_SIZE, + report_file: str = "python-connector-comparison-report.txt" + ): + """ + Initialize the comparator with connection parameters. + + Args: + server_hostname: Databricks server hostname + http_path: HTTP path for the SQL warehouse + access_token: Access token for authentication + catalog: Catalog name to use + schema: Schema name to use + array_size: Array size for result fetching + buffer_size_bytes: Buffer size for result fetching + report_file: Path to the report file + """ + self.server_hostname = server_hostname + self.http_path = http_path + self.access_token = access_token + self.catalog = catalog + self.schema = schema + self.array_size = array_size + self.buffer_size_bytes = buffer_size_bytes + self.report_file = report_file + + self.thrift_connection = None + self.sea_connection = None + self.results = [] + + def setup_connections(self): + """Set up connections to both backends.""" + logger.info("Setting up connections to Thrift and SEA backends") + + # Create Thrift connection + self.thrift_connection = Connection( + server_hostname=self.server_hostname, + http_path=self.http_path, + access_token=self.access_token, + catalog=self.catalog, + schema=self.schema, + use_sea=False, # Explicitly use Thrift backend + user_agent_entry="Python-Connector-Comparator" + ) + + # Create SEA connection + self.sea_connection = Connection( + server_hostname=self.server_hostname, + http_path=self.http_path, + access_token=self.access_token, + catalog=self.catalog, + schema=self.schema, + use_sea=True, # Explicitly use SEA backend + user_agent_entry="Python-Connector-Comparator" + ) + + logger.info("Connections established successfully") + + def close_connections(self): + """Close connections to both backends.""" + logger.info("Closing connections") + + if self.thrift_connection: + self.thrift_connection.close() + + if self.sea_connection: + self.sea_connection.close() + + logger.info("Connections closed successfully") + + def compare_cursor_description( + self, thrift_cursor: Cursor, sea_cursor: Cursor, result: ComparisonResult + ): + """ + Compare cursor descriptions between Thrift and SEA backends. + + Args: + thrift_cursor: Cursor from Thrift backend + sea_cursor: Cursor from SEA backend + result: ComparisonResult to update with findings + """ + thrift_desc = thrift_cursor.description + sea_desc = sea_cursor.description + + if thrift_desc is None and sea_desc is None: + return + + if thrift_desc is None: + result.add_difference("Thrift description is None but SEA description is not") + return + + if sea_desc is None: + result.add_difference("SEA description is None but Thrift description is not") + return + + if len(thrift_desc) != len(sea_desc): + result.add_difference( + f"Description length mismatch: Thrift has {len(thrift_desc)} columns, SEA has {len(sea_desc)}", + thrift_desc, + sea_desc + ) + return + + for i, (thrift_col, sea_col) in enumerate(zip(thrift_desc, sea_desc)): + # Compare each element of the description tuple + for j, (thrift_val, sea_val) in enumerate(zip(thrift_col, sea_col)): + if thrift_val != sea_val: + element_names = ["name", "type_code", "display_size", "internal_size", + "precision", "scale", "null_ok"] + element_name = element_names[j] if j < len(element_names) else f"element_{j}" + + result.add_difference( + f"Column {i} ({thrift_col[0]}) {element_name} mismatch", + thrift_val, + sea_val + ) + + def compare_rows( + self, thrift_rows: List[Row], sea_rows: List[Row], result: ComparisonResult + ): + """ + Compare rows returned by both backends using the asDict method. + + Args: + thrift_rows: Rows from Thrift backend + sea_rows: Rows from SEA backend + result: ComparisonResult to update with findings + """ + if len(thrift_rows) != len(sea_rows): + result.add_difference( + f"Row count mismatch: Thrift returned {len(thrift_rows)}, SEA returned {len(sea_rows)}" + ) + # Continue comparison with the smaller set + min_rows = min(len(thrift_rows), len(sea_rows)) + thrift_rows = thrift_rows[:min_rows] + sea_rows = sea_rows[:min_rows] + + for i, (thrift_row, sea_row) in enumerate(zip(thrift_rows, sea_rows)): + # Convert rows to dictionaries for comparison + try: + thrift_dict = thrift_row.asDict(recursive=True) + sea_dict = sea_row.asDict(recursive=True) + + if thrift_dict != sea_dict: + # Find which fields differ + all_fields = set(thrift_dict.keys()) | set(sea_dict.keys()) + + for field in all_fields: + thrift_value = thrift_dict.get(field) + sea_value = sea_dict.get(field) + + if field not in thrift_dict: + result.add_difference( + f"Row {i}: Field '{field}' missing in Thrift row", + None, + sea_value + ) + elif field not in sea_dict: + result.add_difference( + f"Row {i}: Field '{field}' missing in SEA row", + thrift_value, + None + ) + elif thrift_value != sea_value: + result.add_difference( + f"Row {i}, field '{field}' value mismatch", + thrift_value, + sea_value + ) + except (AttributeError, TypeError) as e: + # If asDict fails, fall back to direct comparison + if thrift_row != sea_row: + result.add_difference( + f"Row {i} mismatch (asDict failed: {str(e)})", + thrift_row, + sea_row + ) + + def compare_arrow_tables( + self, thrift_table: pa.Table, sea_table: pa.Table, result: ComparisonResult + ): + """ + Compare Arrow tables returned by both backends. + + Args: + thrift_table: Arrow table from Thrift backend + sea_table: Arrow table from SEA backend + result: ComparisonResult to update with findings + """ + # Compare schema + thrift_schema = thrift_table.schema + sea_schema = sea_table.schema + + if len(thrift_schema) != len(sea_schema): + result.add_difference( + f"Arrow schema field count mismatch: Thrift has {len(thrift_schema)}, SEA has {len(sea_schema)}", + thrift_schema, + sea_schema + ) + else: + for i, (thrift_field, sea_field) in enumerate(zip(thrift_schema, sea_schema)): + if thrift_field.name != sea_field.name: + result.add_difference( + f"Arrow schema field {i} name mismatch", + thrift_field.name, + sea_field.name + ) + + if str(thrift_field.type) != str(sea_field.type): + result.add_difference( + f"Arrow schema field {i} ({thrift_field.name}) type mismatch", + str(thrift_field.type), + str(sea_field.type) + ) + + # Compare row count + if thrift_table.num_rows != sea_table.num_rows: + result.add_difference( + f"Arrow table row count mismatch: Thrift has {thrift_table.num_rows}, SEA has {sea_table.num_rows}" + ) + + # Convert to pandas for easier comparison + try: + thrift_df = thrift_table.to_pandas() + sea_df = sea_table.to_pandas() + + # Compare dataframes + if not thrift_df.equals(sea_df): + # Find differing rows + if thrift_df.shape[0] == sea_df.shape[0] and thrift_df.shape[1] == sea_df.shape[1]: + # Same dimensions, compare cell by cell + for col in thrift_df.columns: + if col in sea_df.columns: + mask = thrift_df[col] != sea_df[col] + if mask.any(): + diff_indices = mask[mask].index.tolist() + if len(diff_indices) > 3: # Limit to first 3 differences + diff_indices = diff_indices[:3] + + for idx in diff_indices: + result.add_difference( + f"Arrow data mismatch at row {idx}, column '{col}'", + thrift_df.loc[idx, col], + sea_df.loc[idx, col] + ) + except Exception as e: + result.add_difference(f"Error comparing Arrow tables: {str(e)}") + + def execute_and_compare( + self, + query: str, + test_name: str, + fetch_method: str = "fetchall", + args: List[Any] = None, + fetch_size: int = None + ) -> ComparisonResult: + """ + Execute a query on both backends and compare the results. + + Args: + query: SQL query to execute + test_name: Name of the test for reporting + fetch_method: Method to use for fetching results (fetchall, fetchmany, fetchone, + fetchall_arrow, fetchmany_arrow) + args: Arguments to pass to the query + fetch_size: Size to use for fetchmany/fetchmany_arrow + + Returns: + ComparisonResult with the comparison details + """ + result = ComparisonResult(test_name, query, args) + + # Create cursors + thrift_cursor = self.thrift_connection.cursor( + arraysize=self.array_size, + buffer_size_bytes=self.buffer_size_bytes + ) + + sea_cursor = self.sea_connection.cursor( + arraysize=self.array_size, + buffer_size_bytes=self.buffer_size_bytes + ) + + try: + # Execute query on Thrift backend + start_time = time.time() + thrift_cursor.execute(query, args) + result.thrift_time = time.time() - start_time + + # Execute query on SEA backend + start_time = time.time() + sea_cursor.execute(query, args) + result.sea_time = time.time() - start_time + + # Compare cursor descriptions + self.compare_cursor_description(thrift_cursor, sea_cursor, result) + + # Fetch and compare results based on fetch_method + if fetch_method == "fetchall": + thrift_rows = thrift_cursor.fetchall() + sea_rows = sea_cursor.fetchall() + self.compare_rows(thrift_rows, sea_rows, result) + + elif fetch_method == "fetchmany": + size = fetch_size or self.array_size + thrift_rows = thrift_cursor.fetchmany(size) + sea_rows = sea_cursor.fetchmany(size) + self.compare_rows(thrift_rows, sea_rows, result) + + elif fetch_method == "fetchone": + thrift_row = thrift_cursor.fetchone() + sea_row = sea_cursor.fetchone() + if thrift_row is None and sea_row is None: + pass # Both returned None, which is fine + elif thrift_row is None: + result.add_difference("Thrift returned None but SEA returned a row", None, sea_row) + elif sea_row is None: + result.add_difference("SEA returned None but Thrift returned a row", thrift_row, None) + else: + self.compare_rows([thrift_row], [sea_row], result) + + elif fetch_method == "fetchall_arrow": + thrift_table = thrift_cursor.fetchall_arrow() + sea_table = sea_cursor.fetchall_arrow() + self.compare_arrow_tables(thrift_table, sea_table, result) + + elif fetch_method == "fetchmany_arrow": + size = fetch_size or self.array_size + thrift_table = thrift_cursor.fetchmany_arrow(size) + sea_table = sea_cursor.fetchmany_arrow(size) + self.compare_arrow_tables(thrift_table, sea_table, result) + + else: + result.add_difference(f"Unknown fetch method: {fetch_method}") + + except Exception as e: + logger.exception(f"Error in test {test_name}") + result.success = False + result.add_difference(f"Exception: {str(e)}") + + finally: + # Close cursors + thrift_cursor.close() + sea_cursor.close() + + return result + + def run_comparison_tests(self): + """Run a set of comparison tests between Thrift and SEA backends.""" + logger.info("Starting comparison tests") + + # Following the JDBC comparator approach with a single TPC-DS query + # Adjust the table path if needed based on your environment + tpc_query = "SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5" + + # Test with fetchall + self.results.append( + self.execute_and_compare( + tpc_query, + "TPC-DS query - fetchall", + "fetchall" + ) + ) + + # Test with fetchmany + self.results.append( + self.execute_and_compare( + tpc_query, + "TPC-DS query - fetchmany", + "fetchmany", + fetch_size=2 + ) + ) + + # Test with fetchone + self.results.append( + self.execute_and_compare( + tpc_query, + "TPC-DS query - fetchone", + "fetchone" + ) + ) + + # Test with fetchall_arrow + self.results.append( + self.execute_and_compare( + tpc_query, + "TPC-DS query - fetchall_arrow", + "fetchall_arrow" + ) + ) + + # Test with fetchmany_arrow + self.results.append( + self.execute_and_compare( + tpc_query, + "TPC-DS query - fetchmany_arrow", + "fetchmany_arrow", + fetch_size=2 + ) + ) + + logger.info(f"Completed {len(self.results)} comparison tests") + + def generate_report(self): + """Generate a report of the comparison results.""" + logger.info(f"Generating report to {self.report_file}") + + with open(self.report_file, "w") as f: + f.write("Python Connector Comparison Report\n") + f.write("=================================\n\n") + f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M:%S')}\n") + f.write(f"Server: {self.server_hostname}\n") + f.write(f"HTTP Path: {self.http_path}\n\n") + + # Summary + total_tests = len(self.results) + passed_tests = sum(1 for r in self.results if r.success) + f.write(f"Summary: {passed_tests}/{total_tests} tests passed\n\n") + + # Performance summary + total_thrift_time = sum(r.thrift_time for r in self.results) + total_sea_time = sum(r.sea_time for r in self.results) + f.write(f"Total execution time - Thrift: {total_thrift_time:.4f}s, SEA: {total_sea_time:.4f}s\n") + + if total_thrift_time > 0: + percentage = (total_sea_time / total_thrift_time - 1) * 100 + f.write(f"SEA is {percentage:.2f}% {'slower' if percentage > 0 else 'faster'} than Thrift\n\n") + + # Test details + f.write("Test Details\n") + f.write("------------\n\n") + + for i, result in enumerate(self.results): + f.write(f"Test {i+1}: {result.test_name}\n") + f.write(f"{'PASSED' if result.success else 'FAILED'}\n") + f.write(f"Query: {result.query}\n") + + if result.args: + f.write(f"Args: {result.args}\n") + + f.write(f"Thrift time: {result.thrift_time:.4f}s, SEA time: {result.sea_time:.4f}s\n") + + if not result.success: + f.write("Differences:\n") + for diff in result.differences: + f.write(f" - {diff['message']}\n") + if diff.get('thrift_value') is not None: + f.write(f" Thrift: {diff['thrift_value']}\n") + if diff.get('sea_value') is not None: + f.write(f" SEA: {diff['sea_value']}\n") + + f.write("\n") + + logger.info(f"Report generated: {self.report_file}") + + # Print summary to console + print(f"\nSummary: {passed_tests}/{total_tests} tests passed") + print(f"Total execution time - Thrift: {total_thrift_time:.4f}s, SEA: {total_sea_time:.4f}s") + if total_thrift_time > 0: + percentage = (total_sea_time / total_thrift_time - 1) * 100 + print(f"SEA is {percentage:.2f}% {'slower' if percentage > 0 else 'faster'} than Thrift") + print(f"Detailed report saved to: {self.report_file}") + + def run(self): + """Run the full comparison workflow.""" + try: + self.setup_connections() + self.run_comparison_tests() + self.generate_report() + finally: + self.close_connections() + + +def main(): + """Main entry point.""" + # Check required environment variables + required_vars = ["DATABRICKS_SERVER_HOSTNAME", "DATABRICKS_HTTP_PATH", "DATABRICKS_TOKEN"] + missing_vars = [var for var in required_vars if not os.environ.get(var)] + + if missing_vars: + logger.error(f"Missing required environment variables: {', '.join(missing_vars)}") + logger.error("Please set these variables before running the comparator.") + sys.exit(1) + + # Get connection parameters from environment + server_hostname = os.environ["DATABRICKS_SERVER_HOSTNAME"] + http_path = os.environ["DATABRICKS_HTTP_PATH"] + access_token = os.environ["DATABRICKS_TOKEN"] + catalog = os.environ.get("DATABRICKS_CATALOG", DEFAULT_CATALOG) + + # Create and run comparator + comparator = PythonConnectorComparator( + server_hostname=server_hostname, + http_path=http_path, + access_token=access_token, + catalog=catalog + ) + + comparator.run() + + +if __name__ == "__main__": + main() \ No newline at end of file From 17a65dbcf794840912917efce07d780ee8448979 Mon Sep 17 00:00:00 2001 From: varun-edachali-dbx Date: Mon, 21 Jul 2025 16:19:56 +0530 Subject: [PATCH 2/7] align description with Thrift Signed-off-by: varun-edachali-dbx --- src/databricks/sql/backend/sea/backend.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/databricks/sql/backend/sea/backend.py b/src/databricks/sql/backend/sea/backend.py index 42677b903..386522612 100644 --- a/src/databricks/sql/backend/sea/backend.py +++ b/src/databricks/sql/backend/sea/backend.py @@ -318,15 +318,23 @@ def _extract_description_from_manifest( columns = [] for col_data in columns_data: # Format: (name, type_code, display_size, internal_size, precision, scale, null_ok) + name = col_data.get("name", "") + type_name = col_data.get("type_name", "") + type_name = ( + type_name[:-5] if type_name.endswith("_TYPE") else type_name + ).lower() + precision = col_data.get("type_precision") + scale = col_data.get("type_scale") + columns.append( ( - col_data.get("name", ""), # name - col_data.get("type_name", ""), # type_code + name, # name + type_name, # type_code None, # display_size (not provided by SEA) None, # internal_size (not provided by SEA) - col_data.get("precision"), # precision - col_data.get("scale"), # scale - col_data.get("nullable", True), # null_ok + precision, # precision + scale, # scale + None, # null_ok ) ) From f29198dc74a5a6cd2f127f3478cf174ada0dc20a Mon Sep 17 00:00:00 2001 From: varun-edachali-dbx Date: Mon, 21 Jul 2025 16:22:15 +0530 Subject: [PATCH 3/7] preliminary comparison report Signed-off-by: varun-edachali-dbx --- .../python-connector-comparison-report.txt | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 examples/experimental/python-connector-comparison-report.txt diff --git a/examples/experimental/python-connector-comparison-report.txt b/examples/experimental/python-connector-comparison-report.txt new file mode 100644 index 000000000..87df81fe9 --- /dev/null +++ b/examples/experimental/python-connector-comparison-report.txt @@ -0,0 +1,60 @@ +Python Connector Comparison Report +================================= + +Date: 2025-07-21 16:21:27 +Server: benchmarking-staging-aws-us-west-2.staging.cloud.databricks.com +HTTP Path: /sql/1.0/warehouses/17661fca65a0e4fc + +Summary: 0/5 tests passed + +Total execution time - Thrift: 2.3825s, SEA: 3.0076s +SEA is 26.24% slower than Thrift + +Test Details +------------ + +Test 1: TPC-DS query - fetchall +FAILED +Query: SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5 +Thrift time: 0.4903s, SEA time: 0.6477s +Differences: + - Column 17 (cs_order_number) type_code mismatch + Thrift: bigint + SEA: long + +Test 2: TPC-DS query - fetchmany +FAILED +Query: SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5 +Thrift time: 0.4870s, SEA time: 0.5810s +Differences: + - Column 17 (cs_order_number) type_code mismatch + Thrift: bigint + SEA: long + +Test 3: TPC-DS query - fetchone +FAILED +Query: SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5 +Thrift time: 0.4784s, SEA time: 0.5651s +Differences: + - Column 17 (cs_order_number) type_code mismatch + Thrift: bigint + SEA: long + +Test 4: TPC-DS query - fetchall_arrow +FAILED +Query: SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5 +Thrift time: 0.4549s, SEA time: 0.6195s +Differences: + - Column 17 (cs_order_number) type_code mismatch + Thrift: bigint + SEA: long + +Test 5: TPC-DS query - fetchmany_arrow +FAILED +Query: SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5 +Thrift time: 0.4720s, SEA time: 0.5943s +Differences: + - Column 17 (cs_order_number) type_code mismatch + Thrift: bigint + SEA: long + From a1a41dc4805ed571312ecfa8486afe832d79b010 Mon Sep 17 00:00:00 2001 From: varun-edachali-dbx Date: Thu, 24 Jul 2025 06:58:24 +0000 Subject: [PATCH 4/7] more verbose comparator, add more tests Signed-off-by: varun-edachali-dbx --- examples/experimental/comparator.py | 1075 ++++++++++++++--- .../python-connector-comparison-report.md | 443 +++++++ .../python-connector-comparison-report.txt | 60 - 3 files changed, 1345 insertions(+), 233 deletions(-) create mode 100644 examples/experimental/python-connector-comparison-report.md delete mode 100644 examples/experimental/python-connector-comparison-report.txt diff --git a/examples/experimental/comparator.py b/examples/experimental/comparator.py index e497c142f..62b08c7f0 100755 --- a/examples/experimental/comparator.py +++ b/examples/experimental/comparator.py @@ -25,7 +25,9 @@ from databricks.sql.types import Row # Set up logging -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) # Constants @@ -49,41 +51,43 @@ def __init__(self, test_name: str, query: str, args: List[Any] = None): self.thrift_error = None self.sea_error = None - def add_difference(self, message: str, thrift_value: Any = None, sea_value: Any = None): + def add_difference( + self, message: str, thrift_value: Any = None, sea_value: Any = None + ): """Add a difference to the result.""" - self.differences.append({ - "message": message, - "thrift_value": thrift_value, - "sea_value": sea_value - }) + self.differences.append( + {"message": message, "thrift_value": thrift_value, "sea_value": sea_value} + ) self.success = False def __str__(self) -> str: """String representation of the comparison result.""" result = f"Test: {self.test_name}\n" result += f"Query: {self.query}\n" - + if self.args: result += f"Args: {self.args}\n" - + result += f"Success: {self.success}\n" - result += f"Thrift time: {self.thrift_time:.4f}s, SEA time: {self.sea_time:.4f}s\n" - + result += ( + f"Thrift time: {self.thrift_time:.4f}s, SEA time: {self.sea_time:.4f}s\n" + ) + if self.thrift_error: result += f"Thrift error: {self.thrift_error}\n" - + if self.sea_error: result += f"SEA error: {self.sea_error}\n" - + if not self.success: result += "Differences:\n" for diff in self.differences: result += f" - {diff['message']}\n" - if diff.get('thrift_value') is not None: + if diff.get("thrift_value") is not None: result += f" Thrift: {diff['thrift_value']}\n" - if diff.get('sea_value') is not None: + if diff.get("sea_value") is not None: result += f" SEA: {diff['sea_value']}\n" - + return result @@ -101,7 +105,7 @@ def __init__( schema: str = DEFAULT_SCHEMA, array_size: int = DEFAULT_ARRAY_SIZE, buffer_size_bytes: int = DEFAULT_BUFFER_SIZE, - report_file: str = "python-connector-comparison-report.txt" + report_file: str = "python-connector-comparison-report.md", ): """ Initialize the comparator with connection parameters. @@ -124,7 +128,7 @@ def __init__( self.array_size = array_size self.buffer_size_bytes = buffer_size_bytes self.report_file = report_file - + self.thrift_connection = None self.sea_connection = None self.results = [] @@ -132,7 +136,7 @@ def __init__( def setup_connections(self): """Set up connections to both backends.""" logger.info("Setting up connections to Thrift and SEA backends") - + # Create Thrift connection self.thrift_connection = Connection( server_hostname=self.server_hostname, @@ -141,9 +145,9 @@ def setup_connections(self): catalog=self.catalog, schema=self.schema, use_sea=False, # Explicitly use Thrift backend - user_agent_entry="Python-Connector-Comparator" + user_agent_entry="Python-Connector-Comparator", ) - + # Create SEA connection self.sea_connection = Connection( server_hostname=self.server_hostname, @@ -152,21 +156,21 @@ def setup_connections(self): catalog=self.catalog, schema=self.schema, use_sea=True, # Explicitly use SEA backend - user_agent_entry="Python-Connector-Comparator" + user_agent_entry="Python-Connector-Comparator", ) - + logger.info("Connections established successfully") def close_connections(self): """Close connections to both backends.""" logger.info("Closing connections") - + if self.thrift_connection: self.thrift_connection.close() - + if self.sea_connection: self.sea_connection.close() - + logger.info("Connections closed successfully") def compare_cursor_description( @@ -174,7 +178,7 @@ def compare_cursor_description( ): """ Compare cursor descriptions between Thrift and SEA backends. - + Args: thrift_cursor: Cursor from Thrift backend sea_cursor: Cursor from SEA backend @@ -182,38 +186,51 @@ def compare_cursor_description( """ thrift_desc = thrift_cursor.description sea_desc = sea_cursor.description - + if thrift_desc is None and sea_desc is None: return - + if thrift_desc is None: - result.add_difference("Thrift description is None but SEA description is not") + result.add_difference( + "Thrift description is None but SEA description is not" + ) return - + if sea_desc is None: - result.add_difference("SEA description is None but Thrift description is not") + result.add_difference( + "SEA description is None but Thrift description is not" + ) return - + if len(thrift_desc) != len(sea_desc): result.add_difference( f"Description length mismatch: Thrift has {len(thrift_desc)} columns, SEA has {len(sea_desc)}", thrift_desc, - sea_desc + sea_desc, ) return - + for i, (thrift_col, sea_col) in enumerate(zip(thrift_desc, sea_desc)): # Compare each element of the description tuple for j, (thrift_val, sea_val) in enumerate(zip(thrift_col, sea_col)): if thrift_val != sea_val: - element_names = ["name", "type_code", "display_size", "internal_size", - "precision", "scale", "null_ok"] - element_name = element_names[j] if j < len(element_names) else f"element_{j}" - + element_names = [ + "name", + "type_code", + "display_size", + "internal_size", + "precision", + "scale", + "null_ok", + ] + element_name = ( + element_names[j] if j < len(element_names) else f"element_{j}" + ) + result.add_difference( f"Column {i} ({thrift_col[0]}) {element_name} mismatch", thrift_val, - sea_val + sea_val, ) def compare_rows( @@ -221,7 +238,7 @@ def compare_rows( ): """ Compare rows returned by both backends using the asDict method. - + Args: thrift_rows: Rows from Thrift backend sea_rows: Rows from SEA backend @@ -235,38 +252,35 @@ def compare_rows( min_rows = min(len(thrift_rows), len(sea_rows)) thrift_rows = thrift_rows[:min_rows] sea_rows = sea_rows[:min_rows] - + + # Track fields that are consistently missing across all rows + fields_missing_in_thrift = set() + fields_missing_in_sea = set() + field_value_mismatches = {} # Track per-row mismatches + for i, (thrift_row, sea_row) in enumerate(zip(thrift_rows, sea_rows)): # Convert rows to dictionaries for comparison try: thrift_dict = thrift_row.asDict(recursive=True) sea_dict = sea_row.asDict(recursive=True) - + if thrift_dict != sea_dict: # Find which fields differ all_fields = set(thrift_dict.keys()) | set(sea_dict.keys()) - + for field in all_fields: thrift_value = thrift_dict.get(field) sea_value = sea_dict.get(field) - + if field not in thrift_dict: - result.add_difference( - f"Row {i}: Field '{field}' missing in Thrift row", - None, - sea_value - ) + fields_missing_in_thrift.add(field) elif field not in sea_dict: - result.add_difference( - f"Row {i}: Field '{field}' missing in SEA row", - thrift_value, - None - ) + fields_missing_in_sea.add(field) elif thrift_value != sea_value: - result.add_difference( - f"Row {i}, field '{field}' value mismatch", - thrift_value, - sea_value + if field not in field_value_mismatches: + field_value_mismatches[field] = [] + field_value_mismatches[field].append( + (i, thrift_value, sea_value) ) except (AttributeError, TypeError) as e: # If asDict fails, fall back to direct comparison @@ -274,7 +288,59 @@ def compare_rows( result.add_difference( f"Row {i} mismatch (asDict failed: {str(e)})", thrift_row, - sea_row + sea_row, + ) + + # Report consistently missing fields once + if fields_missing_in_thrift: + for field in fields_missing_in_thrift: + result.add_difference(f"Field '{field}' missing in all Thrift rows") + + if fields_missing_in_sea: + for field in fields_missing_in_sea: + result.add_difference(f"Field '{field}' missing in all SEA rows") + + # Report value mismatches + for field, mismatches in field_value_mismatches.items(): + # If all rows have the same mismatch pattern, report it once + if len(mismatches) == len(thrift_rows): + # Check if all values are the same + thrift_values = [m[1] for m in mismatches] + sea_values = [m[2] for m in mismatches] + + if all(v == thrift_values[0] for v in thrift_values) and all( + v == sea_values[0] for v in sea_values + ): + result.add_difference( + f"Field '{field}' value mismatch in all rows", + thrift_values[0], + sea_values[0], + ) + else: + # Values differ across rows, report first few examples + examples = mismatches[:3] # Limit to first 3 examples + for row_idx, thrift_val, sea_val in examples: + result.add_difference( + f"Row {row_idx}, field '{field}' value mismatch", + thrift_val, + sea_val, + ) + if len(mismatches) > 3: + result.add_difference( + f"... and {len(mismatches) - 3} more rows with '{field}' mismatches" + ) + else: + # Not all rows have this mismatch, report individually (up to 3) + examples = mismatches[:3] + for row_idx, thrift_val, sea_val in examples: + result.add_difference( + f"Row {row_idx}, field '{field}' value mismatch", + thrift_val, + sea_val, + ) + if len(mismatches) > 3: + result.add_difference( + f"... and {len(mismatches) - 3} more rows with '{field}' mismatches" ) def compare_arrow_tables( @@ -282,7 +348,7 @@ def compare_arrow_tables( ): """ Compare Arrow tables returned by both backends. - + Args: thrift_table: Arrow table from Thrift backend sea_table: Arrow table from SEA backend @@ -291,58 +357,65 @@ def compare_arrow_tables( # Compare schema thrift_schema = thrift_table.schema sea_schema = sea_table.schema - + if len(thrift_schema) != len(sea_schema): result.add_difference( f"Arrow schema field count mismatch: Thrift has {len(thrift_schema)}, SEA has {len(sea_schema)}", thrift_schema, - sea_schema + sea_schema, ) else: - for i, (thrift_field, sea_field) in enumerate(zip(thrift_schema, sea_schema)): + for i, (thrift_field, sea_field) in enumerate( + zip(thrift_schema, sea_schema) + ): if thrift_field.name != sea_field.name: result.add_difference( f"Arrow schema field {i} name mismatch", thrift_field.name, - sea_field.name + sea_field.name, ) - + if str(thrift_field.type) != str(sea_field.type): result.add_difference( f"Arrow schema field {i} ({thrift_field.name}) type mismatch", str(thrift_field.type), - str(sea_field.type) + str(sea_field.type), ) - + # Compare row count if thrift_table.num_rows != sea_table.num_rows: result.add_difference( f"Arrow table row count mismatch: Thrift has {thrift_table.num_rows}, SEA has {sea_table.num_rows}" ) - + # Convert to pandas for easier comparison try: thrift_df = thrift_table.to_pandas() sea_df = sea_table.to_pandas() - + # Compare dataframes if not thrift_df.equals(sea_df): # Find differing rows - if thrift_df.shape[0] == sea_df.shape[0] and thrift_df.shape[1] == sea_df.shape[1]: + if ( + thrift_df.shape[0] == sea_df.shape[0] + and thrift_df.shape[1] == sea_df.shape[1] + ): # Same dimensions, compare cell by cell for col in thrift_df.columns: if col in sea_df.columns: mask = thrift_df[col] != sea_df[col] if mask.any(): diff_indices = mask[mask].index.tolist() - if len(diff_indices) > 3: # Limit to first 3 differences + if ( + len(diff_indices) > 3 + ): # Limit to first 3 differences diff_indices = diff_indices[:3] - + for idx in diff_indices: result.add_difference( f"Arrow data mismatch at row {idx}, column '{col}'", thrift_df.loc[idx, col], - sea_df.loc[idx, col] + sea_df.loc[idx, col], ) except Exception as e: result.add_difference(f"Error comparing Arrow tables: {str(e)}") @@ -353,214 +426,864 @@ def execute_and_compare( test_name: str, fetch_method: str = "fetchall", args: List[Any] = None, - fetch_size: int = None + fetch_size: int = None, ) -> ComparisonResult: """ Execute a query on both backends and compare the results. - + Args: query: SQL query to execute test_name: Name of the test for reporting - fetch_method: Method to use for fetching results (fetchall, fetchmany, fetchone, + fetch_method: Method to use for fetching results (fetchall, fetchmany, fetchone, fetchall_arrow, fetchmany_arrow) args: Arguments to pass to the query fetch_size: Size to use for fetchmany/fetchmany_arrow - + Returns: ComparisonResult with the comparison details """ result = ComparisonResult(test_name, query, args) - + # Create cursors thrift_cursor = self.thrift_connection.cursor( - arraysize=self.array_size, - buffer_size_bytes=self.buffer_size_bytes + arraysize=self.array_size, buffer_size_bytes=self.buffer_size_bytes ) - + sea_cursor = self.sea_connection.cursor( - arraysize=self.array_size, - buffer_size_bytes=self.buffer_size_bytes + arraysize=self.array_size, buffer_size_bytes=self.buffer_size_bytes ) - + try: # Execute query on Thrift backend start_time = time.time() thrift_cursor.execute(query, args) result.thrift_time = time.time() - start_time - + # Execute query on SEA backend start_time = time.time() sea_cursor.execute(query, args) result.sea_time = time.time() - start_time - + # Compare cursor descriptions self.compare_cursor_description(thrift_cursor, sea_cursor, result) - + # Fetch and compare results based on fetch_method if fetch_method == "fetchall": thrift_rows = thrift_cursor.fetchall() sea_rows = sea_cursor.fetchall() self.compare_rows(thrift_rows, sea_rows, result) - + elif fetch_method == "fetchmany": size = fetch_size or self.array_size thrift_rows = thrift_cursor.fetchmany(size) sea_rows = sea_cursor.fetchmany(size) self.compare_rows(thrift_rows, sea_rows, result) - + elif fetch_method == "fetchone": thrift_row = thrift_cursor.fetchone() sea_row = sea_cursor.fetchone() if thrift_row is None and sea_row is None: pass # Both returned None, which is fine elif thrift_row is None: - result.add_difference("Thrift returned None but SEA returned a row", None, sea_row) + result.add_difference( + "Thrift returned None but SEA returned a row", None, sea_row + ) elif sea_row is None: - result.add_difference("SEA returned None but Thrift returned a row", thrift_row, None) + result.add_difference( + "SEA returned None but Thrift returned a row", thrift_row, None + ) else: self.compare_rows([thrift_row], [sea_row], result) - + elif fetch_method == "fetchall_arrow": thrift_table = thrift_cursor.fetchall_arrow() sea_table = sea_cursor.fetchall_arrow() self.compare_arrow_tables(thrift_table, sea_table, result) - + elif fetch_method == "fetchmany_arrow": size = fetch_size or self.array_size thrift_table = thrift_cursor.fetchmany_arrow(size) sea_table = sea_cursor.fetchmany_arrow(size) self.compare_arrow_tables(thrift_table, sea_table, result) - + else: result.add_difference(f"Unknown fetch method: {fetch_method}") - + except Exception as e: logger.exception(f"Error in test {test_name}") result.success = False result.add_difference(f"Exception: {str(e)}") - + finally: # Close cursors thrift_cursor.close() sea_cursor.close() - + return result + def compare_metadata_results( + self, thrift_rows: List[Row], sea_rows: List[Row], result: ComparisonResult + ): + """ + Compare metadata results (like catalogs, schemas, tables) between backends. + These have specific columns that we want to validate. + """ + # First do regular row comparison + self.compare_rows(thrift_rows, sea_rows, result) + + # Additional validation could be added here for specific metadata formats + + def test_metadata_methods(self): + """Test catalog, schema, table, and column metadata methods.""" + logger.info("Testing metadata methods") + + # Test catalogs() + try: + thrift_cursor = self.thrift_connection.cursor() + sea_cursor = self.sea_connection.cursor() + + result = ComparisonResult("catalogs()", "catalogs()") + + start_time = time.time() + thrift_cursor.catalogs() + result.thrift_time = time.time() - start_time + thrift_catalogs = thrift_cursor.fetchall() + + start_time = time.time() + sea_cursor.catalogs() + result.sea_time = time.time() - start_time + sea_catalogs = sea_cursor.fetchall() + + self.compare_cursor_description(thrift_cursor, sea_cursor, result) + self.compare_metadata_results(thrift_catalogs, sea_catalogs, result) + + self.results.append(result) + + thrift_cursor.close() + sea_cursor.close() + except Exception as e: + logger.exception("Error testing catalogs()") + result = ComparisonResult("catalogs()", "catalogs()") + result.success = False + result.add_difference(f"Exception: {str(e)}") + self.results.append(result) + + # Test schemas() with various parameters + test_cases = [ + ("schemas() - no params", None, None), + ("schemas() - with catalog", self.catalog, None), + ("schemas() - with pattern", self.catalog, "def%"), + ] + + for test_name, catalog_arg, schema_arg in test_cases: + try: + thrift_cursor = self.thrift_connection.cursor() + sea_cursor = self.sea_connection.cursor() + + result = ComparisonResult( + test_name, f"schemas({catalog_arg}, {schema_arg})" + ) + + start_time = time.time() + thrift_cursor.schemas(catalog_arg, schema_arg) + result.thrift_time = time.time() - start_time + thrift_schemas = thrift_cursor.fetchall() + + start_time = time.time() + sea_cursor.schemas(catalog_arg, schema_arg) + result.sea_time = time.time() - start_time + sea_schemas = sea_cursor.fetchall() + + self.compare_cursor_description(thrift_cursor, sea_cursor, result) + self.compare_metadata_results(thrift_schemas, sea_schemas, result) + + self.results.append(result) + + thrift_cursor.close() + sea_cursor.close() + except Exception as e: + logger.exception(f"Error testing {test_name}") + result = ComparisonResult( + test_name, f"schemas({catalog_arg}, {schema_arg})" + ) + result.success = False + result.add_difference(f"Exception: {str(e)}") + self.results.append(result) + + # Test tables() with various parameters + table_test_cases = [ + ("tables() - no params", None, None, None, None), + ("tables() - with catalog", self.catalog, None, None, None), + ("tables() - with schema", self.catalog, self.schema, None, None), + ( + "tables() - with table pattern", + self.catalog, + self.schema, + "%sales", + None, + ), + ( + "tables() - with table types", + self.catalog, + self.schema, + None, + ["TABLE", "VIEW"], + ), + ] + + for test_name, cat, sch, tab, types in table_test_cases: + try: + thrift_cursor = self.thrift_connection.cursor() + sea_cursor = self.sea_connection.cursor() + + result = ComparisonResult( + test_name, f"tables({cat}, {sch}, {tab}, {types})" + ) + + start_time = time.time() + thrift_cursor.tables(cat, sch, tab, types) + result.thrift_time = time.time() - start_time + thrift_tables = thrift_cursor.fetchall() + + start_time = time.time() + sea_cursor.tables(cat, sch, tab, types) + result.sea_time = time.time() - start_time + sea_tables = sea_cursor.fetchall() + + self.compare_cursor_description(thrift_cursor, sea_cursor, result) + self.compare_metadata_results(thrift_tables, sea_tables, result) + + self.results.append(result) + + thrift_cursor.close() + sea_cursor.close() + except Exception as e: + logger.exception(f"Error testing {test_name}") + result = ComparisonResult( + test_name, f"tables({cat}, {sch}, {tab}, {types})" + ) + result.success = False + result.add_difference(f"Exception: {str(e)}") + self.results.append(result) + + # Test columns() - let's use a known table + column_test_cases = [ + ( + "columns() - specific table", + self.catalog, + "tpcds_sf100_delta", + "catalog_sales", + None, + ), + ( + "columns() - with column pattern", + self.catalog, + "tpcds_sf100_delta", + "catalog_sales", + "cs_%", + ), + ] + + for test_name, cat, sch, tab, col in column_test_cases: + try: + thrift_cursor = self.thrift_connection.cursor() + sea_cursor = self.sea_connection.cursor() + + result = ComparisonResult( + test_name, f"columns({cat}, {sch}, {tab}, {col})" + ) + + start_time = time.time() + thrift_cursor.columns(cat, sch, tab, col) + result.thrift_time = time.time() - start_time + thrift_columns = thrift_cursor.fetchall() + + start_time = time.time() + sea_cursor.columns(cat, sch, tab, col) + result.sea_time = time.time() - start_time + sea_columns = sea_cursor.fetchall() + + self.compare_cursor_description(thrift_cursor, sea_cursor, result) + self.compare_metadata_results(thrift_columns, sea_columns, result) + + self.results.append(result) + + thrift_cursor.close() + sea_cursor.close() + except Exception as e: + logger.exception(f"Error testing {test_name}") + result = ComparisonResult( + test_name, f"columns({cat}, {sch}, {tab}, {col})" + ) + result.success = False + result.add_difference(f"Exception: {str(e)}") + self.results.append(result) + + def test_fetch_variations(self): + """Test various sequences of fetch operations.""" + logger.info("Testing fetch operation variations") + + # Test: fetchone x3, then fetchmany, then fetchone + query = "SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 10" + + try: + thrift_cursor = self.thrift_connection.cursor() + sea_cursor = self.sea_connection.cursor() + + result = ComparisonResult("Fetch variation - mixed operations", query) + + # Execute on both backends + start_time = time.time() + thrift_cursor.execute(query) + thrift_exec_time = time.time() - start_time + + start_time = time.time() + sea_cursor.execute(query) + sea_exec_time = time.time() - start_time + + # Compare descriptions first + self.compare_cursor_description(thrift_cursor, sea_cursor, result) + + # Fetch sequence: 3x fetchone + thrift_rows = [] + sea_rows = [] + + for i in range(3): + thrift_row = thrift_cursor.fetchone() + sea_row = sea_cursor.fetchone() + + if thrift_row is None and sea_row is None: + break + elif thrift_row is None: + result.add_difference( + f"fetchone {i+1}: Thrift returned None but SEA returned a row" + ) + break + elif sea_row is None: + result.add_difference( + f"fetchone {i+1}: SEA returned None but Thrift returned a row" + ) + break + else: + thrift_rows.append(thrift_row) + sea_rows.append(sea_row) + + # Then fetchmany(3) + thrift_many = thrift_cursor.fetchmany(3) + sea_many = sea_cursor.fetchmany(3) + thrift_rows.extend(thrift_many) + sea_rows.extend(sea_many) + + # Then another fetchone + thrift_last = thrift_cursor.fetchone() + sea_last = sea_cursor.fetchone() + + if thrift_last is not None: + thrift_rows.append(thrift_last) + if sea_last is not None: + sea_rows.append(sea_last) + + # Compare all fetched rows + self.compare_rows(thrift_rows, sea_rows, result) + + result.thrift_time = thrift_exec_time + result.sea_time = sea_exec_time + + self.results.append(result) + + thrift_cursor.close() + sea_cursor.close() + except Exception as e: + logger.exception("Error in fetch variation test") + result.success = False + result.add_difference(f"Exception: {str(e)}") + self.results.append(result) + + # Test: fetchall after partial fetch + try: + thrift_cursor = self.thrift_connection.cursor() + sea_cursor = self.sea_connection.cursor() + + result = ComparisonResult( + "Fetch variation - fetchmany then fetchall", query + ) + + # Execute on both backends + thrift_cursor.execute(query) + sea_cursor.execute(query) + + # Fetch first 2 rows + thrift_first = thrift_cursor.fetchmany(2) + sea_first = sea_cursor.fetchmany(2) + + # Then fetch remaining + thrift_rest = thrift_cursor.fetchall() + sea_rest = sea_cursor.fetchall() + + # Combine results + thrift_all = thrift_first + thrift_rest + sea_all = sea_first + sea_rest + + self.compare_rows(thrift_all, sea_all, result) + + self.results.append(result) + + thrift_cursor.close() + sea_cursor.close() + except Exception as e: + logger.exception("Error in fetchmany/fetchall variation test") + result = ComparisonResult( + "Fetch variation - fetchmany then fetchall", query + ) + result.success = False + result.add_difference(f"Exception: {str(e)}") + self.results.append(result) + + def test_edge_cases(self): + """Test edge cases like empty results, NULL values, etc.""" + logger.info("Testing edge cases") + + # Test empty result set + empty_query = "SELECT * FROM main.tpcds_sf100_delta.catalog_sales WHERE 1=0" + self.results.append( + self.execute_and_compare( + empty_query, "Edge case - empty result set", "fetchall" + ) + ) + + # Test NULL values + null_query = "SELECT NULL as null_col, 'test' as string_col, 123 as int_col" + self.results.append( + self.execute_and_compare(null_query, "Edge case - NULL values", "fetchall") + ) + + # Test various data types + types_query = """ + SELECT + CAST(123 AS TINYINT) as tiny_col, + CAST(456 AS SMALLINT) as small_col, + CAST(789 AS INT) as int_col, + CAST(123456789 AS BIGINT) as big_col, + CAST(123.45 AS FLOAT) as float_col, + CAST(678.90 AS DOUBLE) as double_col, + CAST(123.456 AS DECIMAL(10,3)) as decimal_col, + 'test_string' as string_col, + TRUE as bool_col, + CAST('2023-01-01' AS DATE) as date_col, + CAST('2023-01-01 12:34:56' AS TIMESTAMP) as timestamp_col, + ARRAY(1,2,3) as array_col, + STRUCT(1 as a, 'b' as b) as struct_col, + MAP('key1', 'value1', 'key2', 'value2') as map_col + """ + self.results.append( + self.execute_and_compare( + types_query, "Edge case - various data types", "fetchall" + ) + ) + + # Test large result set (but limited) + large_query = "SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 1000" + self.results.append( + self.execute_and_compare( + large_query, "Edge case - larger result set (1000 rows)", "fetchall" + ) + ) + + def test_parameterized_queries(self): + """Test parameterized queries with both native and inline parameters.""" + logger.info("Testing parameterized queries") + + # Native parameter test (if supported) + if not self.thrift_connection.use_inline_params: + # Test with named parameters + named_query = "SELECT * FROM main.tpcds_sf100_delta.catalog_sales WHERE cs_sold_date_sk = :date_sk LIMIT 5" + params = {"date_sk": 2451088} + + result = self.execute_and_compare( + named_query, + "Parameterized query - named parameters", + "fetchall", + args=params, + ) + self.results.append(result) + + # Test with positional parameters + pos_query = "SELECT * FROM main.tpcds_sf100_delta.catalog_sales WHERE cs_sold_date_sk = ? AND cs_sold_time_sk = ? LIMIT 5" + params = [2451088, 48000] + + result = self.execute_and_compare( + pos_query, + "Parameterized query - positional parameters", + "fetchall", + args=params, + ) + self.results.append(result) + + def test_executemany(self): + """Test executemany method.""" + logger.info("Testing executemany") + + # Create a temporary table for testing + create_table_query = """ + CREATE TABLE IF NOT EXISTS main.default.comparator_test_table ( + id INT, + value STRING + ) USING DELTA + """ + + try: + # Create table on both backends + thrift_cursor = self.thrift_connection.cursor() + sea_cursor = self.sea_connection.cursor() + + thrift_cursor.execute(create_table_query) + sea_cursor.execute(create_table_query) + + # Test executemany with INSERT + insert_query = ( + "INSERT INTO main.default.comparator_test_table VALUES (?, ?)" + ) + data = [(1, "one"), (2, "two"), (3, "three")] + + result = ComparisonResult("executemany - INSERT", insert_query, data) + + try: + start_time = time.time() + thrift_cursor.executemany(insert_query, data) + result.thrift_time = time.time() - start_time + except Exception as e: + result.thrift_error = str(e) + + try: + start_time = time.time() + sea_cursor.executemany(insert_query, data) + result.sea_time = time.time() - start_time + except Exception as e: + result.sea_error = str(e) + + # If both succeeded, compare the inserted data + if not result.thrift_error and not result.sea_error: + # Verify data + verify_query = ( + "SELECT * FROM main.default.comparator_test_table ORDER BY id" + ) + + thrift_cursor.execute(verify_query) + thrift_data = thrift_cursor.fetchall() + + sea_cursor.execute(verify_query) + sea_data = sea_cursor.fetchall() + + self.compare_rows(thrift_data, sea_data, result) + else: + result.success = False + if result.thrift_error and result.sea_error: + result.add_difference("Both backends failed with errors") + elif result.thrift_error: + result.add_difference("Only Thrift backend failed") + else: + result.add_difference("Only SEA backend failed") + + self.results.append(result) + + # Cleanup + cleanup_query = "DROP TABLE IF EXISTS main.default.comparator_test_table" + try: + thrift_cursor.execute(cleanup_query) + except: + pass + try: + sea_cursor.execute(cleanup_query) + except: + pass + + thrift_cursor.close() + sea_cursor.close() + except Exception as e: + logger.exception("Error in executemany test") + result = ComparisonResult("executemany - INSERT", insert_query, data) + result.success = False + result.add_difference(f"Exception during test setup: {str(e)}") + self.results.append(result) + + def test_cursor_description(self): + """Test cursor.description property in detail.""" + logger.info("Testing cursor.description property") + + # Test description before execute + try: + thrift_cursor = self.thrift_connection.cursor() + sea_cursor = self.sea_connection.cursor() + + result = ComparisonResult( + "description - before execute", "No query executed" + ) + + if thrift_cursor.description is None and sea_cursor.description is None: + result.success = True + elif thrift_cursor.description is None: + result.add_difference("Thrift description is None but SEA is not") + elif sea_cursor.description is None: + result.add_difference("SEA description is None but Thrift is not") + else: + result.add_difference("Both should be None before execute") + + self.results.append(result) + + thrift_cursor.close() + sea_cursor.close() + except Exception as e: + logger.exception("Error testing description before execute") + result = ComparisonResult( + "description - before execute", "No query executed" + ) + result.success = False + result.add_difference(f"Exception: {str(e)}") + self.results.append(result) + + # Test description after non-SELECT statement + try: + thrift_cursor = self.thrift_connection.cursor() + sea_cursor = self.sea_connection.cursor() + + show_query = "SHOW TABLES IN main.default" + result = ComparisonResult("description - after SHOW statement", show_query) + + thrift_cursor.execute(show_query) + sea_cursor.execute(show_query) + + self.compare_cursor_description(thrift_cursor, sea_cursor, result) + + self.results.append(result) + + thrift_cursor.close() + sea_cursor.close() + except Exception as e: + logger.exception("Error testing description after SHOW") + result = ComparisonResult("description - after SHOW statement", show_query) + result.success = False + result.add_difference(f"Exception: {str(e)}") + self.results.append(result) + def run_comparison_tests(self): """Run a set of comparison tests between Thrift and SEA backends.""" logger.info("Starting comparison tests") - - # Following the JDBC comparator approach with a single TPC-DS query - # Adjust the table path if needed based on your environment + + # Basic fetch method tests with TPC-DS query tpc_query = "SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5" - + # Test with fetchall self.results.append( - self.execute_and_compare( - tpc_query, - "TPC-DS query - fetchall", - "fetchall" - ) + self.execute_and_compare(tpc_query, "TPC-DS query - fetchall", "fetchall") ) - + # Test with fetchmany self.results.append( self.execute_and_compare( - tpc_query, - "TPC-DS query - fetchmany", - "fetchmany", - fetch_size=2 + tpc_query, "TPC-DS query - fetchmany", "fetchmany", fetch_size=2 ) ) - + # Test with fetchone self.results.append( - self.execute_and_compare( - tpc_query, - "TPC-DS query - fetchone", - "fetchone" - ) + self.execute_and_compare(tpc_query, "TPC-DS query - fetchone", "fetchone") ) - + # Test with fetchall_arrow self.results.append( self.execute_and_compare( - tpc_query, - "TPC-DS query - fetchall_arrow", - "fetchall_arrow" + tpc_query, "TPC-DS query - fetchall_arrow", "fetchall_arrow" ) ) - + # Test with fetchmany_arrow self.results.append( self.execute_and_compare( tpc_query, "TPC-DS query - fetchmany_arrow", "fetchmany_arrow", - fetch_size=2 + fetch_size=2, ) ) - + + # Run additional test suites + self.test_cursor_description() + self.test_metadata_methods() + self.test_fetch_variations() + self.test_edge_cases() + self.test_parameterized_queries() + self.test_executemany() + logger.info(f"Completed {len(self.results)} comparison tests") def generate_report(self): - """Generate a report of the comparison results.""" + """Generate a report of the comparison results in JDBC comparator format.""" logger.info(f"Generating report to {self.report_file}") - + with open(self.report_file, "w") as f: - f.write("Python Connector Comparison Report\n") - f.write("=================================\n\n") - f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M:%S')}\n") - f.write(f"Server: {self.server_hostname}\n") - f.write(f"HTTP Path: {self.http_path}\n\n") - + # Header + f.write("# Python Connector Comparison Report\n\n") + f.write(f"**Date:** {time.strftime('%Y-%m-%d %H:%M:%S')}\n") + f.write(f"**Server:** {self.server_hostname}\n") + f.write(f"**HTTP Path:** {self.http_path}\n\n") + # Summary total_tests = len(self.results) passed_tests = sum(1 for r in self.results if r.success) - f.write(f"Summary: {passed_tests}/{total_tests} tests passed\n\n") - + failed_tests = total_tests - passed_tests + + f.write("## Summary\n\n") + f.write(f"- **Total Tests:** {total_tests}\n") + f.write(f"- **Passed:** {passed_tests}\n") + f.write(f"- **Failed:** {failed_tests}\n\n") + # Performance summary total_thrift_time = sum(r.thrift_time for r in self.results) total_sea_time = sum(r.sea_time for r in self.results) - f.write(f"Total execution time - Thrift: {total_thrift_time:.4f}s, SEA: {total_sea_time:.4f}s\n") - + f.write("## Performance Summary\n\n") + f.write(f"- **Total Thrift Execution Time:** {total_thrift_time:.4f}s\n") + f.write(f"- **Total SEA Execution Time:** {total_sea_time:.4f}s\n") + if total_thrift_time > 0: percentage = (total_sea_time / total_thrift_time - 1) * 100 - f.write(f"SEA is {percentage:.2f}% {'slower' if percentage > 0 else 'faster'} than Thrift\n\n") - - # Test details - f.write("Test Details\n") - f.write("------------\n\n") - - for i, result in enumerate(self.results): - f.write(f"Test {i+1}: {result.test_name}\n") - f.write(f"{'PASSED' if result.success else 'FAILED'}\n") - f.write(f"Query: {result.query}\n") - - if result.args: - f.write(f"Args: {result.args}\n") - - f.write(f"Thrift time: {result.thrift_time:.4f}s, SEA time: {result.sea_time:.4f}s\n") - + f.write( + f"- **SEA Performance:** {percentage:+.2f}% {'slower' if percentage > 0 else 'faster'} than Thrift\n\n" + ) + + # Test results in JDBC comparator format + f.write("## Test Results\n\n") + + for result in self.results: + # Determine query type + if "metadata" in result.test_name.lower() or any( + x in result.query.lower() + for x in ["catalogs()", "schemas(", "tables(", "columns("] + ): + query_type = "Cursor Metadata Methods" + elif "description" in result.test_name.lower(): + query_type = "Cursor Properties" + elif "executemany" in result.test_name.lower(): + query_type = "Batch Operations" + elif "parameterized" in result.test_name.lower(): + query_type = "Parameterized Queries" + else: + query_type = "SQL Query" + + f.write(f"**Query Type:** {query_type}\n") + f.write(f"**Query/Method:** {result.query}\n") + + if result.args and result.args != []: + f.write( + f"**Method Arguments:** {', '.join(str(arg) for arg in result.args)}\n" + ) + + f.write("============================\n\n") + if not result.success: - f.write("Differences:\n") + # Group differences by type + metadata_diffs = [] + data_diffs = [] + other_diffs = [] + for diff in result.differences: - f.write(f" - {diff['message']}\n") - if diff.get('thrift_value') is not None: - f.write(f" Thrift: {diff['thrift_value']}\n") - if diff.get('sea_value') is not None: - f.write(f" SEA: {diff['sea_value']}\n") - - f.write("\n") - + msg = diff["message"] + if any( + x in msg.lower() + for x in [ + "description", + "column", + "type", + "schema", + "catalog", + ] + ): + metadata_diffs.append(diff) + elif any( + x in msg.lower() for x in ["row", "value", "data", "count"] + ): + data_diffs.append(diff) + else: + other_diffs.append(diff) + + # Write metadata differences + if metadata_diffs: + f.write("**Metadata Differences:**\n") + f.write("---------------------\n") + f.write("Column Metadata:\n") + for diff in metadata_diffs: + f.write(f" - {diff['message']}") + if ( + diff.get("thrift_value") is not None + and diff.get("sea_value") is not None + ): + f.write( + f": {diff['thrift_value']} vs {diff['sea_value']}" + ) + f.write("\n") + f.write("\n") + + # Write data differences + if data_diffs: + f.write("**Data Differences:**\n") + f.write("-----------------\n") + f.write("Row Data:\n") + for diff in data_diffs: + f.write(f" - {diff['message']}") + if ( + diff.get("thrift_value") is not None + and diff.get("sea_value") is not None + ): + f.write( + f": {diff['thrift_value']} vs {diff['sea_value']}" + ) + f.write("\n") + f.write("\n") + + # Write other differences + if other_diffs: + f.write("**Other Differences:**\n") + f.write("-----------------\n") + for diff in other_diffs: + f.write(f" - {diff['message']}") + if ( + diff.get("thrift_value") is not None + and diff.get("sea_value") is not None + ): + f.write( + f": {diff['thrift_value']} vs {diff['sea_value']}" + ) + f.write("\n") + f.write("\n") + + # Write error information if present + if result.thrift_error or result.sea_error: + f.write("**Errors:**\n") + f.write("--------\n") + if result.thrift_error: + f.write(f" - Thrift Error: {result.thrift_error}\n") + if result.sea_error: + f.write(f" - SEA Error: {result.sea_error}\n") + f.write("\n") + else: + f.write("**Result:** PASSED\n") + f.write( + f"**Execution Time:** Thrift: {result.thrift_time:.4f}s, SEA: {result.sea_time:.4f}s\n\n" + ) + + f.write("============================\n\n") + logger.info(f"Report generated: {self.report_file}") - + # Print summary to console print(f"\nSummary: {passed_tests}/{total_tests} tests passed") - print(f"Total execution time - Thrift: {total_thrift_time:.4f}s, SEA: {total_sea_time:.4f}s") + print( + f"Total execution time - Thrift: {total_thrift_time:.4f}s, SEA: {total_sea_time:.4f}s" + ) if total_thrift_time > 0: percentage = (total_sea_time / total_thrift_time - 1) * 100 - print(f"SEA is {percentage:.2f}% {'slower' if percentage > 0 else 'faster'} than Thrift") + print( + f"SEA is {percentage:+.2f}% {'slower' if percentage > 0 else 'faster'} than Thrift" + ) print(f"Detailed report saved to: {self.report_file}") def run(self): @@ -576,30 +1299,36 @@ def run(self): def main(): """Main entry point.""" # Check required environment variables - required_vars = ["DATABRICKS_SERVER_HOSTNAME", "DATABRICKS_HTTP_PATH", "DATABRICKS_TOKEN"] + required_vars = [ + "DATABRICKS_SERVER_HOSTNAME", + "DATABRICKS_HTTP_PATH", + "DATABRICKS_TOKEN", + ] missing_vars = [var for var in required_vars if not os.environ.get(var)] - + if missing_vars: - logger.error(f"Missing required environment variables: {', '.join(missing_vars)}") + logger.error( + f"Missing required environment variables: {', '.join(missing_vars)}" + ) logger.error("Please set these variables before running the comparator.") sys.exit(1) - + # Get connection parameters from environment server_hostname = os.environ["DATABRICKS_SERVER_HOSTNAME"] http_path = os.environ["DATABRICKS_HTTP_PATH"] access_token = os.environ["DATABRICKS_TOKEN"] catalog = os.environ.get("DATABRICKS_CATALOG", DEFAULT_CATALOG) - + # Create and run comparator comparator = PythonConnectorComparator( server_hostname=server_hostname, http_path=http_path, access_token=access_token, - catalog=catalog + catalog=catalog, ) - + comparator.run() if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/examples/experimental/python-connector-comparison-report.md b/examples/experimental/python-connector-comparison-report.md new file mode 100644 index 000000000..1cb926707 --- /dev/null +++ b/examples/experimental/python-connector-comparison-report.md @@ -0,0 +1,443 @@ +# Python Connector Comparison Report + +**Date:** 2025-07-24 06:52:12 +**Server:** benchmarking-staging-aws-us-west-2.staging.cloud.databricks.com +**HTTP Path:** /sql/1.0/warehouses/17661fca65a0e4fc + +## Summary + +- **Total Tests:** 27 +- **Passed:** 4 +- **Failed:** 23 + +## Performance Summary + +- **Total Thrift Execution Time:** 9.5053s +- **Total SEA Execution Time:** 12.2712s +- **SEA Performance:** +29.10% slower than Thrift + +## Test Results + +**Query Type:** SQL Query +**Query/Method:** SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5 +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Column 17 (cs_order_number) type_code mismatch: bigint vs long + +============================ + +**Query Type:** SQL Query +**Query/Method:** SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5 +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Column 17 (cs_order_number) type_code mismatch: bigint vs long + +============================ + +**Query Type:** SQL Query +**Query/Method:** SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5 +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Column 17 (cs_order_number) type_code mismatch: bigint vs long + +============================ + +**Query Type:** SQL Query +**Query/Method:** SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5 +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Column 17 (cs_order_number) type_code mismatch: bigint vs long + +============================ + +**Query Type:** SQL Query +**Query/Method:** SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5 +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Column 17 (cs_order_number) type_code mismatch: bigint vs long + +============================ + +**Query Type:** Cursor Properties +**Query/Method:** No query executed +============================ + +**Result:** PASSED +**Execution Time:** Thrift: 0.0000s, SEA: 0.0000s + +============================ + +**Query Type:** Cursor Properties +**Query/Method:** SHOW TABLES IN main.default +============================ + +**Result:** PASSED +**Execution Time:** Thrift: 0.0000s, SEA: 0.0000s + +============================ + +**Query Type:** Cursor Metadata Methods +**Query/Method:** catalogs() +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Column 0 (TABLE_CAT) name mismatch: TABLE_CAT vs catalog + - Field 'catalog' missing in all Thrift rows + +**Data Differences:** +----------------- +Row Data: + - Field 'TABLE_CAT' missing in all SEA rows + +============================ + +**Query Type:** Cursor Metadata Methods +**Query/Method:** schemas(None, None) +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Exception: Catalog name is required for get_schemas + +============================ + +**Query Type:** Cursor Metadata Methods +**Query/Method:** schemas(main, None) +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Description length mismatch: Thrift has 2 columns, SEA has 1: [('TABLE_SCHEM', 'string', None, None, None, None, None), ('TABLE_CATALOG', 'string', None, None, None, None, None)] vs [('databaseName', 'string', None, None, None, None, None)] + - Field 'TABLE_CATALOG' missing in all SEA rows + +**Data Differences:** +----------------- +Row Data: + - Field 'databaseName' missing in all Thrift rows + - Field 'TABLE_SCHEM' missing in all SEA rows + +============================ + +**Query Type:** Cursor Metadata Methods +**Query/Method:** schemas(main, def%) +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Description length mismatch: Thrift has 2 columns, SEA has 1: [('TABLE_SCHEM', 'string', None, None, None, None, None), ('TABLE_CATALOG', 'string', None, None, None, None, None)] vs [('databaseName', 'string', None, None, None, None, None)] + +**Data Differences:** +----------------- +Row Data: + - Row count mismatch: Thrift returned 1, SEA returned 0 + +============================ + +**Query Type:** Cursor Metadata Methods +**Query/Method:** tables(None, None, None, None) +============================ + +**Other Differences:** +----------------- + - Exception: Command failed: BAD_REQUEST - Inline byte limit exceeded. Statements executed with disposition=INLINE can have a result size of at most 26214400 bytes. Please execute the statement with disposition=EXTERNAL_LINKS if you want to download the full result. + +============================ + +**Query Type:** Cursor Metadata Methods +**Query/Method:** tables(main, None, None, None) +============================ + +**Other Differences:** +----------------- + - Exception: Command failed: BAD_REQUEST - Inline byte limit exceeded. Statements executed with disposition=INLINE can have a result size of at most 26214400 bytes. Please execute the statement with disposition=EXTERNAL_LINKS if you want to download the full result. + +============================ + +**Query Type:** Cursor Metadata Methods +**Query/Method:** tables(main, default, None, None) +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Description length mismatch: Thrift has 10 columns, SEA has 7: [('TABLE_CAT', 'string', None, None, None, None, None), ('TABLE_SCHEM', 'string', None, None, None, None, None), ('TABLE_NAME', 'string', None, None, None, None, None), ('TABLE_TYPE', 'string', None, None, None, None, None), ('REMARKS', 'string', None, None, None, None, None), ('TYPE_CAT', 'string', None, None, None, None, None), ('TYPE_SCHEM', 'string', None, None, None, None, None), ('TYPE_NAME', 'string', None, None, None, None, None), ('SELF_REFERENCING_COL_NAME', 'string', None, None, None, None, None), ('REF_GENERATION', 'string', None, None, None, None, None)] vs [('namespace', 'string', None, None, None, None, None), ('tableName', 'string', None, None, None, None, None), ('isTemporary', 'boolean', None, None, None, None, None), ('information', 'string', None, None, None, None, None), ('catalogName', 'string', None, None, None, None, None), ('tableType', 'string', None, None, None, None, None), ('remarks', 'string', None, None, None, None, None)] + - Field 'tableType' missing in all Thrift rows + - Field 'catalogName' missing in all Thrift rows + - Field 'TYPE_CAT' missing in all SEA rows + - Field 'TYPE_NAME' missing in all SEA rows + - Field 'TABLE_TYPE' missing in all SEA rows + - Field 'TYPE_SCHEM' missing in all SEA rows + +**Data Differences:** +----------------- +Row Data: + - Field 'tableName' missing in all Thrift rows + - Field 'remarks' missing in all Thrift rows + - Field 'namespace' missing in all Thrift rows + - Field 'isTemporary' missing in all Thrift rows + - Field 'information' missing in all Thrift rows + - Field 'SELF_REFERENCING_COL_NAME' missing in all SEA rows + - Field 'TABLE_SCHEM' missing in all SEA rows + - Field 'TABLE_CAT' missing in all SEA rows + - Field 'REMARKS' missing in all SEA rows + - Field 'TABLE_NAME' missing in all SEA rows + - Field 'REF_GENERATION' missing in all SEA rows + +============================ + +**Query Type:** Cursor Metadata Methods +**Query/Method:** tables(main, default, %sales, None) +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Description length mismatch: Thrift has 10 columns, SEA has 7: [('TABLE_CAT', 'string', None, None, None, None, None), ('TABLE_SCHEM', 'string', None, None, None, None, None), ('TABLE_NAME', 'string', None, None, None, None, None), ('TABLE_TYPE', 'string', None, None, None, None, None), ('REMARKS', 'string', None, None, None, None, None), ('TYPE_CAT', 'string', None, None, None, None, None), ('TYPE_SCHEM', 'string', None, None, None, None, None), ('TYPE_NAME', 'string', None, None, None, None, None), ('SELF_REFERENCING_COL_NAME', 'string', None, None, None, None, None), ('REF_GENERATION', 'string', None, None, None, None, None)] vs [('namespace', 'string', None, None, None, None, None), ('tableName', 'string', None, None, None, None, None), ('isTemporary', 'boolean', None, None, None, None, None), ('information', 'string', None, None, None, None, None), ('catalogName', 'string', None, None, None, None, None), ('tableType', 'string', None, None, None, None, None), ('remarks', 'string', None, None, None, None, None)] + +**Data Differences:** +----------------- +Row Data: + - Row count mismatch: Thrift returned 1, SEA returned 0 + +============================ + +**Query Type:** Cursor Metadata Methods +**Query/Method:** tables(main, default, None, ['TABLE', 'VIEW']) +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Description length mismatch: Thrift has 10 columns, SEA has 7: [('TABLE_CAT', 'string', None, None, None, None, None), ('TABLE_SCHEM', 'string', None, None, None, None, None), ('TABLE_NAME', 'string', None, None, None, None, None), ('TABLE_TYPE', 'string', None, None, None, None, None), ('REMARKS', 'string', None, None, None, None, None), ('TYPE_CAT', 'string', None, None, None, None, None), ('TYPE_SCHEM', 'string', None, None, None, None, None), ('TYPE_NAME', 'string', None, None, None, None, None), ('SELF_REFERENCING_COL_NAME', 'string', None, None, None, None, None), ('REF_GENERATION', 'string', None, None, None, None, None)] vs [('namespace', 'string', None, None, None, None, None), ('tableName', 'string', None, None, None, None, None), ('isTemporary', 'boolean', None, None, None, None, None), ('information', 'string', None, None, None, None, None), ('catalogName', 'string', None, None, None, None, None), ('tableType', 'string', None, None, None, None, None), ('remarks', 'string', None, None, None, None, None)] + - Field 'tableType' missing in all Thrift rows + - Field 'catalogName' missing in all Thrift rows + - Field 'TYPE_CAT' missing in all SEA rows + - Field 'TYPE_NAME' missing in all SEA rows + - Field 'TABLE_TYPE' missing in all SEA rows + - Field 'TYPE_SCHEM' missing in all SEA rows + +**Data Differences:** +----------------- +Row Data: + - Field 'tableName' missing in all Thrift rows + - Field 'remarks' missing in all Thrift rows + - Field 'namespace' missing in all Thrift rows + - Field 'isTemporary' missing in all Thrift rows + - Field 'information' missing in all Thrift rows + - Field 'SELF_REFERENCING_COL_NAME' missing in all SEA rows + - Field 'TABLE_SCHEM' missing in all SEA rows + - Field 'TABLE_CAT' missing in all SEA rows + - Field 'REMARKS' missing in all SEA rows + - Field 'TABLE_NAME' missing in all SEA rows + - Field 'REF_GENERATION' missing in all SEA rows + +============================ + +**Query Type:** Cursor Metadata Methods +**Query/Method:** columns(main, tpcds_sf100_delta, catalog_sales, None) +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Description length mismatch: Thrift has 23 columns, SEA has 13: [('TABLE_CAT', 'string', None, None, None, None, None), ('TABLE_SCHEM', 'string', None, None, None, None, None), ('TABLE_NAME', 'string', None, None, None, None, None), ('COLUMN_NAME', 'string', None, None, None, None, None), ('DATA_TYPE', 'int', None, None, None, None, None), ('TYPE_NAME', 'string', None, None, None, None, None), ('COLUMN_SIZE', 'int', None, None, None, None, None), ('BUFFER_LENGTH', 'tinyint', None, None, None, None, None), ('DECIMAL_DIGITS', 'int', None, None, None, None, None), ('NUM_PREC_RADIX', 'int', None, None, None, None, None), ('NULLABLE', 'int', None, None, None, None, None), ('REMARKS', 'string', None, None, None, None, None), ('COLUMN_DEF', 'string', None, None, None, None, None), ('SQL_DATA_TYPE', 'int', None, None, None, None, None), ('SQL_DATETIME_SUB', 'int', None, None, None, None, None), ('CHAR_OCTET_LENGTH', 'int', None, None, None, None, None), ('ORDINAL_POSITION', 'int', None, None, None, None, None), ('IS_NULLABLE', 'string', None, None, None, None, None), ('SCOPE_CATALOG', 'string', None, None, None, None, None), ('SCOPE_SCHEMA', 'string', None, None, None, None, None), ('SCOPE_TABLE', 'string', None, None, None, None, None), ('SOURCE_DATA_TYPE', 'smallint', None, None, None, None, None), ('IS_AUTO_INCREMENT', 'string', None, None, None, None, None)] vs [('col_name', 'string', None, None, None, None, None), ('catalogName', 'string', None, None, None, None, None), ('namespace', 'string', None, None, None, None, None), ('tableName', 'string', None, None, None, None, None), ('columnType', 'string', None, None, None, None, None), ('columnSize', 'int', None, None, None, None, None), ('decimalDigits', 'int', None, None, None, None, None), ('radix', 'int', None, None, None, None, None), ('isNullable', 'string', None, None, None, None, None), ('remarks', 'string', None, None, None, None, None), ('ordinalPosition', 'int', None, None, None, None, None), ('isAutoIncrement', 'string', None, None, None, None, None), ('isGenerated', 'string', None, None, None, None, None)] + - Field 'columnType' missing in all Thrift rows + - Field 'columnSize' missing in all Thrift rows + - Field 'catalogName' missing in all Thrift rows + - Field 'COLUMN_SIZE' missing in all SEA rows + - Field 'COLUMN_DEF' missing in all SEA rows + - Field 'TYPE_NAME' missing in all SEA rows + - Field 'SOURCE_DATA_TYPE' missing in all SEA rows + - Field 'SQL_DATA_TYPE' missing in all SEA rows + - Field 'SCOPE_CATALOG' missing in all SEA rows + - Field 'COLUMN_NAME' missing in all SEA rows + - Field 'SCOPE_SCHEMA' missing in all SEA rows + - Field 'DATA_TYPE' missing in all SEA rows + +**Data Differences:** +----------------- +Row Data: + - Field 'isAutoIncrement' missing in all Thrift rows + - Field 'tableName' missing in all Thrift rows + - Field 'remarks' missing in all Thrift rows + - Field 'namespace' missing in all Thrift rows + - Field 'radix' missing in all Thrift rows + - Field 'isGenerated' missing in all Thrift rows + - Field 'ordinalPosition' missing in all Thrift rows + - Field 'isNullable' missing in all Thrift rows + - Field 'col_name' missing in all Thrift rows + - Field 'decimalDigits' missing in all Thrift rows + - Field 'NULLABLE' missing in all SEA rows + - Field 'BUFFER_LENGTH' missing in all SEA rows + - Field 'TABLE_NAME' missing in all SEA rows + - Field 'NUM_PREC_RADIX' missing in all SEA rows + - Field 'DECIMAL_DIGITS' missing in all SEA rows + - Field 'SQL_DATETIME_SUB' missing in all SEA rows + - Field 'IS_NULLABLE' missing in all SEA rows + - Field 'TABLE_SCHEM' missing in all SEA rows + - Field 'SCOPE_TABLE' missing in all SEA rows + - Field 'IS_AUTO_INCREMENT' missing in all SEA rows + - Field 'TABLE_CAT' missing in all SEA rows + - Field 'ORDINAL_POSITION' missing in all SEA rows + - Field 'CHAR_OCTET_LENGTH' missing in all SEA rows + - Field 'REMARKS' missing in all SEA rows + +============================ + +**Query Type:** Cursor Metadata Methods +**Query/Method:** columns(main, tpcds_sf100_delta, catalog_sales, cs_%) +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Description length mismatch: Thrift has 23 columns, SEA has 13: [('TABLE_CAT', 'string', None, None, None, None, None), ('TABLE_SCHEM', 'string', None, None, None, None, None), ('TABLE_NAME', 'string', None, None, None, None, None), ('COLUMN_NAME', 'string', None, None, None, None, None), ('DATA_TYPE', 'int', None, None, None, None, None), ('TYPE_NAME', 'string', None, None, None, None, None), ('COLUMN_SIZE', 'int', None, None, None, None, None), ('BUFFER_LENGTH', 'tinyint', None, None, None, None, None), ('DECIMAL_DIGITS', 'int', None, None, None, None, None), ('NUM_PREC_RADIX', 'int', None, None, None, None, None), ('NULLABLE', 'int', None, None, None, None, None), ('REMARKS', 'string', None, None, None, None, None), ('COLUMN_DEF', 'string', None, None, None, None, None), ('SQL_DATA_TYPE', 'int', None, None, None, None, None), ('SQL_DATETIME_SUB', 'int', None, None, None, None, None), ('CHAR_OCTET_LENGTH', 'int', None, None, None, None, None), ('ORDINAL_POSITION', 'int', None, None, None, None, None), ('IS_NULLABLE', 'string', None, None, None, None, None), ('SCOPE_CATALOG', 'string', None, None, None, None, None), ('SCOPE_SCHEMA', 'string', None, None, None, None, None), ('SCOPE_TABLE', 'string', None, None, None, None, None), ('SOURCE_DATA_TYPE', 'smallint', None, None, None, None, None), ('IS_AUTO_INCREMENT', 'string', None, None, None, None, None)] vs [('col_name', 'string', None, None, None, None, None), ('catalogName', 'string', None, None, None, None, None), ('namespace', 'string', None, None, None, None, None), ('tableName', 'string', None, None, None, None, None), ('columnType', 'string', None, None, None, None, None), ('columnSize', 'int', None, None, None, None, None), ('decimalDigits', 'int', None, None, None, None, None), ('radix', 'int', None, None, None, None, None), ('isNullable', 'string', None, None, None, None, None), ('remarks', 'string', None, None, None, None, None), ('ordinalPosition', 'int', None, None, None, None, None), ('isAutoIncrement', 'string', None, None, None, None, None), ('isGenerated', 'string', None, None, None, None, None)] + +**Data Differences:** +----------------- +Row Data: + - Row count mismatch: Thrift returned 34, SEA returned 0 + +============================ + +**Query Type:** SQL Query +**Query/Method:** SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 10 +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Column 17 (cs_order_number) type_code mismatch: bigint vs long + +============================ + +**Query Type:** SQL Query +**Query/Method:** SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 10 +============================ + +**Result:** PASSED +**Execution Time:** Thrift: 0.0000s, SEA: 0.0000s + +============================ + +**Query Type:** SQL Query +**Query/Method:** SELECT * FROM main.tpcds_sf100_delta.catalog_sales WHERE 1=0 +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Column 17 (cs_order_number) type_code mismatch: bigint vs long + +============================ + +**Query Type:** SQL Query +**Query/Method:** SELECT NULL as null_col, 'test' as string_col, 123 as int_col +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Column 0 (null_col) type_code mismatch: string vs null + +============================ + +**Query Type:** SQL Query +**Query/Method:** + SELECT + CAST(123 AS TINYINT) as tiny_col, + CAST(456 AS SMALLINT) as small_col, + CAST(789 AS INT) as int_col, + CAST(123456789 AS BIGINT) as big_col, + CAST(123.45 AS FLOAT) as float_col, + CAST(678.90 AS DOUBLE) as double_col, + CAST(123.456 AS DECIMAL(10,3)) as decimal_col, + 'test_string' as string_col, + TRUE as bool_col, + CAST('2023-01-01' AS DATE) as date_col, + CAST('2023-01-01 12:34:56' AS TIMESTAMP) as timestamp_col, + ARRAY(1,2,3) as array_col, + STRUCT(1 as a, 'b' as b) as struct_col, + MAP('key1', 'value1', 'key2', 'value2') as map_col + +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Column 0 (tiny_col) type_code mismatch: tinyint vs byte + - Column 1 (small_col) type_code mismatch: smallint vs short + - Column 3 (big_col) type_code mismatch: bigint vs long + +**Data Differences:** +----------------- +Row Data: + - Exception: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all() + +============================ + +**Query Type:** SQL Query +**Query/Method:** SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 1000 +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Column 17 (cs_order_number) type_code mismatch: bigint vs long + +============================ + +**Query Type:** Parameterized Queries +**Query/Method:** SELECT * FROM main.tpcds_sf100_delta.catalog_sales WHERE cs_sold_date_sk = :date_sk LIMIT 5 +**Method Arguments:** date_sk +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Column 17 (cs_order_number) type_code mismatch: bigint vs long + +============================ + +**Query Type:** Parameterized Queries +**Query/Method:** SELECT * FROM main.tpcds_sf100_delta.catalog_sales WHERE cs_sold_date_sk = ? AND cs_sold_time_sk = ? LIMIT 5 +**Method Arguments:** 2451088, 48000 +============================ + +**Metadata Differences:** +--------------------- +Column Metadata: + - Column 17 (cs_order_number) type_code mismatch: bigint vs long + +============================ + +**Query Type:** Batch Operations +**Query/Method:** INSERT INTO main.default.comparator_test_table VALUES (?, ?) +**Method Arguments:** (1, 'one'), (2, 'two'), (3, 'three') +============================ + +**Result:** PASSED +**Execution Time:** Thrift: 2.7094s, SEA: 3.1544s + +============================ + diff --git a/examples/experimental/python-connector-comparison-report.txt b/examples/experimental/python-connector-comparison-report.txt deleted file mode 100644 index 87df81fe9..000000000 --- a/examples/experimental/python-connector-comparison-report.txt +++ /dev/null @@ -1,60 +0,0 @@ -Python Connector Comparison Report -================================= - -Date: 2025-07-21 16:21:27 -Server: benchmarking-staging-aws-us-west-2.staging.cloud.databricks.com -HTTP Path: /sql/1.0/warehouses/17661fca65a0e4fc - -Summary: 0/5 tests passed - -Total execution time - Thrift: 2.3825s, SEA: 3.0076s -SEA is 26.24% slower than Thrift - -Test Details ------------- - -Test 1: TPC-DS query - fetchall -FAILED -Query: SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5 -Thrift time: 0.4903s, SEA time: 0.6477s -Differences: - - Column 17 (cs_order_number) type_code mismatch - Thrift: bigint - SEA: long - -Test 2: TPC-DS query - fetchmany -FAILED -Query: SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5 -Thrift time: 0.4870s, SEA time: 0.5810s -Differences: - - Column 17 (cs_order_number) type_code mismatch - Thrift: bigint - SEA: long - -Test 3: TPC-DS query - fetchone -FAILED -Query: SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5 -Thrift time: 0.4784s, SEA time: 0.5651s -Differences: - - Column 17 (cs_order_number) type_code mismatch - Thrift: bigint - SEA: long - -Test 4: TPC-DS query - fetchall_arrow -FAILED -Query: SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5 -Thrift time: 0.4549s, SEA time: 0.6195s -Differences: - - Column 17 (cs_order_number) type_code mismatch - Thrift: bigint - SEA: long - -Test 5: TPC-DS query - fetchmany_arrow -FAILED -Query: SELECT * FROM main.tpcds_sf100_delta.catalog_sales LIMIT 5 -Thrift time: 0.4720s, SEA time: 0.5943s -Differences: - - Column 17 (cs_order_number) type_code mismatch - Thrift: bigint - SEA: long - From a9b900662bb048dc172fc4acbbeb1a4b3ea2a3bb Mon Sep 17 00:00:00 2001 From: varun-edachali-dbx Date: Thu, 24 Jul 2025 10:49:29 +0000 Subject: [PATCH 5/7] safer comparision Signed-off-by: varun-edachali-dbx --- examples/experimental/comparator.py | 57 ++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/examples/experimental/comparator.py b/examples/experimental/comparator.py index 62b08c7f0..5c516e116 100755 --- a/examples/experimental/comparator.py +++ b/examples/experimental/comparator.py @@ -233,6 +233,41 @@ def compare_cursor_description( sea_val, ) + def _safe_compare(self, val1, val2): + """ + Safely compare two values, handling lists, dicts, and complex types. + + Returns True if values are equal, False otherwise. + """ + try: + # Handle None values + if val1 is None and val2 is None: + return True + if val1 is None or val2 is None: + return False + + # For lists, tuples, and other sequences (but not strings) + if isinstance(val1, (list, tuple)) and isinstance(val2, (list, tuple)): + if len(val1) != len(val2): + return False + return all(self._safe_compare(v1, v2) for v1, v2 in zip(val1, val2)) + + # For dictionaries + if isinstance(val1, dict) and isinstance(val2, dict): + if set(val1.keys()) != set(val2.keys()): + return False + return all(self._safe_compare(val1[k], val2[k]) for k in val1.keys()) + + # For Row objects (which are tuples with special properties) + if hasattr(val1, 'asDict') and hasattr(val2, 'asDict'): + return self._safe_compare(val1.asDict(recursive=True), val2.asDict(recursive=True)) + + # Default comparison + return val1 == val2 + except (ValueError, TypeError) as e: + # If comparison fails (e.g., numpy arrays), convert to string + return str(val1) == str(val2) + def compare_rows( self, thrift_rows: List[Row], sea_rows: List[Row], result: ComparisonResult ): @@ -264,9 +299,19 @@ def compare_rows( thrift_dict = thrift_row.asDict(recursive=True) sea_dict = sea_row.asDict(recursive=True) - if thrift_dict != sea_dict: - # Find which fields differ - all_fields = set(thrift_dict.keys()) | set(sea_dict.keys()) + # Check if dictionaries are different by comparing all fields + all_fields = set(thrift_dict.keys()) | set(sea_dict.keys()) + dicts_differ = False + + for field in all_fields: + if field not in thrift_dict or field not in sea_dict: + dicts_differ = True + break + elif not self._safe_compare(thrift_dict.get(field), sea_dict.get(field)): + dicts_differ = True + break + + if dicts_differ: for field in all_fields: thrift_value = thrift_dict.get(field) @@ -276,7 +321,7 @@ def compare_rows( fields_missing_in_thrift.add(field) elif field not in sea_dict: fields_missing_in_sea.add(field) - elif thrift_value != sea_value: + elif not self._safe_compare(thrift_value, sea_value): if field not in field_value_mismatches: field_value_mismatches[field] = [] field_value_mismatches[field].append( @@ -308,8 +353,8 @@ def compare_rows( thrift_values = [m[1] for m in mismatches] sea_values = [m[2] for m in mismatches] - if all(v == thrift_values[0] for v in thrift_values) and all( - v == sea_values[0] for v in sea_values + if all(self._safe_compare(v, thrift_values[0]) for v in thrift_values) and all( + self._safe_compare(v, sea_values[0]) for v in sea_values ): result.add_difference( f"Field '{field}' value mismatch in all rows", From ba36ebe4596eb85543b0cc14a619e64ab2dcbc01 Mon Sep 17 00:00:00 2001 From: varun-edachali-dbx Date: Tue, 29 Jul 2025 19:12:53 +0530 Subject: [PATCH 6/7] safer comparision Signed-off-by: varun-edachali-dbx --- examples/experimental/comparator.py | 56 ++++++++++++++++++----------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/examples/experimental/comparator.py b/examples/experimental/comparator.py index 5c516e116..2ba118bbb 100755 --- a/examples/experimental/comparator.py +++ b/examples/experimental/comparator.py @@ -235,8 +235,8 @@ def compare_cursor_description( def _safe_compare(self, val1, val2): """ - Safely compare two values, handling lists, dicts, and complex types. - + Safely compare two values, handling Row objects and PyArrow tables. + Returns True if values are equal, False otherwise. """ try: @@ -245,28 +245,40 @@ def _safe_compare(self, val1, val2): return True if val1 is None or val2 is None: return False - - # For lists, tuples, and other sequences (but not strings) + + # For Row objects, convert to dictionaries + if hasattr(val1, "asDict") and hasattr(val2, "asDict"): + return self._safe_compare( + val1.asDict(recursive=True), val2.asDict(recursive=True) + ) + + # For PyArrow arrays/tables + if hasattr(val1, "to_pylist") and hasattr(val2, "to_pylist"): + return val1.to_pylist() == val2.to_pylist() + + # For lists and tuples if isinstance(val1, (list, tuple)) and isinstance(val2, (list, tuple)): if len(val1) != len(val2): return False return all(self._safe_compare(v1, v2) for v1, v2 in zip(val1, val2)) - + # For dictionaries if isinstance(val1, dict) and isinstance(val2, dict): if set(val1.keys()) != set(val2.keys()): return False return all(self._safe_compare(val1[k], val2[k]) for k in val1.keys()) - - # For Row objects (which are tuples with special properties) - if hasattr(val1, 'asDict') and hasattr(val2, 'asDict'): - return self._safe_compare(val1.asDict(recursive=True), val2.asDict(recursive=True)) - - # Default comparison - return val1 == val2 - except (ValueError, TypeError) as e: - # If comparison fails (e.g., numpy arrays), convert to string - return str(val1) == str(val2) + + # Default comparison - ensure we always return a boolean + result = val1 == val2 + # If result is not a simple boolean, use bool() to convert it + return bool(result) + + except (ValueError, TypeError): + # Fallback to string comparison for problematic types + try: + return str(val1) == str(val2) + except: + return False def compare_rows( self, thrift_rows: List[Row], sea_rows: List[Row], result: ComparisonResult @@ -302,15 +314,17 @@ def compare_rows( # Check if dictionaries are different by comparing all fields all_fields = set(thrift_dict.keys()) | set(sea_dict.keys()) dicts_differ = False - + for field in all_fields: if field not in thrift_dict or field not in sea_dict: dicts_differ = True break - elif not self._safe_compare(thrift_dict.get(field), sea_dict.get(field)): + elif not self._safe_compare( + thrift_dict.get(field), sea_dict.get(field) + ): dicts_differ = True break - + if dicts_differ: for field in all_fields: @@ -353,9 +367,9 @@ def compare_rows( thrift_values = [m[1] for m in mismatches] sea_values = [m[2] for m in mismatches] - if all(self._safe_compare(v, thrift_values[0]) for v in thrift_values) and all( - self._safe_compare(v, sea_values[0]) for v in sea_values - ): + if all( + self._safe_compare(v, thrift_values[0]) for v in thrift_values + ) and all(self._safe_compare(v, sea_values[0]) for v in sea_values): result.add_difference( f"Field '{field}' value mismatch in all rows", thrift_values[0], From 75973e06a6bdbb8e476db783bf45c24eadf11993 Mon Sep 17 00:00:00 2001 From: varun-edachali-dbx Date: Tue, 29 Jul 2025 19:20:25 +0530 Subject: [PATCH 7/7] updated report Signed-off-by: varun-edachali-dbx --- .../python-connector-comparison-report.md | 77 +++++++++---------- 1 file changed, 36 insertions(+), 41 deletions(-) diff --git a/examples/experimental/python-connector-comparison-report.md b/examples/experimental/python-connector-comparison-report.md index 1cb926707..3d7bda6d5 100644 --- a/examples/experimental/python-connector-comparison-report.md +++ b/examples/experimental/python-connector-comparison-report.md @@ -1,6 +1,6 @@ # Python Connector Comparison Report -**Date:** 2025-07-24 06:52:12 +**Date:** 2025-07-29 19:19:02 **Server:** benchmarking-staging-aws-us-west-2.staging.cloud.databricks.com **HTTP Path:** /sql/1.0/warehouses/17661fca65a0e4fc @@ -12,9 +12,9 @@ ## Performance Summary -- **Total Thrift Execution Time:** 9.5053s -- **Total SEA Execution Time:** 12.2712s -- **SEA Performance:** +29.10% slower than Thrift +- **Total Thrift Execution Time:** 30.1946s +- **Total SEA Execution Time:** 23.1179s +- **SEA Performance:** -23.44% faster than Thrift ## Test Results @@ -181,8 +181,8 @@ Row Data: --------------------- Column Metadata: - Description length mismatch: Thrift has 10 columns, SEA has 7: [('TABLE_CAT', 'string', None, None, None, None, None), ('TABLE_SCHEM', 'string', None, None, None, None, None), ('TABLE_NAME', 'string', None, None, None, None, None), ('TABLE_TYPE', 'string', None, None, None, None, None), ('REMARKS', 'string', None, None, None, None, None), ('TYPE_CAT', 'string', None, None, None, None, None), ('TYPE_SCHEM', 'string', None, None, None, None, None), ('TYPE_NAME', 'string', None, None, None, None, None), ('SELF_REFERENCING_COL_NAME', 'string', None, None, None, None, None), ('REF_GENERATION', 'string', None, None, None, None, None)] vs [('namespace', 'string', None, None, None, None, None), ('tableName', 'string', None, None, None, None, None), ('isTemporary', 'boolean', None, None, None, None, None), ('information', 'string', None, None, None, None, None), ('catalogName', 'string', None, None, None, None, None), ('tableType', 'string', None, None, None, None, None), ('remarks', 'string', None, None, None, None, None)] - - Field 'tableType' missing in all Thrift rows - Field 'catalogName' missing in all Thrift rows + - Field 'tableType' missing in all Thrift rows - Field 'TYPE_CAT' missing in all SEA rows - Field 'TYPE_NAME' missing in all SEA rows - Field 'TABLE_TYPE' missing in all SEA rows @@ -192,16 +192,16 @@ Column Metadata: ----------------- Row Data: - Field 'tableName' missing in all Thrift rows + - Field 'isTemporary' missing in all Thrift rows - Field 'remarks' missing in all Thrift rows - Field 'namespace' missing in all Thrift rows - - Field 'isTemporary' missing in all Thrift rows - Field 'information' missing in all Thrift rows - - Field 'SELF_REFERENCING_COL_NAME' missing in all SEA rows - - Field 'TABLE_SCHEM' missing in all SEA rows - - Field 'TABLE_CAT' missing in all SEA rows - Field 'REMARKS' missing in all SEA rows + - Field 'TABLE_CAT' missing in all SEA rows - Field 'TABLE_NAME' missing in all SEA rows - Field 'REF_GENERATION' missing in all SEA rows + - Field 'SELF_REFERENCING_COL_NAME' missing in all SEA rows + - Field 'TABLE_SCHEM' missing in all SEA rows ============================ @@ -229,8 +229,8 @@ Row Data: --------------------- Column Metadata: - Description length mismatch: Thrift has 10 columns, SEA has 7: [('TABLE_CAT', 'string', None, None, None, None, None), ('TABLE_SCHEM', 'string', None, None, None, None, None), ('TABLE_NAME', 'string', None, None, None, None, None), ('TABLE_TYPE', 'string', None, None, None, None, None), ('REMARKS', 'string', None, None, None, None, None), ('TYPE_CAT', 'string', None, None, None, None, None), ('TYPE_SCHEM', 'string', None, None, None, None, None), ('TYPE_NAME', 'string', None, None, None, None, None), ('SELF_REFERENCING_COL_NAME', 'string', None, None, None, None, None), ('REF_GENERATION', 'string', None, None, None, None, None)] vs [('namespace', 'string', None, None, None, None, None), ('tableName', 'string', None, None, None, None, None), ('isTemporary', 'boolean', None, None, None, None, None), ('information', 'string', None, None, None, None, None), ('catalogName', 'string', None, None, None, None, None), ('tableType', 'string', None, None, None, None, None), ('remarks', 'string', None, None, None, None, None)] - - Field 'tableType' missing in all Thrift rows - Field 'catalogName' missing in all Thrift rows + - Field 'tableType' missing in all Thrift rows - Field 'TYPE_CAT' missing in all SEA rows - Field 'TYPE_NAME' missing in all SEA rows - Field 'TABLE_TYPE' missing in all SEA rows @@ -240,16 +240,16 @@ Column Metadata: ----------------- Row Data: - Field 'tableName' missing in all Thrift rows + - Field 'isTemporary' missing in all Thrift rows - Field 'remarks' missing in all Thrift rows - Field 'namespace' missing in all Thrift rows - - Field 'isTemporary' missing in all Thrift rows - Field 'information' missing in all Thrift rows - - Field 'SELF_REFERENCING_COL_NAME' missing in all SEA rows - - Field 'TABLE_SCHEM' missing in all SEA rows - - Field 'TABLE_CAT' missing in all SEA rows - Field 'REMARKS' missing in all SEA rows + - Field 'TABLE_CAT' missing in all SEA rows - Field 'TABLE_NAME' missing in all SEA rows - Field 'REF_GENERATION' missing in all SEA rows + - Field 'SELF_REFERENCING_COL_NAME' missing in all SEA rows + - Field 'TABLE_SCHEM' missing in all SEA rows ============================ @@ -261,46 +261,46 @@ Row Data: --------------------- Column Metadata: - Description length mismatch: Thrift has 23 columns, SEA has 13: [('TABLE_CAT', 'string', None, None, None, None, None), ('TABLE_SCHEM', 'string', None, None, None, None, None), ('TABLE_NAME', 'string', None, None, None, None, None), ('COLUMN_NAME', 'string', None, None, None, None, None), ('DATA_TYPE', 'int', None, None, None, None, None), ('TYPE_NAME', 'string', None, None, None, None, None), ('COLUMN_SIZE', 'int', None, None, None, None, None), ('BUFFER_LENGTH', 'tinyint', None, None, None, None, None), ('DECIMAL_DIGITS', 'int', None, None, None, None, None), ('NUM_PREC_RADIX', 'int', None, None, None, None, None), ('NULLABLE', 'int', None, None, None, None, None), ('REMARKS', 'string', None, None, None, None, None), ('COLUMN_DEF', 'string', None, None, None, None, None), ('SQL_DATA_TYPE', 'int', None, None, None, None, None), ('SQL_DATETIME_SUB', 'int', None, None, None, None, None), ('CHAR_OCTET_LENGTH', 'int', None, None, None, None, None), ('ORDINAL_POSITION', 'int', None, None, None, None, None), ('IS_NULLABLE', 'string', None, None, None, None, None), ('SCOPE_CATALOG', 'string', None, None, None, None, None), ('SCOPE_SCHEMA', 'string', None, None, None, None, None), ('SCOPE_TABLE', 'string', None, None, None, None, None), ('SOURCE_DATA_TYPE', 'smallint', None, None, None, None, None), ('IS_AUTO_INCREMENT', 'string', None, None, None, None, None)] vs [('col_name', 'string', None, None, None, None, None), ('catalogName', 'string', None, None, None, None, None), ('namespace', 'string', None, None, None, None, None), ('tableName', 'string', None, None, None, None, None), ('columnType', 'string', None, None, None, None, None), ('columnSize', 'int', None, None, None, None, None), ('decimalDigits', 'int', None, None, None, None, None), ('radix', 'int', None, None, None, None, None), ('isNullable', 'string', None, None, None, None, None), ('remarks', 'string', None, None, None, None, None), ('ordinalPosition', 'int', None, None, None, None, None), ('isAutoIncrement', 'string', None, None, None, None, None), ('isGenerated', 'string', None, None, None, None, None)] - - Field 'columnType' missing in all Thrift rows - Field 'columnSize' missing in all Thrift rows - Field 'catalogName' missing in all Thrift rows + - Field 'columnType' missing in all Thrift rows + - Field 'SCOPE_CATALOG' missing in all SEA rows + - Field 'SCOPE_SCHEMA' missing in all SEA rows - Field 'COLUMN_SIZE' missing in all SEA rows + - Field 'SQL_DATA_TYPE' missing in all SEA rows - Field 'COLUMN_DEF' missing in all SEA rows - - Field 'TYPE_NAME' missing in all SEA rows - Field 'SOURCE_DATA_TYPE' missing in all SEA rows - - Field 'SQL_DATA_TYPE' missing in all SEA rows - - Field 'SCOPE_CATALOG' missing in all SEA rows - - Field 'COLUMN_NAME' missing in all SEA rows - - Field 'SCOPE_SCHEMA' missing in all SEA rows - Field 'DATA_TYPE' missing in all SEA rows + - Field 'TYPE_NAME' missing in all SEA rows + - Field 'COLUMN_NAME' missing in all SEA rows **Data Differences:** ----------------- Row Data: - - Field 'isAutoIncrement' missing in all Thrift rows - Field 'tableName' missing in all Thrift rows - - Field 'remarks' missing in all Thrift rows - - Field 'namespace' missing in all Thrift rows - - Field 'radix' missing in all Thrift rows - Field 'isGenerated' missing in all Thrift rows - - Field 'ordinalPosition' missing in all Thrift rows + - Field 'isAutoIncrement' missing in all Thrift rows + - Field 'radix' missing in all Thrift rows - Field 'isNullable' missing in all Thrift rows - - Field 'col_name' missing in all Thrift rows - Field 'decimalDigits' missing in all Thrift rows - - Field 'NULLABLE' missing in all SEA rows - - Field 'BUFFER_LENGTH' missing in all SEA rows + - Field 'ordinalPosition' missing in all Thrift rows + - Field 'remarks' missing in all Thrift rows + - Field 'col_name' missing in all Thrift rows + - Field 'namespace' missing in all Thrift rows + - Field 'IS_AUTO_INCREMENT' missing in all SEA rows + - Field 'REMARKS' missing in all SEA rows - Field 'TABLE_NAME' missing in all SEA rows - - Field 'NUM_PREC_RADIX' missing in all SEA rows - - Field 'DECIMAL_DIGITS' missing in all SEA rows + - Field 'ORDINAL_POSITION' missing in all SEA rows - Field 'SQL_DATETIME_SUB' missing in all SEA rows - - Field 'IS_NULLABLE' missing in all SEA rows - - Field 'TABLE_SCHEM' missing in all SEA rows - Field 'SCOPE_TABLE' missing in all SEA rows - - Field 'IS_AUTO_INCREMENT' missing in all SEA rows + - Field 'IS_NULLABLE' missing in all SEA rows - Field 'TABLE_CAT' missing in all SEA rows - - Field 'ORDINAL_POSITION' missing in all SEA rows + - Field 'BUFFER_LENGTH' missing in all SEA rows + - Field 'NUM_PREC_RADIX' missing in all SEA rows - Field 'CHAR_OCTET_LENGTH' missing in all SEA rows - - Field 'REMARKS' missing in all SEA rows + - Field 'NULLABLE' missing in all SEA rows + - Field 'DECIMAL_DIGITS' missing in all SEA rows + - Field 'TABLE_SCHEM' missing in all SEA rows ============================ @@ -389,11 +389,6 @@ Column Metadata: - Column 1 (small_col) type_code mismatch: smallint vs short - Column 3 (big_col) type_code mismatch: bigint vs long -**Data Differences:** ------------------ -Row Data: - - Exception: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all() - ============================ **Query Type:** SQL Query @@ -437,7 +432,7 @@ Column Metadata: ============================ **Result:** PASSED -**Execution Time:** Thrift: 2.7094s, SEA: 3.1544s +**Execution Time:** Thrift: 4.7163s, SEA: 4.5337s ============================