Skip to content

Introduce row_limit param #607

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: sea-migration
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/databricks/sql/backend/databricks_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def execute_command(
parameters: List,
async_op: bool,
enforce_embedded_schema_correctness: bool,
row_limit: Optional[int] = None,
) -> Union["ResultSet", None]:
"""
Executes a SQL command or query within the specified session.
Expand All @@ -103,6 +104,7 @@ def execute_command(
parameters: List of parameters to bind to the query
async_op: Whether to execute the command asynchronously
enforce_embedded_schema_correctness: Whether to enforce schema correctness
row_limit: Maximum number of rows in the operation result.

Returns:
If async_op is False, returns a ResultSet object containing the
Expand Down
3 changes: 2 additions & 1 deletion src/databricks/sql/backend/sea/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,7 @@ def execute_command(
parameters: List[Dict[str, Any]],
async_op: bool,
enforce_embedded_schema_correctness: bool,
row_limit: Optional[int] = None,
) -> Union["ResultSet", None]:
"""
Execute a SQL command using the SEA backend.
Expand Down Expand Up @@ -462,7 +463,7 @@ def execute_command(
format=format,
wait_timeout=(WaitTimeout.ASYNC if async_op else WaitTimeout.SYNC).value,
on_wait_timeout="CONTINUE",
row_limit=max_rows,
row_limit=row_limit,
parameters=sea_parameters if sea_parameters else None,
result_compression=result_compression,
)
Expand Down
4 changes: 3 additions & 1 deletion src/databricks/sql/backend/thrift_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import math
import time
import threading
from typing import List, Union, Any, TYPE_CHECKING
from typing import List, Optional, Union, Any, TYPE_CHECKING

if TYPE_CHECKING:
from databricks.sql.client import Cursor
Expand Down Expand Up @@ -929,6 +929,7 @@ def execute_command(
parameters=[],
async_op=False,
enforce_embedded_schema_correctness=False,
row_limit: Optional[int] = None,
) -> Union["ResultSet", None]:
thrift_handle = session_id.to_thrift_handle()
if not thrift_handle:
Expand Down Expand Up @@ -969,6 +970,7 @@ def execute_command(
useArrowNativeTypes=spark_arrow_types,
parameters=parameters,
enforceEmbeddedSchemaCorrectness=enforce_embedded_schema_correctness,
resultRowLimit=row_limit,
)
resp = self.make_request(self._client.ExecuteStatement, req)

Expand Down
23 changes: 15 additions & 8 deletions src/databricks/sql/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@ def cursor(
self,
arraysize: int = DEFAULT_ARRAY_SIZE,
buffer_size_bytes: int = DEFAULT_RESULT_BUFFER_SIZE_BYTES,
row_limit: Optional[int] = None,
) -> "Cursor":
"""
Return a new Cursor object using the connection.
Expand All @@ -355,6 +356,7 @@ def cursor(
self.session.backend,
arraysize=arraysize,
result_buffer_size_bytes=buffer_size_bytes,
row_limit=row_limit,
)
self._cursors.append(cursor)
return cursor
Expand Down Expand Up @@ -388,6 +390,7 @@ def __init__(
backend: DatabricksClient,
result_buffer_size_bytes: int = DEFAULT_RESULT_BUFFER_SIZE_BYTES,
arraysize: int = DEFAULT_ARRAY_SIZE,
row_limit: Optional[int] = None,
) -> None:
"""
These objects represent a database cursor, which is used to manage the context of a fetch
Expand All @@ -397,16 +400,18 @@ def __init__(
visible by other cursors or connections.
"""

self.connection = connection
self.rowcount = -1 # Return -1 as this is not supported
self.buffer_size_bytes = result_buffer_size_bytes
self.connection: Connection = connection

self.rowcount: int = -1 # Return -1 as this is not supported
self.buffer_size_bytes: int = result_buffer_size_bytes
self.active_result_set: Union[ResultSet, None] = None
self.arraysize = arraysize
self.arraysize: int = arraysize
self.row_limit: Optional[int] = row_limit
# Note that Cursor closed => active result set closed, but not vice versa
self.open = True
self.executing_command_id = None
self.backend = backend
self.active_command_id = None
self.open: bool = True
self.executing_command_id: Optional[CommandId] = None
self.backend: DatabricksClient = backend
self.active_command_id: Optional[CommandId] = None
self.escaper = ParamEscaper()
self.lastrowid = None

Expand Down Expand Up @@ -792,6 +797,7 @@ def execute(
parameters=prepared_params,
async_op=False,
enforce_embedded_schema_correctness=enforce_embedded_schema_correctness,
row_limit=self.row_limit,
)

if self.active_result_set and self.active_result_set.is_staging_operation:
Expand Down Expand Up @@ -848,6 +854,7 @@ def execute_async(
parameters=prepared_params,
async_op=True,
enforce_embedded_schema_correctness=enforce_embedded_schema_correctness,
row_limit=self.row_limit,
)

return self
Expand Down
60 changes: 58 additions & 2 deletions tests/e2e/test_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,12 @@ def connection(self, extra_params=()):
conn.close()

@contextmanager
def cursor(self, extra_params=()):
def cursor(self, extra_params=(), extra_cursor_params=()):
with self.connection(extra_params) as conn:
cursor = conn.cursor(
arraysize=self.arraysize, buffer_size_bytes=self.buffer_size_bytes
arraysize=self.arraysize,
buffer_size_bytes=self.buffer_size_bytes,
**dict(extra_cursor_params),
)
try:
yield cursor
Expand Down Expand Up @@ -945,6 +947,60 @@ def test_result_set_close(self):
finally:
cursor.close()

def test_row_limit_with_larger_result(self):
"""Test that row_limit properly constrains results when query would return more rows"""
row_limit = 1000
with self.cursor(extra_cursor_params={"row_limit": row_limit}) as cursor:
# Execute a query that returns more than row_limit rows
cursor.execute("SELECT * FROM range(2000)")
rows = cursor.fetchall()

# Check if the number of rows is limited to row_limit
assert len(rows) == row_limit, f"Expected {row_limit} rows, got {len(rows)}"

def test_row_limit_with_smaller_result(self):
"""Test that row_limit doesn't affect results when query returns fewer rows than limit"""
row_limit = 100
expected_rows = 50
with self.cursor(extra_cursor_params={"row_limit": row_limit}) as cursor:
# Execute a query that returns fewer than row_limit rows
cursor.execute(f"SELECT * FROM range({expected_rows})")
rows = cursor.fetchall()

# Check if all rows are returned (not limited by row_limit)
assert (
len(rows) == expected_rows
), f"Expected {expected_rows} rows, got {len(rows)}"

@skipUnless(pysql_supports_arrow(), "arrow test needs arrow support")
def test_row_limit_with_arrow_larger_result(self):
"""Test that row_limit properly constrains arrow results when query would return more rows"""
row_limit = 800
with self.cursor(extra_cursor_params={"row_limit": row_limit}) as cursor:
# Execute a query that returns more than row_limit rows
cursor.execute("SELECT * FROM range(1500)")
arrow_table = cursor.fetchall_arrow()

# Check if the number of rows in the arrow table is limited to row_limit
assert (
arrow_table.num_rows == row_limit
), f"Expected {row_limit} rows, got {arrow_table.num_rows}"

@skipUnless(pysql_supports_arrow(), "arrow test needs arrow support")
def test_row_limit_with_arrow_smaller_result(self):
"""Test that row_limit doesn't affect arrow results when query returns fewer rows than limit"""
row_limit = 200
expected_rows = 100
with self.cursor(extra_cursor_params={"row_limit": row_limit}) as cursor:
# Execute a query that returns fewer than row_limit rows
cursor.execute(f"SELECT * FROM range({expected_rows})")
arrow_table = cursor.fetchall_arrow()

# Check if all rows are returned (not limited by row_limit)
assert (
arrow_table.num_rows == expected_rows
), f"Expected {expected_rows} rows, got {arrow_table.num_rows}"


# use a RetrySuite to encapsulate these tests which we'll typically want to run together; however keep
# the 429/503 subsuites separate since they execute under different circumstances.
Expand Down
Loading