Skip to content

Partial fix for #1078 — [Add Dataframe display config] #1086

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 52 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
41e6ad2
feat: Add configurable display options for PyDataFrame
kosiew Mar 28, 2025
17d54cd
feat: Enhance DisplayConfig for DataFrame with customizable options
kosiew Mar 28, 2025
fd8f5a1
feat: Add display configuration methods to DataFrame class
kosiew Mar 28, 2025
5aae267
feat: Add display configuration tests for DataFrame
kosiew Mar 28, 2025
bb4516f
feat: Validate display configuration values in DataFrame
kosiew Mar 28, 2025
ca908f0
collect_record_batches_to_display without debug
kosiew Mar 28, 2025
727914d
Add tests for display_config
kosiew Mar 28, 2025
52091ce
fix: Update record batch display logic to use min_table_rows from config
kosiew Mar 28, 2025
da116bf
reuse _create_numeric_test_df
kosiew Mar 28, 2025
ee1de81
feat: Add max_table_rows_in_repr to control row display in DataFrame
kosiew Mar 28, 2025
929563a
tidy up comments, tests
kosiew Mar 28, 2025
cae89b0
Fix ruff errors
kosiew Mar 28, 2025
1bfa8b1
Trigger CI
kosiew Mar 28, 2025
f34a331
Fix ruff errors
kosiew Mar 28, 2025
cb151e3
fix: Simplify error handling in display_config method
kosiew Mar 28, 2025
0d5e900
refactor: Update display configuration handling in DataFrame
kosiew Mar 31, 2025
ba5acc4
Revert "refactor: Update display configuration handling in DataFrame"
kosiew Mar 31, 2025
0e30af3
Refactor PyDataFrame: Simplify methods and improve performance
kosiew Mar 31, 2025
a5d224f
Revert "Refactor PyDataFrame: Simplify methods and improve performance"
kosiew Mar 31, 2025
30c9d99
revert to before DisplayConfig in PyDataFrame
kosiew Apr 2, 2025
028f0ab
feat: Add DataframeDisplayConfig for customizable DataFrame display o…
kosiew Apr 2, 2025
b401e1a
feat: Add method to configure DataFrame display options in PySessionC…
kosiew Apr 2, 2025
d2a1dc9
feat: Add method to configure DataFrame display options in SessionCon…
kosiew Apr 2, 2025
07d7cf6
rename to PyDataframeDisplayConfig
kosiew Apr 2, 2025
625a1f2
feat: Add DataframeDisplayConfig class for customizable DataFrame dis…
kosiew Apr 2, 2025
5dfb9ce
Fix ruff errors
kosiew Apr 2, 2025
065fa40
feat: Enhance PyDataFrame to support customizable display options
kosiew Apr 2, 2025
7fa2c7c
Amend PyDataFrame to use display_config instead of constants
kosiew Apr 2, 2025
cbc4759
refactor: Simplify PySessionConfig and PySessionContext by removing u…
kosiew Apr 2, 2025
1737973
refactor: Update PyDataFrame methods to consistently use display_conf…
kosiew Apr 2, 2025
354ff45
feat: Add display configuration options to SessionContext for DataFra…
kosiew Apr 2, 2025
984b906
fix: Add validation for display configuration properties in Dataframe…
kosiew Apr 2, 2025
1326d71
feat: Integrate DataframeDisplayConfig into SessionContext initializa…
kosiew Apr 2, 2025
0c4eaa6
test: Add tests for DataframeDisplayConfig initialization and Session…
kosiew Apr 2, 2025
eef0a36
debug: Add logging to collect_record_batches_to_display for better tr…
kosiew Apr 2, 2025
815690b
test: Add display configuration tests for DataFrame representation an…
kosiew Apr 2, 2025
a5e16a3
refactor: Remove debug print statements from display configuration tests
kosiew Apr 2, 2025
efc041c
refactor: Extract validation logic into a separate method in Datafram…
kosiew Apr 3, 2025
d30c641
refactor: Enhance DataframeDisplayConfig initialization with value va…
kosiew Apr 3, 2025
b467100
test: Add fixture for test data and refactor tests to use it
kosiew Apr 3, 2025
2993854
fix: Update loop condition in collect_record_batches_to_display for c…
kosiew Apr 3, 2025
71c64b9
fix ruff errors
kosiew Apr 3, 2025
a878ed4
Merge branch 'main' into dataframe-display-config
kosiew Apr 3, 2025
ec7033a
fix ruff errors
kosiew Apr 3, 2025
ad83fc5
feat: Add optional display_config parameter to SessionContext constru…
kosiew Apr 3, 2025
fb90fbc
fix: Update test data size and improve display config tests
kosiew Apr 3, 2025
73edc6a
fix: Remove unused import of 'dis' in test_dataframe.py
kosiew Apr 3, 2025
f08c070
feat: Add display_config parameter to SessionContext constructor
kosiew Apr 3, 2025
2751759
fix: Increase test data size in data fixture for better coverage
kosiew Apr 3, 2025
c109ad2
docs: Add docstring to normalize_uuid function for clarity in testing
kosiew Apr 3, 2025
f3cdfbe
fix ruff errors
kosiew Apr 3, 2025
2fcc2c1
fix clippy errors
kosiew Apr 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions python/datafusion/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,16 @@
# The following imports are okay to remain as opaque to the user.
from ._internal import Config
from .catalog import Catalog, Database, Table
from .common import (
DFSchema,
)
from .common import DFSchema
from .context import (
DataframeDisplayConfig,
RuntimeEnvBuilder,
SessionConfig,
SessionContext,
SQLOptions,
)
from .dataframe import DataFrame
from .expr import (
Expr,
WindowFrame,
)
from .expr import Expr, WindowFrame
from .io import read_avro, read_csv, read_json, read_parquet
from .plan import ExecutionPlan, LogicalPlan
from .record_batch import RecordBatch, RecordBatchStream
Expand All @@ -60,6 +56,7 @@
"DFSchema",
"DataFrame",
"Database",
"DataframeDisplayConfig",
"ExecutionPlan",
"Expr",
"LogicalPlan",
Expand Down
154 changes: 147 additions & 7 deletions python/datafusion/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from __future__ import annotations

from typing import TYPE_CHECKING, Any, Protocol
from typing import TYPE_CHECKING, Any, Optional, Protocol

try:
from warnings import deprecated # Python 3.13+
Expand All @@ -32,6 +32,7 @@
from datafusion.record_batch import RecordBatchStream
from datafusion.udf import AggregateUDF, ScalarUDF, WindowUDF

from ._internal import DataframeDisplayConfig as DataframeDisplayConfigInternal
from ._internal import RuntimeEnvBuilder as RuntimeEnvBuilderInternal
from ._internal import SessionConfig as SessionConfigInternal
from ._internal import SessionContext as SessionContextInternal
Expand Down Expand Up @@ -78,6 +79,106 @@ class TableProviderExportable(Protocol):
def __datafusion_table_provider__(self) -> object: ... # noqa: D105


class DataframeDisplayConfig:
"""Configuration for displaying DataFrame results.

This class allows you to control how DataFrames are displayed in Python.
"""

def __init__(
self,
max_table_bytes: Optional[int] = None,
min_table_rows: Optional[int] = None,
max_cell_length: Optional[int] = None,
max_table_rows_in_repr: Optional[int] = None,
) -> None:
"""Create a new :py:class:`DataframeDisplayConfig` instance.

Args:
max_table_bytes: Maximum bytes to display for table presentation
(default: 2MB)
min_table_rows: Minimum number of table rows to display
(default: 20)
max_cell_length: Maximum length of a cell before it gets minimized
(default: 25)
max_table_rows_in_repr: Maximum number of rows to display in repr
string output (default: 10)
"""
# Validate values if they are not None
if max_table_bytes is not None:
self._validate_positive(max_table_bytes, "max_table_bytes")
if min_table_rows is not None:
self._validate_positive(min_table_rows, "min_table_rows")
if max_cell_length is not None:
self._validate_positive(max_cell_length, "max_cell_length")
if max_table_rows_in_repr is not None:
self._validate_positive(max_table_rows_in_repr, "max_table_rows_in_repr")
self.config_internal = DataframeDisplayConfigInternal(
max_table_bytes=max_table_bytes,
min_table_rows=min_table_rows,
max_cell_length=max_cell_length,
max_table_rows_in_repr=max_table_rows_in_repr,
)

def _validate_positive(self, value: int, name: str) -> None:
"""Validate that the given value is positive.

Args:
value: The value to validate
name: The name of the parameter for the error message

Raises:
ValueError: If the value is not positive
"""
if value <= 0:
error_message = f"{name} must be greater than 0"
raise ValueError(error_message)

@property
def max_table_bytes(self) -> int:
"""Get the maximum bytes to display for table presentation."""
return self.config_internal.max_table_bytes

@max_table_bytes.setter
def max_table_bytes(self, value: int) -> None:
"""Set the maximum bytes to display for table presentation."""
self._validate_positive(value, "max_table_bytes")
self.config_internal.max_table_bytes = value

@property
def min_table_rows(self) -> int:
"""Get the minimum number of table rows to display."""
return self.config_internal.min_table_rows

@min_table_rows.setter
def min_table_rows(self, value: int) -> None:
"""Set the minimum number of table rows to display."""
self._validate_positive(value, "min_table_rows")
self.config_internal.min_table_rows = value

@property
def max_cell_length(self) -> int:
"""Get the maximum length of a cell before it gets minimized."""
return self.config_internal.max_cell_length

@max_cell_length.setter
def max_cell_length(self, value: int) -> None:
"""Set the maximum length of a cell before it gets minimized."""
self._validate_positive(value, "max_cell_length")
self.config_internal.max_cell_length = value

@property
def max_table_rows_in_repr(self) -> int:
"""Get the maximum number of rows to display in repr string output."""
return self.config_internal.max_table_rows_in_repr

@max_table_rows_in_repr.setter
def max_table_rows_in_repr(self, value: int) -> None:
"""Set the maximum number of rows to display in repr string output."""
self._validate_positive(value, "max_table_rows_in_repr")
self.config_internal.max_table_rows_in_repr = value


class SessionConfig:
"""Session configuration options."""

Expand Down Expand Up @@ -470,6 +571,7 @@ def __init__(
self,
config: SessionConfig | None = None,
runtime: RuntimeEnvBuilder | None = None,
display_config: DataframeDisplayConfig | None = None,
) -> None:
"""Main interface for executing queries with DataFusion.

Expand All @@ -480,7 +582,7 @@ def __init__(
Args:
config: Session configuration options.
runtime: Runtime configuration options.

display_config: DataFrame display configuration options.
Example usage:

The following example demonstrates how to use the context to execute
Expand All @@ -493,8 +595,10 @@ def __init__(
"""
config = config.config_internal if config is not None else None
runtime = runtime.config_internal if runtime is not None else None

self.ctx = SessionContextInternal(config, runtime)
display_config = (
display_config.config_internal if display_config is not None else None
)
self.ctx = SessionContextInternal(config, runtime, display_config)

@classmethod
def global_ctx(cls) -> SessionContext:
Expand All @@ -508,6 +612,40 @@ def global_ctx(cls) -> SessionContext:
wrapper.ctx = internal_ctx
return wrapper

def with_display_config(
self,
max_table_bytes: Optional[int] = None,
min_table_rows: Optional[int] = None,
max_cell_length: Optional[int] = None,
max_table_rows_in_repr: Optional[int] = None,
) -> SessionContext:
"""Configure the display options for DataFrames.

Args:
max_table_bytes: Maximum bytes to display for table presentation
(default: 2MB)
min_table_rows: Minimum number of table rows to display
(default: 20)
max_cell_length: Maximum length of a cell before it gets minimized
(default: 25)
max_table_rows_in_repr: Maximum number of rows to display in repr
string output (default: 10)

Returns:
A new :py:class:`SessionContext` object with the updated display settings.
"""
display_config = DataframeDisplayConfig(
max_table_bytes=max_table_bytes,
min_table_rows=min_table_rows,
max_cell_length=max_cell_length,
max_table_rows_in_repr=max_table_rows_in_repr,
)

klass = self.__class__
obj = klass.__new__(klass)
obj.ctx = self.ctx.with_display_config(display_config.config_internal)
return obj

def enable_url_table(self) -> SessionContext:
"""Control if local files can be queried as tables.

Expand Down Expand Up @@ -806,9 +944,11 @@ def register_parquet(
file_extension,
skip_metadata,
schema,
[sort_list_to_raw_sort_list(exprs) for exprs in file_sort_order]
if file_sort_order is not None
else None,
(
[sort_list_to_raw_sort_list(exprs) for exprs in file_sort_order]
if file_sort_order is not None
else None
),
)

def register_csv(
Expand Down
Loading