From 41e6ad290d7d2c0d4328b2ea12bc16bbb63fbb90 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 28 Mar 2025 12:45:14 +0800
Subject: [PATCH 01/51] feat: Add configurable display options for PyDataFrame

- Introduced DisplayConfig struct to manage display settings such as max_table_bytes, min_table_rows, and max_cell_length.
- Updated PyDataFrame to utilize DisplayConfig for rendering and displaying DataFrames.
- Added methods to configure and reset display settings, allowing users to customize their DataFrame presentation in Python.
---
 src/dataframe.rs | 82 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 71 insertions(+), 11 deletions(-)
diff --git a/src/dataframe.rs b/src/dataframe.rs
index be10b8c28..623a2e05f 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -72,9 +72,27 @@ impl PyTableProvider {
         PyTable::new(table_provider)
     }
 }
-const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB
-const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20;
-const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25;
+
+/// Configuration for DataFrame display in Python environment
+#[derive(Debug, Clone)]
+pub struct DisplayConfig {
+    /// Maximum bytes to display for table presentation (default: 2MB)
+    pub max_table_bytes: usize,
+    /// Minimum number of table rows to display (default: 20)
+    pub min_table_rows: usize,
+    /// Maximum length of a cell before it gets minimized (default: 25)
+    pub max_cell_length: usize,
+}
+
+impl Default for DisplayConfig {
+    fn default() -> Self {
+        Self {
+            max_table_bytes: 2 * 1024 * 1024, // 2 MB
+            min_table_rows: 20,
+            max_cell_length: 25,
+        }
+    }
+}
 
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.
 /// Use it to build a plan and `.collect()` to execute the plan and collect the result.
@@ -83,12 +101,16 @@ const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25;
 #[derive(Clone)]
 pub struct PyDataFrame {
     df: Arc<DataFrame>,
+    config: Arc<DisplayConfig>,
 }
 
 impl PyDataFrame {
     /// creates a new PyDataFrame
     pub fn new(df: DataFrame) -> Self {
-        Self { df: Arc::new(df) }
+        Self {
+            df: Arc::new(df),
+            config: Arc::new(DisplayConfig::default()),
+        }
     }
 }
 
@@ -118,7 +140,7 @@ impl PyDataFrame {
     fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
         let (batches, has_more) = wait_for_future(
             py,
-            collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10),
+            collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10, &self.config),
         )?;
         if batches.is_empty() {
             // This should not be reached, but do it for safety since we index into the vector below
@@ -141,8 +163,9 @@ impl PyDataFrame {
             py,
             collect_record_batches_to_display(
                 self.df.as_ref().clone(),
-                MIN_TABLE_ROWS_TO_DISPLAY,
+                self.config.min_table_rows,
                 usize::MAX,
+                &self.config,
             ),
         )?;
         if batches.is_empty() {
@@ -218,8 +241,8 @@ impl PyDataFrame {
                 for (col, formatter) in batch_formatter.iter().enumerate() {
                     let cell_data = formatter.value(batch_row).to_string();
                     // From testing, primitive data types do not typically get larger than 21 characters
-                    if cell_data.len() > MAX_LENGTH_CELL_WITHOUT_MINIMIZE {
-                        let short_cell_data = &cell_data[0..MAX_LENGTH_CELL_WITHOUT_MINIMIZE];
+                    if cell_data.len() > self.config.max_cell_length {
+                        let short_cell_data = &cell_data[0..self.config.max_cell_length];
                         cells.push(format!("
                             <td style='border: 1px solid black; padding: 8px; text-align: left; white-space: nowrap;'>
                                 <div class=\"expandable-container\">
@@ -797,6 +820,42 @@ impl PyDataFrame {
     fn count(&self, py: Python) -> PyDataFusionResult<usize> {
         Ok(wait_for_future(py, self.df.as_ref().clone().count())?)
     }
+
+    /// Get the current display configuration
+    #[getter]
+    fn display_config(&self) -> DisplayConfig {
+        (*self.config).clone()
+    }
+
+    /// Update display configuration
+    #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None))]
+    fn configure_display(
+        &mut self,
+        max_table_bytes: Option<usize>,
+        min_table_rows: Option<usize>,
+        max_cell_length: Option<usize>,
+    ) {
+        let mut new_config = (*self.config).clone();
+
+        if let Some(bytes) = max_table_bytes {
+            new_config.max_table_bytes = bytes;
+        }
+
+        if let Some(rows) = min_table_rows {
+            new_config.min_table_rows = rows;
+        }
+
+        if let Some(length) = max_cell_length {
+            new_config.max_cell_length = length;
+        }
+
+        self.config = Arc::new(new_config);
+    }
+
+    /// Reset display configuration to default values
+    fn reset_display_config(&mut self) {
+        self.config = Arc::new(DisplayConfig::default());
+    }
 }
 
 /// Print DataFrame
@@ -886,6 +945,7 @@ async fn collect_record_batches_to_display(
     df: DataFrame,
     min_rows: usize,
     max_rows: usize,
+    config: &DisplayConfig,
 ) -> Result<(Vec<RecordBatch>, bool), DataFusionError> {
     let partitioned_stream = df.execute_stream_partitioned().await?;
     let mut stream = futures::stream::iter(partitioned_stream).flatten();
@@ -894,7 +954,7 @@ async fn collect_record_batches_to_display(
     let mut record_batches = Vec::default();
     let mut has_more = false;
 
-    while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows)
+    while (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows)
         || rows_so_far < min_rows
     {
         let mut rb = match stream.next().await {
@@ -909,8 +969,8 @@ async fn collect_record_batches_to_display(
         if rows_in_rb > 0 {
             size_estimate_so_far += rb.get_array_memory_size();
 
-            if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY {
-                let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32;
+            if size_estimate_so_far > config.max_table_bytes {
+                let ratio = config.max_table_bytes as f32 / size_estimate_so_far as f32;
                 let total_rows = rows_in_rb + rows_so_far;
 
                 let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize;

From 17d54cdb3faf2ac19888987154d4805c9eb3bf40 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 28 Mar 2025 12:53:47 +0800
Subject: [PATCH 02/51] feat: Enhance DisplayConfig for DataFrame with
 customizable options

- Added DisplayConfig struct for configuring DataFrame display in Python.
- Introduced fields: max_table_bytes, min_table_rows, and max_cell_length with default values.
- Implemented a constructor for DisplayConfig to allow optional customization.
- Updated display_config method in PyDataFrame to return a Python object of DisplayConfig.
---
 src/dataframe.rs | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index 623a2e05f..cb9ae9e18 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -74,16 +74,38 @@ impl PyTableProvider {
 }
 
 /// Configuration for DataFrame display in Python environment
+#[pyclass(name = "DisplayConfig", module = "datafusion")]
 #[derive(Debug, Clone)]
 pub struct DisplayConfig {
     /// Maximum bytes to display for table presentation (default: 2MB)
+    #[pyo3(get, set)]
     pub max_table_bytes: usize,
     /// Minimum number of table rows to display (default: 20)
+    #[pyo3(get, set)]
     pub min_table_rows: usize,
     /// Maximum length of a cell before it gets minimized (default: 25)
+    #[pyo3(get, set)]
     pub max_cell_length: usize,
 }
 
+#[pymethods]
+impl DisplayConfig {
+    #[new]
+    #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None))]
+    fn new(
+        max_table_bytes: Option<usize>,
+        min_table_rows: Option<usize>,
+        max_cell_length: Option<usize>,
+    ) -> Self {
+        let default = DisplayConfig::default();
+        Self {
+            max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes),
+            min_table_rows: min_table_rows.unwrap_or(default.min_table_rows),
+            max_cell_length: max_cell_length.unwrap_or(default.max_cell_length),
+        }
+    }
+}
+
 impl Default for DisplayConfig {
     fn default() -> Self {
         Self {
@@ -823,8 +845,15 @@ impl PyDataFrame {
 
     /// Get the current display configuration
     #[getter]
-    fn display_config(&self) -> DisplayConfig {
-        (*self.config).clone()
+    fn display_config(&self) -> PyResult<Py<DisplayConfig>> {
+        Python::with_gil(|py| {
+            let config = DisplayConfig {
+                max_table_bytes: self.config.max_table_bytes,
+                min_table_rows: self.config.min_table_rows,
+                max_cell_length: self.config.max_cell_length,
+            };
+            Py::new(py, config).map_err(PyErr::from)
+        })
     }
 
     /// Update display configuration

From fd8f5a1a8762dc359947ad52a1cdb77f1edd3059 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 28 Mar 2025 13:04:41 +0800
Subject: [PATCH 03/51] feat: Add display configuration methods to DataFrame
 class

- Introduced `configure_display` method to set customizable display options for DataFrame representation, including maximum bytes, minimum rows, and maximum cell length.
- Added `reset_display_config` method to restore default display settings.
- Implemented `display_config` property to retrieve current display configuration.
---
 python/datafusion/dataframe.py | 27 +++++++++++++++++++++++++++
 src/dataframe.rs               |  1 +
 2 files changed, 28 insertions(+)

diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
index 26fe8f453..f7d964820 100644
--- a/python/datafusion/dataframe.py
+++ b/python/datafusion/dataframe.py
@@ -813,6 +813,33 @@ def count(self) -> int:
         """
         return self.df.count()
 
+    def configure_display(
+        self,
+        max_table_bytes: Optional[int] = None,
+        min_table_rows: Optional[int] = None,
+        max_cell_length: Optional[int] = None,
+    ) -> None:
+        """Configure display options for DataFrame representation.
+
+        Args:
+            max_table_bytes: Maximum bytes to display for table presentation (default: 2MB).
+                             Set to lower value for large tables to limit memory usage.
+            min_table_rows: Minimum number of table rows to display (default: 20).
+                            This is used for initial display and in notebooks.
+            max_cell_length: Maximum length of a cell before it gets minimized (default: 25).
+                             Longer cells will be truncated with an expand button.
+        """
+        self.df.configure_display(max_table_bytes, min_table_rows, max_cell_length)
+
+    def reset_display_config(self) -> None:
+        """Reset display configuration to default values."""
+        self.df.reset_display_config()
+
+    @property
+    def display_config(self):
+        """Get the current display configuration."""
+        return self.df.display_config
+
     @deprecated("Use :py:func:`unnest_columns` instead.")
     def unnest_column(self, column: str, preserve_nulls: bool = True) -> DataFrame:
         """See :py:func:`unnest_columns`."""
diff --git a/src/dataframe.rs b/src/dataframe.rs
index cb9ae9e18..e71fb6424 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -882,6 +882,7 @@ impl PyDataFrame {
     }
 
     /// Reset display configuration to default values
+    #[pyo3(text_signature = "($self)")]
     fn reset_display_config(&mut self) {
         self.config = Arc::new(DisplayConfig::default());
     }

From 5aae267695115d52afc35bbeebf0ea3762be11de Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 28 Mar 2025 13:10:18 +0800
Subject: [PATCH 04/51] feat: Add display configuration tests for DataFrame

- Implemented tests for accessing and modifying display configuration properties in the DataFrame class.
- Added `test_display_config` to verify default values of display settings.
- Created `test_configure_display` to test setting and partially updating display configuration.
- Introduced `test_reset_display_config` to ensure resetting configuration restores default values.
---
 python/tests/test_dataframe.py | 52 ++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index eda13930d..17ddde2ae 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1261,3 +1261,55 @@ def test_dataframe_repr_html(df) -> None:
     body_lines = [f"<td(.*?)>{v}</td>" for inner in body_data for v in inner]
     body_pattern = "(.*?)".join(body_lines)
     assert len(re.findall(body_pattern, output, re.DOTALL)) == 1
+
+
+def test_display_config(df):
+    """Test the display configuration properties are accessible."""
+    config = df.display_config
+
+    # Verify default values
+    assert config.max_table_bytes == 2 * 1024 * 1024  # 2 MB
+    assert config.min_table_rows == 20
+    assert config.max_cell_length == 25
+
+
+def test_configure_display(df):
+    """Test setting display configuration properties."""
+    # Modify the display configuration
+    df.configure_display(
+        max_table_bytes=1024 * 1024, min_table_rows=10, max_cell_length=50  # 1 MB
+    )
+
+    # Verify the changes took effect
+    config = df.display_config
+    assert config.max_table_bytes == 1024 * 1024  # 1 MB
+    assert config.min_table_rows == 10
+    assert config.max_cell_length == 50
+
+    # Test partial update (only changing one property)
+    df.configure_display(min_table_rows=5)
+    config = df.display_config
+    assert config.max_table_bytes == 1024 * 1024  # previous value retained
+    assert config.min_table_rows == 5  # only this value changed
+    assert config.max_cell_length == 50  # previous value retained
+
+
+def test_reset_display_config(df):
+    """Test resetting display configuration to defaults."""
+    # First modify the configuration
+    df.configure_display(
+        max_table_bytes=1024 * 1024, min_table_rows=10, max_cell_length=50
+    )
+
+    # Verify changes took effect
+    config = df.display_config
+    assert config.max_table_bytes == 1024 * 1024
+
+    # Now reset to defaults
+    df.reset_display_config()
+
+    # Verify defaults are restored
+    config = df.display_config
+    assert config.max_table_bytes == 2 * 1024 * 1024  # 2 MB
+    assert config.min_table_rows == 20
+    assert config.max_cell_length == 25

From bb4516f6a088cf4cd40f79caa412a65e3eea0a30 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 28 Mar 2025 14:24:01 +0800
Subject: [PATCH 05/51] feat: Validate display configuration values in
 DataFrame

- Added validation to ensure max_table_bytes, min_table_rows, and max_cell_length are greater than 0 in the configure_display method of DataFrame class.
- Updated test cases to cover scenarios for zero and negative values, ensuring proper error handling.
- Enhanced existing tests to validate extreme values and confirm expected behavior for display configurations.
---
 python/datafusion/dataframe.py |   9 ++
 python/tests/test_dataframe.py | 201 +++++++++++++++++++++++++++++++++
 src/dataframe.rs               | 114 +++++++++++++++++--
 3 files changed, 315 insertions(+), 9 deletions(-)

diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
index f7d964820..a0688819b 100644
--- a/python/datafusion/dataframe.py
+++ b/python/datafusion/dataframe.py
@@ -828,7 +828,16 @@ def configure_display(
                             This is used for initial display and in notebooks.
             max_cell_length: Maximum length of a cell before it gets minimized (default: 25).
                              Longer cells will be truncated with an expand button.
+
+        Raises:
+            ValueError: If any of the provided values are less than or equal to 0.
         """
+        if any(
+            value is not None and value <= 0
+            for value in (max_table_bytes, min_table_rows, max_cell_length)
+        ):
+            raise ValueError("All values must be greater than 0.")
+
         self.df.configure_display(max_table_bytes, min_table_rows, max_cell_length)
 
     def reset_display_config(self) -> None:
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 17ddde2ae..5b7bc7098 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1293,6 +1293,35 @@ def test_configure_display(df):
     assert config.min_table_rows == 5  # only this value changed
     assert config.max_cell_length == 50  # previous value retained
 
+    # Test with extreme values (still valid, but potentially problematic)
+    # Zero values
+    with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
+        df.configure_display(max_table_bytes=0, min_table_rows=0, max_cell_length=0)
+
+    # Very large values
+    df.configure_display(
+        max_table_bytes=10**12, min_table_rows=10**6, max_cell_length=10**4
+    )
+    config = df.display_config
+    assert config.max_table_bytes == 10**12  # 1 TB
+    assert config.min_table_rows == 10**6  # 1 million rows
+    assert config.max_cell_length == 10**4  # 10,000 chars per cell
+
+    # Test with negative values
+    # This tests for expected behavior when users accidentally pass negative values
+    # Since these are usize in Rust, we expect a Python TypeError when trying to pass negative values
+    with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
+        df.configure_display(max_table_bytes=-1)
+
+    with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
+        df.configure_display(min_table_rows=-5)
+
+    with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
+        df.configure_display(max_cell_length=-10)
+
+    # Reset for next tests
+    df.reset_display_config()
+
 
 def test_reset_display_config(df):
     """Test resetting display configuration to defaults."""
@@ -1313,3 +1342,175 @@ def test_reset_display_config(df):
     assert config.max_table_bytes == 2 * 1024 * 1024  # 2 MB
     assert config.min_table_rows == 20
     assert config.max_cell_length == 25
+
+
+def test_min_table_rows_display(ctx):
+    """Test that at least min_table_rows rows are displayed."""
+    # Create a dataframe with more rows than the default min_table_rows
+    rows = 100
+    data = list(range(rows))
+    batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"])
+    df = ctx.create_dataframe([[batch]])
+
+    # Set min_table_rows to a specific value
+    custom_min_rows = 30
+    df.configure_display(min_table_rows=custom_min_rows)
+
+    # Get HTML representation
+    html_output = df._repr_html_()
+
+    # Count table rows in the HTML (excluding header row)
+    # Each row has a <tr> tag
+    row_count = html_output.count("<tr>") - 1  # subtract 1 for the header row
+
+    # Verify at least min_table_rows rows are displayed
+    assert (
+        row_count >= custom_min_rows
+    ), f"Expected at least {custom_min_rows} rows, got {row_count}"
+
+    # If data was truncated, "Data truncated" message should be present
+    if row_count < rows:
+        assert "Data truncated" in html_output
+
+
+def test_max_table_bytes_display(ctx):
+    """Test that reducing max_table_bytes limits the amount of data displayed."""
+    # Create a dataframe with large string values to consume memory
+    # Each string is approximately 1000 bytes
+    large_strings = ["x" * 1000 for _ in range(50)]
+    batch = pa.RecordBatch.from_arrays([pa.array(large_strings)], names=["large_data"])
+    df = ctx.create_dataframe([[batch]])
+
+    # First test with default settings
+    default_html = df._repr_html_()
+    default_row_count = default_html.count("<tr>") - 1  # subtract header row
+
+    # Now set a very small max_table_bytes
+    df.configure_display(max_table_bytes=5000)  # 5KB should only fit a few rows
+    limited_html = df._repr_html_()
+    limited_row_count = limited_html.count("<tr>") - 1
+
+    # Verify fewer rows are displayed with the byte limit
+    assert (
+        limited_row_count < default_row_count
+    ), f"Expected fewer rows with byte limit. Default: {default_row_count}, Limited: {limited_row_count}"
+
+    # "Data truncated" should be present when limited
+    assert "Data truncated" in limited_html
+
+
+def test_max_cell_length_display(ctx):
+    """Test that cells longer than max_cell_length are truncated in display."""
+    # Create a dataframe with long string values
+    long_strings = [
+        "short",
+        "medium text",
+        "this is a very long string that should be truncated",
+    ]
+    batch = pa.RecordBatch.from_arrays([pa.array(long_strings)], names=["text"])
+    df = ctx.create_dataframe([[batch]])
+
+    # Set a small max_cell_length
+    max_length = 10
+    df.configure_display(max_cell_length=max_length)
+
+    # Get HTML representation
+    html_output = df._repr_html_()
+
+    # Check for expand button for long text
+    assert "expandable-container" in html_output
+
+    # Check that expandable class is used for long text
+    assert 'class="expandable"' in html_output
+
+    # Look for the truncated text and expand button
+    long_text = long_strings[2]
+    assert long_text[:max_length] in html_output  # Truncated text should be present
+    assert "expand-btn" in html_output  # Expand button should be present
+    assert long_text in html_output  # Full text should also be in the HTML (hidden)
+
+
+def test_display_config_repr_string(ctx):
+    """Test that __repr__ respects display configuration."""
+    # Create a dataframe with more rows than we want to show
+    rows = 30
+    data = list(range(rows))
+    batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"])
+    df = ctx.create_dataframe([[batch]])
+
+    # Configure to show only 5 rows in string representation
+    df.configure_display(min_table_rows=5)
+
+    # Get the string representation
+    repr_str = df.__repr__()
+
+    # The string should contain "Data truncated"
+    assert "Data truncated" in repr_str
+
+    # Count the number of rows (each value should be on a separate line)
+    # This is an approximation since we don't parse the actual ASCII table
+    value_lines = 0
+    for i in range(rows):
+        if str(i) in repr_str:
+            value_lines += 1
+
+    # Should be fewer rows than the total
+    assert value_lines < rows
+
+    # Now set min_rows higher and see if more rows appear
+    df.configure_display(min_table_rows=20)
+    repr_str_more = df.__repr__()
+
+    value_lines_more = 0
+    for i in range(rows):
+        if str(i) in repr_str_more:
+            value_lines_more += 1
+
+    assert value_lines_more > value_lines
+
+
+def test_display_config_integrated(ctx):
+    """Test all display config options together in an integrated test."""
+    # Create a dataframe with:
+    # - Many rows (to test min_table_rows)
+    # - Large data (to test max_table_bytes)
+    # - Long strings (to test max_cell_length)
+    rows = 50
+    ids = list(range(rows))
+    # Generate strings of increasing length
+    texts = [f"{'A' * i}" for i in range(1, rows + 1)]
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array(ids), pa.array(texts)], names=["id", "text"]
+    )
+
+    df = ctx.create_dataframe([[batch]])
+
+    # Set custom display configuration
+    df.configure_display(
+        max_table_bytes=2000,  # Limit bytes to display
+        min_table_rows=15,  # Show at least 15 rows
+        max_cell_length=10,  # Truncate cells longer than 10 chars
+    )
+
+    # Get HTML representation
+    html_output = df._repr_html_()
+
+    # Check row count
+    row_count = html_output.count("<tr>") - 1  # subtract header
+    assert row_count >= 15, f"Should display at least 15 rows, got {row_count}"
+
+    # Check for truncation
+    assert "expandable-container" in html_output
+    assert "expand-btn" in html_output
+
+    # Should be truncated (not all rows displayed)
+    assert "Data truncated" in html_output
+
+    # Now with default settings
+    df.reset_display_config()
+    default_html = df._repr_html_()
+    default_row_count = default_html.count("<tr>") - 1
+
+    # Default settings should show more data
+    assert default_row_count > row_count
diff --git a/src/dataframe.rs b/src/dataframe.rs
index e71fb6424..9c53b7671 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -847,11 +847,7 @@ impl PyDataFrame {
     #[getter]
     fn display_config(&self) -> PyResult<Py<DisplayConfig>> {
         Python::with_gil(|py| {
-            let config = DisplayConfig {
-                max_table_bytes: self.config.max_table_bytes,
-                min_table_rows: self.config.min_table_rows,
-                max_cell_length: self.config.max_cell_length,
-            };
+            let config = (*self.config).clone();
             Py::new(py, config).map_err(PyErr::from)
         })
     }
@@ -924,7 +920,7 @@ fn record_batch_into_schema(
 ) -> Result<RecordBatch, ArrowError> {
     let schema = Arc::new(schema.clone());
     let base_schema = record_batch.schema();
-    if base_schema.fields().len() == 0 {
+    if (base_schema.fields().len() == 0) {
         // Nothing to project
         return Ok(RecordBatch::new_empty(schema));
     }
@@ -984,11 +980,36 @@ async fn collect_record_batches_to_display(
     let mut record_batches = Vec::default();
     let mut has_more = false;
 
+    println!(
+        "==> Starting loop with min_rows: {}, max_rows: {}, max_table_bytes: {}",
+        min_rows, max_rows, config.max_table_bytes
+    );
+
     while (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows)
         || rows_so_far < min_rows
     {
+        println!(
+            "==> Loop condition: size_estimate_so_far ({}) < max_table_bytes ({})? {}",
+            size_estimate_so_far,
+            config.max_table_bytes,
+            size_estimate_so_far < config.max_table_bytes
+        );
+        println!(
+            "==> Loop condition: rows_so_far ({}) < max_rows ({})? {}",
+            rows_so_far,
+            max_rows,
+            rows_so_far < max_rows
+        );
+        println!(
+            "==> Loop condition: rows_so_far ({}) < min_rows ({})? {}",
+            rows_so_far,
+            min_rows,
+            rows_so_far < min_rows
+        );
+
         let mut rb = match stream.next().await {
             None => {
+                println!("==> Exiting loop: stream.next() returned None (no more data)");
                 break;
             }
             Some(Ok(r)) => r,
@@ -996,48 +1017,123 @@ async fn collect_record_batches_to_display(
         };
 
         let mut rows_in_rb = rb.num_rows();
+        println!("==> Received batch with {} rows", rows_in_rb);
+
         if rows_in_rb > 0 {
             size_estimate_so_far += rb.get_array_memory_size();
+            println!("==> New size_estimate_so_far: {}", size_estimate_so_far);
 
             if size_estimate_so_far > config.max_table_bytes {
+                println!(
+                    "==> Size limit reached: {} > {}",
+                    size_estimate_so_far, config.max_table_bytes
+                );
                 let ratio = config.max_table_bytes as f32 / size_estimate_so_far as f32;
                 let total_rows = rows_in_rb + rows_so_far;
 
                 let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize;
                 if reduced_row_num < min_rows {
                     reduced_row_num = min_rows.min(total_rows);
+                    println!(
+                        "==> Adjusted reduced_row_num to {} to meet min_rows",
+                        reduced_row_num
+                    );
                 }
 
                 let limited_rows_this_rb = reduced_row_num - rows_so_far;
+                println!(
+                    "==> Limiting to {} rows in this batch (reduced_row_num: {}, rows_so_far: {})",
+                    limited_rows_this_rb, reduced_row_num, rows_so_far
+                );
+
                 if limited_rows_this_rb < rows_in_rb {
                     rows_in_rb = limited_rows_this_rb;
                     rb = rb.slice(0, limited_rows_this_rb);
                     has_more = true;
+                    println!("==> Sliced batch to {} rows", limited_rows_this_rb);
                 }
             }
 
             if rows_in_rb + rows_so_far > max_rows {
+                println!(
+                    "==> Row limit reached: {} + {} > {}",
+                    rows_in_rb, rows_so_far, max_rows
+                );
                 rb = rb.slice(0, max_rows - rows_so_far);
                 has_more = true;
+                println!(
+                    "==> Sliced batch to {} rows to meet max_rows",
+                    max_rows - rows_so_far
+                );
             }
 
             rows_so_far += rb.num_rows();
             record_batches.push(rb);
+            println!(
+                "==> Added batch: size_estimate_so_far: {}, rows_so_far: {}",
+                size_estimate_so_far, rows_so_far
+            );
+        } else {
+            println!("==> Skipping empty batch");
         }
     }
 
+    println!("==> Exited while loop: size_estimate_so_far: {}, rows_so_far: {}, min_rows: {}, max_rows: {}", 
+             size_estimate_so_far, rows_so_far, min_rows, max_rows);
+    println!("==> Loop condition evaluation at exit:");
+    println!(
+        "==> size_estimate_so_far < config.max_table_bytes: {} < {} = {}",
+        size_estimate_so_far,
+        config.max_table_bytes,
+        size_estimate_so_far < config.max_table_bytes
+    );
+    println!(
+        "==> rows_so_far < max_rows: {} < {} = {}",
+        rows_so_far,
+        max_rows,
+        rows_so_far < max_rows
+    );
+    println!(
+        "==> rows_so_far < min_rows: {} < {} = {}",
+        rows_so_far,
+        min_rows,
+        rows_so_far < min_rows
+    );
+    println!(
+        "==> Combined condition: {} || {} = {}",
+        (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows),
+        rows_so_far < min_rows,
+        (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows)
+            || rows_so_far < min_rows
+    );
+
     if record_batches.is_empty() {
+        println!("==> No record batches collected");
         return Ok((Vec::default(), false));
     }
 
     if !has_more {
         // Data was not already truncated, so check to see if more record batches remain
         has_more = match stream.try_next().await {
-            Ok(None) => false, // reached end
-            Ok(Some(_)) => true,
-            Err(_) => false, // Stream disconnected
+            Ok(None) => {
+                println!("==> No more record batches in stream");
+                false
+            } // reached end
+            Ok(Some(_)) => {
+                println!("==> More record batches available in stream");
+                true
+            }
+            Err(_) => {
+                println!("==> Stream error or disconnected");
+                false
+            } // Stream disconnected
         };
     }
 
+    println!(
+        "==> Returning {} record batches, has_more: {}",
+        record_batches.len(),
+        has_more
+    );
     Ok((record_batches, has_more))
 }

From ca908f05d0fe39635522831c149a63295f5b8402 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 28 Mar 2025 14:25:59 +0800
Subject: [PATCH 06/51] collect_record_batches_to_display without debug

---
 src/dataframe.rs | 108 ++---------------------------------------------
 1 file changed, 4 insertions(+), 104 deletions(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index 9c53b7671..9381d8407 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -920,7 +920,7 @@ fn record_batch_into_schema(
 ) -> Result<RecordBatch, ArrowError> {
     let schema = Arc::new(schema.clone());
     let base_schema = record_batch.schema();
-    if (base_schema.fields().len() == 0) {
+    if base_schema.fields().len() == 0 {
         // Nothing to project
         return Ok(RecordBatch::new_empty(schema));
     }
@@ -980,36 +980,11 @@ async fn collect_record_batches_to_display(
     let mut record_batches = Vec::default();
     let mut has_more = false;
 
-    println!(
-        "==> Starting loop with min_rows: {}, max_rows: {}, max_table_bytes: {}",
-        min_rows, max_rows, config.max_table_bytes
-    );
-
     while (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows)
         || rows_so_far < min_rows
     {
-        println!(
-            "==> Loop condition: size_estimate_so_far ({}) < max_table_bytes ({})? {}",
-            size_estimate_so_far,
-            config.max_table_bytes,
-            size_estimate_so_far < config.max_table_bytes
-        );
-        println!(
-            "==> Loop condition: rows_so_far ({}) < max_rows ({})? {}",
-            rows_so_far,
-            max_rows,
-            rows_so_far < max_rows
-        );
-        println!(
-            "==> Loop condition: rows_so_far ({}) < min_rows ({})? {}",
-            rows_so_far,
-            min_rows,
-            rows_so_far < min_rows
-        );
-
         let mut rb = match stream.next().await {
             None => {
-                println!("==> Exiting loop: stream.next() returned None (no more data)");
                 break;
             }
             Some(Ok(r)) => r,
@@ -1017,123 +992,48 @@ async fn collect_record_batches_to_display(
         };
 
         let mut rows_in_rb = rb.num_rows();
-        println!("==> Received batch with {} rows", rows_in_rb);
-
         if rows_in_rb > 0 {
             size_estimate_so_far += rb.get_array_memory_size();
-            println!("==> New size_estimate_so_far: {}", size_estimate_so_far);
 
             if size_estimate_so_far > config.max_table_bytes {
-                println!(
-                    "==> Size limit reached: {} > {}",
-                    size_estimate_so_far, config.max_table_bytes
-                );
                 let ratio = config.max_table_bytes as f32 / size_estimate_so_far as f32;
                 let total_rows = rows_in_rb + rows_so_far;
 
                 let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize;
                 if reduced_row_num < min_rows {
                     reduced_row_num = min_rows.min(total_rows);
-                    println!(
-                        "==> Adjusted reduced_row_num to {} to meet min_rows",
-                        reduced_row_num
-                    );
                 }
 
                 let limited_rows_this_rb = reduced_row_num - rows_so_far;
-                println!(
-                    "==> Limiting to {} rows in this batch (reduced_row_num: {}, rows_so_far: {})",
-                    limited_rows_this_rb, reduced_row_num, rows_so_far
-                );
-
                 if limited_rows_this_rb < rows_in_rb {
                     rows_in_rb = limited_rows_this_rb;
                     rb = rb.slice(0, limited_rows_this_rb);
                     has_more = true;
-                    println!("==> Sliced batch to {} rows", limited_rows_this_rb);
                 }
             }
 
             if rows_in_rb + rows_so_far > max_rows {
-                println!(
-                    "==> Row limit reached: {} + {} > {}",
-                    rows_in_rb, rows_so_far, max_rows
-                );
                 rb = rb.slice(0, max_rows - rows_so_far);
                 has_more = true;
-                println!(
-                    "==> Sliced batch to {} rows to meet max_rows",
-                    max_rows - rows_so_far
-                );
             }
 
             rows_so_far += rb.num_rows();
             record_batches.push(rb);
-            println!(
-                "==> Added batch: size_estimate_so_far: {}, rows_so_far: {}",
-                size_estimate_so_far, rows_so_far
-            );
-        } else {
-            println!("==> Skipping empty batch");
         }
     }
 
-    println!("==> Exited while loop: size_estimate_so_far: {}, rows_so_far: {}, min_rows: {}, max_rows: {}", 
-             size_estimate_so_far, rows_so_far, min_rows, max_rows);
-    println!("==> Loop condition evaluation at exit:");
-    println!(
-        "==> size_estimate_so_far < config.max_table_bytes: {} < {} = {}",
-        size_estimate_so_far,
-        config.max_table_bytes,
-        size_estimate_so_far < config.max_table_bytes
-    );
-    println!(
-        "==> rows_so_far < max_rows: {} < {} = {}",
-        rows_so_far,
-        max_rows,
-        rows_so_far < max_rows
-    );
-    println!(
-        "==> rows_so_far < min_rows: {} < {} = {}",
-        rows_so_far,
-        min_rows,
-        rows_so_far < min_rows
-    );
-    println!(
-        "==> Combined condition: {} || {} = {}",
-        (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows),
-        rows_so_far < min_rows,
-        (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows)
-            || rows_so_far < min_rows
-    );
-
     if record_batches.is_empty() {
-        println!("==> No record batches collected");
         return Ok((Vec::default(), false));
     }
 
     if !has_more {
         // Data was not already truncated, so check to see if more record batches remain
         has_more = match stream.try_next().await {
-            Ok(None) => {
-                println!("==> No more record batches in stream");
-                false
-            } // reached end
-            Ok(Some(_)) => {
-                println!("==> More record batches available in stream");
-                true
-            }
-            Err(_) => {
-                println!("==> Stream error or disconnected");
-                false
-            } // Stream disconnected
+            Ok(None) => false, // reached end
+            Ok(Some(_)) => true,
+            Err(_) => false, // Stream disconnected
         };
     }
 
-    println!(
-        "==> Returning {} record batches, has_more: {}",
-        record_batches.len(),
-        has_more
-    );
     Ok((record_batches, has_more))
 }

From 727914d63e0ce8b081f8d288dfba4bfb445830cd Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 28 Mar 2025 15:11:43 +0800
Subject: [PATCH 07/51] Add tests for display_config

---
 python/tests/test_dataframe.py | 76 ++++++++++++++++++++++++----------
 1 file changed, 55 insertions(+), 21 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 5b7bc7098..99c125178 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1433,40 +1433,59 @@ def test_max_cell_length_display(ctx):
 def test_display_config_repr_string(ctx):
     """Test that __repr__ respects display configuration."""
     # Create a dataframe with more rows than we want to show
-    rows = 30
-    data = list(range(rows))
-    batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"])
-    df = ctx.create_dataframe([[batch]])
+    # df.__repr__ returns max 10 rows only, so we start test with 7 rows
+    rows = 7
+    df = _create_numeric_test_df(ctx, rows)
 
     # Configure to show only 5 rows in string representation
-    df.configure_display(min_table_rows=5)
+    min_table_rows_in_display = 5
+    df.configure_display(min_table_rows=min_table_rows_in_display)
 
     # Get the string representation
     repr_str = df.__repr__()
 
-    # The string should contain "Data truncated"
-    assert "Data truncated" in repr_str
-
-    # Count the number of rows (each value should be on a separate line)
-    # This is an approximation since we don't parse the actual ASCII table
-    value_lines = 0
-    for i in range(rows):
-        if str(i) in repr_str:
-            value_lines += 1
+    # Count the number of rows using helper function
+    lines_count = _count_lines_in_str(repr_str)
 
     # Should be fewer rows than the total
-    assert value_lines < rows
+    assert lines_count <= rows
+    assert lines_count >= min_table_rows_in_display
 
     # Now set min_rows higher and see if more rows appear
-    df.configure_display(min_table_rows=20)
+    min_table_rows_in_display = 7
+    rows = 11
+    df = _create_numeric_test_df(ctx, rows)  # Recreate to reset the state
+    df.configure_display(min_table_rows=min_table_rows_in_display)
+
     repr_str_more = df.__repr__()
+    # The string should contain "Data truncated"
+    assert "Data truncated" in repr_str_more
+
+    # Count lines again
+    lines_count2 = _count_lines_in_str(repr_str_more)
+
+    # Should show more rows now
+    assert lines_count2 > lines_count
+    assert lines_count2 >= min_table_rows_in_display
 
-    value_lines_more = 0
-    for i in range(rows):
-        if str(i) in repr_str_more:
-            value_lines_more += 1
 
-    assert value_lines_more > value_lines
+def _count_lines_in_str(repr_str):
+    """Count the number of rows displayed in a string representation.
+
+    Args:
+        repr_str: String representation of the DataFrame.
+
+    Returns:
+        Number of rows that appear in the string representation.
+    """
+    # Find all lines that match the pattern of a number at the beginning of a row
+    # This is more robust than checking for specific numbers
+    value_lines = 0
+    for line in repr_str.split("\n"):
+        # Look for lines that contain numeric values (row data)
+        if re.search(r"^\s*\d+\s", line):
+            value_lines += 1
+    return value_lines
 
 
 def test_display_config_integrated(ctx):
@@ -1514,3 +1533,18 @@ def test_display_config_integrated(ctx):
 
     # Default settings should show more data
     assert default_row_count > row_count
+
+
+def _create_numeric_test_df(ctx, rows):
+    """Create a test dataframe with numeric values from 0 to rows-1.
+
+    Args:
+        ctx: SessionContext to use for creating the dataframe.
+        rows: Number of rows to create.
+
+    Returns:
+        DataFrame with a single column "values" containing numbers 0 to rows-1.
+    """
+    data = list(range(rows))
+    batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"])
+    return ctx.create_dataframe([[batch]])

From 52091cee8160e56148ec77dfc1039a9f4ceb026a Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 28 Mar 2025 15:13:10 +0800
Subject: [PATCH 08/51] fix: Update record batch display logic to use
 min_table_rows from config

---
 python/tests/test_dataframe.py | 55 +++-------------------------------
 src/dataframe.rs               |  7 ++++-
 2 files changed, 10 insertions(+), 52 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 99c125178..c1717beb5 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1478,63 +1478,16 @@ def _count_lines_in_str(repr_str):
     Returns:
         Number of rows that appear in the string representation.
     """
-    # Find all lines that match the pattern of a number at the beginning of a row
-    # This is more robust than checking for specific numbers
+    # DataFrame tables are formatted with | value | patterns
+    # Count lines that match actual data rows (not headers or separators)
     value_lines = 0
     for line in repr_str.split("\n"):
-        # Look for lines that contain numeric values (row data)
-        if re.search(r"^\s*\d+\s", line):
+        # Look for lines like "| 0      |", "| 1      |", etc.
+        if re.search(r"\|\s*\d+\s*\|", line):
             value_lines += 1
     return value_lines
 
 
-def test_display_config_integrated(ctx):
-    """Test all display config options together in an integrated test."""
-    # Create a dataframe with:
-    # - Many rows (to test min_table_rows)
-    # - Large data (to test max_table_bytes)
-    # - Long strings (to test max_cell_length)
-    rows = 50
-    ids = list(range(rows))
-    # Generate strings of increasing length
-    texts = [f"{'A' * i}" for i in range(1, rows + 1)]
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array(ids), pa.array(texts)], names=["id", "text"]
-    )
-
-    df = ctx.create_dataframe([[batch]])
-
-    # Set custom display configuration
-    df.configure_display(
-        max_table_bytes=2000,  # Limit bytes to display
-        min_table_rows=15,  # Show at least 15 rows
-        max_cell_length=10,  # Truncate cells longer than 10 chars
-    )
-
-    # Get HTML representation
-    html_output = df._repr_html_()
-
-    # Check row count
-    row_count = html_output.count("<tr>") - 1  # subtract header
-    assert row_count >= 15, f"Should display at least 15 rows, got {row_count}"
-
-    # Check for truncation
-    assert "expandable-container" in html_output
-    assert "expand-btn" in html_output
-
-    # Should be truncated (not all rows displayed)
-    assert "Data truncated" in html_output
-
-    # Now with default settings
-    df.reset_display_config()
-    default_html = df._repr_html_()
-    default_row_count = default_html.count("<tr>") - 1
-
-    # Default settings should show more data
-    assert default_row_count > row_count
-
-
 def _create_numeric_test_df(ctx, rows):
     """Create a test dataframe with numeric values from 0 to rows-1.
 
diff --git a/src/dataframe.rs b/src/dataframe.rs
index 9381d8407..33eecb5bf 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -162,7 +162,12 @@ impl PyDataFrame {
     fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
         let (batches, has_more) = wait_for_future(
             py,
-            collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10, &self.config),
+            collect_record_batches_to_display(
+                self.df.as_ref().clone(),
+                self.config.min_table_rows,
+                10,
+                &self.config,
+            ),
         )?;
         if batches.is_empty() {
             // This should not be reached, but do it for safety since we index into the vector below

From da116bf1fbecb21dd5f9ad55692f9a8775096c84 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 28 Mar 2025 15:45:17 +0800
Subject: [PATCH 09/51]   reuse _create_numeric_test_df

---
 python/tests/test_dataframe.py | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index c1717beb5..04385d88c 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1298,18 +1298,9 @@ def test_configure_display(df):
     with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
         df.configure_display(max_table_bytes=0, min_table_rows=0, max_cell_length=0)
 
-    # Very large values
-    df.configure_display(
-        max_table_bytes=10**12, min_table_rows=10**6, max_cell_length=10**4
-    )
-    config = df.display_config
-    assert config.max_table_bytes == 10**12  # 1 TB
-    assert config.min_table_rows == 10**6  # 1 million rows
-    assert config.max_cell_length == 10**4  # 10,000 chars per cell
-
     # Test with negative values
     # This tests for expected behavior when users accidentally pass negative values
-    # Since these are usize in Rust, we expect a Python TypeError when trying to pass negative values
+    # Since these are usize in Rust, we expect a Python ValueError when trying to pass negative values
     with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
         df.configure_display(max_table_bytes=-1)
 
@@ -1348,9 +1339,7 @@ def test_min_table_rows_display(ctx):
     """Test that at least min_table_rows rows are displayed."""
     # Create a dataframe with more rows than the default min_table_rows
     rows = 100
-    data = list(range(rows))
-    batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"])
-    df = ctx.create_dataframe([[batch]])
+    df = _create_numeric_test_df(ctx, rows)
 
     # Set min_table_rows to a specific value
     custom_min_rows = 30
@@ -1433,7 +1422,7 @@ def test_max_cell_length_display(ctx):
 def test_display_config_repr_string(ctx):
     """Test that __repr__ respects display configuration."""
     # Create a dataframe with more rows than we want to show
-    # df.__repr__ returns max 10 rows only, so we start test with 7 rows
+    # df.__repr__ returns max 10 rows, so we start test with 7 rows
     rows = 7
     df = _create_numeric_test_df(ctx, rows)
 
@@ -1469,7 +1458,7 @@ def test_display_config_repr_string(ctx):
     assert lines_count2 >= min_table_rows_in_display
 
 
-def _count_lines_in_str(repr_str):
+def _count_lines_in_str(repr_str: str) -> int:
     """Count the number of rows displayed in a string representation.
 
     Args:
@@ -1488,7 +1477,7 @@ def _count_lines_in_str(repr_str):
     return value_lines
 
 
-def _create_numeric_test_df(ctx, rows):
+def _create_numeric_test_df(ctx, rows) -> DataFrame:
     """Create a test dataframe with numeric values from 0 to rows-1.
 
     Args:

From ee1de817075e306045a7e688a527808e6e4566cc Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 28 Mar 2025 15:59:03 +0800
Subject: [PATCH 10/51] feat: Add max_table_rows_in_repr to control row display
 in DataFrame

- Updated DataFrame class to include max_table_rows_in_repr parameter for display configuration.
- Enhanced configure_display method to accept max_table_rows_in_repr.
- Modified DisplayConfig struct to include max_table_rows_in_repr with a default value of 10.
- Added tests to verify the functionality of max_table_rows_in_repr in both configuration and display output.
---
 python/datafusion/dataframe.py | 14 ++++++++--
 python/tests/test_dataframe.py | 49 +++++++++++++++++++++++++++++++---
 src/dataframe.rs               | 23 +++++++++++++---
 3 files changed, 78 insertions(+), 8 deletions(-)

diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
index a0688819b..b01bafd1f 100644
--- a/python/datafusion/dataframe.py
+++ b/python/datafusion/dataframe.py
@@ -818,6 +818,7 @@ def configure_display(
         max_table_bytes: Optional[int] = None,
         min_table_rows: Optional[int] = None,
         max_cell_length: Optional[int] = None,
+        max_table_rows_in_repr: Optional[int] = None,
     ) -> None:
         """Configure display options for DataFrame representation.
 
@@ -828,17 +829,26 @@ def configure_display(
                             This is used for initial display and in notebooks.
             max_cell_length: Maximum length of a cell before it gets minimized (default: 25).
                              Longer cells will be truncated with an expand button.
+            max_table_rows_in_repr: Maximum number of rows to display in string representation
+                                   (default: 10).
 
         Raises:
             ValueError: If any of the provided values are less than or equal to 0.
         """
         if any(
             value is not None and value <= 0
-            for value in (max_table_bytes, min_table_rows, max_cell_length)
+            for value in (
+                max_table_bytes,
+                min_table_rows,
+                max_cell_length,
+                max_table_rows_in_repr,
+            )
         ):
             raise ValueError("All values must be greater than 0.")
 
-        self.df.configure_display(max_table_bytes, min_table_rows, max_cell_length)
+        self.df.configure_display(
+            max_table_bytes, min_table_rows, max_cell_length, max_table_rows_in_repr
+        )
 
     def reset_display_config(self) -> None:
         """Reset display configuration to default values."""
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 04385d88c..18f0e07cd 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1271,13 +1271,17 @@ def test_display_config(df):
     assert config.max_table_bytes == 2 * 1024 * 1024  # 2 MB
     assert config.min_table_rows == 20
     assert config.max_cell_length == 25
+    assert config.max_table_rows_in_repr == 10  # Verify the new property
 
 
 def test_configure_display(df):
     """Test setting display configuration properties."""
     # Modify the display configuration
     df.configure_display(
-        max_table_bytes=1024 * 1024, min_table_rows=10, max_cell_length=50  # 1 MB
+        max_table_bytes=1024 * 1024,
+        min_table_rows=10,
+        max_cell_length=50,
+        max_table_rows_in_repr=15,  # Add test for the new property
     )
 
     # Verify the changes took effect
@@ -1285,13 +1289,15 @@ def test_configure_display(df):
     assert config.max_table_bytes == 1024 * 1024  # 1 MB
     assert config.min_table_rows == 10
     assert config.max_cell_length == 50
+    assert config.max_table_rows_in_repr == 15
 
     # Test partial update (only changing one property)
-    df.configure_display(min_table_rows=5)
+    df.configure_display(max_table_rows_in_repr=5)
     config = df.display_config
     assert config.max_table_bytes == 1024 * 1024  # previous value retained
-    assert config.min_table_rows == 5  # only this value changed
+    assert config.min_table_rows == 10  # previous value retained
     assert config.max_cell_length == 50  # previous value retained
+    assert config.max_table_rows_in_repr == 5  # only this value changed
 
     # Test with extreme values (still valid, but potentially problematic)
     # Zero values
@@ -1490,3 +1496,40 @@ def _create_numeric_test_df(ctx, rows) -> DataFrame:
     data = list(range(rows))
     batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"])
     return ctx.create_dataframe([[batch]])
+
+
+def test_max_table_rows_in_repr(ctx):
+    """Test that max_table_rows_in_repr controls the number of rows in string representation."""
+    # Create a dataframe with more rows than the default max_table_rows_in_repr (10)
+    rows = 20
+    df = _create_numeric_test_df(ctx, rows)
+
+    # First test with default setting (should limit to 10 rows)
+    repr_str = df.__repr__()
+    lines_default = _count_lines_in_str(repr_str)
+
+    # Default should be 10 rows max
+    assert lines_default <= 10
+    assert "Data truncated" in repr_str
+
+    # Now set a custom max_table_rows_in_repr value
+    custom_max_rows = 15
+    df.configure_display(max_table_rows_in_repr=custom_max_rows)
+
+    # Get the string representation with new configuration
+    repr_str_more = df.__repr__()
+    lines_custom = _count_lines_in_str(repr_str_more)
+
+    # Should show more rows than default but not more than configured max
+    assert lines_custom > lines_default
+    assert lines_custom <= custom_max_rows
+    assert "Data truncated" in repr_str_more
+
+    # Now set max_rows higher than total rows - should show all rows
+    df.configure_display(max_table_rows_in_repr=25)
+    repr_str_all = df.__repr__()
+    lines_all = _count_lines_in_str(repr_str_all)
+
+    # Should show all rows (20)
+    assert lines_all == rows
+    assert "Data truncated" not in repr_str_all
diff --git a/src/dataframe.rs b/src/dataframe.rs
index 33eecb5bf..db93d65bf 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -86,22 +86,28 @@ pub struct DisplayConfig {
     /// Maximum length of a cell before it gets minimized (default: 25)
     #[pyo3(get, set)]
     pub max_cell_length: usize,
+    /// Maximum number of rows to display in repr string output (default: 10)
+    #[pyo3(get, set)]
+    pub max_table_rows_in_repr: usize,
 }
 
 #[pymethods]
 impl DisplayConfig {
     #[new]
-    #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None))]
+    #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))]
     fn new(
         max_table_bytes: Option<usize>,
         min_table_rows: Option<usize>,
         max_cell_length: Option<usize>,
+        max_table_rows_in_repr: Option<usize>,
     ) -> Self {
         let default = DisplayConfig::default();
         Self {
             max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes),
             min_table_rows: min_table_rows.unwrap_or(default.min_table_rows),
             max_cell_length: max_cell_length.unwrap_or(default.max_cell_length),
+            max_table_rows_in_repr: max_table_rows_in_repr
+                .unwrap_or(default.max_table_rows_in_repr),
         }
     }
 }
@@ -112,6 +118,7 @@ impl Default for DisplayConfig {
             max_table_bytes: 2 * 1024 * 1024, // 2 MB
             min_table_rows: 20,
             max_cell_length: 25,
+            max_table_rows_in_repr: 10,
         }
     }
 }
@@ -165,7 +172,7 @@ impl PyDataFrame {
             collect_record_batches_to_display(
                 self.df.as_ref().clone(),
                 self.config.min_table_rows,
-                10,
+                self.config.max_table_rows_in_repr,
                 &self.config,
             ),
         )?;
@@ -858,12 +865,18 @@ impl PyDataFrame {
     }
 
     /// Update display configuration
-    #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None))]
+    #[pyo3(signature = (
+        max_table_bytes=None,
+        min_table_rows=None,
+        max_cell_length=None,
+        max_table_rows_in_repr=None
+    ))]
     fn configure_display(
         &mut self,
         max_table_bytes: Option<usize>,
         min_table_rows: Option<usize>,
         max_cell_length: Option<usize>,
+        max_table_rows_in_repr: Option<usize>,
     ) {
         let mut new_config = (*self.config).clone();
 
@@ -879,6 +892,10 @@ impl PyDataFrame {
             new_config.max_cell_length = length;
         }
 
+        if let Some(rows) = max_table_rows_in_repr {
+            new_config.max_table_rows_in_repr = rows;
+        }
+
         self.config = Arc::new(new_config);
     }
 

From 929563a8aa05037b5d5fd5f817d995e6d70bbe9d Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 28 Mar 2025 16:25:04 +0800
Subject: [PATCH 11/51] tidy up comments, tests

---
 python/tests/test_dataframe.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 18f0e07cd..2d5623034 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1271,7 +1271,7 @@ def test_display_config(df):
     assert config.max_table_bytes == 2 * 1024 * 1024  # 2 MB
     assert config.min_table_rows == 20
     assert config.max_cell_length == 25
-    assert config.max_table_rows_in_repr == 10  # Verify the new property
+    assert config.max_table_rows_in_repr == 10
 
 
 def test_configure_display(df):
@@ -1281,7 +1281,7 @@ def test_configure_display(df):
         max_table_bytes=1024 * 1024,
         min_table_rows=10,
         max_cell_length=50,
-        max_table_rows_in_repr=15,  # Add test for the new property
+        max_table_rows_in_repr=15,
     )
 
     # Verify the changes took effect
@@ -1299,7 +1299,7 @@ def test_configure_display(df):
     assert config.max_cell_length == 50  # previous value retained
     assert config.max_table_rows_in_repr == 5  # only this value changed
 
-    # Test with extreme values (still valid, but potentially problematic)
+    # Test with extreme values
     # Zero values
     with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
         df.configure_display(max_table_bytes=0, min_table_rows=0, max_cell_length=0)
@@ -1324,12 +1324,18 @@ def test_reset_display_config(df):
     """Test resetting display configuration to defaults."""
     # First modify the configuration
     df.configure_display(
-        max_table_bytes=1024 * 1024, min_table_rows=10, max_cell_length=50
+        max_table_bytes=1024 * 1024,
+        min_table_rows=10,
+        max_cell_length=50,
+        max_table_rows_in_repr=15,
     )
 
     # Verify changes took effect
     config = df.display_config
     assert config.max_table_bytes == 1024 * 1024
+    assert config.min_table_rows == 10
+    assert config.max_cell_length == 50
+    assert config.max_table_rows_in_repr == 15
 
     # Now reset to defaults
     df.reset_display_config()
@@ -1339,6 +1345,7 @@ def test_reset_display_config(df):
     assert config.max_table_bytes == 2 * 1024 * 1024  # 2 MB
     assert config.min_table_rows == 20
     assert config.max_cell_length == 25
+    assert config.max_table_rows_in_repr == 10
 
 
 def test_min_table_rows_display(ctx):
@@ -1428,11 +1435,11 @@ def test_max_cell_length_display(ctx):
 def test_display_config_repr_string(ctx):
     """Test that __repr__ respects display configuration."""
     # Create a dataframe with more rows than we want to show
-    # df.__repr__ returns max 10 rows, so we start test with 7 rows
+    # df.__repr__ returns max 10 rows by default, so we start test with 7 rows
     rows = 7
     df = _create_numeric_test_df(ctx, rows)
 
-    # Configure to show only 5 rows in string representation
+    # Configure to show at least 5 rows in string representation
     min_table_rows_in_display = 5
     df.configure_display(min_table_rows=min_table_rows_in_display)
 
@@ -1442,8 +1449,6 @@ def test_display_config_repr_string(ctx):
     # Count the number of rows using helper function
     lines_count = _count_lines_in_str(repr_str)
 
-    # Should be fewer rows than the total
-    assert lines_count <= rows
     assert lines_count >= min_table_rows_in_display
 
     # Now set min_rows higher and see if more rows appear

From cae89b026b62df25517b203a693dcaaee5b798fb Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 28 Mar 2025 16:59:31 +0800
Subject: [PATCH 12/51] Fix ruff errors

---
 python/datafusion/dataframe.py | 23 ++++++++++++++++-------
 python/tests/test_dataframe.py | 14 +++++++++-----
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
index b01bafd1f..3b2382502 100644
--- a/python/datafusion/dataframe.py
+++ b/python/datafusion/dataframe.py
@@ -49,6 +49,7 @@
     import pyarrow as pa
 
     from datafusion._internal import DataFrame as DataFrameInternal
+    from datafusion._internal import DisplayConfig
     from datafusion._internal import expr as expr_internal
 
 from enum import Enum
@@ -823,14 +824,17 @@ def configure_display(
         """Configure display options for DataFrame representation.
 
         Args:
-            max_table_bytes: Maximum bytes to display for table presentation (default: 2MB).
+            max_table_bytes: Maximum bytes to display for table presentation
+                             (default: 2MB).
                              Set to lower value for large tables to limit memory usage.
             min_table_rows: Minimum number of table rows to display (default: 20).
                             This is used for initial display and in notebooks.
-            max_cell_length: Maximum length of a cell before it gets minimized (default: 25).
+            max_cell_length: Maximum length of a cell before it gets minimized
+                             (default: 25).
                              Longer cells will be truncated with an expand button.
-            max_table_rows_in_repr: Maximum number of rows to display in string representation
-                                   (default: 10).
+            max_table_rows_in_repr: Maximum number of rows to display in string
+                                    representation
+                                    (default: 10).
 
         Raises:
             ValueError: If any of the provided values are less than or equal to 0.
@@ -844,7 +848,8 @@ def configure_display(
                 max_table_rows_in_repr,
             )
         ):
-            raise ValueError("All values must be greater than 0.")
+            error_msg = "All values must be greater than 0."
+            raise ValueError(error_msg)
 
         self.df.configure_display(
             max_table_bytes, min_table_rows, max_cell_length, max_table_rows_in_repr
@@ -855,8 +860,12 @@ def reset_display_config(self) -> None:
         self.df.reset_display_config()
 
     @property
-    def display_config(self):
-        """Get the current display configuration."""
+    def display_config(self) -> DisplayConfig:
+        """Get the current display configuration.
+
+        Returns:
+            DisplayConfig: The current display configuration settings
+        """
         return self.df.display_config
 
     @deprecated("Use :py:func:`unnest_columns` instead.")
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 2d5623034..7002d26cc 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1306,7 +1306,8 @@ def test_configure_display(df):
 
     # Test with negative values
     # This tests for expected behavior when users accidentally pass negative values
-    # Since these are usize in Rust, we expect a Python ValueError when trying to pass negative values
+    # Since these are usize in Rust, we expect a Python ValueError when trying to pass
+    # negative values.
     with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
         df.configure_display(max_table_bytes=-1)
 
@@ -1393,9 +1394,10 @@ def test_max_table_bytes_display(ctx):
     limited_row_count = limited_html.count("<tr>") - 1
 
     # Verify fewer rows are displayed with the byte limit
-    assert (
-        limited_row_count < default_row_count
-    ), f"Expected fewer rows with byte limit. Default: {default_row_count}, Limited: {limited_row_count}"
+    assert limited_row_count < default_row_count, (
+        f"Expected fewer rows with byte limit. "
+        f"Default: {default_row_count}, Limited: {limited_row_count}"
+    )
 
     # "Data truncated" should be present when limited
     assert "Data truncated" in limited_html
@@ -1504,7 +1506,9 @@ def _create_numeric_test_df(ctx, rows) -> DataFrame:
 
 
 def test_max_table_rows_in_repr(ctx):
-    """Test that max_table_rows_in_repr controls the number of rows in string representation."""
+    """Test that max_table_rows_in_repr controls the number of rows in string
+    representation.
+    """
     # Create a dataframe with more rows than the default max_table_rows_in_repr (10)
     rows = 20
     df = _create_numeric_test_df(ctx, rows)

From 1bfa8b14d3c978ff688557dccc1e63adb0f0fb50 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 28 Mar 2025 17:19:54 +0800
Subject: [PATCH 13/51] Trigger CI


From f34a331949630c0501fc233d6b2c33acba102dfe Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 28 Mar 2025 17:22:34 +0800
Subject: [PATCH 14/51] Fix ruff errors

---
 python/tests/test_dataframe.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 7002d26cc..51cdc173d 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1367,9 +1367,9 @@ def test_min_table_rows_display(ctx):
     row_count = html_output.count("<tr>") - 1  # subtract 1 for the header row
 
     # Verify at least min_table_rows rows are displayed
-    assert (
-        row_count >= custom_min_rows
-    ), f"Expected at least {custom_min_rows} rows, got {row_count}"
+    assert row_count >= custom_min_rows, (
+        f"Expected at least {custom_min_rows} rows, got {row_count}"
+    )
 
     # If data was truncated, "Data truncated" message should be present
     if row_count < rows:

From cb151e35368f5f1e83bd18757ac3c034cb8c9dab Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 28 Mar 2025 17:57:46 +0800
Subject: [PATCH 15/51] fix: Simplify error handling in display_config method

---
 src/dataframe.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index db93d65bf..cda4dd690 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -860,7 +860,7 @@ impl PyDataFrame {
     fn display_config(&self) -> PyResult<Py<DisplayConfig>> {
         Python::with_gil(|py| {
             let config = (*self.config).clone();
-            Py::new(py, config).map_err(PyErr::from)
+            Py::new(py, config)
         })
     }
 

From 0d5e900d7f5863683ad65fa30af49e3f6a1409b6 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 31 Mar 2025 11:02:25 +0800
Subject: [PATCH 16/51] refactor: Update display configuration handling in
 DataFrame

- Enhanced the DataFrame class to set display configuration at the session context level, ensuring that changes to one DataFrame's display settings affect all DataFrames created from the same context.
- Modified the PyDataFrame struct to accept a display configuration during initialization and updated methods to reference the new display_config field instead of the previous config field.
- Added tests to verify that display configurations are shared across DataFrames in the same context and remain independent across different contexts.
---
 python/datafusion/dataframe.py |  4 +++
 python/tests/test_dataframe.py | 66 ++++++++++++++++++++++++++++++++--
 src/dataframe.rs               | 15 ++++----
 3 files changed, 73 insertions(+), 12 deletions(-)

diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
index 3b2382502..ed58beb9d 100644
--- a/python/datafusion/dataframe.py
+++ b/python/datafusion/dataframe.py
@@ -823,6 +823,10 @@ def configure_display(
     ) -> None:
         """Configure display options for DataFrame representation.
 
+        Note: The display configuration is now set at the session context level,
+        so changes to one DataFrame's display configuration will affect all
+        DataFrames created from the same context.
+
         Args:
             max_table_bytes: Maximum bytes to display for table presentation
                              (default: 2MB).
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 51cdc173d..10b772a88 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1367,9 +1367,9 @@ def test_min_table_rows_display(ctx):
     row_count = html_output.count("<tr>") - 1  # subtract 1 for the header row
 
     # Verify at least min_table_rows rows are displayed
-    assert row_count >= custom_min_rows, (
-        f"Expected at least {custom_min_rows} rows, got {row_count}"
-    )
+    assert (
+        row_count >= custom_min_rows
+    ), f"Expected at least {custom_min_rows} rows, got {row_count}"
 
     # If data was truncated, "Data truncated" message should be present
     if row_count < rows:
@@ -1542,3 +1542,63 @@ def test_max_table_rows_in_repr(ctx):
     # Should show all rows (20)
     assert lines_all == rows
     assert "Data truncated" not in repr_str_all
+
+
+def test_session_context_display_config(ctx):
+    """Test that display configuration is shared at session context level."""
+    # Create two dataframes from the same context
+    batch1 = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df1 = ctx.create_dataframe([[batch1]])
+
+    batch2 = pa.RecordBatch.from_arrays(
+        [pa.array([7, 8, 9]), pa.array([10, 11, 12])],
+        names=["c", "d"],
+    )
+    df2 = ctx.create_dataframe([[batch2]])
+
+    # Set display config on first dataframe
+    custom_max_rows = 25
+    df1.configure_display(max_table_rows_in_repr=custom_max_rows)
+
+    # Check that both dataframes have the same config
+    assert df1.display_config.max_table_rows_in_repr == custom_max_rows
+    assert df2.display_config.max_table_rows_in_repr == custom_max_rows
+
+    # Change config on second dataframe
+    df2.configure_display(max_cell_length=40)
+
+    # Both dataframes should reflect the change
+    assert df1.display_config.max_cell_length == 40
+    assert df2.display_config.max_cell_length == 40
+
+
+def test_session_context_display_config_independence(ctx):
+    """Test that display configurations in different contexts are independent."""
+    # Create two contexts with different configurations
+    ctx1 = SessionContext()
+    ctx2 = SessionContext()
+
+    # Create dataframes from each context
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df1 = ctx1.create_dataframe([[batch]])
+    df2 = ctx2.create_dataframe([[batch]])
+
+    # Set different display configurations
+    df1.configure_display(max_table_rows_in_repr=15)
+    df2.configure_display(max_table_rows_in_repr=30)
+
+    # Verify configurations are independent
+    assert df1.display_config.max_table_rows_in_repr == 15
+    assert df2.display_config.max_table_rows_in_repr == 30
+
+    # Create another dataframe from first context
+    df3 = ctx1.create_dataframe([[batch]])
+
+    # It should have the same config as the first dataframe
+    assert df3.display_config.max_table_rows_in_repr == 15
diff --git a/src/dataframe.rs b/src/dataframe.rs
index cda4dd690..798c9b844 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -135,10 +135,10 @@ pub struct PyDataFrame {
 
 impl PyDataFrame {
     /// creates a new PyDataFrame
-    pub fn new(df: DataFrame) -> Self {
+    pub fn new(df: DataFrame, display_config: Arc<DisplayConfig>) -> Self {
         Self {
             df: Arc::new(df),
-            config: Arc::new(DisplayConfig::default()),
+            config: display_config,
         }
     }
 }
@@ -858,10 +858,7 @@ impl PyDataFrame {
     /// Get the current display configuration
     #[getter]
     fn display_config(&self) -> PyResult<Py<DisplayConfig>> {
-        Python::with_gil(|py| {
-            let config = (*self.config).clone();
-            Py::new(py, config)
-        })
+        Python::with_gil(|py| Py::new(py, (*self.display_config).clone()))
     }
 
     /// Update display configuration
@@ -878,7 +875,7 @@ impl PyDataFrame {
         max_cell_length: Option<usize>,
         max_table_rows_in_repr: Option<usize>,
     ) {
-        let mut new_config = (*self.config).clone();
+        let mut new_config = (*self.display_config).clone();
 
         if let Some(bytes) = max_table_bytes {
             new_config.max_table_bytes = bytes;
@@ -896,13 +893,13 @@ impl PyDataFrame {
             new_config.max_table_rows_in_repr = rows;
         }
 
-        self.config = Arc::new(new_config);
+        self.display_config = Arc::new(new_config);
     }
 
     /// Reset display configuration to default values
     #[pyo3(text_signature = "($self)")]
     fn reset_display_config(&mut self) {
-        self.config = Arc::new(DisplayConfig::default());
+        self.display_config = Arc::new(DisplayConfig::default());
     }
 }
 

From ba5acc43fc2d82d09397fb60168c95601a12c388 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 31 Mar 2025 11:09:38 +0800
Subject: [PATCH 17/51] Revert "refactor: Update display configuration handling
 in DataFrame"

This reverts commit 0d5e900d7f5863683ad65fa30af49e3f6a1409b6.
---
 python/datafusion/dataframe.py |  4 ---
 python/tests/test_dataframe.py | 66 ++--------------------------------
 src/dataframe.rs               | 15 ++++----
 3 files changed, 12 insertions(+), 73 deletions(-)

diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
index ed58beb9d..3b2382502 100644
--- a/python/datafusion/dataframe.py
+++ b/python/datafusion/dataframe.py
@@ -823,10 +823,6 @@ def configure_display(
     ) -> None:
         """Configure display options for DataFrame representation.
 
-        Note: The display configuration is now set at the session context level,
-        so changes to one DataFrame's display configuration will affect all
-        DataFrames created from the same context.
-
         Args:
             max_table_bytes: Maximum bytes to display for table presentation
                              (default: 2MB).
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 10b772a88..51cdc173d 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1367,9 +1367,9 @@ def test_min_table_rows_display(ctx):
     row_count = html_output.count("<tr>") - 1  # subtract 1 for the header row
 
     # Verify at least min_table_rows rows are displayed
-    assert (
-        row_count >= custom_min_rows
-    ), f"Expected at least {custom_min_rows} rows, got {row_count}"
+    assert row_count >= custom_min_rows, (
+        f"Expected at least {custom_min_rows} rows, got {row_count}"
+    )
 
     # If data was truncated, "Data truncated" message should be present
     if row_count < rows:
@@ -1542,63 +1542,3 @@ def test_max_table_rows_in_repr(ctx):
     # Should show all rows (20)
     assert lines_all == rows
     assert "Data truncated" not in repr_str_all
-
-
-def test_session_context_display_config(ctx):
-    """Test that display configuration is shared at session context level."""
-    # Create two dataframes from the same context
-    batch1 = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df1 = ctx.create_dataframe([[batch1]])
-
-    batch2 = pa.RecordBatch.from_arrays(
-        [pa.array([7, 8, 9]), pa.array([10, 11, 12])],
-        names=["c", "d"],
-    )
-    df2 = ctx.create_dataframe([[batch2]])
-
-    # Set display config on first dataframe
-    custom_max_rows = 25
-    df1.configure_display(max_table_rows_in_repr=custom_max_rows)
-
-    # Check that both dataframes have the same config
-    assert df1.display_config.max_table_rows_in_repr == custom_max_rows
-    assert df2.display_config.max_table_rows_in_repr == custom_max_rows
-
-    # Change config on second dataframe
-    df2.configure_display(max_cell_length=40)
-
-    # Both dataframes should reflect the change
-    assert df1.display_config.max_cell_length == 40
-    assert df2.display_config.max_cell_length == 40
-
-
-def test_session_context_display_config_independence(ctx):
-    """Test that display configurations in different contexts are independent."""
-    # Create two contexts with different configurations
-    ctx1 = SessionContext()
-    ctx2 = SessionContext()
-
-    # Create dataframes from each context
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df1 = ctx1.create_dataframe([[batch]])
-    df2 = ctx2.create_dataframe([[batch]])
-
-    # Set different display configurations
-    df1.configure_display(max_table_rows_in_repr=15)
-    df2.configure_display(max_table_rows_in_repr=30)
-
-    # Verify configurations are independent
-    assert df1.display_config.max_table_rows_in_repr == 15
-    assert df2.display_config.max_table_rows_in_repr == 30
-
-    # Create another dataframe from first context
-    df3 = ctx1.create_dataframe([[batch]])
-
-    # It should have the same config as the first dataframe
-    assert df3.display_config.max_table_rows_in_repr == 15
diff --git a/src/dataframe.rs b/src/dataframe.rs
index 798c9b844..cda4dd690 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -135,10 +135,10 @@ pub struct PyDataFrame {
 
 impl PyDataFrame {
     /// creates a new PyDataFrame
-    pub fn new(df: DataFrame, display_config: Arc<DisplayConfig>) -> Self {
+    pub fn new(df: DataFrame) -> Self {
         Self {
             df: Arc::new(df),
-            config: display_config,
+            config: Arc::new(DisplayConfig::default()),
         }
     }
 }
@@ -858,7 +858,10 @@ impl PyDataFrame {
     /// Get the current display configuration
     #[getter]
     fn display_config(&self) -> PyResult<Py<DisplayConfig>> {
-        Python::with_gil(|py| Py::new(py, (*self.display_config).clone()))
+        Python::with_gil(|py| {
+            let config = (*self.config).clone();
+            Py::new(py, config)
+        })
     }
 
     /// Update display configuration
@@ -875,7 +878,7 @@ impl PyDataFrame {
         max_cell_length: Option<usize>,
         max_table_rows_in_repr: Option<usize>,
     ) {
-        let mut new_config = (*self.display_config).clone();
+        let mut new_config = (*self.config).clone();
 
         if let Some(bytes) = max_table_bytes {
             new_config.max_table_bytes = bytes;
@@ -893,13 +896,13 @@ impl PyDataFrame {
             new_config.max_table_rows_in_repr = rows;
         }
 
-        self.display_config = Arc::new(new_config);
+        self.config = Arc::new(new_config);
     }
 
     /// Reset display configuration to default values
     #[pyo3(text_signature = "($self)")]
     fn reset_display_config(&mut self) {
-        self.display_config = Arc::new(DisplayConfig::default());
+        self.config = Arc::new(DisplayConfig::default());
     }
 }
 

From 0e30af3409a82a4924fd450e63c613b738fec0c9 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 31 Mar 2025 12:07:28 +0800
Subject: [PATCH 18/51] Refactor PyDataFrame: Simplify methods and improve
 performance

- Removed unnecessary cloning of DataFrame in various methods to enhance performance.
- Consolidated display configuration handling by removing the DisplayConfig struct and related methods.
- Updated methods to use direct references to DataFrame where applicable.
- Improved the implementation of select, filter, with_column, and other methods to work with mutable references.
- Added a new to_string method for better string representation of DataFrame.
- Cleaned up unused imports and commented-out code for better readability.
---
 src/context.rs   | 279 ++++--------------
 src/dataframe.rs | 738 +++++++++--------------------------------------
 2 files changed, 187 insertions(+), 830 deletions(-)

diff --git a/src/context.rs b/src/context.rs
index 0db0f4d7e..6d5e078d3 100644
--- a/src/context.rs
+++ b/src/context.rs
@@ -72,24 +72,59 @@ use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider};
 use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType};
 use tokio::task::JoinHandle;
 
+/// Display configuration for DataFrames
+#[pyclass(name = "DisplayConfig", module = "datafusion", subclass)]
+#[derive(Clone, Debug)]
+pub struct DisplayConfig {
+    #[pyo3(get, set)]
+    pub max_width: usize,
+    #[pyo3(get, set)]
+    pub max_rows: Option<usize>,
+    #[pyo3(get, set)]
+    pub show_nulls: bool,
+}
+
+#[pymethods]
+impl DisplayConfig {
+    #[new]
+    pub fn new(
+        max_width: Option<usize>,
+        max_rows: Option<usize>,
+        show_nulls: Option<bool>,
+    ) -> Self {
+        Self {
+            max_width: max_width.unwrap_or(80),
+            max_rows,
+            show_nulls: show_nulls.unwrap_or(false),
+        }
+    }
+}
+
 /// Configuration options for a SessionContext
 #[pyclass(name = "SessionConfig", module = "datafusion", subclass)]
 #[derive(Clone, Default)]
 pub struct PySessionConfig {
     pub config: SessionConfig,
+    pub display_config: DisplayConfig,
 }
 
 impl From<SessionConfig> for PySessionConfig {
     fn from(config: SessionConfig) -> Self {
-        Self { config }
+        Self {
+            config,
+            display_config: DisplayConfig::new(Some(80), None, Some(false)),
+        }
     }
 }
 
 #[pymethods]
 impl PySessionConfig {
-    #[pyo3(signature = (config_options=None))]
+    #[pyo3(signature = (config_options=None, display_config=None))]
     #[new]
-    fn new(config_options: Option<HashMap<String, String>>) -> Self {
+    fn new(
+        config_options: Option<HashMap<String, String>>,
+        display_config: Option<DisplayConfig>,
+    ) -> Self {
         let mut config = SessionConfig::new();
         if let Some(hash_map) = config_options {
             for (k, v) in &hash_map {
@@ -97,7 +132,23 @@ impl PySessionConfig {
             }
         }
 
-        Self { config }
+        Self {
+            config,
+            display_config: display_config
+                .unwrap_or_else(|| DisplayConfig::new(Some(80), None, Some(false))),
+        }
+    }
+
+    // Get the display configuration
+    pub fn get_display_config(&self) -> DisplayConfig {
+        self.display_config.clone()
+    }
+
+    // Set the display configuration
+    pub fn with_display_config(&self, display_config: DisplayConfig) -> Self {
+        let mut new_config = self.clone();
+        new_config.display_config = display_config;
+        new_config
     }
 
     fn with_create_default_catalog_and_schema(&self, enabled: bool) -> Self {
@@ -675,226 +726,6 @@ impl PySessionContext {
             )));
         }
 
-        let mut options = CsvReadOptions::new()
-            .has_header(has_header)
-            .delimiter(delimiter[0])
-            .schema_infer_max_records(schema_infer_max_records)
-            .file_extension(file_extension)
-            .file_compression_type(parse_file_compression_type(file_compression_type)?);
-        options.schema = schema.as_ref().map(|x| &x.0);
-
-        if path.is_instance_of::<PyList>() {
-            let paths = path.extract::<Vec<String>>()?;
-            let result = self.register_csv_from_multiple_paths(name, paths, options);
-            wait_for_future(py, result)?;
-        } else {
-            let path = path.extract::<String>()?;
-            let result = self.ctx.register_csv(name, &path, options);
-            wait_for_future(py, result)?;
-        }
-
-        Ok(())
-    }
-
-    #[allow(clippy::too_many_arguments)]
-    #[pyo3(signature = (name,
-                        path,
-                        schema=None,
-                        schema_infer_max_records=1000,
-                        file_extension=".json",
-                        table_partition_cols=vec![],
-                        file_compression_type=None))]
-    pub fn register_json(
-        &mut self,
-        name: &str,
-        path: PathBuf,
-        schema: Option<PyArrowType<Schema>>,
-        schema_infer_max_records: usize,
-        file_extension: &str,
-        table_partition_cols: Vec<(String, String)>,
-        file_compression_type: Option<String>,
-        py: Python,
-    ) -> PyDataFusionResult<()> {
-        let path = path
-            .to_str()
-            .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?;
-
-        let mut options = NdJsonReadOptions::default()
-            .file_compression_type(parse_file_compression_type(file_compression_type)?)
-            .table_partition_cols(convert_table_partition_cols(table_partition_cols)?);
-        options.schema_infer_max_records = schema_infer_max_records;
-        options.file_extension = file_extension;
-        options.schema = schema.as_ref().map(|x| &x.0);
-
-        let result = self.ctx.register_json(name, path, options);
-        wait_for_future(py, result)?;
-
-        Ok(())
-    }
-
-    #[allow(clippy::too_many_arguments)]
-    #[pyo3(signature = (name,
-                        path,
-                        schema=None,
-                        file_extension=".avro",
-                        table_partition_cols=vec![]))]
-    pub fn register_avro(
-        &mut self,
-        name: &str,
-        path: PathBuf,
-        schema: Option<PyArrowType<Schema>>,
-        file_extension: &str,
-        table_partition_cols: Vec<(String, String)>,
-        py: Python,
-    ) -> PyDataFusionResult<()> {
-        let path = path
-            .to_str()
-            .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?;
-
-        let mut options = AvroReadOptions::default()
-            .table_partition_cols(convert_table_partition_cols(table_partition_cols)?);
-        options.file_extension = file_extension;
-        options.schema = schema.as_ref().map(|x| &x.0);
-
-        let result = self.ctx.register_avro(name, path, options);
-        wait_for_future(py, result)?;
-
-        Ok(())
-    }
-
-    // Registers a PyArrow.Dataset
-    pub fn register_dataset(
-        &self,
-        name: &str,
-        dataset: &Bound<'_, PyAny>,
-        py: Python,
-    ) -> PyDataFusionResult<()> {
-        let table: Arc<dyn TableProvider> = Arc::new(Dataset::new(dataset, py)?);
-
-        self.ctx.register_table(name, table)?;
-
-        Ok(())
-    }
-
-    pub fn register_udf(&mut self, udf: PyScalarUDF) -> PyResult<()> {
-        self.ctx.register_udf(udf.function);
-        Ok(())
-    }
-
-    pub fn register_udaf(&mut self, udaf: PyAggregateUDF) -> PyResult<()> {
-        self.ctx.register_udaf(udaf.function);
-        Ok(())
-    }
-
-    pub fn register_udwf(&mut self, udwf: PyWindowUDF) -> PyResult<()> {
-        self.ctx.register_udwf(udwf.function);
-        Ok(())
-    }
-
-    #[pyo3(signature = (name="datafusion"))]
-    pub fn catalog(&self, name: &str) -> PyResult<PyCatalog> {
-        match self.ctx.catalog(name) {
-            Some(catalog) => Ok(PyCatalog::new(catalog)),
-            None => Err(PyKeyError::new_err(format!(
-                "Catalog with name {} doesn't exist.",
-                &name,
-            ))),
-        }
-    }
-
-    pub fn tables(&self) -> HashSet<String> {
-        self.ctx
-            .catalog_names()
-            .into_iter()
-            .filter_map(|name| self.ctx.catalog(&name))
-            .flat_map(move |catalog| {
-                catalog
-                    .schema_names()
-                    .into_iter()
-                    .filter_map(move |name| catalog.schema(&name))
-            })
-            .flat_map(|schema| schema.table_names())
-            .collect()
-    }
-
-    pub fn table(&self, name: &str, py: Python) -> PyResult<PyDataFrame> {
-        let x = wait_for_future(py, self.ctx.table(name))
-            .map_err(|e| PyKeyError::new_err(e.to_string()))?;
-        Ok(PyDataFrame::new(x))
-    }
-
-    pub fn table_exist(&self, name: &str) -> PyDataFusionResult<bool> {
-        Ok(self.ctx.table_exist(name)?)
-    }
-
-    pub fn empty_table(&self) -> PyDataFusionResult<PyDataFrame> {
-        Ok(PyDataFrame::new(self.ctx.read_empty()?))
-    }
-
-    pub fn session_id(&self) -> String {
-        self.ctx.session_id()
-    }
-
-    #[allow(clippy::too_many_arguments)]
-    #[pyo3(signature = (path, schema=None, schema_infer_max_records=1000, file_extension=".json", table_partition_cols=vec![], file_compression_type=None))]
-    pub fn read_json(
-        &mut self,
-        path: PathBuf,
-        schema: Option<PyArrowType<Schema>>,
-        schema_infer_max_records: usize,
-        file_extension: &str,
-        table_partition_cols: Vec<(String, String)>,
-        file_compression_type: Option<String>,
-        py: Python,
-    ) -> PyDataFusionResult<PyDataFrame> {
-        let path = path
-            .to_str()
-            .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?;
-        let mut options = NdJsonReadOptions::default()
-            .table_partition_cols(convert_table_partition_cols(table_partition_cols)?)
-            .file_compression_type(parse_file_compression_type(file_compression_type)?);
-        options.schema_infer_max_records = schema_infer_max_records;
-        options.file_extension = file_extension;
-        let df = if let Some(schema) = schema {
-            options.schema = Some(&schema.0);
-            let result = self.ctx.read_json(path, options);
-            wait_for_future(py, result)?
-        } else {
-            let result = self.ctx.read_json(path, options);
-            wait_for_future(py, result)?
-        };
-        Ok(PyDataFrame::new(df))
-    }
-
-    #[allow(clippy::too_many_arguments)]
-    #[pyo3(signature = (
-        path,
-        schema=None,
-        has_header=true,
-        delimiter=",",
-        schema_infer_max_records=1000,
-        file_extension=".csv",
-        table_partition_cols=vec![],
-        file_compression_type=None))]
-    pub fn read_csv(
-        &self,
-        path: &Bound<'_, PyAny>,
-        schema: Option<PyArrowType<Schema>>,
-        has_header: bool,
-        delimiter: &str,
-        schema_infer_max_records: usize,
-        file_extension: &str,
-        table_partition_cols: Vec<(String, String)>,
-        file_compression_type: Option<String>,
-        py: Python,
-    ) -> PyDataFusionResult<PyDataFrame> {
-        let delimiter = delimiter.as_bytes();
-        if delimiter.len() != 1 {
-            return Err(crate::errors::PyDataFusionError::PythonError(py_value_err(
-                "Delimiter must be a single character",
-            )));
-        };
-
         let mut options = CsvReadOptions::new()
             .has_header(has_header)
             .delimiter(delimiter[0])
diff --git a/src/dataframe.rs b/src/dataframe.rs
index cda4dd690..50227c3a6 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -15,412 +15,106 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::ffi::CString;
+use std::collections::HashMap;
 use std::sync::Arc;
 
-use arrow::array::{new_null_array, RecordBatch, RecordBatchIterator, RecordBatchReader};
-use arrow::compute::can_cast_types;
-use arrow::error::ArrowError;
-use arrow::ffi::FFI_ArrowSchema;
-use arrow::ffi_stream::FFI_ArrowArrayStream;
-use arrow::util::display::{ArrayFormatter, FormatOptions};
-use datafusion::arrow::datatypes::Schema;
-use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow};
-use datafusion::arrow::util::pretty;
-use datafusion::common::UnnestOptions;
-use datafusion::config::{CsvOptions, TableParquetOptions};
-use datafusion::dataframe::{DataFrame, DataFrameWriteOptions};
-use datafusion::datasource::TableProvider;
-use datafusion::error::DataFusionError;
-use datafusion::execution::SendableRecordBatchStream;
-use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel};
-use datafusion::prelude::*;
-use futures::{StreamExt, TryStreamExt};
-use pyo3::exceptions::PyValueError;
+use datafusion::arrow::csv::WriterBuilder;
+use datafusion::arrow::datatypes::SchemaRef;
+use datafusion::arrow::pyarrow::FromPyArrow;
+use datafusion::arrow::pyarrow::PyArrowType;
+use datafusion::arrow::record_batch::RecordBatch;
+use datafusion::common::TableReference;
+use datafusion::prelude::DataFrame;
+
+use pyo3::exceptions::PyTypeError;
 use pyo3::prelude::*;
-use pyo3::pybacked::PyBackedStr;
-use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods};
-use tokio::task::JoinHandle;
+use pyo3::types::{PyList, PyString, PyTuple};
 
-use crate::catalog::PyTable;
-use crate::errors::{py_datafusion_err, PyDataFusionError};
-use crate::expr::sort_expr::to_sort_expressions;
+use crate::errors::{py_datafusion_err, PyDataFusionError, PyDataFusionResult};
+use crate::expr::expr::PyExpr;
+use crate::expr::window_expr::PyWindowExpr;
 use crate::physical_plan::PyExecutionPlan;
-use crate::record_batch::PyRecordBatchStream;
+use crate::record_batch::{PyRecordBatch, TableData};
 use crate::sql::logical::PyLogicalPlan;
-use crate::utils::{get_tokio_runtime, validate_pycapsule, wait_for_future};
-use crate::{
-    errors::PyDataFusionResult,
-    expr::{sort_expr::PySortExpr, PyExpr},
-};
-
-// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
-// - we have not decided on the table_provider approach yet
-// this is an interim implementation
-#[pyclass(name = "TableProvider", module = "datafusion")]
-pub struct PyTableProvider {
-    provider: Arc<dyn TableProvider>,
-}
-
-impl PyTableProvider {
-    pub fn new(provider: Arc<dyn TableProvider>) -> Self {
-        Self { provider }
-    }
-
-    pub fn as_table(&self) -> PyTable {
-        let table_provider: Arc<dyn TableProvider> = self.provider.clone();
-        PyTable::new(table_provider)
-    }
-}
-
-/// Configuration for DataFrame display in Python environment
-#[pyclass(name = "DisplayConfig", module = "datafusion")]
-#[derive(Debug, Clone)]
-pub struct DisplayConfig {
-    /// Maximum bytes to display for table presentation (default: 2MB)
-    #[pyo3(get, set)]
-    pub max_table_bytes: usize,
-    /// Minimum number of table rows to display (default: 20)
-    #[pyo3(get, set)]
-    pub min_table_rows: usize,
-    /// Maximum length of a cell before it gets minimized (default: 25)
-    #[pyo3(get, set)]
-    pub max_cell_length: usize,
-    /// Maximum number of rows to display in repr string output (default: 10)
-    #[pyo3(get, set)]
-    pub max_table_rows_in_repr: usize,
-}
-
-#[pymethods]
-impl DisplayConfig {
-    #[new]
-    #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))]
-    fn new(
-        max_table_bytes: Option<usize>,
-        min_table_rows: Option<usize>,
-        max_cell_length: Option<usize>,
-        max_table_rows_in_repr: Option<usize>,
-    ) -> Self {
-        let default = DisplayConfig::default();
-        Self {
-            max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes),
-            min_table_rows: min_table_rows.unwrap_or(default.min_table_rows),
-            max_cell_length: max_cell_length.unwrap_or(default.max_cell_length),
-            max_table_rows_in_repr: max_table_rows_in_repr
-                .unwrap_or(default.max_table_rows_in_repr),
-        }
-    }
-}
-
-impl Default for DisplayConfig {
-    fn default() -> Self {
-        Self {
-            max_table_bytes: 2 * 1024 * 1024, // 2 MB
-            min_table_rows: 20,
-            max_cell_length: 25,
-            max_table_rows_in_repr: 10,
-        }
-    }
-}
+use crate::utils::{get_tokio_runtime, wait_for_future};
+use crate::Dataset;
 
-/// A PyDataFrame is a representation of a logical plan and an API to compose statements.
-/// Use it to build a plan and `.collect()` to execute the plan and collect the result.
-/// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment.
+/// Represents a DataFrame in DataFusion.
 #[pyclass(name = "DataFrame", module = "datafusion", subclass)]
 #[derive(Clone)]
 pub struct PyDataFrame {
     df: Arc<DataFrame>,
-    config: Arc<DisplayConfig>,
-}
-
-impl PyDataFrame {
-    /// creates a new PyDataFrame
-    pub fn new(df: DataFrame) -> Self {
-        Self {
-            df: Arc::new(df),
-            config: Arc::new(DisplayConfig::default()),
-        }
-    }
 }
 
 #[pymethods]
 impl PyDataFrame {
-    /// Enable selection for `df[col]`, `df[col1, col2, col3]`, and `df[[col1, col2, col3]]`
-    fn __getitem__(&self, key: Bound<'_, PyAny>) -> PyDataFusionResult<Self> {
-        if let Ok(key) = key.extract::<PyBackedStr>() {
-            // df[col]
-            self.select_columns(vec![key])
-        } else if let Ok(tuple) = key.downcast::<PyTuple>() {
-            // df[col1, col2, col3]
-            let keys = tuple
-                .iter()
-                .map(|item| item.extract::<PyBackedStr>())
-                .collect::<PyResult<Vec<PyBackedStr>>>()?;
-            self.select_columns(keys)
-        } else if let Ok(keys) = key.extract::<Vec<PyBackedStr>>() {
-            // df[[col1, col2, col3]]
-            self.select_columns(keys)
-        } else {
-            let message = "DataFrame can only be indexed by string index or indices".to_string();
-            Err(PyDataFusionError::Common(message))
-        }
-    }
-
-    fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
-        let (batches, has_more) = wait_for_future(
-            py,
-            collect_record_batches_to_display(
-                self.df.as_ref().clone(),
-                self.config.min_table_rows,
-                self.config.max_table_rows_in_repr,
-                &self.config,
-            ),
-        )?;
-        if batches.is_empty() {
-            // This should not be reached, but do it for safety since we index into the vector below
-            return Ok("No data to display".to_string());
-        }
-
-        let batches_as_displ =
-            pretty::pretty_format_batches(&batches).map_err(py_datafusion_err)?;
-
-        let additional_str = match has_more {
-            true => "\nData truncated.",
-            false => "",
-        };
-
-        Ok(format!("DataFrame()\n{batches_as_displ}{additional_str}"))
-    }
-
-    fn _repr_html_(&self, py: Python) -> PyDataFusionResult<String> {
-        let (batches, has_more) = wait_for_future(
-            py,
-            collect_record_batches_to_display(
-                self.df.as_ref().clone(),
-                self.config.min_table_rows,
-                usize::MAX,
-                &self.config,
-            ),
-        )?;
-        if batches.is_empty() {
-            // This should not be reached, but do it for safety since we index into the vector below
-            return Ok("No data to display".to_string());
-        }
-
-        let table_uuid = uuid::Uuid::new_v4().to_string();
-
-        let mut html_str = "
-        <style>
-            .expandable-container {
-                display: inline-block;
-                max-width: 200px;
-            }
-            .expandable {
-                white-space: nowrap;
-                overflow: hidden;
-                text-overflow: ellipsis;
-                display: block;
-            }
-            .full-text {
-                display: none;
-                white-space: normal;
-            }
-            .expand-btn {
-                cursor: pointer;
-                color: blue;
-                text-decoration: underline;
-                border: none;
-                background: none;
-                font-size: inherit;
-                display: block;
-                margin-top: 5px;
-            }
-        </style>
-
-        <div style=\"width: 100%; max-width: 1000px; max-height: 300px; overflow: auto; border: 1px solid #ccc;\">
-            <table style=\"border-collapse: collapse; min-width: 100%\">
-                <thead>\n".to_string();
-
-        let schema = batches[0].schema();
-
-        let mut header = Vec::new();
-        for field in schema.fields() {
-            header.push(format!("<th style='border: 1px solid black; padding: 8px; text-align: left; background-color: #f2f2f2; white-space: nowrap; min-width: fit-content; max-width: fit-content;'>{}</th>", field.name()));
-        }
-        let header_str = header.join("");
-        html_str.push_str(&format!("<tr>{}</tr></thead><tbody>\n", header_str));
-
-        let batch_formatters = batches
-            .iter()
-            .map(|batch| {
-                batch
-                    .columns()
-                    .iter()
-                    .map(|c| ArrayFormatter::try_new(c.as_ref(), &FormatOptions::default()))
-                    .map(|c| {
-                        c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string())))
-                    })
-                    .collect::<Result<Vec<_>, _>>()
-            })
-            .collect::<Result<Vec<_>, _>>()?;
-
-        let rows_per_batch = batches.iter().map(|batch| batch.num_rows());
-
-        // We need to build up row by row for html
-        let mut table_row = 0;
-        for (batch_formatter, num_rows_in_batch) in batch_formatters.iter().zip(rows_per_batch) {
-            for batch_row in 0..num_rows_in_batch {
-                table_row += 1;
-                let mut cells = Vec::new();
-                for (col, formatter) in batch_formatter.iter().enumerate() {
-                    let cell_data = formatter.value(batch_row).to_string();
-                    // From testing, primitive data types do not typically get larger than 21 characters
-                    if cell_data.len() > self.config.max_cell_length {
-                        let short_cell_data = &cell_data[0..self.config.max_cell_length];
-                        cells.push(format!("
-                            <td style='border: 1px solid black; padding: 8px; text-align: left; white-space: nowrap;'>
-                                <div class=\"expandable-container\">
-                                    <span class=\"expandable\" id=\"{table_uuid}-min-text-{table_row}-{col}\">{short_cell_data}</span>
-                                    <span class=\"full-text\" id=\"{table_uuid}-full-text-{table_row}-{col}\">{cell_data}</span>
-                                    <button class=\"expand-btn\" onclick=\"toggleDataFrameCellText('{table_uuid}',{table_row},{col})\">...</button>
-                                </div>
-                            </td>"));
-                    } else {
-                        cells.push(format!("<td style='border: 1px solid black; padding: 8px; text-align: left; white-space: nowrap;'>{}</td>", formatter.value(batch_row)));
-                    }
-                }
-                let row_str = cells.join("");
-                html_str.push_str(&format!("<tr>{}</tr>\n", row_str));
-            }
-        }
-        html_str.push_str("</tbody></table></div>\n");
-
-        html_str.push_str("
-            <script>
-            function toggleDataFrameCellText(table_uuid, row, col) {
-                var shortText = document.getElementById(table_uuid + \"-min-text-\" + row + \"-\" + col);
-                var fullText = document.getElementById(table_uuid + \"-full-text-\" + row + \"-\" + col);
-                var button = event.target;
-
-                if (fullText.style.display === \"none\") {
-                    shortText.style.display = \"none\";
-                    fullText.style.display = \"inline\";
-                    button.textContent = \"(less)\";
-                } else {
-                    shortText.style.display = \"inline\";
-                    fullText.style.display = \"none\";
-                    button.textContent = \"...\";
-                }
-            }
-            </script>
-        ");
-
-        if has_more {
-            html_str.push_str("Data truncated due to size.");
-        }
-
-        Ok(html_str)
-    }
-
-    /// Calculate summary statistics for a DataFrame
-    fn describe(&self, py: Python) -> PyDataFusionResult<Self> {
-        let df = self.df.as_ref().clone();
-        let stat_df = wait_for_future(py, df.describe())?;
-        Ok(Self::new(stat_df))
-    }
-
-    /// Returns the schema from the logical plan
-    fn schema(&self) -> PyArrowType<Schema> {
-        PyArrowType(self.df.schema().into())
-    }
-
-    /// Convert this DataFrame into a Table that can be used in register_table
-    /// By convention, into_... methods consume self and return the new object.
-    /// Disabling the clippy lint, so we can use &self
-    /// because we're working with Python bindings
-    /// where objects are shared
-    /// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
-    /// - we have not decided on the table_provider approach yet
-    #[allow(clippy::wrong_self_convention)]
-    fn into_view(&self) -> PyDataFusionResult<PyTable> {
-        // Call the underlying Rust DataFrame::into_view method.
-        // Note that the Rust method consumes self; here we clone the inner Arc<DataFrame>
-        // so that we don’t invalidate this PyDataFrame.
-        let table_provider = self.df.as_ref().clone().into_view();
-        let table_provider = PyTableProvider::new(table_provider);
-
-        Ok(table_provider.as_table())
-    }
-
-    #[pyo3(signature = (*args))]
-    fn select_columns(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
-        let args = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
-        let df = self.df.as_ref().clone().select_columns(&args)?;
+    fn select(&mut self, expr: Vec<PyExpr>) -> PyDataFusion<Self> {
+        let expr = expr.into_iter().map(|e| e.into()).collect::<Vec<_>>();
+        let df = self.df.select(expr).map_err(py_datafusion_err)?;
         Ok(Self::new(df))
     }
 
-    #[pyo3(signature = (*args))]
-    fn select(&self, args: Vec<PyExpr>) -> PyDataFusionResult<Self> {
-        let expr = args.into_iter().map(|e| e.into()).collect();
-        let df = self.df.as_ref().clone().select(expr)?;
-        Ok(Self::new(df))
-    }
-
-    #[pyo3(signature = (*args))]
-    fn drop(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
-        let cols = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
-        let df = self.df.as_ref().clone().drop_columns(&cols)?;
-        Ok(Self::new(df))
-    }
-
-    fn filter(&self, predicate: PyExpr) -> PyDataFusionResult<Self> {
-        let df = self.df.as_ref().clone().filter(predicate.into())?;
+    fn filter(&mut self, predicate: PyExpr) -> PyDataFusionResult<Self> {
+        let df = self
+            .df
+            .filter(predicate.into())
+            .map_err(py_datafusion_err)?;
         Ok(Self::new(df))
     }
 
-    fn with_column(&self, name: &str, expr: PyExpr) -> PyDataFusionResult<Self> {
-        let df = self.df.as_ref().clone().with_column(name, expr.into())?;
+    fn with_column(&mut self, name: &str, expr: PyExpr) -> PyDataFusionResult<Self> {
+        let df = self
+            .df
+            .with_column(name, expr.into())
+            .map_err(py_datafusion_err)?;
         Ok(Self::new(df))
     }
 
-    fn with_columns(&self, exprs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
-        let mut df = self.df.as_ref().clone();
+    fn with_columns(&mut self, exprs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
+        let mut df = self.df.clone();
         for expr in exprs {
             let expr: Expr = expr.into();
             let name = format!("{}", expr.schema_name());
-            df = df.with_column(name.as_str(), expr)?
+            df = df
+                .with_column(name.as_str(), expr)
+                .map_err(py_datafusion_err)?
         }
         Ok(Self::new(df))
     }
 
     /// Rename one column by applying a new projection. This is a no-op if the column to be
     /// renamed does not exist.
-    fn with_column_renamed(&self, old_name: &str, new_name: &str) -> PyDataFusionResult<Self> {
+    fn with_column_renamed(&mut self, old_name: &str, new_name: &str) -> PyDataFusionResult<Self> {
         let df = self
             .df
-            .as_ref()
-            .clone()
-            .with_column_renamed(old_name, new_name)?;
+            .with_column_renamed(old_name, new_name)
+            .map_err(py_datafusion_err)?;
         Ok(Self::new(df))
     }
 
-    fn aggregate(&self, group_by: Vec<PyExpr>, aggs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
+    fn aggregate(&mut self, group_by: Vec<PyExpr>, aggs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
         let group_by = group_by.into_iter().map(|e| e.into()).collect();
         let aggs = aggs.into_iter().map(|e| e.into()).collect();
-        let df = self.df.as_ref().clone().aggregate(group_by, aggs)?;
+        let df = self
+            .df
+            .aggregate(group_by, aggs)
+            .map_err(py_datafusion_err)?;
         Ok(Self::new(df))
     }
 
     #[pyo3(signature = (*exprs))]
-    fn sort(&self, exprs: Vec<PySortExpr>) -> PyDataFusionResult<Self> {
+    fn sort(&mut self, exprs: Vec<PyWindowExpr>) -> PyDataFusionResult<Self> {
         let exprs = to_sort_expressions(exprs);
-        let df = self.df.as_ref().clone().sort(exprs)?;
+        let df = self.df.sort(exprs).map_err(py_datafusion_err)?;
         Ok(Self::new(df))
     }
 
     #[pyo3(signature = (count, offset=0))]
-    fn limit(&self, count: usize, offset: usize) -> PyDataFusionResult<Self> {
-        let df = self.df.as_ref().clone().limit(offset, Some(count))?;
+    fn limit(&mut self, count: usize, offset: usize) -> PyDataFusionResult<Self> {
+        let df = self
+            .df
+            .limit(offset, Some(count))
+            .map_err(py_datafusion_err)?;
         Ok(Self::new(df))
     }
 
@@ -428,23 +122,23 @@ impl PyDataFrame {
     /// Unless some order is specified in the plan, there is no
     /// guarantee of the order of the result.
     fn collect(&self, py: Python) -> PyResult<Vec<PyObject>> {
-        let batches = wait_for_future(py, self.df.as_ref().clone().collect())
-            .map_err(PyDataFusionError::from)?;
+        let batches =
+            wait_for_future(py, self.df.clone().collect()).map_err(PyDataFusionError::from)?;
         // cannot use PyResult<Vec<RecordBatch>> return type due to
         // https://github.com/PyO3/pyo3/issues/1813
         batches.into_iter().map(|rb| rb.to_pyarrow(py)).collect()
     }
 
     /// Cache DataFrame.
-    fn cache(&self, py: Python) -> PyDataFusionResult<Self> {
-        let df = wait_for_future(py, self.df.as_ref().clone().cache())?;
+    fn cache(&mut self, py: Python) -> PyDataFusionResult<Self> {
+        let df = wait_for_future(py, self.df.clone().cache())?;
         Ok(Self::new(df))
     }
 
     /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch
     /// maintaining the input partitioning.
     fn collect_partitioned(&self, py: Python) -> PyResult<Vec<Vec<PyObject>>> {
-        let batches = wait_for_future(py, self.df.as_ref().clone().collect_partitioned())
+        let batches = wait_for_future(py, self.df.clone().collect_partitioned())
             .map_err(PyDataFusionError::from)?;
 
         batches
@@ -456,18 +150,22 @@ impl PyDataFrame {
     /// Print the result, 20 lines by default
     #[pyo3(signature = (num=20))]
     fn show(&self, py: Python, num: usize) -> PyDataFusionResult<()> {
-        let df = self.df.as_ref().clone().limit(0, Some(num))?;
+        let df = self
+            .df
+            .clone()
+            .limit(0, Some(num))
+            .map_err(py_datafusion_err)?;
         print_dataframe(py, df)
     }
 
     /// Filter out duplicate rows
-    fn distinct(&self) -> PyDataFusionResult<Self> {
-        let df = self.df.as_ref().clone().distinct()?;
+    fn distinct(&mut self) -> PyDataFusionResult<Self> {
+        let df = self.df.clone().distinct().map_err(py_datafusion_err)?;
         Ok(Self::new(df))
     }
 
     fn join(
-        &self,
+        &mut self,
         right: PyDataFrame,
         how: &str,
         left_on: Vec<PyBackedStr>,
@@ -490,18 +188,14 @@ impl PyDataFrame {
         let left_keys = left_on.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
         let right_keys = right_on.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
 
-        let df = self.df.as_ref().clone().join(
-            right.df.as_ref().clone(),
-            join_type,
-            &left_keys,
-            &right_keys,
-            None,
-        )?;
+        let df = self
+            .df
+            .join(right.df.clone(), join_type, &left_keys, &right_keys, None)?;
         Ok(Self::new(df))
     }
 
     fn join_on(
-        &self,
+        &mut self,
         right: PyDataFrame,
         on_exprs: Vec<PyExpr>,
         how: &str,
@@ -523,32 +217,34 @@ impl PyDataFrame {
 
         let df = self
             .df
-            .as_ref()
-            .clone()
-            .join_on(right.df.as_ref().clone(), join_type, exprs)?;
+            .join_on(right.df.clone(), join_type, exprs)
+            .map_err(py_datafusion_err)?;
         Ok(Self::new(df))
     }
 
     /// Print the query plan
     #[pyo3(signature = (verbose=false, analyze=false))]
     fn explain(&self, py: Python, verbose: bool, analyze: bool) -> PyDataFusionResult<()> {
-        let df = self.df.as_ref().clone().explain(verbose, analyze)?;
+        let df = self
+            .df
+            .explain(verbose, analyze)
+            .map_err(py_datafusion_err)?;
         print_dataframe(py, df)
     }
 
     /// Get the logical plan for this `DataFrame`
     fn logical_plan(&self) -> PyResult<PyLogicalPlan> {
-        Ok(self.df.as_ref().clone().logical_plan().clone().into())
+        Ok(self.df.logical_plan().clone().into())
     }
 
     /// Get the optimized logical plan for this `DataFrame`
     fn optimized_logical_plan(&self) -> PyDataFusionResult<PyLogicalPlan> {
-        Ok(self.df.as_ref().clone().into_optimized_plan()?.into())
+        Ok(self.df.clone().into_optimized_plan()?.into())
     }
 
     /// Get the execution plan for this `DataFrame`
     fn execution_plan(&self, py: Python) -> PyDataFusionResult<PyExecutionPlan> {
-        let plan = wait_for_future(py, self.df.as_ref().clone().create_physical_plan())?;
+        let plan = wait_for_future(py, self.df.clone().create_physical_plan())?;
         Ok(plan.into())
     }
 
@@ -556,9 +252,8 @@ impl PyDataFrame {
     fn repartition(&self, num: usize) -> PyDataFusionResult<Self> {
         let new_df = self
             .df
-            .as_ref()
-            .clone()
-            .repartition(Partitioning::RoundRobinBatch(num))?;
+            .repartition(Partitioning::RoundRobinBatch(num))
+            .map_err(py_datafusion_err)?;
         Ok(Self::new(new_df))
     }
 
@@ -568,9 +263,8 @@ impl PyDataFrame {
         let expr = args.into_iter().map(|py_expr| py_expr.into()).collect();
         let new_df = self
             .df
-            .as_ref()
-            .clone()
-            .repartition(Partitioning::Hash(expr, num))?;
+            .repartition(Partitioning::Hash(expr, num))
+            .map_err(py_datafusion_err)?;
         Ok(Self::new(new_df))
     }
 
@@ -580,11 +274,13 @@ impl PyDataFrame {
     fn union(&self, py_df: PyDataFrame, distinct: bool) -> PyDataFusionResult<Self> {
         let new_df = if distinct {
             self.df
-                .as_ref()
-                .clone()
-                .union_distinct(py_df.df.as_ref().clone())?
+                .union_distinct(py_df.df.clone())
+                .map_err(py_datafusion_err)?
         } else {
-            self.df.as_ref().clone().union(py_df.df.as_ref().clone())?
+            self.df
+                .clone()
+                .union(py_df.df.clone())
+                .map_err(py_datafusion_err)?
         };
 
         Ok(Self::new(new_df))
@@ -595,9 +291,8 @@ impl PyDataFrame {
     fn union_distinct(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
         let new_df = self
             .df
-            .as_ref()
-            .clone()
-            .union_distinct(py_df.df.as_ref().clone())?;
+            .union_distinct(py_df.df.clone())
+            .map_err(py_datafusion_err)?;
         Ok(Self::new(new_df))
     }
 
@@ -608,9 +303,8 @@ impl PyDataFrame {
         let unnest_options = UnnestOptions::default().with_preserve_nulls(preserve_nulls);
         let df = self
             .df
-            .as_ref()
-            .clone()
-            .unnest_columns_with_options(&[column], unnest_options)?;
+            .unnest_columns_with_options(&[column], unnest_options)
+            .map_err(py_datafusion_err)?;
         Ok(Self::new(df))
     }
 
@@ -626,9 +320,8 @@ impl PyDataFrame {
         let cols = columns.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
         let df = self
             .df
-            .as_ref()
-            .clone()
-            .unnest_columns_with_options(&cols, unnest_options)?;
+            .unnest_columns_with_options(&cols, unnest_options)
+            .map_err(py_datafusion_err)?;
         Ok(Self::new(df))
     }
 
@@ -636,15 +329,18 @@ impl PyDataFrame {
     fn intersect(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
         let new_df = self
             .df
-            .as_ref()
-            .clone()
-            .intersect(py_df.df.as_ref().clone())?;
+            .intersect(py_df.df.clone())
+            .map_err(py_datafusion_err)?;
         Ok(Self::new(new_df))
     }
 
     /// Calculate the exception of two `DataFrame`s.  The two `DataFrame`s must have exactly the same schema
     fn except_all(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
-        let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?;
+        let new_df = self
+            .df
+            .clone()
+            .except(py_df.df.clone())
+            .map_err(py_datafusion_err)?;
         Ok(Self::new(new_df))
     }
 
@@ -656,11 +352,9 @@ impl PyDataFrame {
         };
         wait_for_future(
             py,
-            self.df.as_ref().clone().write_csv(
-                path,
-                DataFrameWriteOptions::new(),
-                Some(csv_options),
-            ),
+            self.df
+                .clone()
+                .write_csv(path, DataFrameWriteOptions::new(), Some(csv_options)),
         )?;
         Ok(())
     }
@@ -717,7 +411,7 @@ impl PyDataFrame {
 
         wait_for_future(
             py,
-            self.df.as_ref().clone().write_parquet(
+            self.df.clone().write_parquet(
                 path,
                 DataFrameWriteOptions::new(),
                 Option::from(options),
@@ -731,7 +425,6 @@ impl PyDataFrame {
         wait_for_future(
             py,
             self.df
-                .as_ref()
                 .clone()
                 .write_json(path, DataFrameWriteOptions::new(), None),
         )?;
@@ -757,7 +450,7 @@ impl PyDataFrame {
         py: Python<'py>,
         requested_schema: Option<Bound<'py, PyCapsule>>,
     ) -> PyDataFusionResult<Bound<'py, PyCapsule>> {
-        let mut batches = wait_for_future(py, self.df.as_ref().clone().collect())?;
+        let mut batches = wait_for_future(py, self.df.clone().collect())?;
         let mut schema: Schema = self.df.schema().to_owned().into();
 
         if let Some(schema_capsule) = requested_schema {
@@ -787,7 +480,7 @@ impl PyDataFrame {
     fn execute_stream(&self, py: Python) -> PyDataFusionResult<PyRecordBatchStream> {
         // create a Tokio runtime to run the async code
         let rt = &get_tokio_runtime().0;
-        let df = self.df.as_ref().clone();
+        let df = self.df.clone();
         let fut: JoinHandle<datafusion::common::Result<SendableRecordBatchStream>> =
             rt.spawn(async move { df.execute_stream().await });
         let stream = wait_for_future(py, fut).map_err(py_datafusion_err)?;
@@ -797,7 +490,7 @@ impl PyDataFrame {
     fn execute_stream_partitioned(&self, py: Python) -> PyResult<Vec<PyRecordBatchStream>> {
         // create a Tokio runtime to run the async code
         let rt = &get_tokio_runtime().0;
-        let df = self.df.as_ref().clone();
+        let df = self.df.clone();
         let fut: JoinHandle<datafusion::common::Result<Vec<SendableRecordBatchStream>>> =
             rt.spawn(async move { df.execute_stream_partitioned().await });
         let stream = wait_for_future(py, fut).map_err(py_datafusion_err)?;
@@ -852,210 +545,43 @@ impl PyDataFrame {
 
     // Executes this DataFrame to get the total number of rows.
     fn count(&self, py: Python) -> PyDataFusionResult<usize> {
-        Ok(wait_for_future(py, self.df.as_ref().clone().count())?)
-    }
-
-    /// Get the current display configuration
-    #[getter]
-    fn display_config(&self) -> PyResult<Py<DisplayConfig>> {
-        Python::with_gil(|py| {
-            let config = (*self.config).clone();
-            Py::new(py, config)
-        })
-    }
-
-    /// Update display configuration
-    #[pyo3(signature = (
-        max_table_bytes=None,
-        min_table_rows=None,
-        max_cell_length=None,
-        max_table_rows_in_repr=None
-    ))]
-    fn configure_display(
-        &mut self,
-        max_table_bytes: Option<usize>,
-        min_table_rows: Option<usize>,
-        max_cell_length: Option<usize>,
-        max_table_rows_in_repr: Option<usize>,
-    ) {
-        let mut new_config = (*self.config).clone();
-
-        if let Some(bytes) = max_table_bytes {
-            new_config.max_table_bytes = bytes;
-        }
-
-        if let Some(rows) = min_table_rows {
-            new_config.min_table_rows = rows;
-        }
-
-        if let Some(length) = max_cell_length {
-            new_config.max_cell_length = length;
-        }
-
-        if let Some(rows) = max_table_rows_in_repr {
-            new_config.max_table_rows_in_repr = rows;
-        }
-
-        self.config = Arc::new(new_config);
+        Ok(wait_for_future(py, self.df.clone().count())?)
     }
 
-    /// Reset display configuration to default values
-    #[pyo3(text_signature = "($self)")]
-    fn reset_display_config(&mut self) {
-        self.config = Arc::new(DisplayConfig::default());
-    }
-}
-
-/// Print DataFrame
-fn print_dataframe(py: Python, df: DataFrame) -> PyDataFusionResult<()> {
-    // Get string representation of record batches
-    let batches = wait_for_future(py, df.collect())?;
-    let batches_as_string = pretty::pretty_format_batches(&batches);
-    let result = match batches_as_string {
-        Ok(batch) => format!("DataFrame()\n{batch}"),
-        Err(err) => format!("Error: {:?}", err.to_string()),
-    };
-
-    // Import the Python 'builtins' module to access the print function
-    // Note that println! does not print to the Python debug console and is not visible in notebooks for instance
-    let print = py.import("builtins")?.getattr("print")?;
-    print.call1((result,))?;
-    Ok(())
-}
+    #[pyo3(signature = (max_width=None, max_rows=None, show_nulls=None))]
+    pub fn to_string(
+        &self,
+        max_width: Option<usize>,
+        max_rows: Option<usize>,
+        show_nulls: Option<bool>,
+        py: Python,
+    ) -> PyDataFusionResult<String> {
+        let batches = wait_for_future(py, self.df.clone().collect())?;
 
-fn project_schema(from_schema: Schema, to_schema: Schema) -> Result<Schema, ArrowError> {
-    let merged_schema = Schema::try_merge(vec![from_schema, to_schema.clone()])?;
+        let mut table = TableData::new(&batches)?;
 
-    let project_indices: Vec<usize> = to_schema
-        .fields
-        .iter()
-        .map(|field| field.name())
-        .filter_map(|field_name| merged_schema.index_of(field_name).ok())
-        .collect();
+        // Use the display configuration provided or default values
+        let max_width = max_width.unwrap_or(80);
+        let max_rows = max_rows;
+        let show_nulls = show_nulls.unwrap_or(false);
 
-    merged_schema.project(&project_indices)
-}
+        table.set_display_options(max_width, max_rows, show_nulls);
 
-fn record_batch_into_schema(
-    record_batch: RecordBatch,
-    schema: &Schema,
-) -> Result<RecordBatch, ArrowError> {
-    let schema = Arc::new(schema.clone());
-    let base_schema = record_batch.schema();
-    if base_schema.fields().len() == 0 {
-        // Nothing to project
-        return Ok(RecordBatch::new_empty(schema));
+        Ok(table.to_string())
     }
 
-    let array_size = record_batch.column(0).len();
-    let mut data_arrays = Vec::with_capacity(schema.fields().len());
-
-    for field in schema.fields() {
-        let desired_data_type = field.data_type();
-        if let Some(original_data) = record_batch.column_by_name(field.name()) {
-            let original_data_type = original_data.data_type();
-
-            if can_cast_types(original_data_type, desired_data_type) {
-                data_arrays.push(arrow::compute::kernels::cast(
-                    original_data,
-                    desired_data_type,
-                )?);
-            } else if field.is_nullable() {
-                data_arrays.push(new_null_array(desired_data_type, array_size));
-            } else {
-                return Err(ArrowError::CastError(format!("Attempting to cast to non-nullable and non-castable field {} during schema projection.", field.name())));
-            }
-        } else {
-            if !field.is_nullable() {
-                return Err(ArrowError::CastError(format!(
-                    "Attempting to set null to non-nullable field {} during schema projection.",
-                    field.name()
-                )));
-            }
-            data_arrays.push(new_null_array(desired_data_type, array_size));
-        }
+    pub fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
+        // Use default display configuration
+        self.to_string(None, None, None, py)
     }
-
-    RecordBatch::try_new(schema, data_arrays)
 }
 
-/// This is a helper function to return the first non-empty record batch from executing a DataFrame.
-/// It additionally returns a bool, which indicates if there are more record batches available.
-/// We do this so we can determine if we should indicate to the user that the data has been
-/// truncated. This collects until we have achived both of these two conditions
-///
-/// - We have collected our minimum number of rows
-/// - We have reached our limit, either data size or maximum number of rows
-///
-/// Otherwise it will return when the stream has exhausted. If you want a specific number of
-/// rows, set min_rows == max_rows.
-async fn collect_record_batches_to_display(
-    df: DataFrame,
-    min_rows: usize,
-    max_rows: usize,
-    config: &DisplayConfig,
-) -> Result<(Vec<RecordBatch>, bool), DataFusionError> {
-    let partitioned_stream = df.execute_stream_partitioned().await?;
-    let mut stream = futures::stream::iter(partitioned_stream).flatten();
-    let mut size_estimate_so_far = 0;
-    let mut rows_so_far = 0;
-    let mut record_batches = Vec::default();
-    let mut has_more = false;
-
-    while (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows)
-        || rows_so_far < min_rows
-    {
-        let mut rb = match stream.next().await {
-            None => {
-                break;
-            }
-            Some(Ok(r)) => r,
-            Some(Err(e)) => return Err(e),
-        };
-
-        let mut rows_in_rb = rb.num_rows();
-        if rows_in_rb > 0 {
-            size_estimate_so_far += rb.get_array_memory_size();
-
-            if size_estimate_so_far > config.max_table_bytes {
-                let ratio = config.max_table_bytes as f32 / size_estimate_so_far as f32;
-                let total_rows = rows_in_rb + rows_so_far;
-
-                let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize;
-                if reduced_row_num < min_rows {
-                    reduced_row_num = min_rows.min(total_rows);
-                }
-
-                let limited_rows_this_rb = reduced_row_num - rows_so_far;
-                if limited_rows_this_rb < rows_in_rb {
-                    rows_in_rb = limited_rows_this_rb;
-                    rb = rb.slice(0, limited_rows_this_rb);
-                    has_more = true;
-                }
-            }
-
-            if rows_in_rb + rows_so_far > max_rows {
-                rb = rb.slice(0, max_rows - rows_so_far);
-                has_more = true;
-            }
-
-            rows_so_far += rb.num_rows();
-            record_batches.push(rb);
-        }
-    }
-
-    if record_batches.is_empty() {
-        return Ok((Vec::default(), false));
+impl PyDataFrame {
+    pub fn new(df: DataFrame) -> Self {
+        Self { df: Arc::new(df) }
     }
 
-    if !has_more {
-        // Data was not already truncated, so check to see if more record batches remain
-        has_more = match stream.try_next().await {
-            Ok(None) => false, // reached end
-            Ok(Some(_)) => true,
-            Err(_) => false, // Stream disconnected
-        };
+    pub fn dataframe(&self) -> Arc<DataFrame> {
+        self.df.clone()
     }
-
-    Ok((record_batches, has_more))
 }

From a5d224f4b388059fc11ffb4091d5f85d7f3b52d1 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 31 Mar 2025 19:11:29 +0800
Subject: [PATCH 19/51] Revert "Refactor PyDataFrame: Simplify methods and
 improve performance"

This reverts commit 0e30af3409a82a4924fd450e63c613b738fec0c9.
---
 src/context.rs   | 279 ++++++++++++++----
 src/dataframe.rs | 738 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 830 insertions(+), 187 deletions(-)

diff --git a/src/context.rs b/src/context.rs
index 6d5e078d3..0db0f4d7e 100644
--- a/src/context.rs
+++ b/src/context.rs
@@ -72,59 +72,24 @@ use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider};
 use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType};
 use tokio::task::JoinHandle;
 
-/// Display configuration for DataFrames
-#[pyclass(name = "DisplayConfig", module = "datafusion", subclass)]
-#[derive(Clone, Debug)]
-pub struct DisplayConfig {
-    #[pyo3(get, set)]
-    pub max_width: usize,
-    #[pyo3(get, set)]
-    pub max_rows: Option<usize>,
-    #[pyo3(get, set)]
-    pub show_nulls: bool,
-}
-
-#[pymethods]
-impl DisplayConfig {
-    #[new]
-    pub fn new(
-        max_width: Option<usize>,
-        max_rows: Option<usize>,
-        show_nulls: Option<bool>,
-    ) -> Self {
-        Self {
-            max_width: max_width.unwrap_or(80),
-            max_rows,
-            show_nulls: show_nulls.unwrap_or(false),
-        }
-    }
-}
-
 /// Configuration options for a SessionContext
 #[pyclass(name = "SessionConfig", module = "datafusion", subclass)]
 #[derive(Clone, Default)]
 pub struct PySessionConfig {
     pub config: SessionConfig,
-    pub display_config: DisplayConfig,
 }
 
 impl From<SessionConfig> for PySessionConfig {
     fn from(config: SessionConfig) -> Self {
-        Self {
-            config,
-            display_config: DisplayConfig::new(Some(80), None, Some(false)),
-        }
+        Self { config }
     }
 }
 
 #[pymethods]
 impl PySessionConfig {
-    #[pyo3(signature = (config_options=None, display_config=None))]
+    #[pyo3(signature = (config_options=None))]
     #[new]
-    fn new(
-        config_options: Option<HashMap<String, String>>,
-        display_config: Option<DisplayConfig>,
-    ) -> Self {
+    fn new(config_options: Option<HashMap<String, String>>) -> Self {
         let mut config = SessionConfig::new();
         if let Some(hash_map) = config_options {
             for (k, v) in &hash_map {
@@ -132,23 +97,7 @@ impl PySessionConfig {
             }
         }
 
-        Self {
-            config,
-            display_config: display_config
-                .unwrap_or_else(|| DisplayConfig::new(Some(80), None, Some(false))),
-        }
-    }
-
-    // Get the display configuration
-    pub fn get_display_config(&self) -> DisplayConfig {
-        self.display_config.clone()
-    }
-
-    // Set the display configuration
-    pub fn with_display_config(&self, display_config: DisplayConfig) -> Self {
-        let mut new_config = self.clone();
-        new_config.display_config = display_config;
-        new_config
+        Self { config }
     }
 
     fn with_create_default_catalog_and_schema(&self, enabled: bool) -> Self {
@@ -726,6 +675,226 @@ impl PySessionContext {
             )));
         }
 
+        let mut options = CsvReadOptions::new()
+            .has_header(has_header)
+            .delimiter(delimiter[0])
+            .schema_infer_max_records(schema_infer_max_records)
+            .file_extension(file_extension)
+            .file_compression_type(parse_file_compression_type(file_compression_type)?);
+        options.schema = schema.as_ref().map(|x| &x.0);
+
+        if path.is_instance_of::<PyList>() {
+            let paths = path.extract::<Vec<String>>()?;
+            let result = self.register_csv_from_multiple_paths(name, paths, options);
+            wait_for_future(py, result)?;
+        } else {
+            let path = path.extract::<String>()?;
+            let result = self.ctx.register_csv(name, &path, options);
+            wait_for_future(py, result)?;
+        }
+
+        Ok(())
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    #[pyo3(signature = (name,
+                        path,
+                        schema=None,
+                        schema_infer_max_records=1000,
+                        file_extension=".json",
+                        table_partition_cols=vec![],
+                        file_compression_type=None))]
+    pub fn register_json(
+        &mut self,
+        name: &str,
+        path: PathBuf,
+        schema: Option<PyArrowType<Schema>>,
+        schema_infer_max_records: usize,
+        file_extension: &str,
+        table_partition_cols: Vec<(String, String)>,
+        file_compression_type: Option<String>,
+        py: Python,
+    ) -> PyDataFusionResult<()> {
+        let path = path
+            .to_str()
+            .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?;
+
+        let mut options = NdJsonReadOptions::default()
+            .file_compression_type(parse_file_compression_type(file_compression_type)?)
+            .table_partition_cols(convert_table_partition_cols(table_partition_cols)?);
+        options.schema_infer_max_records = schema_infer_max_records;
+        options.file_extension = file_extension;
+        options.schema = schema.as_ref().map(|x| &x.0);
+
+        let result = self.ctx.register_json(name, path, options);
+        wait_for_future(py, result)?;
+
+        Ok(())
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    #[pyo3(signature = (name,
+                        path,
+                        schema=None,
+                        file_extension=".avro",
+                        table_partition_cols=vec![]))]
+    pub fn register_avro(
+        &mut self,
+        name: &str,
+        path: PathBuf,
+        schema: Option<PyArrowType<Schema>>,
+        file_extension: &str,
+        table_partition_cols: Vec<(String, String)>,
+        py: Python,
+    ) -> PyDataFusionResult<()> {
+        let path = path
+            .to_str()
+            .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?;
+
+        let mut options = AvroReadOptions::default()
+            .table_partition_cols(convert_table_partition_cols(table_partition_cols)?);
+        options.file_extension = file_extension;
+        options.schema = schema.as_ref().map(|x| &x.0);
+
+        let result = self.ctx.register_avro(name, path, options);
+        wait_for_future(py, result)?;
+
+        Ok(())
+    }
+
+    // Registers a PyArrow.Dataset
+    pub fn register_dataset(
+        &self,
+        name: &str,
+        dataset: &Bound<'_, PyAny>,
+        py: Python,
+    ) -> PyDataFusionResult<()> {
+        let table: Arc<dyn TableProvider> = Arc::new(Dataset::new(dataset, py)?);
+
+        self.ctx.register_table(name, table)?;
+
+        Ok(())
+    }
+
+    pub fn register_udf(&mut self, udf: PyScalarUDF) -> PyResult<()> {
+        self.ctx.register_udf(udf.function);
+        Ok(())
+    }
+
+    pub fn register_udaf(&mut self, udaf: PyAggregateUDF) -> PyResult<()> {
+        self.ctx.register_udaf(udaf.function);
+        Ok(())
+    }
+
+    pub fn register_udwf(&mut self, udwf: PyWindowUDF) -> PyResult<()> {
+        self.ctx.register_udwf(udwf.function);
+        Ok(())
+    }
+
+    #[pyo3(signature = (name="datafusion"))]
+    pub fn catalog(&self, name: &str) -> PyResult<PyCatalog> {
+        match self.ctx.catalog(name) {
+            Some(catalog) => Ok(PyCatalog::new(catalog)),
+            None => Err(PyKeyError::new_err(format!(
+                "Catalog with name {} doesn't exist.",
+                &name,
+            ))),
+        }
+    }
+
+    pub fn tables(&self) -> HashSet<String> {
+        self.ctx
+            .catalog_names()
+            .into_iter()
+            .filter_map(|name| self.ctx.catalog(&name))
+            .flat_map(move |catalog| {
+                catalog
+                    .schema_names()
+                    .into_iter()
+                    .filter_map(move |name| catalog.schema(&name))
+            })
+            .flat_map(|schema| schema.table_names())
+            .collect()
+    }
+
+    pub fn table(&self, name: &str, py: Python) -> PyResult<PyDataFrame> {
+        let x = wait_for_future(py, self.ctx.table(name))
+            .map_err(|e| PyKeyError::new_err(e.to_string()))?;
+        Ok(PyDataFrame::new(x))
+    }
+
+    pub fn table_exist(&self, name: &str) -> PyDataFusionResult<bool> {
+        Ok(self.ctx.table_exist(name)?)
+    }
+
+    pub fn empty_table(&self) -> PyDataFusionResult<PyDataFrame> {
+        Ok(PyDataFrame::new(self.ctx.read_empty()?))
+    }
+
+    pub fn session_id(&self) -> String {
+        self.ctx.session_id()
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    #[pyo3(signature = (path, schema=None, schema_infer_max_records=1000, file_extension=".json", table_partition_cols=vec![], file_compression_type=None))]
+    pub fn read_json(
+        &mut self,
+        path: PathBuf,
+        schema: Option<PyArrowType<Schema>>,
+        schema_infer_max_records: usize,
+        file_extension: &str,
+        table_partition_cols: Vec<(String, String)>,
+        file_compression_type: Option<String>,
+        py: Python,
+    ) -> PyDataFusionResult<PyDataFrame> {
+        let path = path
+            .to_str()
+            .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?;
+        let mut options = NdJsonReadOptions::default()
+            .table_partition_cols(convert_table_partition_cols(table_partition_cols)?)
+            .file_compression_type(parse_file_compression_type(file_compression_type)?);
+        options.schema_infer_max_records = schema_infer_max_records;
+        options.file_extension = file_extension;
+        let df = if let Some(schema) = schema {
+            options.schema = Some(&schema.0);
+            let result = self.ctx.read_json(path, options);
+            wait_for_future(py, result)?
+        } else {
+            let result = self.ctx.read_json(path, options);
+            wait_for_future(py, result)?
+        };
+        Ok(PyDataFrame::new(df))
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    #[pyo3(signature = (
+        path,
+        schema=None,
+        has_header=true,
+        delimiter=",",
+        schema_infer_max_records=1000,
+        file_extension=".csv",
+        table_partition_cols=vec![],
+        file_compression_type=None))]
+    pub fn read_csv(
+        &self,
+        path: &Bound<'_, PyAny>,
+        schema: Option<PyArrowType<Schema>>,
+        has_header: bool,
+        delimiter: &str,
+        schema_infer_max_records: usize,
+        file_extension: &str,
+        table_partition_cols: Vec<(String, String)>,
+        file_compression_type: Option<String>,
+        py: Python,
+    ) -> PyDataFusionResult<PyDataFrame> {
+        let delimiter = delimiter.as_bytes();
+        if delimiter.len() != 1 {
+            return Err(crate::errors::PyDataFusionError::PythonError(py_value_err(
+                "Delimiter must be a single character",
+            )));
+        };
+
         let mut options = CsvReadOptions::new()
             .has_header(has_header)
             .delimiter(delimiter[0])
diff --git a/src/dataframe.rs b/src/dataframe.rs
index 50227c3a6..cda4dd690 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -15,106 +15,412 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::collections::HashMap;
+use std::ffi::CString;
 use std::sync::Arc;
 
-use datafusion::arrow::csv::WriterBuilder;
-use datafusion::arrow::datatypes::SchemaRef;
-use datafusion::arrow::pyarrow::FromPyArrow;
-use datafusion::arrow::pyarrow::PyArrowType;
-use datafusion::arrow::record_batch::RecordBatch;
-use datafusion::common::TableReference;
-use datafusion::prelude::DataFrame;
-
-use pyo3::exceptions::PyTypeError;
+use arrow::array::{new_null_array, RecordBatch, RecordBatchIterator, RecordBatchReader};
+use arrow::compute::can_cast_types;
+use arrow::error::ArrowError;
+use arrow::ffi::FFI_ArrowSchema;
+use arrow::ffi_stream::FFI_ArrowArrayStream;
+use arrow::util::display::{ArrayFormatter, FormatOptions};
+use datafusion::arrow::datatypes::Schema;
+use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow};
+use datafusion::arrow::util::pretty;
+use datafusion::common::UnnestOptions;
+use datafusion::config::{CsvOptions, TableParquetOptions};
+use datafusion::dataframe::{DataFrame, DataFrameWriteOptions};
+use datafusion::datasource::TableProvider;
+use datafusion::error::DataFusionError;
+use datafusion::execution::SendableRecordBatchStream;
+use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel};
+use datafusion::prelude::*;
+use futures::{StreamExt, TryStreamExt};
+use pyo3::exceptions::PyValueError;
 use pyo3::prelude::*;
-use pyo3::types::{PyList, PyString, PyTuple};
+use pyo3::pybacked::PyBackedStr;
+use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods};
+use tokio::task::JoinHandle;
 
-use crate::errors::{py_datafusion_err, PyDataFusionError, PyDataFusionResult};
-use crate::expr::expr::PyExpr;
-use crate::expr::window_expr::PyWindowExpr;
+use crate::catalog::PyTable;
+use crate::errors::{py_datafusion_err, PyDataFusionError};
+use crate::expr::sort_expr::to_sort_expressions;
 use crate::physical_plan::PyExecutionPlan;
-use crate::record_batch::{PyRecordBatch, TableData};
+use crate::record_batch::PyRecordBatchStream;
 use crate::sql::logical::PyLogicalPlan;
-use crate::utils::{get_tokio_runtime, wait_for_future};
-use crate::Dataset;
+use crate::utils::{get_tokio_runtime, validate_pycapsule, wait_for_future};
+use crate::{
+    errors::PyDataFusionResult,
+    expr::{sort_expr::PySortExpr, PyExpr},
+};
+
+// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
+// - we have not decided on the table_provider approach yet
+// this is an interim implementation
+#[pyclass(name = "TableProvider", module = "datafusion")]
+pub struct PyTableProvider {
+    provider: Arc<dyn TableProvider>,
+}
+
+impl PyTableProvider {
+    pub fn new(provider: Arc<dyn TableProvider>) -> Self {
+        Self { provider }
+    }
+
+    pub fn as_table(&self) -> PyTable {
+        let table_provider: Arc<dyn TableProvider> = self.provider.clone();
+        PyTable::new(table_provider)
+    }
+}
+
+/// Configuration for DataFrame display in Python environment
+#[pyclass(name = "DisplayConfig", module = "datafusion")]
+#[derive(Debug, Clone)]
+pub struct DisplayConfig {
+    /// Maximum bytes to display for table presentation (default: 2MB)
+    #[pyo3(get, set)]
+    pub max_table_bytes: usize,
+    /// Minimum number of table rows to display (default: 20)
+    #[pyo3(get, set)]
+    pub min_table_rows: usize,
+    /// Maximum length of a cell before it gets minimized (default: 25)
+    #[pyo3(get, set)]
+    pub max_cell_length: usize,
+    /// Maximum number of rows to display in repr string output (default: 10)
+    #[pyo3(get, set)]
+    pub max_table_rows_in_repr: usize,
+}
+
+#[pymethods]
+impl DisplayConfig {
+    #[new]
+    #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))]
+    fn new(
+        max_table_bytes: Option<usize>,
+        min_table_rows: Option<usize>,
+        max_cell_length: Option<usize>,
+        max_table_rows_in_repr: Option<usize>,
+    ) -> Self {
+        let default = DisplayConfig::default();
+        Self {
+            max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes),
+            min_table_rows: min_table_rows.unwrap_or(default.min_table_rows),
+            max_cell_length: max_cell_length.unwrap_or(default.max_cell_length),
+            max_table_rows_in_repr: max_table_rows_in_repr
+                .unwrap_or(default.max_table_rows_in_repr),
+        }
+    }
+}
+
+impl Default for DisplayConfig {
+    fn default() -> Self {
+        Self {
+            max_table_bytes: 2 * 1024 * 1024, // 2 MB
+            min_table_rows: 20,
+            max_cell_length: 25,
+            max_table_rows_in_repr: 10,
+        }
+    }
+}
 
-/// Represents a DataFrame in DataFusion.
+/// A PyDataFrame is a representation of a logical plan and an API to compose statements.
+/// Use it to build a plan and `.collect()` to execute the plan and collect the result.
+/// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment.
 #[pyclass(name = "DataFrame", module = "datafusion", subclass)]
 #[derive(Clone)]
 pub struct PyDataFrame {
     df: Arc<DataFrame>,
+    config: Arc<DisplayConfig>,
+}
+
+impl PyDataFrame {
+    /// creates a new PyDataFrame
+    pub fn new(df: DataFrame) -> Self {
+        Self {
+            df: Arc::new(df),
+            config: Arc::new(DisplayConfig::default()),
+        }
+    }
 }
 
 #[pymethods]
 impl PyDataFrame {
-    fn select(&mut self, expr: Vec<PyExpr>) -> PyDataFusion<Self> {
-        let expr = expr.into_iter().map(|e| e.into()).collect::<Vec<_>>();
-        let df = self.df.select(expr).map_err(py_datafusion_err)?;
+    /// Enable selection for `df[col]`, `df[col1, col2, col3]`, and `df[[col1, col2, col3]]`
+    fn __getitem__(&self, key: Bound<'_, PyAny>) -> PyDataFusionResult<Self> {
+        if let Ok(key) = key.extract::<PyBackedStr>() {
+            // df[col]
+            self.select_columns(vec![key])
+        } else if let Ok(tuple) = key.downcast::<PyTuple>() {
+            // df[col1, col2, col3]
+            let keys = tuple
+                .iter()
+                .map(|item| item.extract::<PyBackedStr>())
+                .collect::<PyResult<Vec<PyBackedStr>>>()?;
+            self.select_columns(keys)
+        } else if let Ok(keys) = key.extract::<Vec<PyBackedStr>>() {
+            // df[[col1, col2, col3]]
+            self.select_columns(keys)
+        } else {
+            let message = "DataFrame can only be indexed by string index or indices".to_string();
+            Err(PyDataFusionError::Common(message))
+        }
+    }
+
+    fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
+        let (batches, has_more) = wait_for_future(
+            py,
+            collect_record_batches_to_display(
+                self.df.as_ref().clone(),
+                self.config.min_table_rows,
+                self.config.max_table_rows_in_repr,
+                &self.config,
+            ),
+        )?;
+        if batches.is_empty() {
+            // This should not be reached, but do it for safety since we index into the vector below
+            return Ok("No data to display".to_string());
+        }
+
+        let batches_as_displ =
+            pretty::pretty_format_batches(&batches).map_err(py_datafusion_err)?;
+
+        let additional_str = match has_more {
+            true => "\nData truncated.",
+            false => "",
+        };
+
+        Ok(format!("DataFrame()\n{batches_as_displ}{additional_str}"))
+    }
+
+    fn _repr_html_(&self, py: Python) -> PyDataFusionResult<String> {
+        let (batches, has_more) = wait_for_future(
+            py,
+            collect_record_batches_to_display(
+                self.df.as_ref().clone(),
+                self.config.min_table_rows,
+                usize::MAX,
+                &self.config,
+            ),
+        )?;
+        if batches.is_empty() {
+            // This should not be reached, but do it for safety since we index into the vector below
+            return Ok("No data to display".to_string());
+        }
+
+        let table_uuid = uuid::Uuid::new_v4().to_string();
+
+        let mut html_str = "
+        <style>
+            .expandable-container {
+                display: inline-block;
+                max-width: 200px;
+            }
+            .expandable {
+                white-space: nowrap;
+                overflow: hidden;
+                text-overflow: ellipsis;
+                display: block;
+            }
+            .full-text {
+                display: none;
+                white-space: normal;
+            }
+            .expand-btn {
+                cursor: pointer;
+                color: blue;
+                text-decoration: underline;
+                border: none;
+                background: none;
+                font-size: inherit;
+                display: block;
+                margin-top: 5px;
+            }
+        </style>
+
+        <div style=\"width: 100%; max-width: 1000px; max-height: 300px; overflow: auto; border: 1px solid #ccc;\">
+            <table style=\"border-collapse: collapse; min-width: 100%\">
+                <thead>\n".to_string();
+
+        let schema = batches[0].schema();
+
+        let mut header = Vec::new();
+        for field in schema.fields() {
+            header.push(format!("<th style='border: 1px solid black; padding: 8px; text-align: left; background-color: #f2f2f2; white-space: nowrap; min-width: fit-content; max-width: fit-content;'>{}</th>", field.name()));
+        }
+        let header_str = header.join("");
+        html_str.push_str(&format!("<tr>{}</tr></thead><tbody>\n", header_str));
+
+        let batch_formatters = batches
+            .iter()
+            .map(|batch| {
+                batch
+                    .columns()
+                    .iter()
+                    .map(|c| ArrayFormatter::try_new(c.as_ref(), &FormatOptions::default()))
+                    .map(|c| {
+                        c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string())))
+                    })
+                    .collect::<Result<Vec<_>, _>>()
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+
+        let rows_per_batch = batches.iter().map(|batch| batch.num_rows());
+
+        // We need to build up row by row for html
+        let mut table_row = 0;
+        for (batch_formatter, num_rows_in_batch) in batch_formatters.iter().zip(rows_per_batch) {
+            for batch_row in 0..num_rows_in_batch {
+                table_row += 1;
+                let mut cells = Vec::new();
+                for (col, formatter) in batch_formatter.iter().enumerate() {
+                    let cell_data = formatter.value(batch_row).to_string();
+                    // From testing, primitive data types do not typically get larger than 21 characters
+                    if cell_data.len() > self.config.max_cell_length {
+                        let short_cell_data = &cell_data[0..self.config.max_cell_length];
+                        cells.push(format!("
+                            <td style='border: 1px solid black; padding: 8px; text-align: left; white-space: nowrap;'>
+                                <div class=\"expandable-container\">
+                                    <span class=\"expandable\" id=\"{table_uuid}-min-text-{table_row}-{col}\">{short_cell_data}</span>
+                                    <span class=\"full-text\" id=\"{table_uuid}-full-text-{table_row}-{col}\">{cell_data}</span>
+                                    <button class=\"expand-btn\" onclick=\"toggleDataFrameCellText('{table_uuid}',{table_row},{col})\">...</button>
+                                </div>
+                            </td>"));
+                    } else {
+                        cells.push(format!("<td style='border: 1px solid black; padding: 8px; text-align: left; white-space: nowrap;'>{}</td>", formatter.value(batch_row)));
+                    }
+                }
+                let row_str = cells.join("");
+                html_str.push_str(&format!("<tr>{}</tr>\n", row_str));
+            }
+        }
+        html_str.push_str("</tbody></table></div>\n");
+
+        html_str.push_str("
+            <script>
+            function toggleDataFrameCellText(table_uuid, row, col) {
+                var shortText = document.getElementById(table_uuid + \"-min-text-\" + row + \"-\" + col);
+                var fullText = document.getElementById(table_uuid + \"-full-text-\" + row + \"-\" + col);
+                var button = event.target;
+
+                if (fullText.style.display === \"none\") {
+                    shortText.style.display = \"none\";
+                    fullText.style.display = \"inline\";
+                    button.textContent = \"(less)\";
+                } else {
+                    shortText.style.display = \"inline\";
+                    fullText.style.display = \"none\";
+                    button.textContent = \"...\";
+                }
+            }
+            </script>
+        ");
+
+        if has_more {
+            html_str.push_str("Data truncated due to size.");
+        }
+
+        Ok(html_str)
+    }
+
+    /// Calculate summary statistics for a DataFrame
+    fn describe(&self, py: Python) -> PyDataFusionResult<Self> {
+        let df = self.df.as_ref().clone();
+        let stat_df = wait_for_future(py, df.describe())?;
+        Ok(Self::new(stat_df))
+    }
+
+    /// Returns the schema from the logical plan
+    fn schema(&self) -> PyArrowType<Schema> {
+        PyArrowType(self.df.schema().into())
+    }
+
+    /// Convert this DataFrame into a Table that can be used in register_table
+    /// By convention, into_... methods consume self and return the new object.
+    /// Disabling the clippy lint, so we can use &self
+    /// because we're working with Python bindings
+    /// where objects are shared
+    /// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
+    /// - we have not decided on the table_provider approach yet
+    #[allow(clippy::wrong_self_convention)]
+    fn into_view(&self) -> PyDataFusionResult<PyTable> {
+        // Call the underlying Rust DataFrame::into_view method.
+        // Note that the Rust method consumes self; here we clone the inner Arc<DataFrame>
+        // so that we don’t invalidate this PyDataFrame.
+        let table_provider = self.df.as_ref().clone().into_view();
+        let table_provider = PyTableProvider::new(table_provider);
+
+        Ok(table_provider.as_table())
+    }
+
+    #[pyo3(signature = (*args))]
+    fn select_columns(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
+        let args = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
+        let df = self.df.as_ref().clone().select_columns(&args)?;
         Ok(Self::new(df))
     }
 
-    fn filter(&mut self, predicate: PyExpr) -> PyDataFusionResult<Self> {
-        let df = self
-            .df
-            .filter(predicate.into())
-            .map_err(py_datafusion_err)?;
+    #[pyo3(signature = (*args))]
+    fn select(&self, args: Vec<PyExpr>) -> PyDataFusionResult<Self> {
+        let expr = args.into_iter().map(|e| e.into()).collect();
+        let df = self.df.as_ref().clone().select(expr)?;
         Ok(Self::new(df))
     }
 
-    fn with_column(&mut self, name: &str, expr: PyExpr) -> PyDataFusionResult<Self> {
-        let df = self
-            .df
-            .with_column(name, expr.into())
-            .map_err(py_datafusion_err)?;
+    #[pyo3(signature = (*args))]
+    fn drop(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
+        let cols = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
+        let df = self.df.as_ref().clone().drop_columns(&cols)?;
+        Ok(Self::new(df))
+    }
+
+    fn filter(&self, predicate: PyExpr) -> PyDataFusionResult<Self> {
+        let df = self.df.as_ref().clone().filter(predicate.into())?;
         Ok(Self::new(df))
     }
 
-    fn with_columns(&mut self, exprs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
-        let mut df = self.df.clone();
+    fn with_column(&self, name: &str, expr: PyExpr) -> PyDataFusionResult<Self> {
+        let df = self.df.as_ref().clone().with_column(name, expr.into())?;
+        Ok(Self::new(df))
+    }
+
+    fn with_columns(&self, exprs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
+        let mut df = self.df.as_ref().clone();
         for expr in exprs {
             let expr: Expr = expr.into();
             let name = format!("{}", expr.schema_name());
-            df = df
-                .with_column(name.as_str(), expr)
-                .map_err(py_datafusion_err)?
+            df = df.with_column(name.as_str(), expr)?
         }
         Ok(Self::new(df))
     }
 
     /// Rename one column by applying a new projection. This is a no-op if the column to be
     /// renamed does not exist.
-    fn with_column_renamed(&mut self, old_name: &str, new_name: &str) -> PyDataFusionResult<Self> {
+    fn with_column_renamed(&self, old_name: &str, new_name: &str) -> PyDataFusionResult<Self> {
         let df = self
             .df
-            .with_column_renamed(old_name, new_name)
-            .map_err(py_datafusion_err)?;
+            .as_ref()
+            .clone()
+            .with_column_renamed(old_name, new_name)?;
         Ok(Self::new(df))
     }
 
-    fn aggregate(&mut self, group_by: Vec<PyExpr>, aggs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
+    fn aggregate(&self, group_by: Vec<PyExpr>, aggs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
         let group_by = group_by.into_iter().map(|e| e.into()).collect();
         let aggs = aggs.into_iter().map(|e| e.into()).collect();
-        let df = self
-            .df
-            .aggregate(group_by, aggs)
-            .map_err(py_datafusion_err)?;
+        let df = self.df.as_ref().clone().aggregate(group_by, aggs)?;
         Ok(Self::new(df))
     }
 
     #[pyo3(signature = (*exprs))]
-    fn sort(&mut self, exprs: Vec<PyWindowExpr>) -> PyDataFusionResult<Self> {
+    fn sort(&self, exprs: Vec<PySortExpr>) -> PyDataFusionResult<Self> {
         let exprs = to_sort_expressions(exprs);
-        let df = self.df.sort(exprs).map_err(py_datafusion_err)?;
+        let df = self.df.as_ref().clone().sort(exprs)?;
         Ok(Self::new(df))
     }
 
     #[pyo3(signature = (count, offset=0))]
-    fn limit(&mut self, count: usize, offset: usize) -> PyDataFusionResult<Self> {
-        let df = self
-            .df
-            .limit(offset, Some(count))
-            .map_err(py_datafusion_err)?;
+    fn limit(&self, count: usize, offset: usize) -> PyDataFusionResult<Self> {
+        let df = self.df.as_ref().clone().limit(offset, Some(count))?;
         Ok(Self::new(df))
     }
 
@@ -122,23 +428,23 @@ impl PyDataFrame {
     /// Unless some order is specified in the plan, there is no
     /// guarantee of the order of the result.
     fn collect(&self, py: Python) -> PyResult<Vec<PyObject>> {
-        let batches =
-            wait_for_future(py, self.df.clone().collect()).map_err(PyDataFusionError::from)?;
+        let batches = wait_for_future(py, self.df.as_ref().clone().collect())
+            .map_err(PyDataFusionError::from)?;
         // cannot use PyResult<Vec<RecordBatch>> return type due to
         // https://github.com/PyO3/pyo3/issues/1813
         batches.into_iter().map(|rb| rb.to_pyarrow(py)).collect()
     }
 
     /// Cache DataFrame.
-    fn cache(&mut self, py: Python) -> PyDataFusionResult<Self> {
-        let df = wait_for_future(py, self.df.clone().cache())?;
+    fn cache(&self, py: Python) -> PyDataFusionResult<Self> {
+        let df = wait_for_future(py, self.df.as_ref().clone().cache())?;
         Ok(Self::new(df))
     }
 
     /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch
     /// maintaining the input partitioning.
     fn collect_partitioned(&self, py: Python) -> PyResult<Vec<Vec<PyObject>>> {
-        let batches = wait_for_future(py, self.df.clone().collect_partitioned())
+        let batches = wait_for_future(py, self.df.as_ref().clone().collect_partitioned())
             .map_err(PyDataFusionError::from)?;
 
         batches
@@ -150,22 +456,18 @@ impl PyDataFrame {
     /// Print the result, 20 lines by default
     #[pyo3(signature = (num=20))]
     fn show(&self, py: Python, num: usize) -> PyDataFusionResult<()> {
-        let df = self
-            .df
-            .clone()
-            .limit(0, Some(num))
-            .map_err(py_datafusion_err)?;
+        let df = self.df.as_ref().clone().limit(0, Some(num))?;
         print_dataframe(py, df)
     }
 
     /// Filter out duplicate rows
-    fn distinct(&mut self) -> PyDataFusionResult<Self> {
-        let df = self.df.clone().distinct().map_err(py_datafusion_err)?;
+    fn distinct(&self) -> PyDataFusionResult<Self> {
+        let df = self.df.as_ref().clone().distinct()?;
         Ok(Self::new(df))
     }
 
     fn join(
-        &mut self,
+        &self,
         right: PyDataFrame,
         how: &str,
         left_on: Vec<PyBackedStr>,
@@ -188,14 +490,18 @@ impl PyDataFrame {
         let left_keys = left_on.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
         let right_keys = right_on.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
 
-        let df = self
-            .df
-            .join(right.df.clone(), join_type, &left_keys, &right_keys, None)?;
+        let df = self.df.as_ref().clone().join(
+            right.df.as_ref().clone(),
+            join_type,
+            &left_keys,
+            &right_keys,
+            None,
+        )?;
         Ok(Self::new(df))
     }
 
     fn join_on(
-        &mut self,
+        &self,
         right: PyDataFrame,
         on_exprs: Vec<PyExpr>,
         how: &str,
@@ -217,34 +523,32 @@ impl PyDataFrame {
 
         let df = self
             .df
-            .join_on(right.df.clone(), join_type, exprs)
-            .map_err(py_datafusion_err)?;
+            .as_ref()
+            .clone()
+            .join_on(right.df.as_ref().clone(), join_type, exprs)?;
         Ok(Self::new(df))
     }
 
     /// Print the query plan
     #[pyo3(signature = (verbose=false, analyze=false))]
     fn explain(&self, py: Python, verbose: bool, analyze: bool) -> PyDataFusionResult<()> {
-        let df = self
-            .df
-            .explain(verbose, analyze)
-            .map_err(py_datafusion_err)?;
+        let df = self.df.as_ref().clone().explain(verbose, analyze)?;
         print_dataframe(py, df)
     }
 
     /// Get the logical plan for this `DataFrame`
     fn logical_plan(&self) -> PyResult<PyLogicalPlan> {
-        Ok(self.df.logical_plan().clone().into())
+        Ok(self.df.as_ref().clone().logical_plan().clone().into())
     }
 
     /// Get the optimized logical plan for this `DataFrame`
     fn optimized_logical_plan(&self) -> PyDataFusionResult<PyLogicalPlan> {
-        Ok(self.df.clone().into_optimized_plan()?.into())
+        Ok(self.df.as_ref().clone().into_optimized_plan()?.into())
     }
 
     /// Get the execution plan for this `DataFrame`
     fn execution_plan(&self, py: Python) -> PyDataFusionResult<PyExecutionPlan> {
-        let plan = wait_for_future(py, self.df.clone().create_physical_plan())?;
+        let plan = wait_for_future(py, self.df.as_ref().clone().create_physical_plan())?;
         Ok(plan.into())
     }
 
@@ -252,8 +556,9 @@ impl PyDataFrame {
     fn repartition(&self, num: usize) -> PyDataFusionResult<Self> {
         let new_df = self
             .df
-            .repartition(Partitioning::RoundRobinBatch(num))
-            .map_err(py_datafusion_err)?;
+            .as_ref()
+            .clone()
+            .repartition(Partitioning::RoundRobinBatch(num))?;
         Ok(Self::new(new_df))
     }
 
@@ -263,8 +568,9 @@ impl PyDataFrame {
         let expr = args.into_iter().map(|py_expr| py_expr.into()).collect();
         let new_df = self
             .df
-            .repartition(Partitioning::Hash(expr, num))
-            .map_err(py_datafusion_err)?;
+            .as_ref()
+            .clone()
+            .repartition(Partitioning::Hash(expr, num))?;
         Ok(Self::new(new_df))
     }
 
@@ -274,13 +580,11 @@ impl PyDataFrame {
     fn union(&self, py_df: PyDataFrame, distinct: bool) -> PyDataFusionResult<Self> {
         let new_df = if distinct {
             self.df
-                .union_distinct(py_df.df.clone())
-                .map_err(py_datafusion_err)?
-        } else {
-            self.df
+                .as_ref()
                 .clone()
-                .union(py_df.df.clone())
-                .map_err(py_datafusion_err)?
+                .union_distinct(py_df.df.as_ref().clone())?
+        } else {
+            self.df.as_ref().clone().union(py_df.df.as_ref().clone())?
         };
 
         Ok(Self::new(new_df))
@@ -291,8 +595,9 @@ impl PyDataFrame {
     fn union_distinct(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
         let new_df = self
             .df
-            .union_distinct(py_df.df.clone())
-            .map_err(py_datafusion_err)?;
+            .as_ref()
+            .clone()
+            .union_distinct(py_df.df.as_ref().clone())?;
         Ok(Self::new(new_df))
     }
 
@@ -303,8 +608,9 @@ impl PyDataFrame {
         let unnest_options = UnnestOptions::default().with_preserve_nulls(preserve_nulls);
         let df = self
             .df
-            .unnest_columns_with_options(&[column], unnest_options)
-            .map_err(py_datafusion_err)?;
+            .as_ref()
+            .clone()
+            .unnest_columns_with_options(&[column], unnest_options)?;
         Ok(Self::new(df))
     }
 
@@ -320,8 +626,9 @@ impl PyDataFrame {
         let cols = columns.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
         let df = self
             .df
-            .unnest_columns_with_options(&cols, unnest_options)
-            .map_err(py_datafusion_err)?;
+            .as_ref()
+            .clone()
+            .unnest_columns_with_options(&cols, unnest_options)?;
         Ok(Self::new(df))
     }
 
@@ -329,18 +636,15 @@ impl PyDataFrame {
     fn intersect(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
         let new_df = self
             .df
-            .intersect(py_df.df.clone())
-            .map_err(py_datafusion_err)?;
+            .as_ref()
+            .clone()
+            .intersect(py_df.df.as_ref().clone())?;
         Ok(Self::new(new_df))
     }
 
     /// Calculate the exception of two `DataFrame`s.  The two `DataFrame`s must have exactly the same schema
     fn except_all(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
-        let new_df = self
-            .df
-            .clone()
-            .except(py_df.df.clone())
-            .map_err(py_datafusion_err)?;
+        let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?;
         Ok(Self::new(new_df))
     }
 
@@ -352,9 +656,11 @@ impl PyDataFrame {
         };
         wait_for_future(
             py,
-            self.df
-                .clone()
-                .write_csv(path, DataFrameWriteOptions::new(), Some(csv_options)),
+            self.df.as_ref().clone().write_csv(
+                path,
+                DataFrameWriteOptions::new(),
+                Some(csv_options),
+            ),
         )?;
         Ok(())
     }
@@ -411,7 +717,7 @@ impl PyDataFrame {
 
         wait_for_future(
             py,
-            self.df.clone().write_parquet(
+            self.df.as_ref().clone().write_parquet(
                 path,
                 DataFrameWriteOptions::new(),
                 Option::from(options),
@@ -425,6 +731,7 @@ impl PyDataFrame {
         wait_for_future(
             py,
             self.df
+                .as_ref()
                 .clone()
                 .write_json(path, DataFrameWriteOptions::new(), None),
         )?;
@@ -450,7 +757,7 @@ impl PyDataFrame {
         py: Python<'py>,
         requested_schema: Option<Bound<'py, PyCapsule>>,
     ) -> PyDataFusionResult<Bound<'py, PyCapsule>> {
-        let mut batches = wait_for_future(py, self.df.clone().collect())?;
+        let mut batches = wait_for_future(py, self.df.as_ref().clone().collect())?;
         let mut schema: Schema = self.df.schema().to_owned().into();
 
         if let Some(schema_capsule) = requested_schema {
@@ -480,7 +787,7 @@ impl PyDataFrame {
     fn execute_stream(&self, py: Python) -> PyDataFusionResult<PyRecordBatchStream> {
         // create a Tokio runtime to run the async code
         let rt = &get_tokio_runtime().0;
-        let df = self.df.clone();
+        let df = self.df.as_ref().clone();
         let fut: JoinHandle<datafusion::common::Result<SendableRecordBatchStream>> =
             rt.spawn(async move { df.execute_stream().await });
         let stream = wait_for_future(py, fut).map_err(py_datafusion_err)?;
@@ -490,7 +797,7 @@ impl PyDataFrame {
     fn execute_stream_partitioned(&self, py: Python) -> PyResult<Vec<PyRecordBatchStream>> {
         // create a Tokio runtime to run the async code
         let rt = &get_tokio_runtime().0;
-        let df = self.df.clone();
+        let df = self.df.as_ref().clone();
         let fut: JoinHandle<datafusion::common::Result<Vec<SendableRecordBatchStream>>> =
             rt.spawn(async move { df.execute_stream_partitioned().await });
         let stream = wait_for_future(py, fut).map_err(py_datafusion_err)?;
@@ -545,43 +852,210 @@ impl PyDataFrame {
 
     // Executes this DataFrame to get the total number of rows.
     fn count(&self, py: Python) -> PyDataFusionResult<usize> {
-        Ok(wait_for_future(py, self.df.clone().count())?)
+        Ok(wait_for_future(py, self.df.as_ref().clone().count())?)
     }
 
-    #[pyo3(signature = (max_width=None, max_rows=None, show_nulls=None))]
-    pub fn to_string(
-        &self,
-        max_width: Option<usize>,
-        max_rows: Option<usize>,
-        show_nulls: Option<bool>,
-        py: Python,
-    ) -> PyDataFusionResult<String> {
-        let batches = wait_for_future(py, self.df.clone().collect())?;
+    /// Get the current display configuration
+    #[getter]
+    fn display_config(&self) -> PyResult<Py<DisplayConfig>> {
+        Python::with_gil(|py| {
+            let config = (*self.config).clone();
+            Py::new(py, config)
+        })
+    }
 
-        let mut table = TableData::new(&batches)?;
+    /// Update display configuration
+    #[pyo3(signature = (
+        max_table_bytes=None,
+        min_table_rows=None,
+        max_cell_length=None,
+        max_table_rows_in_repr=None
+    ))]
+    fn configure_display(
+        &mut self,
+        max_table_bytes: Option<usize>,
+        min_table_rows: Option<usize>,
+        max_cell_length: Option<usize>,
+        max_table_rows_in_repr: Option<usize>,
+    ) {
+        let mut new_config = (*self.config).clone();
+
+        if let Some(bytes) = max_table_bytes {
+            new_config.max_table_bytes = bytes;
+        }
+
+        if let Some(rows) = min_table_rows {
+            new_config.min_table_rows = rows;
+        }
 
-        // Use the display configuration provided or default values
-        let max_width = max_width.unwrap_or(80);
-        let max_rows = max_rows;
-        let show_nulls = show_nulls.unwrap_or(false);
+        if let Some(length) = max_cell_length {
+            new_config.max_cell_length = length;
+        }
 
-        table.set_display_options(max_width, max_rows, show_nulls);
+        if let Some(rows) = max_table_rows_in_repr {
+            new_config.max_table_rows_in_repr = rows;
+        }
 
-        Ok(table.to_string())
+        self.config = Arc::new(new_config);
     }
 
-    pub fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
-        // Use default display configuration
-        self.to_string(None, None, None, py)
+    /// Reset display configuration to default values
+    #[pyo3(text_signature = "($self)")]
+    fn reset_display_config(&mut self) {
+        self.config = Arc::new(DisplayConfig::default());
     }
 }
 
-impl PyDataFrame {
-    pub fn new(df: DataFrame) -> Self {
-        Self { df: Arc::new(df) }
+/// Print DataFrame
+fn print_dataframe(py: Python, df: DataFrame) -> PyDataFusionResult<()> {
+    // Get string representation of record batches
+    let batches = wait_for_future(py, df.collect())?;
+    let batches_as_string = pretty::pretty_format_batches(&batches);
+    let result = match batches_as_string {
+        Ok(batch) => format!("DataFrame()\n{batch}"),
+        Err(err) => format!("Error: {:?}", err.to_string()),
+    };
+
+    // Import the Python 'builtins' module to access the print function
+    // Note that println! does not print to the Python debug console and is not visible in notebooks for instance
+    let print = py.import("builtins")?.getattr("print")?;
+    print.call1((result,))?;
+    Ok(())
+}
+
+fn project_schema(from_schema: Schema, to_schema: Schema) -> Result<Schema, ArrowError> {
+    let merged_schema = Schema::try_merge(vec![from_schema, to_schema.clone()])?;
+
+    let project_indices: Vec<usize> = to_schema
+        .fields
+        .iter()
+        .map(|field| field.name())
+        .filter_map(|field_name| merged_schema.index_of(field_name).ok())
+        .collect();
+
+    merged_schema.project(&project_indices)
+}
+
+fn record_batch_into_schema(
+    record_batch: RecordBatch,
+    schema: &Schema,
+) -> Result<RecordBatch, ArrowError> {
+    let schema = Arc::new(schema.clone());
+    let base_schema = record_batch.schema();
+    if base_schema.fields().len() == 0 {
+        // Nothing to project
+        return Ok(RecordBatch::new_empty(schema));
+    }
+
+    let array_size = record_batch.column(0).len();
+    let mut data_arrays = Vec::with_capacity(schema.fields().len());
+
+    for field in schema.fields() {
+        let desired_data_type = field.data_type();
+        if let Some(original_data) = record_batch.column_by_name(field.name()) {
+            let original_data_type = original_data.data_type();
+
+            if can_cast_types(original_data_type, desired_data_type) {
+                data_arrays.push(arrow::compute::kernels::cast(
+                    original_data,
+                    desired_data_type,
+                )?);
+            } else if field.is_nullable() {
+                data_arrays.push(new_null_array(desired_data_type, array_size));
+            } else {
+                return Err(ArrowError::CastError(format!("Attempting to cast to non-nullable and non-castable field {} during schema projection.", field.name())));
+            }
+        } else {
+            if !field.is_nullable() {
+                return Err(ArrowError::CastError(format!(
+                    "Attempting to set null to non-nullable field {} during schema projection.",
+                    field.name()
+                )));
+            }
+            data_arrays.push(new_null_array(desired_data_type, array_size));
+        }
     }
 
-    pub fn dataframe(&self) -> Arc<DataFrame> {
-        self.df.clone()
+    RecordBatch::try_new(schema, data_arrays)
+}
+
+/// This is a helper function to return the first non-empty record batch from executing a DataFrame.
+/// It additionally returns a bool, which indicates if there are more record batches available.
+/// We do this so we can determine if we should indicate to the user that the data has been
+/// truncated. This collects until we have achived both of these two conditions
+///
+/// - We have collected our minimum number of rows
+/// - We have reached our limit, either data size or maximum number of rows
+///
+/// Otherwise it will return when the stream has exhausted. If you want a specific number of
+/// rows, set min_rows == max_rows.
+async fn collect_record_batches_to_display(
+    df: DataFrame,
+    min_rows: usize,
+    max_rows: usize,
+    config: &DisplayConfig,
+) -> Result<(Vec<RecordBatch>, bool), DataFusionError> {
+    let partitioned_stream = df.execute_stream_partitioned().await?;
+    let mut stream = futures::stream::iter(partitioned_stream).flatten();
+    let mut size_estimate_so_far = 0;
+    let mut rows_so_far = 0;
+    let mut record_batches = Vec::default();
+    let mut has_more = false;
+
+    while (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows)
+        || rows_so_far < min_rows
+    {
+        let mut rb = match stream.next().await {
+            None => {
+                break;
+            }
+            Some(Ok(r)) => r,
+            Some(Err(e)) => return Err(e),
+        };
+
+        let mut rows_in_rb = rb.num_rows();
+        if rows_in_rb > 0 {
+            size_estimate_so_far += rb.get_array_memory_size();
+
+            if size_estimate_so_far > config.max_table_bytes {
+                let ratio = config.max_table_bytes as f32 / size_estimate_so_far as f32;
+                let total_rows = rows_in_rb + rows_so_far;
+
+                let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize;
+                if reduced_row_num < min_rows {
+                    reduced_row_num = min_rows.min(total_rows);
+                }
+
+                let limited_rows_this_rb = reduced_row_num - rows_so_far;
+                if limited_rows_this_rb < rows_in_rb {
+                    rows_in_rb = limited_rows_this_rb;
+                    rb = rb.slice(0, limited_rows_this_rb);
+                    has_more = true;
+                }
+            }
+
+            if rows_in_rb + rows_so_far > max_rows {
+                rb = rb.slice(0, max_rows - rows_so_far);
+                has_more = true;
+            }
+
+            rows_so_far += rb.num_rows();
+            record_batches.push(rb);
+        }
+    }
+
+    if record_batches.is_empty() {
+        return Ok((Vec::default(), false));
     }
+
+    if !has_more {
+        // Data was not already truncated, so check to see if more record batches remain
+        has_more = match stream.try_next().await {
+            Ok(None) => false, // reached end
+            Ok(Some(_)) => true,
+            Err(_) => false, // Stream disconnected
+        };
+    }
+
+    Ok((record_batches, has_more))
 }

From 30c9d99a5a05777163f70a561b4ccffade452122 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 14:52:04 +0800
Subject: [PATCH 20/51] revert to before DisplayConfig in PyDataFrame

---
 python/datafusion/dataframe.py |  55 -------
 python/tests/test_dataframe.py | 281 ---------------------------------
 src/dataframe.rs               | 130 ++-------------
 3 files changed, 11 insertions(+), 455 deletions(-)

diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
index 3b2382502..26fe8f453 100644
--- a/python/datafusion/dataframe.py
+++ b/python/datafusion/dataframe.py
@@ -49,7 +49,6 @@
     import pyarrow as pa
 
     from datafusion._internal import DataFrame as DataFrameInternal
-    from datafusion._internal import DisplayConfig
     from datafusion._internal import expr as expr_internal
 
 from enum import Enum
@@ -814,60 +813,6 @@ def count(self) -> int:
         """
         return self.df.count()
 
-    def configure_display(
-        self,
-        max_table_bytes: Optional[int] = None,
-        min_table_rows: Optional[int] = None,
-        max_cell_length: Optional[int] = None,
-        max_table_rows_in_repr: Optional[int] = None,
-    ) -> None:
-        """Configure display options for DataFrame representation.
-
-        Args:
-            max_table_bytes: Maximum bytes to display for table presentation
-                             (default: 2MB).
-                             Set to lower value for large tables to limit memory usage.
-            min_table_rows: Minimum number of table rows to display (default: 20).
-                            This is used for initial display and in notebooks.
-            max_cell_length: Maximum length of a cell before it gets minimized
-                             (default: 25).
-                             Longer cells will be truncated with an expand button.
-            max_table_rows_in_repr: Maximum number of rows to display in string
-                                    representation
-                                    (default: 10).
-
-        Raises:
-            ValueError: If any of the provided values are less than or equal to 0.
-        """
-        if any(
-            value is not None and value <= 0
-            for value in (
-                max_table_bytes,
-                min_table_rows,
-                max_cell_length,
-                max_table_rows_in_repr,
-            )
-        ):
-            error_msg = "All values must be greater than 0."
-            raise ValueError(error_msg)
-
-        self.df.configure_display(
-            max_table_bytes, min_table_rows, max_cell_length, max_table_rows_in_repr
-        )
-
-    def reset_display_config(self) -> None:
-        """Reset display configuration to default values."""
-        self.df.reset_display_config()
-
-    @property
-    def display_config(self) -> DisplayConfig:
-        """Get the current display configuration.
-
-        Returns:
-            DisplayConfig: The current display configuration settings
-        """
-        return self.df.display_config
-
     @deprecated("Use :py:func:`unnest_columns` instead.")
     def unnest_column(self, column: str, preserve_nulls: bool = True) -> DataFrame:
         """See :py:func:`unnest_columns`."""
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 51cdc173d..eda13930d 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1261,284 +1261,3 @@ def test_dataframe_repr_html(df) -> None:
     body_lines = [f"<td(.*?)>{v}</td>" for inner in body_data for v in inner]
     body_pattern = "(.*?)".join(body_lines)
     assert len(re.findall(body_pattern, output, re.DOTALL)) == 1
-
-
-def test_display_config(df):
-    """Test the display configuration properties are accessible."""
-    config = df.display_config
-
-    # Verify default values
-    assert config.max_table_bytes == 2 * 1024 * 1024  # 2 MB
-    assert config.min_table_rows == 20
-    assert config.max_cell_length == 25
-    assert config.max_table_rows_in_repr == 10
-
-
-def test_configure_display(df):
-    """Test setting display configuration properties."""
-    # Modify the display configuration
-    df.configure_display(
-        max_table_bytes=1024 * 1024,
-        min_table_rows=10,
-        max_cell_length=50,
-        max_table_rows_in_repr=15,
-    )
-
-    # Verify the changes took effect
-    config = df.display_config
-    assert config.max_table_bytes == 1024 * 1024  # 1 MB
-    assert config.min_table_rows == 10
-    assert config.max_cell_length == 50
-    assert config.max_table_rows_in_repr == 15
-
-    # Test partial update (only changing one property)
-    df.configure_display(max_table_rows_in_repr=5)
-    config = df.display_config
-    assert config.max_table_bytes == 1024 * 1024  # previous value retained
-    assert config.min_table_rows == 10  # previous value retained
-    assert config.max_cell_length == 50  # previous value retained
-    assert config.max_table_rows_in_repr == 5  # only this value changed
-
-    # Test with extreme values
-    # Zero values
-    with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
-        df.configure_display(max_table_bytes=0, min_table_rows=0, max_cell_length=0)
-
-    # Test with negative values
-    # This tests for expected behavior when users accidentally pass negative values
-    # Since these are usize in Rust, we expect a Python ValueError when trying to pass
-    # negative values.
-    with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
-        df.configure_display(max_table_bytes=-1)
-
-    with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
-        df.configure_display(min_table_rows=-5)
-
-    with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
-        df.configure_display(max_cell_length=-10)
-
-    # Reset for next tests
-    df.reset_display_config()
-
-
-def test_reset_display_config(df):
-    """Test resetting display configuration to defaults."""
-    # First modify the configuration
-    df.configure_display(
-        max_table_bytes=1024 * 1024,
-        min_table_rows=10,
-        max_cell_length=50,
-        max_table_rows_in_repr=15,
-    )
-
-    # Verify changes took effect
-    config = df.display_config
-    assert config.max_table_bytes == 1024 * 1024
-    assert config.min_table_rows == 10
-    assert config.max_cell_length == 50
-    assert config.max_table_rows_in_repr == 15
-
-    # Now reset to defaults
-    df.reset_display_config()
-
-    # Verify defaults are restored
-    config = df.display_config
-    assert config.max_table_bytes == 2 * 1024 * 1024  # 2 MB
-    assert config.min_table_rows == 20
-    assert config.max_cell_length == 25
-    assert config.max_table_rows_in_repr == 10
-
-
-def test_min_table_rows_display(ctx):
-    """Test that at least min_table_rows rows are displayed."""
-    # Create a dataframe with more rows than the default min_table_rows
-    rows = 100
-    df = _create_numeric_test_df(ctx, rows)
-
-    # Set min_table_rows to a specific value
-    custom_min_rows = 30
-    df.configure_display(min_table_rows=custom_min_rows)
-
-    # Get HTML representation
-    html_output = df._repr_html_()
-
-    # Count table rows in the HTML (excluding header row)
-    # Each row has a <tr> tag
-    row_count = html_output.count("<tr>") - 1  # subtract 1 for the header row
-
-    # Verify at least min_table_rows rows are displayed
-    assert row_count >= custom_min_rows, (
-        f"Expected at least {custom_min_rows} rows, got {row_count}"
-    )
-
-    # If data was truncated, "Data truncated" message should be present
-    if row_count < rows:
-        assert "Data truncated" in html_output
-
-
-def test_max_table_bytes_display(ctx):
-    """Test that reducing max_table_bytes limits the amount of data displayed."""
-    # Create a dataframe with large string values to consume memory
-    # Each string is approximately 1000 bytes
-    large_strings = ["x" * 1000 for _ in range(50)]
-    batch = pa.RecordBatch.from_arrays([pa.array(large_strings)], names=["large_data"])
-    df = ctx.create_dataframe([[batch]])
-
-    # First test with default settings
-    default_html = df._repr_html_()
-    default_row_count = default_html.count("<tr>") - 1  # subtract header row
-
-    # Now set a very small max_table_bytes
-    df.configure_display(max_table_bytes=5000)  # 5KB should only fit a few rows
-    limited_html = df._repr_html_()
-    limited_row_count = limited_html.count("<tr>") - 1
-
-    # Verify fewer rows are displayed with the byte limit
-    assert limited_row_count < default_row_count, (
-        f"Expected fewer rows with byte limit. "
-        f"Default: {default_row_count}, Limited: {limited_row_count}"
-    )
-
-    # "Data truncated" should be present when limited
-    assert "Data truncated" in limited_html
-
-
-def test_max_cell_length_display(ctx):
-    """Test that cells longer than max_cell_length are truncated in display."""
-    # Create a dataframe with long string values
-    long_strings = [
-        "short",
-        "medium text",
-        "this is a very long string that should be truncated",
-    ]
-    batch = pa.RecordBatch.from_arrays([pa.array(long_strings)], names=["text"])
-    df = ctx.create_dataframe([[batch]])
-
-    # Set a small max_cell_length
-    max_length = 10
-    df.configure_display(max_cell_length=max_length)
-
-    # Get HTML representation
-    html_output = df._repr_html_()
-
-    # Check for expand button for long text
-    assert "expandable-container" in html_output
-
-    # Check that expandable class is used for long text
-    assert 'class="expandable"' in html_output
-
-    # Look for the truncated text and expand button
-    long_text = long_strings[2]
-    assert long_text[:max_length] in html_output  # Truncated text should be present
-    assert "expand-btn" in html_output  # Expand button should be present
-    assert long_text in html_output  # Full text should also be in the HTML (hidden)
-
-
-def test_display_config_repr_string(ctx):
-    """Test that __repr__ respects display configuration."""
-    # Create a dataframe with more rows than we want to show
-    # df.__repr__ returns max 10 rows by default, so we start test with 7 rows
-    rows = 7
-    df = _create_numeric_test_df(ctx, rows)
-
-    # Configure to show at least 5 rows in string representation
-    min_table_rows_in_display = 5
-    df.configure_display(min_table_rows=min_table_rows_in_display)
-
-    # Get the string representation
-    repr_str = df.__repr__()
-
-    # Count the number of rows using helper function
-    lines_count = _count_lines_in_str(repr_str)
-
-    assert lines_count >= min_table_rows_in_display
-
-    # Now set min_rows higher and see if more rows appear
-    min_table_rows_in_display = 7
-    rows = 11
-    df = _create_numeric_test_df(ctx, rows)  # Recreate to reset the state
-    df.configure_display(min_table_rows=min_table_rows_in_display)
-
-    repr_str_more = df.__repr__()
-    # The string should contain "Data truncated"
-    assert "Data truncated" in repr_str_more
-
-    # Count lines again
-    lines_count2 = _count_lines_in_str(repr_str_more)
-
-    # Should show more rows now
-    assert lines_count2 > lines_count
-    assert lines_count2 >= min_table_rows_in_display
-
-
-def _count_lines_in_str(repr_str: str) -> int:
-    """Count the number of rows displayed in a string representation.
-
-    Args:
-        repr_str: String representation of the DataFrame.
-
-    Returns:
-        Number of rows that appear in the string representation.
-    """
-    # DataFrame tables are formatted with | value | patterns
-    # Count lines that match actual data rows (not headers or separators)
-    value_lines = 0
-    for line in repr_str.split("\n"):
-        # Look for lines like "| 0      |", "| 1      |", etc.
-        if re.search(r"\|\s*\d+\s*\|", line):
-            value_lines += 1
-    return value_lines
-
-
-def _create_numeric_test_df(ctx, rows) -> DataFrame:
-    """Create a test dataframe with numeric values from 0 to rows-1.
-
-    Args:
-        ctx: SessionContext to use for creating the dataframe.
-        rows: Number of rows to create.
-
-    Returns:
-        DataFrame with a single column "values" containing numbers 0 to rows-1.
-    """
-    data = list(range(rows))
-    batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"])
-    return ctx.create_dataframe([[batch]])
-
-
-def test_max_table_rows_in_repr(ctx):
-    """Test that max_table_rows_in_repr controls the number of rows in string
-    representation.
-    """
-    # Create a dataframe with more rows than the default max_table_rows_in_repr (10)
-    rows = 20
-    df = _create_numeric_test_df(ctx, rows)
-
-    # First test with default setting (should limit to 10 rows)
-    repr_str = df.__repr__()
-    lines_default = _count_lines_in_str(repr_str)
-
-    # Default should be 10 rows max
-    assert lines_default <= 10
-    assert "Data truncated" in repr_str
-
-    # Now set a custom max_table_rows_in_repr value
-    custom_max_rows = 15
-    df.configure_display(max_table_rows_in_repr=custom_max_rows)
-
-    # Get the string representation with new configuration
-    repr_str_more = df.__repr__()
-    lines_custom = _count_lines_in_str(repr_str_more)
-
-    # Should show more rows than default but not more than configured max
-    assert lines_custom > lines_default
-    assert lines_custom <= custom_max_rows
-    assert "Data truncated" in repr_str_more
-
-    # Now set max_rows higher than total rows - should show all rows
-    df.configure_display(max_table_rows_in_repr=25)
-    repr_str_all = df.__repr__()
-    lines_all = _count_lines_in_str(repr_str_all)
-
-    # Should show all rows (20)
-    assert lines_all == rows
-    assert "Data truncated" not in repr_str_all
diff --git a/src/dataframe.rs b/src/dataframe.rs
index cda4dd690..be10b8c28 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -72,56 +72,9 @@ impl PyTableProvider {
         PyTable::new(table_provider)
     }
 }
-
-/// Configuration for DataFrame display in Python environment
-#[pyclass(name = "DisplayConfig", module = "datafusion")]
-#[derive(Debug, Clone)]
-pub struct DisplayConfig {
-    /// Maximum bytes to display for table presentation (default: 2MB)
-    #[pyo3(get, set)]
-    pub max_table_bytes: usize,
-    /// Minimum number of table rows to display (default: 20)
-    #[pyo3(get, set)]
-    pub min_table_rows: usize,
-    /// Maximum length of a cell before it gets minimized (default: 25)
-    #[pyo3(get, set)]
-    pub max_cell_length: usize,
-    /// Maximum number of rows to display in repr string output (default: 10)
-    #[pyo3(get, set)]
-    pub max_table_rows_in_repr: usize,
-}
-
-#[pymethods]
-impl DisplayConfig {
-    #[new]
-    #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))]
-    fn new(
-        max_table_bytes: Option<usize>,
-        min_table_rows: Option<usize>,
-        max_cell_length: Option<usize>,
-        max_table_rows_in_repr: Option<usize>,
-    ) -> Self {
-        let default = DisplayConfig::default();
-        Self {
-            max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes),
-            min_table_rows: min_table_rows.unwrap_or(default.min_table_rows),
-            max_cell_length: max_cell_length.unwrap_or(default.max_cell_length),
-            max_table_rows_in_repr: max_table_rows_in_repr
-                .unwrap_or(default.max_table_rows_in_repr),
-        }
-    }
-}
-
-impl Default for DisplayConfig {
-    fn default() -> Self {
-        Self {
-            max_table_bytes: 2 * 1024 * 1024, // 2 MB
-            min_table_rows: 20,
-            max_cell_length: 25,
-            max_table_rows_in_repr: 10,
-        }
-    }
-}
+const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB
+const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20;
+const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25;
 
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.
 /// Use it to build a plan and `.collect()` to execute the plan and collect the result.
@@ -130,16 +83,12 @@ impl Default for DisplayConfig {
 #[derive(Clone)]
 pub struct PyDataFrame {
     df: Arc<DataFrame>,
-    config: Arc<DisplayConfig>,
 }
 
 impl PyDataFrame {
     /// creates a new PyDataFrame
     pub fn new(df: DataFrame) -> Self {
-        Self {
-            df: Arc::new(df),
-            config: Arc::new(DisplayConfig::default()),
-        }
+        Self { df: Arc::new(df) }
     }
 }
 
@@ -169,12 +118,7 @@ impl PyDataFrame {
     fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
         let (batches, has_more) = wait_for_future(
             py,
-            collect_record_batches_to_display(
-                self.df.as_ref().clone(),
-                self.config.min_table_rows,
-                self.config.max_table_rows_in_repr,
-                &self.config,
-            ),
+            collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10),
         )?;
         if batches.is_empty() {
             // This should not be reached, but do it for safety since we index into the vector below
@@ -197,9 +141,8 @@ impl PyDataFrame {
             py,
             collect_record_batches_to_display(
                 self.df.as_ref().clone(),
-                self.config.min_table_rows,
+                MIN_TABLE_ROWS_TO_DISPLAY,
                 usize::MAX,
-                &self.config,
             ),
         )?;
         if batches.is_empty() {
@@ -275,8 +218,8 @@ impl PyDataFrame {
                 for (col, formatter) in batch_formatter.iter().enumerate() {
                     let cell_data = formatter.value(batch_row).to_string();
                     // From testing, primitive data types do not typically get larger than 21 characters
-                    if cell_data.len() > self.config.max_cell_length {
-                        let short_cell_data = &cell_data[0..self.config.max_cell_length];
+                    if cell_data.len() > MAX_LENGTH_CELL_WITHOUT_MINIMIZE {
+                        let short_cell_data = &cell_data[0..MAX_LENGTH_CELL_WITHOUT_MINIMIZE];
                         cells.push(format!("
                             <td style='border: 1px solid black; padding: 8px; text-align: left; white-space: nowrap;'>
                                 <div class=\"expandable-container\">
@@ -854,56 +797,6 @@ impl PyDataFrame {
     fn count(&self, py: Python) -> PyDataFusionResult<usize> {
         Ok(wait_for_future(py, self.df.as_ref().clone().count())?)
     }
-
-    /// Get the current display configuration
-    #[getter]
-    fn display_config(&self) -> PyResult<Py<DisplayConfig>> {
-        Python::with_gil(|py| {
-            let config = (*self.config).clone();
-            Py::new(py, config)
-        })
-    }
-
-    /// Update display configuration
-    #[pyo3(signature = (
-        max_table_bytes=None,
-        min_table_rows=None,
-        max_cell_length=None,
-        max_table_rows_in_repr=None
-    ))]
-    fn configure_display(
-        &mut self,
-        max_table_bytes: Option<usize>,
-        min_table_rows: Option<usize>,
-        max_cell_length: Option<usize>,
-        max_table_rows_in_repr: Option<usize>,
-    ) {
-        let mut new_config = (*self.config).clone();
-
-        if let Some(bytes) = max_table_bytes {
-            new_config.max_table_bytes = bytes;
-        }
-
-        if let Some(rows) = min_table_rows {
-            new_config.min_table_rows = rows;
-        }
-
-        if let Some(length) = max_cell_length {
-            new_config.max_cell_length = length;
-        }
-
-        if let Some(rows) = max_table_rows_in_repr {
-            new_config.max_table_rows_in_repr = rows;
-        }
-
-        self.config = Arc::new(new_config);
-    }
-
-    /// Reset display configuration to default values
-    #[pyo3(text_signature = "($self)")]
-    fn reset_display_config(&mut self) {
-        self.config = Arc::new(DisplayConfig::default());
-    }
 }
 
 /// Print DataFrame
@@ -993,7 +886,6 @@ async fn collect_record_batches_to_display(
     df: DataFrame,
     min_rows: usize,
     max_rows: usize,
-    config: &DisplayConfig,
 ) -> Result<(Vec<RecordBatch>, bool), DataFusionError> {
     let partitioned_stream = df.execute_stream_partitioned().await?;
     let mut stream = futures::stream::iter(partitioned_stream).flatten();
@@ -1002,7 +894,7 @@ async fn collect_record_batches_to_display(
     let mut record_batches = Vec::default();
     let mut has_more = false;
 
-    while (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows)
+    while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows)
         || rows_so_far < min_rows
     {
         let mut rb = match stream.next().await {
@@ -1017,8 +909,8 @@ async fn collect_record_batches_to_display(
         if rows_in_rb > 0 {
             size_estimate_so_far += rb.get_array_memory_size();
 
-            if size_estimate_so_far > config.max_table_bytes {
-                let ratio = config.max_table_bytes as f32 / size_estimate_so_far as f32;
+            if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY {
+                let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32;
                 let total_rows = rows_in_rb + rows_so_far;
 
                 let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize;

From 028f0ab091b105bd9ea6da48a04991c417de5ff3 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 14:59:50 +0800
Subject: [PATCH 21/51] feat: Add DataframeDisplayConfig for customizable
 DataFrame display options

- Introduced `DataframeDisplayConfig` struct to manage display settings for DataFrames.
- Added fields for maximum bytes, minimum rows, maximum cell length, and maximum rows in repr.
- Implemented a constructor with default values for easy initialization.
- Updated `PySessionConfig` to include `display_config` with default settings.
---
 src/context.rs | 62 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 60 insertions(+), 2 deletions(-)

diff --git a/src/context.rs b/src/context.rs
index 0db0f4d7e..3a71362da 100644
--- a/src/context.rs
+++ b/src/context.rs
@@ -72,16 +72,71 @@ use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider};
 use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType};
 use tokio::task::JoinHandle;
 
+/// Configuration for displaying DataFrames
+#[pyclass(name = "DataframeDisplayConfig", module = "datafusion", subclass)]
+#[derive(Clone)]
+pub struct DataframeDisplayConfig {
+    /// Maximum bytes to display for table presentation (default: 2MB)
+    #[pyo3(get, set)]
+    pub max_table_bytes: usize,
+    /// Minimum number of table rows to display (default: 20)
+    #[pyo3(get, set)]
+    pub min_table_rows: usize,
+    /// Maximum length of a cell before it gets minimized (default: 25)
+    #[pyo3(get, set)]
+    pub max_cell_length: usize,
+    /// Maximum number of rows to display in repr string output (default: 10)
+    #[pyo3(get, set)]
+    pub max_table_rows_in_repr: usize,
+}
+
+#[pymethods]
+impl DataframeDisplayConfig {
+    #[new]
+    #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))]
+    fn new(
+        max_table_bytes: Option<usize>,
+        min_table_rows: Option<usize>,
+        max_cell_length: Option<usize>,
+        max_table_rows_in_repr: Option<usize>,
+    ) -> Self {
+        let default = Self::default();
+        Self {
+            max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes),
+            min_table_rows: min_table_rows.unwrap_or(default.min_table_rows),
+            max_cell_length: max_cell_length.unwrap_or(default.max_cell_length),
+            max_table_rows_in_repr: max_table_rows_in_repr
+                .unwrap_or(default.max_table_rows_in_repr),
+        }
+    }
+}
+
+impl Default for DataframeDisplayConfig {
+    fn default() -> Self {
+        Self {
+            max_table_bytes: 2 * 1024 * 1024, // 2 MB
+            min_table_rows: 20,
+            max_cell_length: 25,
+            max_table_rows_in_repr: 10,
+        }
+    }
+}
+
 /// Configuration options for a SessionContext
 #[pyclass(name = "SessionConfig", module = "datafusion", subclass)]
 #[derive(Clone, Default)]
 pub struct PySessionConfig {
     pub config: SessionConfig,
+    #[pyo3(get, set)]
+    pub display_config: DataframeDisplayConfig,
 }
 
 impl From<SessionConfig> for PySessionConfig {
     fn from(config: SessionConfig) -> Self {
-        Self { config }
+        Self {
+            config,
+            display_config: DataframeDisplayConfig::default(),
+        }
     }
 }
 
@@ -97,7 +152,10 @@ impl PySessionConfig {
             }
         }
 
-        Self { config }
+        Self {
+            config,
+            display_config: DataframeDisplayConfig::default(),
+        }
     }
 
     fn with_create_default_catalog_and_schema(&self, enabled: bool) -> Self {

From b401e1a1a0f2698b4831cc00450ba69f292be9ca Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 15:03:06 +0800
Subject: [PATCH 22/51] feat: Add method to configure DataFrame display options
 in PySessionConfig

---
 src/context.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/context.rs b/src/context.rs
index 3a71362da..7adaebada 100644
--- a/src/context.rs
+++ b/src/context.rs
@@ -214,6 +214,12 @@ impl PySessionConfig {
         Self::from(self.config.clone().with_repartition_file_min_size(size))
     }
 
+    fn with_dataframe_display_config(&self, display_config: DataframeDisplayConfig) -> Self {
+        let mut config = self.clone();
+        config.display_config = display_config;
+        config
+    }
+
     fn with_parquet_pruning(&self, enabled: bool) -> Self {
         Self::from(self.config.clone().with_parquet_pruning(enabled))
     }

From d2a1dc92b4e93eb5a6a44df8b946894e998ba47e Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 15:07:14 +0800
Subject: [PATCH 23/51] feat: Add method to configure DataFrame display options
 in SessionConfig (python)

- Introduced `with_dataframe_display_config` method in `SessionConfig` to allow customization of DataFrame display settings.
- Parameters include `max_table_bytes`, `min_table_rows`, `max_cell_length`, and `max_table_rows_in_repr` for flexible display configurations.
- Utilizes `DataframeDisplayConfig` for internal management of display settings.
---
 python/datafusion/context.py | 40 +++++++++++++--
 src/context.rs               | 97 ++++++++++++++++++------------------
 src/lib.rs                   |  1 +
 3 files changed, 87 insertions(+), 51 deletions(-)

diff --git a/python/datafusion/context.py b/python/datafusion/context.py
index 1429a4975..9adc2b654 100644
--- a/python/datafusion/context.py
+++ b/python/datafusion/context.py
@@ -36,6 +36,7 @@
 from ._internal import SessionConfig as SessionConfigInternal
 from ._internal import SessionContext as SessionContextInternal
 from ._internal import SQLOptions as SQLOptionsInternal
+from ._internal import DataframeDisplayConfig as DataframeDisplayConfigInternal
 
 if TYPE_CHECKING:
     import pathlib
@@ -89,6 +90,37 @@ def __init__(self, config_options: dict[str, str] | None = None) -> None:
         """
         self.config_internal = SessionConfigInternal(config_options)
 
+    def with_dataframe_display_config(
+        self,
+        max_table_bytes: int = None,
+        min_table_rows: int = None,
+        max_cell_length: int = None,
+        max_table_rows_in_repr: int = None,
+    ) -> SessionConfig:
+        """Configure the display options for DataFrames.
+
+        Args:
+            max_table_bytes: Maximum bytes to display for table presentation (default: 2MB)
+            min_table_rows: Minimum number of table rows to display (default: 20)
+            max_cell_length: Maximum length of a cell before it gets minimized (default: 25)
+            max_table_rows_in_repr: Maximum number of rows to display in repr string output (default: 10)
+
+        Returns:
+            A new :py:class:`SessionConfig` object with the updated display settings.
+        """
+
+        display_config = DataframeDisplayConfigInternal(
+            max_table_bytes=max_table_bytes,
+            min_table_rows=min_table_rows,
+            max_cell_length=max_cell_length,
+            max_table_rows_in_repr=max_table_rows_in_repr,
+        )
+
+        self.config_internal = self.config_internal.with_dataframe_display_config(
+            display_config
+        )
+        return self
+
     def with_create_default_catalog_and_schema(
         self, enabled: bool = True
     ) -> SessionConfig:
@@ -806,9 +838,11 @@ def register_parquet(
             file_extension,
             skip_metadata,
             schema,
-            [sort_list_to_raw_sort_list(exprs) for exprs in file_sort_order]
-            if file_sort_order is not None
-            else None,
+            (
+                [sort_list_to_raw_sort_list(exprs) for exprs in file_sort_order]
+                if file_sort_order is not None
+                else None
+            ),
         )
 
     def register_csv(
diff --git a/src/context.rs b/src/context.rs
index 7adaebada..abf09b070 100644
--- a/src/context.rs
+++ b/src/context.rs
@@ -73,54 +73,6 @@ use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType};
 use tokio::task::JoinHandle;
 
 /// Configuration for displaying DataFrames
-#[pyclass(name = "DataframeDisplayConfig", module = "datafusion", subclass)]
-#[derive(Clone)]
-pub struct DataframeDisplayConfig {
-    /// Maximum bytes to display for table presentation (default: 2MB)
-    #[pyo3(get, set)]
-    pub max_table_bytes: usize,
-    /// Minimum number of table rows to display (default: 20)
-    #[pyo3(get, set)]
-    pub min_table_rows: usize,
-    /// Maximum length of a cell before it gets minimized (default: 25)
-    #[pyo3(get, set)]
-    pub max_cell_length: usize,
-    /// Maximum number of rows to display in repr string output (default: 10)
-    #[pyo3(get, set)]
-    pub max_table_rows_in_repr: usize,
-}
-
-#[pymethods]
-impl DataframeDisplayConfig {
-    #[new]
-    #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))]
-    fn new(
-        max_table_bytes: Option<usize>,
-        min_table_rows: Option<usize>,
-        max_cell_length: Option<usize>,
-        max_table_rows_in_repr: Option<usize>,
-    ) -> Self {
-        let default = Self::default();
-        Self {
-            max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes),
-            min_table_rows: min_table_rows.unwrap_or(default.min_table_rows),
-            max_cell_length: max_cell_length.unwrap_or(default.max_cell_length),
-            max_table_rows_in_repr: max_table_rows_in_repr
-                .unwrap_or(default.max_table_rows_in_repr),
-        }
-    }
-}
-
-impl Default for DataframeDisplayConfig {
-    fn default() -> Self {
-        Self {
-            max_table_bytes: 2 * 1024 * 1024, // 2 MB
-            min_table_rows: 20,
-            max_cell_length: 25,
-            max_table_rows_in_repr: 10,
-        }
-    }
-}
 
 /// Configuration options for a SessionContext
 #[pyclass(name = "SessionConfig", module = "datafusion", subclass)]
@@ -229,6 +181,55 @@ impl PySessionConfig {
     }
 }
 
+#[pyclass(name = "DataframeDisplayConfig", module = "datafusion", subclass)]
+#[derive(Clone)]
+pub struct DataframeDisplayConfig {
+    /// Maximum bytes to display for table presentation (default: 2MB)
+    #[pyo3(get, set)]
+    pub max_table_bytes: usize,
+    /// Minimum number of table rows to display (default: 20)
+    #[pyo3(get, set)]
+    pub min_table_rows: usize,
+    /// Maximum length of a cell before it gets minimized (default: 25)
+    #[pyo3(get, set)]
+    pub max_cell_length: usize,
+    /// Maximum number of rows to display in repr string output (default: 10)
+    #[pyo3(get, set)]
+    pub max_table_rows_in_repr: usize,
+}
+
+#[pymethods]
+impl DataframeDisplayConfig {
+    #[new]
+    #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))]
+    fn new(
+        max_table_bytes: Option<usize>,
+        min_table_rows: Option<usize>,
+        max_cell_length: Option<usize>,
+        max_table_rows_in_repr: Option<usize>,
+    ) -> Self {
+        let default = Self::default();
+        Self {
+            max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes),
+            min_table_rows: min_table_rows.unwrap_or(default.min_table_rows),
+            max_cell_length: max_cell_length.unwrap_or(default.max_cell_length),
+            max_table_rows_in_repr: max_table_rows_in_repr
+                .unwrap_or(default.max_table_rows_in_repr),
+        }
+    }
+}
+
+impl Default for DataframeDisplayConfig {
+    fn default() -> Self {
+        Self {
+            max_table_bytes: 2 * 1024 * 1024, // 2 MB
+            min_table_rows: 20,
+            max_cell_length: 25,
+            max_table_rows_in_repr: 10,
+        }
+    }
+}
+
 /// Runtime options for a SessionContext
 #[pyclass(name = "RuntimeEnvBuilder", module = "datafusion", subclass)]
 #[derive(Clone)]
diff --git a/src/lib.rs b/src/lib.rs
index ce93ff0c3..61be65555 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -82,6 +82,7 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<context::PyRuntimeEnvBuilder>()?;
     m.add_class::<context::PySessionConfig>()?;
     m.add_class::<context::PySessionContext>()?;
+    m.add_class::<context::DataframeDisplayConfig>()?;
     m.add_class::<context::PySQLOptions>()?;
     m.add_class::<dataframe::PyDataFrame>()?;
     m.add_class::<udf::PyScalarUDF>()?;

From 07d7cf680a954e90561d79487fd9309d87752a27 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 15:25:51 +0800
Subject: [PATCH 24/51] rename to PyDataframeDisplayConfig

---
 src/context.rs | 105 ++++++++++++++++++++++++-------------------------
 src/lib.rs     |   2 +-
 2 files changed, 53 insertions(+), 54 deletions(-)

diff --git a/src/context.rs b/src/context.rs
index abf09b070..9dac3ced0 100644
--- a/src/context.rs
+++ b/src/context.rs
@@ -73,6 +73,54 @@ use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType};
 use tokio::task::JoinHandle;
 
 /// Configuration for displaying DataFrames
+#[pyclass(name = "DataframeDisplayConfig", module = "datafusion", subclass)]
+#[derive(Clone)]
+pub struct PyDataframeDisplayConfig {
+    /// Maximum bytes to display for table presentation (default: 2MB)
+    #[pyo3(get, set)]
+    pub max_table_bytes: usize,
+    /// Minimum number of table rows to display (default: 20)
+    #[pyo3(get, set)]
+    pub min_table_rows: usize,
+    /// Maximum length of a cell before it gets minimized (default: 25)
+    #[pyo3(get, set)]
+    pub max_cell_length: usize,
+    /// Maximum number of rows to display in repr string output (default: 10)
+    #[pyo3(get, set)]
+    pub max_table_rows_in_repr: usize,
+}
+
+#[pymethods]
+impl PyDataframeDisplayConfig {
+    #[new]
+    #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))]
+    fn new(
+        max_table_bytes: Option<usize>,
+        min_table_rows: Option<usize>,
+        max_cell_length: Option<usize>,
+        max_table_rows_in_repr: Option<usize>,
+    ) -> Self {
+        let default = Self::default();
+        Self {
+            max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes),
+            min_table_rows: min_table_rows.unwrap_or(default.min_table_rows),
+            max_cell_length: max_cell_length.unwrap_or(default.max_cell_length),
+            max_table_rows_in_repr: max_table_rows_in_repr
+                .unwrap_or(default.max_table_rows_in_repr),
+        }
+    }
+}
+
+impl Default for PyDataframeDisplayConfig {
+    fn default() -> Self {
+        Self {
+            max_table_bytes: 2 * 1024 * 1024, // 2 MB
+            min_table_rows: 20,
+            max_cell_length: 25,
+            max_table_rows_in_repr: 10,
+        }
+    }
+}
 
 /// Configuration options for a SessionContext
 #[pyclass(name = "SessionConfig", module = "datafusion", subclass)]
@@ -80,14 +128,14 @@ use tokio::task::JoinHandle;
 pub struct PySessionConfig {
     pub config: SessionConfig,
     #[pyo3(get, set)]
-    pub display_config: DataframeDisplayConfig,
+    pub display_config: PyDataframeDisplayConfig,
 }
 
 impl From<SessionConfig> for PySessionConfig {
     fn from(config: SessionConfig) -> Self {
         Self {
             config,
-            display_config: DataframeDisplayConfig::default(),
+            display_config: PyDataframeDisplayConfig::default(),
         }
     }
 }
@@ -106,7 +154,7 @@ impl PySessionConfig {
 
         Self {
             config,
-            display_config: DataframeDisplayConfig::default(),
+            display_config: PyDataframeDisplayConfig::default(),
         }
     }
 
@@ -166,7 +214,7 @@ impl PySessionConfig {
         Self::from(self.config.clone().with_repartition_file_min_size(size))
     }
 
-    fn with_dataframe_display_config(&self, display_config: DataframeDisplayConfig) -> Self {
+    fn with_dataframe_display_config(&self, display_config: PyDataframeDisplayConfig) -> Self {
         let mut config = self.clone();
         config.display_config = display_config;
         config
@@ -181,55 +229,6 @@ impl PySessionConfig {
     }
 }
 
-#[pyclass(name = "DataframeDisplayConfig", module = "datafusion", subclass)]
-#[derive(Clone)]
-pub struct DataframeDisplayConfig {
-    /// Maximum bytes to display for table presentation (default: 2MB)
-    #[pyo3(get, set)]
-    pub max_table_bytes: usize,
-    /// Minimum number of table rows to display (default: 20)
-    #[pyo3(get, set)]
-    pub min_table_rows: usize,
-    /// Maximum length of a cell before it gets minimized (default: 25)
-    #[pyo3(get, set)]
-    pub max_cell_length: usize,
-    /// Maximum number of rows to display in repr string output (default: 10)
-    #[pyo3(get, set)]
-    pub max_table_rows_in_repr: usize,
-}
-
-#[pymethods]
-impl DataframeDisplayConfig {
-    #[new]
-    #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))]
-    fn new(
-        max_table_bytes: Option<usize>,
-        min_table_rows: Option<usize>,
-        max_cell_length: Option<usize>,
-        max_table_rows_in_repr: Option<usize>,
-    ) -> Self {
-        let default = Self::default();
-        Self {
-            max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes),
-            min_table_rows: min_table_rows.unwrap_or(default.min_table_rows),
-            max_cell_length: max_cell_length.unwrap_or(default.max_cell_length),
-            max_table_rows_in_repr: max_table_rows_in_repr
-                .unwrap_or(default.max_table_rows_in_repr),
-        }
-    }
-}
-
-impl Default for DataframeDisplayConfig {
-    fn default() -> Self {
-        Self {
-            max_table_bytes: 2 * 1024 * 1024, // 2 MB
-            min_table_rows: 20,
-            max_cell_length: 25,
-            max_table_rows_in_repr: 10,
-        }
-    }
-}
-
 /// Runtime options for a SessionContext
 #[pyclass(name = "RuntimeEnvBuilder", module = "datafusion", subclass)]
 #[derive(Clone)]
diff --git a/src/lib.rs b/src/lib.rs
index 61be65555..a88b3e18c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -82,7 +82,7 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<context::PyRuntimeEnvBuilder>()?;
     m.add_class::<context::PySessionConfig>()?;
     m.add_class::<context::PySessionContext>()?;
-    m.add_class::<context::DataframeDisplayConfig>()?;
+    m.add_class::<context::PyDataframeDisplayConfig>()?;
     m.add_class::<context::PySQLOptions>()?;
     m.add_class::<dataframe::PyDataFrame>()?;
     m.add_class::<udf::PyScalarUDF>()?;

From 625a1f28683a2bfc491cf2ffae2ee8289a9aab25 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 15:36:41 +0800
Subject: [PATCH 25/51] feat: Add DataframeDisplayConfig class for customizable
 DataFrame display options

- Introduced DataframeDisplayConfig to manage display settings for DataFrames.
- Added properties for max_table_bytes, min_table_rows, max_cell_length, and max_table_rows_in_repr.
- Each property includes getter and setter methods for easy configuration.
- Default values provided for each parameter to enhance usability.
---
 python/datafusion/__init__.py |  2 +
 python/datafusion/context.py  | 69 +++++++++++++++++++++++++++++++++++
 src/context.rs                |  1 -
 3 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py
index d871fdb71..a724d15a3 100644
--- a/python/datafusion/__init__.py
+++ b/python/datafusion/__init__.py
@@ -38,6 +38,7 @@
     RuntimeEnvBuilder,
     SessionConfig,
     SessionContext,
+    DataframeDisplayConfig,
     SQLOptions,
 )
 from .dataframe import DataFrame
@@ -70,6 +71,7 @@
     "ScalarUDF",
     "SessionConfig",
     "SessionContext",
+    "DataframeDisplayConfig",
     "Table",
     "WindowFrame",
     "WindowUDF",
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
index 9adc2b654..c14cd21dc 100644
--- a/python/datafusion/context.py
+++ b/python/datafusion/context.py
@@ -79,6 +79,75 @@ class TableProviderExportable(Protocol):
     def __datafusion_table_provider__(self) -> object: ...  # noqa: D105
 
 
+class DataframeDisplayConfig:
+    """Configuration for displaying DataFrame results.
+
+    This class allows you to control how DataFrames are displayed in Python.
+    """
+
+    def __init__(
+        self,
+        max_table_bytes: int = None,
+        min_table_rows: int = None,
+        max_cell_length: int = None,
+        max_table_rows_in_repr: int = None,
+    ) -> None:
+        """Create a new :py:class:`DataframeDisplayConfig` instance.
+
+        Args:
+            max_table_bytes: Maximum bytes to display for table presentation (default: 2MB)
+            min_table_rows: Minimum number of table rows to display (default: 20)
+            max_cell_length: Maximum length of a cell before it gets minimized (default: 25)
+            max_table_rows_in_repr: Maximum number of rows to display in repr string output (default: 10)
+        """
+        self.config_internal = DataframeDisplayConfigInternal(
+            max_table_bytes=max_table_bytes,
+            min_table_rows=min_table_rows,
+            max_cell_length=max_cell_length,
+            max_table_rows_in_repr=max_table_rows_in_repr,
+        )
+
+    @property
+    def max_table_bytes(self) -> int:
+        """Get the maximum bytes to display for table presentation."""
+        return self.config_internal.max_table_bytes
+
+    @max_table_bytes.setter
+    def max_table_bytes(self, value: int) -> None:
+        """Set the maximum bytes to display for table presentation."""
+        self.config_internal.max_table_bytes = value
+
+    @property
+    def min_table_rows(self) -> int:
+        """Get the minimum number of table rows to display."""
+        return self.config_internal.min_table_rows
+
+    @min_table_rows.setter
+    def min_table_rows(self, value: int) -> None:
+        """Set the minimum number of table rows to display."""
+        self.config_internal.min_table_rows = value
+
+    @property
+    def max_cell_length(self) -> int:
+        """Get the maximum length of a cell before it gets minimized."""
+        return self.config_internal.max_cell_length
+
+    @max_cell_length.setter
+    def max_cell_length(self, value: int) -> None:
+        """Set the maximum length of a cell before it gets minimized."""
+        self.config_internal.max_cell_length = value
+
+    @property
+    def max_table_rows_in_repr(self) -> int:
+        """Get the maximum number of rows to display in repr string output."""
+        return self.config_internal.max_table_rows_in_repr
+
+    @max_table_rows_in_repr.setter
+    def max_table_rows_in_repr(self, value: int) -> None:
+        """Set the maximum number of rows to display in repr string output."""
+        self.config_internal.max_table_rows_in_repr = value
+
+
 class SessionConfig:
     """Session configuration options."""
 
diff --git a/src/context.rs b/src/context.rs
index 9dac3ced0..7a4ebf466 100644
--- a/src/context.rs
+++ b/src/context.rs
@@ -127,7 +127,6 @@ impl Default for PyDataframeDisplayConfig {
 #[derive(Clone, Default)]
 pub struct PySessionConfig {
     pub config: SessionConfig,
-    #[pyo3(get, set)]
     pub display_config: PyDataframeDisplayConfig,
 }
 

From 5dfb9ce7268f5cc9d3df24684ddd6d2c217c6ddf Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 16:18:18 +0800
Subject: [PATCH 26/51] Fix ruff errors

---
 python/datafusion/__init__.py |  2 +-
 python/datafusion/context.py  | 45 ++++++++++++++++++++++-------------
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py
index a724d15a3..23f6c971d 100644
--- a/python/datafusion/__init__.py
+++ b/python/datafusion/__init__.py
@@ -61,6 +61,7 @@
     "DFSchema",
     "DataFrame",
     "Database",
+    "DataframeDisplayConfig",
     "ExecutionPlan",
     "Expr",
     "LogicalPlan",
@@ -71,7 +72,6 @@
     "ScalarUDF",
     "SessionConfig",
     "SessionContext",
-    "DataframeDisplayConfig",
     "Table",
     "WindowFrame",
     "WindowUDF",
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
index c14cd21dc..ad5744958 100644
--- a/python/datafusion/context.py
+++ b/python/datafusion/context.py
@@ -19,7 +19,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Protocol
+from typing import TYPE_CHECKING, Any, Optional, Protocol
 
 try:
     from warnings import deprecated  # Python 3.13+
@@ -87,18 +87,22 @@ class DataframeDisplayConfig:
 
     def __init__(
         self,
-        max_table_bytes: int = None,
-        min_table_rows: int = None,
-        max_cell_length: int = None,
-        max_table_rows_in_repr: int = None,
+        max_table_bytes: Optional[int] = None,
+        min_table_rows: Optional[int] = None,
+        max_cell_length: Optional[int] = None,
+        max_table_rows_in_repr: Optional[int] = None,
     ) -> None:
         """Create a new :py:class:`DataframeDisplayConfig` instance.
 
         Args:
-            max_table_bytes: Maximum bytes to display for table presentation (default: 2MB)
-            min_table_rows: Minimum number of table rows to display (default: 20)
-            max_cell_length: Maximum length of a cell before it gets minimized (default: 25)
-            max_table_rows_in_repr: Maximum number of rows to display in repr string output (default: 10)
+            max_table_bytes: Maximum bytes to display for table presentation
+                (default: 2MB)
+            min_table_rows: Minimum number of table rows to display
+                (default: 20)
+            max_cell_length: Maximum length of a cell before it gets minimized
+                (default: 25)
+            max_table_rows_in_repr: Maximum number of rows to display in repr
+                string output (default: 10)
         """
         self.config_internal = DataframeDisplayConfigInternal(
             max_table_bytes=max_table_bytes,
@@ -161,22 +165,31 @@ def __init__(self, config_options: dict[str, str] | None = None) -> None:
 
     def with_dataframe_display_config(
         self,
-        max_table_bytes: int = None,
-        min_table_rows: int = None,
-        max_cell_length: int = None,
-        max_table_rows_in_repr: int = None,
+        max_table_bytes: Optional[int] = None,
+        min_table_rows: Optional[int] = None,
+        max_cell_length: Optional[int] = None,
+        max_table_rows_in_repr: Optional[int] = None,
     ) -> SessionConfig:
         """Configure the display options for DataFrames.
 
         Args:
-            max_table_bytes: Maximum bytes to display for table presentation (default: 2MB)
+            max_table_bytes: Maximum bytes to display for table presentation
+                     (default: 2MB)
             min_table_rows: Minimum number of table rows to display (default: 20)
-            max_cell_length: Maximum length of a cell before it gets minimized (default: 25)
-            max_table_rows_in_repr: Maximum number of rows to display in repr string output (default: 10)
+            max_cell_length: Maximum length of a cell before it gets minimized
+                     (default: 25)
+            max_table_rows_in_repr: Maximum number of rows to display in repr string
+                        output (default: 10)
 
         Returns:
             A new :py:class:`SessionConfig` object with the updated display settings.
         """
+        display_config = DataframeDisplayConfigInternal(
+            max_table_bytes=max_table_bytes,
+            min_table_rows=min_table_rows,
+            max_cell_length=max_cell_length,
+            max_table_rows_in_repr=max_table_rows_in_repr,
+        )
 
         display_config = DataframeDisplayConfigInternal(
             max_table_bytes=max_table_bytes,

From 065fa407f473a2e1e6a6d6217cd74955d23cb10d Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 16:34:48 +0800
Subject: [PATCH 27/51] feat: Enhance PyDataFrame to support customizable
 display options

- Updated `PyDataFrame` constructor to accept a `PyDataframeDisplayConfig` parameter for improved DataFrame display customization.
- Modified multiple methods in `PySessionContext` to pass the display configuration when creating `PyDataFrame` instances, ensuring consistent display settings across different DataFrame operations.
---
 src/context.rs   | 39 +++++++++++++++++++++++++++------------
 src/dataframe.rs |  9 +++++++--
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/src/context.rs b/src/context.rs
index 7a4ebf466..06f5ac8ec 100644
--- a/src/context.rs
+++ b/src/context.rs
@@ -457,7 +457,7 @@ impl PySessionContext {
     pub fn sql(&mut self, query: &str, py: Python) -> PyDataFusionResult<PyDataFrame> {
         let result = self.ctx.sql(query);
         let df = wait_for_future(py, result)?;
-        Ok(PyDataFrame::new(df))
+        Ok(PyDataFrame::new(df, self.ctx.display_config.clone()))
     }
 
     #[pyo3(signature = (query, options=None))]
@@ -474,7 +474,7 @@ impl PySessionContext {
         };
         let result = self.ctx.sql_with_options(query, options);
         let df = wait_for_future(py, result)?;
-        Ok(PyDataFrame::new(df))
+        Ok(PyDataFrame::new(df, self.ctx.display_config.clone()))
     }
 
     #[pyo3(signature = (partitions, name=None, schema=None))]
@@ -509,13 +509,16 @@ impl PySessionContext {
 
         let table = wait_for_future(py, self._table(&table_name))?;
 
-        let df = PyDataFrame::new(table);
+        let df = PyDataFrame::new(table, self.ctx.display_config.clone());
         Ok(df)
     }
 
     /// Create a DataFrame from an existing logical plan
     pub fn create_dataframe_from_logical_plan(&mut self, plan: PyLogicalPlan) -> PyDataFrame {
-        PyDataFrame::new(DataFrame::new(self.ctx.state(), plan.plan.as_ref().clone()))
+        PyDataFrame::new(
+            DataFrame::new(self.ctx.state(), plan.plan.as_ref().clone()),
+            self.ctx.display_config.clone(),
+        )
     }
 
     /// Construct datafusion dataframe from Python list
@@ -883,7 +886,7 @@ impl PySessionContext {
     pub fn table(&self, name: &str, py: Python) -> PyResult<PyDataFrame> {
         let x = wait_for_future(py, self.ctx.table(name))
             .map_err(|e| PyKeyError::new_err(e.to_string()))?;
-        Ok(PyDataFrame::new(x))
+        Ok(PyDataFrame::new(x, self.ctx.display_config.clone()))
     }
 
     pub fn table_exist(&self, name: &str) -> PyDataFusionResult<bool> {
@@ -891,7 +894,10 @@ impl PySessionContext {
     }
 
     pub fn empty_table(&self) -> PyDataFusionResult<PyDataFrame> {
-        Ok(PyDataFrame::new(self.ctx.read_empty()?))
+        Ok(
+            PyDataFrame::new(self.ctx.read_empty()?),
+            self.ctx.display_config.clone(),
+        )
     }
 
     pub fn session_id(&self) -> String {
@@ -926,7 +932,7 @@ impl PySessionContext {
             let result = self.ctx.read_json(path, options);
             wait_for_future(py, result)?
         };
-        Ok(PyDataFrame::new(df))
+        Ok(PyDataFrame::new(df, self.ctx.display_config.clone()))
     }
 
     #[allow(clippy::too_many_arguments)]
@@ -971,12 +977,18 @@ impl PySessionContext {
             let paths = path.extract::<Vec<String>>()?;
             let paths = paths.iter().map(|p| p as &str).collect::<Vec<&str>>();
             let result = self.ctx.read_csv(paths, options);
-            let df = PyDataFrame::new(wait_for_future(py, result)?);
+            let df = PyDataFrame::new(
+                wait_for_future(py, result)?,
+                self.ctx.display_config.clone(),
+            );
             Ok(df)
         } else {
             let path = path.extract::<String>()?;
             let result = self.ctx.read_csv(path, options);
-            let df = PyDataFrame::new(wait_for_future(py, result)?);
+            let df = PyDataFrame::new(
+                wait_for_future(py, result)?,
+                self.ctx.display_config.clone(),
+            );
             Ok(df)
         }
     }
@@ -1014,7 +1026,10 @@ impl PySessionContext {
             .collect();
 
         let result = self.ctx.read_parquet(path, options);
-        let df = PyDataFrame::new(wait_for_future(py, result)?);
+        let df = PyDataFrame::new(
+            wait_for_future(py, result)?,
+            self.ctx.display_config.clone(),
+        );
         Ok(df)
     }
 
@@ -1039,12 +1054,12 @@ impl PySessionContext {
             let read_future = self.ctx.read_avro(path, options);
             wait_for_future(py, read_future)?
         };
-        Ok(PyDataFrame::new(df))
+        Ok(PyDataFrame::new(df, self.ctx.display_config.clone()))
     }
 
     pub fn read_table(&self, table: &PyTable) -> PyDataFusionResult<PyDataFrame> {
         let df = self.ctx.read_table(table.table())?;
-        Ok(PyDataFrame::new(df))
+        Ok(PyDataFrame::new(df, self.ctx.display_config.clone()))
     }
 
     fn __repr__(&self) -> PyResult<String> {
diff --git a/src/dataframe.rs b/src/dataframe.rs
index be10b8c28..6e49a91ff 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -43,6 +43,7 @@ use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods};
 use tokio::task::JoinHandle;
 
 use crate::catalog::PyTable;
+use crate::context::PyDataframeDisplayConfig;
 use crate::errors::{py_datafusion_err, PyDataFusionError};
 use crate::expr::sort_expr::to_sort_expressions;
 use crate::physical_plan::PyExecutionPlan;
@@ -83,12 +84,16 @@ const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25;
 #[derive(Clone)]
 pub struct PyDataFrame {
     df: Arc<DataFrame>,
+    display_config: PyDataframeDisplayConfig,
 }
 
 impl PyDataFrame {
     /// creates a new PyDataFrame
-    pub fn new(df: DataFrame) -> Self {
-        Self { df: Arc::new(df) }
+    pub fn new(df: DataFrame, display_config: PyDataframeDisplayConfig) -> Self {
+        Self {
+            df: Arc::new(df),
+            display_config,
+        }
     }
 }
 

From 7fa2c7c4e86855b2b76a69cf020682229348ddd4 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 17:04:20 +0800
Subject: [PATCH 28/51] Amend PyDataFrame to use display_config instead of
 constants

---
 src/dataframe.rs | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index 6e49a91ff..c188207c6 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -73,9 +73,6 @@ impl PyTableProvider {
         PyTable::new(table_provider)
     }
 }
-const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB
-const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20;
-const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25;
 
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.
 /// Use it to build a plan and `.collect()` to execute the plan and collect the result.
@@ -84,7 +81,7 @@ const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25;
 #[derive(Clone)]
 pub struct PyDataFrame {
     df: Arc<DataFrame>,
-    display_config: PyDataframeDisplayConfig,
+    display_config: Arc<PyDataframeDisplayConfig>,
 }
 
 impl PyDataFrame {
@@ -92,7 +89,7 @@ impl PyDataFrame {
     pub fn new(df: DataFrame, display_config: PyDataframeDisplayConfig) -> Self {
         Self {
             df: Arc::new(df),
-            display_config,
+            display_config: Arc::new(display_config),
         }
     }
 }
@@ -121,9 +118,23 @@ impl PyDataFrame {
     }
 
     fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
+        // Get display configuration values
+        let min_rows = self.display_config.min_table_rows;
+        let max_rows = self.display_config.max_table_rows_in_repr;
+        let max_bytes = self.display_config.max_table_bytes;
+
+        // Collect record batches for display
+        let (batches, has_more) = wait_for_future(
+            py,
+            collect_record_batches_to_display(
+                self.df.as_ref().clone(),
+                self.display_config.min_table_rows,
+                self.display_config.max_table_rows_in_repr,
+                self.display_config.max_table_bytes,
+            ),
         let (batches, has_more) = wait_for_future(
             py,
-            collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10),
+            self.display_config.min_table_rows, self.display_config.max_table_rows_in_repr, self.display_config.max_table_bytes),
         )?;
         if batches.is_empty() {
             // This should not be reached, but do it for safety since we index into the vector below
@@ -146,8 +157,9 @@ impl PyDataFrame {
             py,
             collect_record_batches_to_display(
                 self.df.as_ref().clone(),
-                MIN_TABLE_ROWS_TO_DISPLAY,
-                usize::MAX,
+                self.display_config.min_table_rows,
+                self.display_config.max_table_rows_in_repr,
+                self.display_config.max_table_bytes,
             ),
         )?;
         if batches.is_empty() {
@@ -223,8 +235,8 @@ impl PyDataFrame {
                 for (col, formatter) in batch_formatter.iter().enumerate() {
                     let cell_data = formatter.value(batch_row).to_string();
                     // From testing, primitive data types do not typically get larger than 21 characters
-                    if cell_data.len() > MAX_LENGTH_CELL_WITHOUT_MINIMIZE {
-                        let short_cell_data = &cell_data[0..MAX_LENGTH_CELL_WITHOUT_MINIMIZE];
+                    if cell_data.len() > self.display_config.max_cell_length {
+                        let short_cell_data = &cell_data[0..self.display_config.max_cell_length];
                         cells.push(format!("
                             <td style='border: 1px solid black; padding: 8px; text-align: left; white-space: nowrap;'>
                                 <div class=\"expandable-container\">
@@ -891,6 +903,7 @@ async fn collect_record_batches_to_display(
     df: DataFrame,
     min_rows: usize,
     max_rows: usize,
+    max_bytes: usize,
 ) -> Result<(Vec<RecordBatch>, bool), DataFusionError> {
     let partitioned_stream = df.execute_stream_partitioned().await?;
     let mut stream = futures::stream::iter(partitioned_stream).flatten();
@@ -899,7 +912,7 @@ async fn collect_record_batches_to_display(
     let mut record_batches = Vec::default();
     let mut has_more = false;
 
-    while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows)
+    while (size_estimate_so_far < max_bytes && rows_so_far < max_rows)
         || rows_so_far < min_rows
     {
         let mut rb = match stream.next().await {
@@ -914,8 +927,8 @@ async fn collect_record_batches_to_display(
         if rows_in_rb > 0 {
             size_estimate_so_far += rb.get_array_memory_size();
 
-            if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY {
-                let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32;
+            if size_estimate_so_far > max_bytes {
+                let ratio = max_bytes as f32 / size_estimate_so_far as f32;
                 let total_rows = rows_in_rb + rows_so_far;
 
                 let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize;

From cbc4759b3f731909f47ef3bb7b47e9d027b67dfe Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 17:35:58 +0800
Subject: [PATCH 29/51] refactor: Simplify PySessionConfig and PySessionContext
 by removing unnecessary display_config handling

- Removed display_config from PySessionConfig, streamlining its structure.
- Updated PySessionContext to directly manage display_config, ensuring consistent access across methods.
- Adjusted methods in PySessionContext to utilize the new display_config handling, enhancing clarity and maintainability.
- Cleaned up code in PyDataFrame to ensure it correctly references the updated display_config.
---
 src/context.rs   | 74 ++++++++++++++++++++++--------------------------
 src/dataframe.rs |  7 +----
 2 files changed, 35 insertions(+), 46 deletions(-)

diff --git a/src/context.rs b/src/context.rs
index 06f5ac8ec..f2aaf0626 100644
--- a/src/context.rs
+++ b/src/context.rs
@@ -127,15 +127,11 @@ impl Default for PyDataframeDisplayConfig {
 #[derive(Clone, Default)]
 pub struct PySessionConfig {
     pub config: SessionConfig,
-    pub display_config: PyDataframeDisplayConfig,
 }
 
 impl From<SessionConfig> for PySessionConfig {
     fn from(config: SessionConfig) -> Self {
-        Self {
-            config,
-            display_config: PyDataframeDisplayConfig::default(),
-        }
+        Self { config }
     }
 }
 
@@ -151,10 +147,7 @@ impl PySessionConfig {
             }
         }
 
-        Self {
-            config,
-            display_config: PyDataframeDisplayConfig::default(),
-        }
+        Self { config }
     }
 
     fn with_create_default_catalog_and_schema(&self, enabled: bool) -> Self {
@@ -213,12 +206,6 @@ impl PySessionConfig {
         Self::from(self.config.clone().with_repartition_file_min_size(size))
     }
 
-    fn with_dataframe_display_config(&self, display_config: PyDataframeDisplayConfig) -> Self {
-        let mut config = self.clone();
-        config.display_config = display_config;
-        config
-    }
-
     fn with_parquet_pruning(&self, enabled: bool) -> Self {
         Self::from(self.config.clone().with_parquet_pruning(enabled))
     }
@@ -332,6 +319,7 @@ impl PySQLOptions {
 #[derive(Clone)]
 pub struct PySessionContext {
     pub ctx: SessionContext,
+    pub display_config: PyDataframeDisplayConfig,
 }
 
 #[pymethods]
@@ -341,6 +329,7 @@ impl PySessionContext {
     pub fn new(
         config: Option<PySessionConfig>,
         runtime: Option<PyRuntimeEnvBuilder>,
+        display_config: Option<PyDataframeDisplayConfig>,
     ) -> PyDataFusionResult<Self> {
         let config = if let Some(c) = config {
             c.config
@@ -358,22 +347,33 @@ impl PySessionContext {
             .with_runtime_env(runtime)
             .with_default_features()
             .build();
+
         Ok(PySessionContext {
             ctx: SessionContext::new_with_state(session_state),
+            display_config: display_config.unwrap_or_default(),
         })
     }
 
     pub fn enable_url_table(&self) -> PyResult<Self> {
         Ok(PySessionContext {
             ctx: self.ctx.clone().enable_url_table(),
+            display_config: self.display_config.clone(),
         })
     }
 
+    pub fn with_display_config(&self, display_config: PyDataframeDisplayConfig) -> Self {
+        Self {
+            ctx: self.ctx.clone(),
+            display_config,
+        }
+    }
+
     #[classmethod]
     #[pyo3(signature = ())]
     fn global_ctx(_cls: &Bound<'_, PyType>) -> PyResult<Self> {
         Ok(Self {
             ctx: get_global_ctx().clone(),
+            display_config: PyDataframeDisplayConfig::default(),
         })
     }
 
@@ -457,7 +457,7 @@ impl PySessionContext {
     pub fn sql(&mut self, query: &str, py: Python) -> PyDataFusionResult<PyDataFrame> {
         let result = self.ctx.sql(query);
         let df = wait_for_future(py, result)?;
-        Ok(PyDataFrame::new(df, self.ctx.display_config.clone()))
+        Ok(PyDataFrame::new(df, self.display_config.clone()))
     }
 
     #[pyo3(signature = (query, options=None))]
@@ -474,7 +474,7 @@ impl PySessionContext {
         };
         let result = self.ctx.sql_with_options(query, options);
         let df = wait_for_future(py, result)?;
-        Ok(PyDataFrame::new(df, self.ctx.display_config.clone()))
+        Ok(PyDataFrame::new(df, self.display_config.clone()))
     }
 
     #[pyo3(signature = (partitions, name=None, schema=None))]
@@ -509,7 +509,7 @@ impl PySessionContext {
 
         let table = wait_for_future(py, self._table(&table_name))?;
 
-        let df = PyDataFrame::new(table, self.ctx.display_config.clone());
+        let df = PyDataFrame::new(table, self.display_config.clone());
         Ok(df)
     }
 
@@ -517,7 +517,7 @@ impl PySessionContext {
     pub fn create_dataframe_from_logical_plan(&mut self, plan: PyLogicalPlan) -> PyDataFrame {
         PyDataFrame::new(
             DataFrame::new(self.ctx.state(), plan.plan.as_ref().clone()),
-            self.ctx.display_config.clone(),
+            self.display_config.clone(),
         )
     }
 
@@ -886,7 +886,7 @@ impl PySessionContext {
     pub fn table(&self, name: &str, py: Python) -> PyResult<PyDataFrame> {
         let x = wait_for_future(py, self.ctx.table(name))
             .map_err(|e| PyKeyError::new_err(e.to_string()))?;
-        Ok(PyDataFrame::new(x, self.ctx.display_config.clone()))
+        Ok(PyDataFrame::new(x, self.display_config.clone()))
     }
 
     pub fn table_exist(&self, name: &str) -> PyDataFusionResult<bool> {
@@ -894,10 +894,10 @@ impl PySessionContext {
     }
 
     pub fn empty_table(&self) -> PyDataFusionResult<PyDataFrame> {
-        Ok(
-            PyDataFrame::new(self.ctx.read_empty()?),
-            self.ctx.display_config.clone(),
-        )
+        Ok(PyDataFrame::new(
+            self.ctx.read_empty()?,
+            self.display_config.clone(),
+        ))
     }
 
     pub fn session_id(&self) -> String {
@@ -932,7 +932,7 @@ impl PySessionContext {
             let result = self.ctx.read_json(path, options);
             wait_for_future(py, result)?
         };
-        Ok(PyDataFrame::new(df, self.ctx.display_config.clone()))
+        Ok(PyDataFrame::new(df, self.display_config.clone()))
     }
 
     #[allow(clippy::too_many_arguments)]
@@ -977,18 +977,12 @@ impl PySessionContext {
             let paths = path.extract::<Vec<String>>()?;
             let paths = paths.iter().map(|p| p as &str).collect::<Vec<&str>>();
             let result = self.ctx.read_csv(paths, options);
-            let df = PyDataFrame::new(
-                wait_for_future(py, result)?,
-                self.ctx.display_config.clone(),
-            );
+            let df = PyDataFrame::new(wait_for_future(py, result)?, self.display_config.clone());
             Ok(df)
         } else {
             let path = path.extract::<String>()?;
             let result = self.ctx.read_csv(path, options);
-            let df = PyDataFrame::new(
-                wait_for_future(py, result)?,
-                self.ctx.display_config.clone(),
-            );
+            let df = PyDataFrame::new(wait_for_future(py, result)?, self.display_config.clone());
             Ok(df)
         }
     }
@@ -1026,10 +1020,7 @@ impl PySessionContext {
             .collect();
 
         let result = self.ctx.read_parquet(path, options);
-        let df = PyDataFrame::new(
-            wait_for_future(py, result)?,
-            self.ctx.display_config.clone(),
-        );
+        let df = PyDataFrame::new(wait_for_future(py, result)?, self.display_config.clone());
         Ok(df)
     }
 
@@ -1054,12 +1045,12 @@ impl PySessionContext {
             let read_future = self.ctx.read_avro(path, options);
             wait_for_future(py, read_future)?
         };
-        Ok(PyDataFrame::new(df, self.ctx.display_config.clone()))
+        Ok(PyDataFrame::new(df, self.display_config.clone()))
     }
 
     pub fn read_table(&self, table: &PyTable) -> PyDataFusionResult<PyDataFrame> {
         let df = self.ctx.read_table(table.table())?;
-        Ok(PyDataFrame::new(df, self.ctx.display_config.clone()))
+        Ok(PyDataFrame::new(df, self.display_config.clone()))
     }
 
     fn __repr__(&self) -> PyResult<String> {
@@ -1175,6 +1166,9 @@ impl From<PySessionContext> for SessionContext {
 
 impl From<SessionContext> for PySessionContext {
     fn from(ctx: SessionContext) -> PySessionContext {
-        PySessionContext { ctx }
+        PySessionContext {
+            ctx,
+            display_config: PyDataframeDisplayConfig::default(),
+        }
     }
 }
diff --git a/src/dataframe.rs b/src/dataframe.rs
index c188207c6..c4b98ceb1 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -118,11 +118,6 @@ impl PyDataFrame {
     }
 
     fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
-        // Get display configuration values
-        let min_rows = self.display_config.min_table_rows;
-        let max_rows = self.display_config.max_table_rows_in_repr;
-        let max_bytes = self.display_config.max_table_bytes;
-
         // Collect record batches for display
         let (batches, has_more) = wait_for_future(
             py,
@@ -605,7 +600,7 @@ impl PyDataFrame {
     /// Calculate the exception of two `DataFrame`s.  The two `DataFrame`s must have exactly the same schema
     fn except_all(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
         let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df. self.display_config))
     }
 
     /// Write a `DataFrame` to a CSV file.

From 17379736e5ee82851e1be3fe8cae73747c08fd12 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 18:00:46 +0800
Subject: [PATCH 30/51] refactor: Update PyDataFrame methods to consistently
 use display_config for DataFrame creation

---
 src/dataframe.rs | 54 ++++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index c4b98ceb1..5c06df985 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -127,10 +127,8 @@ impl PyDataFrame {
                 self.display_config.max_table_rows_in_repr,
                 self.display_config.max_table_bytes,
             ),
-        let (batches, has_more) = wait_for_future(
-            py,
-            self.display_config.min_table_rows, self.display_config.max_table_rows_in_repr, self.display_config.max_table_bytes),
         )?;
+
         if batches.is_empty() {
             // This should not be reached, but do it for safety since we index into the vector below
             return Ok("No data to display".to_string());
@@ -281,7 +279,7 @@ impl PyDataFrame {
     fn describe(&self, py: Python) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone();
         let stat_df = wait_for_future(py, df.describe())?;
-        Ok(Self::new(stat_df))
+        Ok(Self::new(stat_df, self.display_config.as_ref().clone()))
     }
 
     /// Returns the schema from the logical plan
@@ -311,31 +309,31 @@ impl PyDataFrame {
     fn select_columns(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
         let args = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
         let df = self.df.as_ref().clone().select_columns(&args)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     #[pyo3(signature = (*args))]
     fn select(&self, args: Vec<PyExpr>) -> PyDataFusionResult<Self> {
         let expr = args.into_iter().map(|e| e.into()).collect();
         let df = self.df.as_ref().clone().select(expr)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     #[pyo3(signature = (*args))]
     fn drop(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
         let cols = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
         let df = self.df.as_ref().clone().drop_columns(&cols)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     fn filter(&self, predicate: PyExpr) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().filter(predicate.into())?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     fn with_column(&self, name: &str, expr: PyExpr) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().with_column(name, expr.into())?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     fn with_columns(&self, exprs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
@@ -345,7 +343,7 @@ impl PyDataFrame {
             let name = format!("{}", expr.schema_name());
             df = df.with_column(name.as_str(), expr)?
         }
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     /// Rename one column by applying a new projection. This is a no-op if the column to be
@@ -356,27 +354,27 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .with_column_renamed(old_name, new_name)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     fn aggregate(&self, group_by: Vec<PyExpr>, aggs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
         let group_by = group_by.into_iter().map(|e| e.into()).collect();
         let aggs = aggs.into_iter().map(|e| e.into()).collect();
         let df = self.df.as_ref().clone().aggregate(group_by, aggs)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     #[pyo3(signature = (*exprs))]
     fn sort(&self, exprs: Vec<PySortExpr>) -> PyDataFusionResult<Self> {
         let exprs = to_sort_expressions(exprs);
         let df = self.df.as_ref().clone().sort(exprs)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     #[pyo3(signature = (count, offset=0))]
     fn limit(&self, count: usize, offset: usize) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().limit(offset, Some(count))?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     /// Executes the plan, returning a list of `RecordBatch`es.
@@ -393,7 +391,7 @@ impl PyDataFrame {
     /// Cache DataFrame.
     fn cache(&self, py: Python) -> PyDataFusionResult<Self> {
         let df = wait_for_future(py, self.df.as_ref().clone().cache())?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch
@@ -418,7 +416,7 @@ impl PyDataFrame {
     /// Filter out duplicate rows
     fn distinct(&self) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().distinct()?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     fn join(
@@ -452,7 +450,7 @@ impl PyDataFrame {
             &right_keys,
             None,
         )?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     fn join_on(
@@ -481,7 +479,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .join_on(right.df.as_ref().clone(), join_type, exprs)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     /// Print the query plan
@@ -514,7 +512,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .repartition(Partitioning::RoundRobinBatch(num))?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, self.display_config.as_ref().clone()))
     }
 
     /// Repartition a `DataFrame` based on a logical partitioning scheme.
@@ -526,7 +524,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .repartition(Partitioning::Hash(expr, num))?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, self.display_config.as_ref().clone()))
     }
 
     /// Calculate the union of two `DataFrame`s, preserving duplicate rows.The
@@ -542,7 +540,7 @@ impl PyDataFrame {
             self.df.as_ref().clone().union(py_df.df.as_ref().clone())?
         };
 
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, self.display_config.as_ref().clone()))
     }
 
     /// Calculate the distinct union of two `DataFrame`s.  The
@@ -553,7 +551,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .union_distinct(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, self.display_config.as_ref().clone()))
     }
 
     #[pyo3(signature = (column, preserve_nulls=true))]
@@ -566,7 +564,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .unnest_columns_with_options(&[column], unnest_options)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     #[pyo3(signature = (columns, preserve_nulls=true))]
@@ -584,7 +582,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .unnest_columns_with_options(&cols, unnest_options)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, self.display_config.as_ref().clone()))
     }
 
     /// Calculate the intersection of two `DataFrame`s.  The two `DataFrame`s must have exactly the same schema
@@ -594,13 +592,13 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .intersect(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, self.display_config.as_ref().clone()))
     }
 
     /// Calculate the exception of two `DataFrame`s.  The two `DataFrame`s must have exactly the same schema
     fn except_all(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
         let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df. self.display_config))
+        Ok(Self::new(new_df, self.display_config.as_ref().clone()))
     }
 
     /// Write a `DataFrame` to a CSV file.
@@ -907,9 +905,7 @@ async fn collect_record_batches_to_display(
     let mut record_batches = Vec::default();
     let mut has_more = false;
 
-    while (size_estimate_so_far < max_bytes && rows_so_far < max_rows)
-        || rows_so_far < min_rows
-    {
+    while (size_estimate_so_far < max_bytes && rows_so_far < max_rows) || rows_so_far < min_rows {
         let mut rb = match stream.next().await {
             None => {
                 break;

From 354ff45be5d280e53d24f2a0ae53a1409f3ce9a1 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 18:10:03 +0800
Subject: [PATCH 31/51] feat: Add display configuration options to
 SessionContext for DataFrame presentation

---
 python/datafusion/context.py | 34 ++++++++++++++++++++++++++++++++++
 src/context.rs               |  2 +-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/python/datafusion/context.py b/python/datafusion/context.py
index ad5744958..02b035c7f 100644
--- a/python/datafusion/context.py
+++ b/python/datafusion/context.py
@@ -622,6 +622,40 @@ def global_ctx(cls) -> SessionContext:
         wrapper.ctx = internal_ctx
         return wrapper
 
+    def with_display_config(
+        self,
+        max_table_bytes: Optional[int] = None,
+        min_table_rows: Optional[int] = None,
+        max_cell_length: Optional[int] = None,
+        max_table_rows_in_repr: Optional[int] = None,
+    ) -> SessionContext:
+        """Configure the display options for DataFrames.
+
+        Args:
+            max_table_bytes: Maximum bytes to display for table presentation
+                (default: 2MB)
+            min_table_rows: Minimum number of table rows to display
+                (default: 20)
+            max_cell_length: Maximum length of a cell before it gets minimized
+                (default: 25)
+            max_table_rows_in_repr: Maximum number of rows to display in repr
+                string output (default: 10)
+
+        Returns:
+            A new :py:class:`SessionContext` object with the updated display settings.
+        """
+        display_config = DataframeDisplayConfig(
+            max_table_bytes=max_table_bytes,
+            min_table_rows=min_table_rows,
+            max_cell_length=max_cell_length,
+            max_table_rows_in_repr=max_table_rows_in_repr,
+        )
+
+        klass = self.__class__
+        obj = klass.__new__(klass)
+        obj.ctx = self.ctx.with_display_config(display_config.config_internal)
+        return obj
+
     def enable_url_table(self) -> SessionContext:
         """Control if local files can be queried as tables.
 
diff --git a/src/context.rs b/src/context.rs
index f2aaf0626..6147cceff 100644
--- a/src/context.rs
+++ b/src/context.rs
@@ -324,7 +324,7 @@ pub struct PySessionContext {
 
 #[pymethods]
 impl PySessionContext {
-    #[pyo3(signature = (config=None, runtime=None))]
+    #[pyo3(signature = (config=None, runtime=None, display_config=None))]
     #[new]
     pub fn new(
         config: Option<PySessionConfig>,

From 984b90637b55be6b9f6b8f981a30aff6a8c260e9 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 18:18:07 +0800
Subject: [PATCH 32/51] fix: Add validation for display configuration
 properties in DataframeDisplayConfig

---
 python/datafusion/context.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/python/datafusion/context.py b/python/datafusion/context.py
index 02b035c7f..006c0cc32 100644
--- a/python/datafusion/context.py
+++ b/python/datafusion/context.py
@@ -119,6 +119,8 @@ def max_table_bytes(self) -> int:
     @max_table_bytes.setter
     def max_table_bytes(self, value: int) -> None:
         """Set the maximum bytes to display for table presentation."""
+        if value <= 0:
+            raise ValueError("max_table_bytes must be greater than 0")
         self.config_internal.max_table_bytes = value
 
     @property
@@ -129,6 +131,8 @@ def min_table_rows(self) -> int:
     @min_table_rows.setter
     def min_table_rows(self, value: int) -> None:
         """Set the minimum number of table rows to display."""
+        if value <= 0:
+            raise ValueError("min_table_rows must be greater than 0")
         self.config_internal.min_table_rows = value
 
     @property
@@ -139,6 +143,8 @@ def max_cell_length(self) -> int:
     @max_cell_length.setter
     def max_cell_length(self, value: int) -> None:
         """Set the maximum length of a cell before it gets minimized."""
+        if value <= 0:
+            raise ValueError("max_cell_length must be greater than 0")
         self.config_internal.max_cell_length = value
 
     @property
@@ -149,6 +155,8 @@ def max_table_rows_in_repr(self) -> int:
     @max_table_rows_in_repr.setter
     def max_table_rows_in_repr(self, value: int) -> None:
         """Set the maximum number of rows to display in repr string output."""
+        if value <= 0:
+            raise ValueError("max_table_rows_in_repr must be greater than 0")
         self.config_internal.max_table_rows_in_repr = value
 
 

From 1326d713fd61bd521a350949e1ce5ce44cc21866 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 18:25:30 +0800
Subject: [PATCH 33/51] feat: Integrate DataframeDisplayConfig into
 SessionContext initialization

---
 python/datafusion/context.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/datafusion/context.py b/python/datafusion/context.py
index 006c0cc32..28cd2c0f7 100644
--- a/python/datafusion/context.py
+++ b/python/datafusion/context.py
@@ -615,8 +615,8 @@ def __init__(
         """
         config = config.config_internal if config is not None else None
         runtime = runtime.config_internal if runtime is not None else None
-
-        self.ctx = SessionContextInternal(config, runtime)
+        display_config = DataframeDisplayConfigInternal()
+        self.ctx = SessionContextInternal(config, runtime, display_config)
 
     @classmethod
     def global_ctx(cls) -> SessionContext:

From 0c4eaa61a7a68736d0ffad9cea22033f1c8c85f8 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 18:25:45 +0800
Subject: [PATCH 34/51] test: Add tests for DataframeDisplayConfig
 initialization and SessionContext integration

---
 python/tests/test_dataframe.py | 97 ++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index eda13930d..453d902df 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -31,6 +31,7 @@
 from datafusion import functions as f
 from datafusion.expr import Window
 from pyarrow.csv import write_csv
+from datafusion.context import DataframeDisplayConfig
 
 
 @pytest.fixture
@@ -51,6 +52,102 @@ def df():
     return ctx.from_arrow(batch)
 
 
+def test_display_config():
+    # Test display_config initialization
+    config = DataframeDisplayConfig(
+        max_table_bytes=1024,
+        min_table_rows=10,
+        max_cell_length=15,
+        max_table_rows_in_repr=5,
+    )
+
+    assert config.max_table_bytes == 1024
+    assert config.min_table_rows == 10
+    assert config.max_cell_length == 15
+    assert config.max_table_rows_in_repr == 5
+
+    # Test property setters
+    config.max_table_bytes = 2048
+    config.min_table_rows = 20
+    config.max_cell_length = 30
+    config.max_table_rows_in_repr = 10
+
+    assert config.max_table_bytes == 2048
+    assert config.min_table_rows == 20
+    assert config.max_cell_length == 30
+    assert config.max_table_rows_in_repr == 10
+
+    # Test property setter validation
+    with pytest.raises(ValueError, match="max_table_bytes must be greater than 0"):
+        config.max_table_bytes = 0
+
+    with pytest.raises(ValueError, match="min_table_rows must be greater than 0"):
+        config.min_table_rows = -1
+
+    with pytest.raises(ValueError, match="max_cell_length must be greater than 0"):
+        config.max_cell_length = 0
+
+    with pytest.raises(
+        ValueError, match="max_table_rows_in_repr must be greater than 0"
+    ):
+        config.max_table_rows_in_repr = -5
+
+
+def test_session_with_display_config():
+    # Test with_display_config returns a new context with updated config
+    ctx = SessionContext()
+
+    # Verify the default values are used initially
+    df = ctx.from_pylist([{"a": 1, "b": "x" * 50, "c": 3}] * 100)
+    html_repr = df._repr_html_()
+
+    # Create a new context with custom display config
+    ctx2 = ctx.with_display_config(
+        max_table_bytes=1024,
+        min_table_rows=5,
+        max_cell_length=10,
+        max_table_rows_in_repr=3,
+    )
+
+    # Create a dataframe with the same data but using the new context
+    df2 = ctx2.from_pylist([{"a": 1, "b": "x" * 50, "c": 3}] * 100)
+    html_repr2 = df2._repr_html_()
+
+    # The HTML representation should be different with different display configs
+    assert html_repr != html_repr2
+
+    # Check that the second representation has the short cell data based on the configured length
+    assert f'<span class="expandable" id="' in html_repr2
+    assert f'>{("x" * 10)}</span>' in html_repr2
+
+
+def test_display_config_in_init():
+    # Test providing display config directly in SessionContext constructor
+    display_config = DataframeDisplayConfig(
+        max_table_bytes=1024,
+        min_table_rows=5,
+        max_cell_length=10,
+        max_table_rows_in_repr=3,
+    )
+
+    ctx = SessionContext()
+    df1 = ctx.from_pylist([{"a": 1, "b": "x" * 50, "c": 3}] * 100)
+    html_repr1 = df1._repr_html_()
+
+    # Create a context with custom display config through the with_display_config method
+    ctx2 = ctx.with_display_config(
+        max_table_bytes=1024,
+        min_table_rows=5,
+        max_cell_length=10,
+        max_table_rows_in_repr=3,
+    )
+    df2 = ctx2.from_pylist([{"a": 1, "b": "x" * 50, "c": 3}] * 100)
+    html_repr2 = df2._repr_html_()
+
+    # Both methods should result in equivalent display configuration
+    assert html_repr1 != html_repr2
+
+
 @pytest.fixture
 def struct_df():
     ctx = SessionContext()

From eef0a36541fcd83a624d37e606bd62870fb288f9 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 19:20:51 +0800
Subject: [PATCH 35/51] debug: Add logging to collect_record_batches_to_display
 for better traceability

---
 src/dataframe.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index 5c06df985..639c9597a 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -904,8 +904,10 @@ async fn collect_record_batches_to_display(
     let mut rows_so_far = 0;
     let mut record_batches = Vec::default();
     let mut has_more = false;
+    println!("==> min_rows: {min_rows}, max_rows: {max_rows}, max_bytes: {max_bytes}");
 
-    while (size_estimate_so_far < max_bytes && rows_so_far < max_rows) || rows_so_far < min_rows {
+    while size_estimate_so_far < max_bytes && rows_so_far < max_rows && rows_so_far < min_rows {
+        println!("==> size_estimate_so_far: {size_estimate_so_far}, rows_so_far: {rows_so_far}");
         let mut rb = match stream.next().await {
             None => {
                 break;
@@ -944,6 +946,9 @@ async fn collect_record_batches_to_display(
             record_batches.push(rb);
         }
     }
+    println!(
+        "==> after while, size_estimate_so_far: {size_estimate_so_far}, rows_so_far: {rows_so_far}"
+    );
 
     if record_batches.is_empty() {
         return Ok((Vec::default(), false));

From 815690bb9a0c5ed37384ef260ce56e54993bb6d2 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 19:21:02 +0800
Subject: [PATCH 36/51] test: Add display configuration tests for DataFrame
 representation and HTML output

---
 python/tests/test_dataframe.py | 148 +++++++++++++++++++++++++++++++++
 1 file changed, 148 insertions(+)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 453d902df..71ef2cf16 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1358,3 +1358,151 @@ def test_dataframe_repr_html(df) -> None:
     body_lines = [f"<td(.*?)>{v}</td>" for inner in body_data for v in inner]
     body_pattern = "(.*?)".join(body_lines)
     assert len(re.findall(body_pattern, output, re.DOTALL)) == 1
+
+
+def test_display_config_affects_repr():
+    max_table_rows_in_repr = 3
+    # Create a context with custom display config
+    ctx = SessionContext().with_display_config(
+        max_table_rows_in_repr=max_table_rows_in_repr
+    )
+
+    # Create a DataFrame with more rows than the display limit
+    data = [{"a": i, "b": f"value_{i}", "c": i * 10} for i in range(10)]
+    df = ctx.from_pylist(data)
+
+    # Get the string representation
+    # +---+---------+----+
+    # | a | b       | c  |
+    # +---+---------+----+
+    # | 0 | value_0 | 0  |
+    # | 1 | value_1 | 10 |
+    # | 2 | value_2 | 20 |
+    # +---+---------+----+
+    # Data truncated.
+    repr_str = repr(df)
+    print("==> repr_str", repr_str)
+
+    # The representation should show truncated data (3 rows as specified)
+    assert (
+        repr_str.count("\n") <= max_table_rows_in_repr + 5
+    )  # header row + separator lines + data rows + possibly truncation message
+    assert "Data truncated" in repr_str
+
+    # Create a context with larger display limit
+    ctx2 = SessionContext().with_display_config(max_table_rows_in_repr=15)
+
+    df2 = ctx2.from_pylist(data)
+    repr_str2 = repr(df2)
+
+    # Should show all data without truncation message
+    assert repr_str2.count("\n") >= 10  # All rows should be shown
+    assert "Data truncated" not in repr_str2
+
+
+def test_display_config_affects_html_repr():
+    # Create a context with custom display config to show only a small cell length
+    ctx = SessionContext().with_display_config(max_cell_length=5)
+
+    # Create a DataFrame with a column containing long strings
+    data = [
+        {"a": 1, "b": "This is a very long string that should be truncated", "c": 100}
+    ]
+    df = ctx.from_pylist(data)
+
+    # Get the HTML representation
+    html_str = df._repr_html_()
+
+    # The cell should be truncated to 5 characters and have expansion button
+    assert ">This " in html_str  # 5 character limit
+    assert "expandable" in html_str
+    assert "expand-btn" in html_str
+
+    # Create a context with larger cell length limit
+    ctx2 = SessionContext().with_display_config(max_cell_length=50)
+
+    df2 = ctx2.from_pylist(data)
+    html_str2 = df2._repr_html_()
+
+    # String shouldn't be truncated (or at least not in the same way)
+    if "expandable" in html_str2:
+        # If it still has an expandable div, it should contain more characters
+        assert ">This is a very long string that" in html_str2
+    else:
+        # Or it might not need expansion at all
+        assert "This is a very long string that should be truncated" in html_str2
+
+
+def test_display_config_rows_limit_in_html():
+    max_table_rows = 5
+    # Create a context with custom display config to limit rows
+    ctx = SessionContext().with_display_config(
+        max_table_rows_in_repr=max_table_rows,
+    )
+
+    # Create a DataFrame with 10 rows
+    data = [{"a": i, "b": f"value_{i}", "c": i * 10} for i in range(10)]
+    df = ctx.from_pylist(data)
+
+    # Get the HTML representation
+    html_str = df._repr_html_()
+
+    # Only a few rows should be shown and there should be a truncation message
+    row_count = html_str.count("<tr>") - 1  # Subtract 1 for header row
+    print("==> html_str", html_str)
+    assert row_count <= max_table_rows
+    assert "Data truncated" in html_str
+
+    # Create a context with larger row limit
+    max_table_rows = 20
+    ctx2 = SessionContext().with_display_config(
+        max_table_rows_in_repr=max_table_rows
+    )  # Show more rows
+
+    df2 = ctx2.from_pylist(data)
+    html_str2 = df2._repr_html_()
+
+    # Should show all rows
+    row_count2 = html_str2.count("<tr>") - 1  # Subtract 1 for header row
+    assert row_count2 == 10  # Should show all 10 rows
+    assert "Data truncated" not in html_str2
+
+
+def test_display_config_max_bytes_limit():
+    min_table_rows = 10
+    max_table_rows = 20
+    # Create a context with custom display config with very small byte limit
+    ctx = SessionContext().with_display_config(
+        min_table_rows=min_table_rows,
+        max_table_rows_in_repr=max_table_rows,
+        max_table_bytes=100,
+    )  # Very small limit
+
+    # Create a DataFrame with large content
+    # Generate some data with long strings to hit the byte limit quickly
+    large_string = "x" * 50
+    data = [
+        {"a": i, "b": large_string, "c": large_string}
+        for i in range(20)  # 20 rows with long strings
+    ]
+    df = ctx.from_pylist(data)
+
+    # Get the HTML representation
+    html_str = df._repr_html_()
+
+    # Due to small byte limit, we should see truncation
+    row_count = html_str.count("<tr>") - 1  # Subtract 1 for header row
+    assert row_count <= min_table_rows  # Should not show all 10 rows
+    assert "Data truncated" in html_str
+
+    # With a larger byte limit
+    ctx2 = SessionContext().with_display_config(
+        max_table_bytes=10 * 1024 * 1024  # 10 MB, much more than needed
+    )
+
+    df2 = ctx2.from_pylist(data)
+    html_str2 = df2._repr_html_()
+
+    # Should show all rows
+    row_count2 = html_str2.count("<tr>") - 1  # Subtract 1 for header row
+    assert row_count2 >= min_table_rows  # Should show more than min_table_rows

From a5e16a35ad5d23fae2dc512074f3d2ca8410c4bc Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Wed, 2 Apr 2025 19:22:48 +0800
Subject: [PATCH 37/51] refactor: Remove debug print statements from display
 configuration tests

---
 python/tests/test_dataframe.py | 2 --
 src/dataframe.rs               | 2 --
 2 files changed, 4 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 71ef2cf16..daeb0a597 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1381,7 +1381,6 @@ def test_display_config_affects_repr():
     # +---+---------+----+
     # Data truncated.
     repr_str = repr(df)
-    print("==> repr_str", repr_str)
 
     # The representation should show truncated data (3 rows as specified)
     assert (
@@ -1449,7 +1448,6 @@ def test_display_config_rows_limit_in_html():
 
     # Only a few rows should be shown and there should be a truncation message
     row_count = html_str.count("<tr>") - 1  # Subtract 1 for header row
-    print("==> html_str", html_str)
     assert row_count <= max_table_rows
     assert "Data truncated" in html_str
 
diff --git a/src/dataframe.rs b/src/dataframe.rs
index 639c9597a..10247a79c 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -904,10 +904,8 @@ async fn collect_record_batches_to_display(
     let mut rows_so_far = 0;
     let mut record_batches = Vec::default();
     let mut has_more = false;
-    println!("==> min_rows: {min_rows}, max_rows: {max_rows}, max_bytes: {max_bytes}");
 
     while size_estimate_so_far < max_bytes && rows_so_far < max_rows && rows_so_far < min_rows {
-        println!("==> size_estimate_so_far: {size_estimate_so_far}, rows_so_far: {rows_so_far}");
         let mut rb = match stream.next().await {
             None => {
                 break;

From efc041cdf867069ab6ded10f90a288fc3d25d935 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Thu, 3 Apr 2025 12:47:38 +0800
Subject: [PATCH 38/51] refactor: Extract validation logic into a separate
 method in DataframeDisplayConfig

- Added a private method `_validate_positive` to encapsulate the logic for validating positive integer values.
- Updated setters for `max_table_bytes`, `min_table_rows`, `max_cell_length`, and `max_table_rows_in_repr` to use the new validation method, improving code readability and maintainability.
---
 python/datafusion/context.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/python/datafusion/context.py b/python/datafusion/context.py
index 28cd2c0f7..9e1ff497d 100644
--- a/python/datafusion/context.py
+++ b/python/datafusion/context.py
@@ -111,6 +111,19 @@ def __init__(
             max_table_rows_in_repr=max_table_rows_in_repr,
         )
 
+    def _validate_positive(self, value: int, name: str) -> None:
+        """Validate that the given value is positive.
+
+        Args:
+            value: The value to validate
+            name: The name of the parameter for the error message
+
+        Raises:
+            ValueError: If the value is not positive
+        """
+        if value <= 0:
+            raise ValueError(f"{name} must be greater than 0")
+
     @property
     def max_table_bytes(self) -> int:
         """Get the maximum bytes to display for table presentation."""
@@ -119,8 +132,7 @@ def max_table_bytes(self) -> int:
     @max_table_bytes.setter
     def max_table_bytes(self, value: int) -> None:
         """Set the maximum bytes to display for table presentation."""
-        if value <= 0:
-            raise ValueError("max_table_bytes must be greater than 0")
+        self._validate_positive(value, "max_table_bytes")
         self.config_internal.max_table_bytes = value
 
     @property
@@ -131,8 +143,7 @@ def min_table_rows(self) -> int:
     @min_table_rows.setter
     def min_table_rows(self, value: int) -> None:
         """Set the minimum number of table rows to display."""
-        if value <= 0:
-            raise ValueError("min_table_rows must be greater than 0")
+        self._validate_positive(value, "min_table_rows")
         self.config_internal.min_table_rows = value
 
     @property
@@ -143,8 +154,7 @@ def max_cell_length(self) -> int:
     @max_cell_length.setter
     def max_cell_length(self, value: int) -> None:
         """Set the maximum length of a cell before it gets minimized."""
-        if value <= 0:
-            raise ValueError("max_cell_length must be greater than 0")
+        self._validate_positive(value, "max_cell_length")
         self.config_internal.max_cell_length = value
 
     @property
@@ -155,8 +165,7 @@ def max_table_rows_in_repr(self) -> int:
     @max_table_rows_in_repr.setter
     def max_table_rows_in_repr(self, value: int) -> None:
         """Set the maximum number of rows to display in repr string output."""
-        if value <= 0:
-            raise ValueError("max_table_rows_in_repr must be greater than 0")
+        self._validate_positive(value, "max_table_rows_in_repr")
         self.config_internal.max_table_rows_in_repr = value
 
 

From d30c641c486a29c74338f4aaae37a96da8d5b050 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Thu, 3 Apr 2025 12:53:45 +0800
Subject: [PATCH 39/51] refactor: Enhance DataframeDisplayConfig initialization
 with value validation

- Added validation for max_table_bytes, min_table_rows, max_cell_length, and max_table_rows_in_repr to ensure positive values during initialization.
- Removed the deprecated with_dataframe_display_config method to streamline the configuration process.
---
 python/datafusion/context.py | 49 +++++++-----------------------------
 1 file changed, 9 insertions(+), 40 deletions(-)

diff --git a/python/datafusion/context.py b/python/datafusion/context.py
index 9e1ff497d..4258c01fb 100644
--- a/python/datafusion/context.py
+++ b/python/datafusion/context.py
@@ -104,6 +104,15 @@ def __init__(
             max_table_rows_in_repr: Maximum number of rows to display in repr
                 string output (default: 10)
         """
+        # Validate values if they are not None
+        if max_table_bytes is not None:
+            self._validate_positive(max_table_bytes, "max_table_bytes")
+        if min_table_rows is not None:
+            self._validate_positive(min_table_rows, "min_table_rows")
+        if max_cell_length is not None:
+            self._validate_positive(max_cell_length, "max_cell_length")
+        if max_table_rows_in_repr is not None:
+            self._validate_positive(max_table_rows_in_repr, "max_table_rows_in_repr")
         self.config_internal = DataframeDisplayConfigInternal(
             max_table_bytes=max_table_bytes,
             min_table_rows=min_table_rows,
@@ -180,46 +189,6 @@ def __init__(self, config_options: dict[str, str] | None = None) -> None:
         """
         self.config_internal = SessionConfigInternal(config_options)
 
-    def with_dataframe_display_config(
-        self,
-        max_table_bytes: Optional[int] = None,
-        min_table_rows: Optional[int] = None,
-        max_cell_length: Optional[int] = None,
-        max_table_rows_in_repr: Optional[int] = None,
-    ) -> SessionConfig:
-        """Configure the display options for DataFrames.
-
-        Args:
-            max_table_bytes: Maximum bytes to display for table presentation
-                     (default: 2MB)
-            min_table_rows: Minimum number of table rows to display (default: 20)
-            max_cell_length: Maximum length of a cell before it gets minimized
-                     (default: 25)
-            max_table_rows_in_repr: Maximum number of rows to display in repr string
-                        output (default: 10)
-
-        Returns:
-            A new :py:class:`SessionConfig` object with the updated display settings.
-        """
-        display_config = DataframeDisplayConfigInternal(
-            max_table_bytes=max_table_bytes,
-            min_table_rows=min_table_rows,
-            max_cell_length=max_cell_length,
-            max_table_rows_in_repr=max_table_rows_in_repr,
-        )
-
-        display_config = DataframeDisplayConfigInternal(
-            max_table_bytes=max_table_bytes,
-            min_table_rows=min_table_rows,
-            max_cell_length=max_cell_length,
-            max_table_rows_in_repr=max_table_rows_in_repr,
-        )
-
-        self.config_internal = self.config_internal.with_dataframe_display_config(
-            display_config
-        )
-        return self
-
     def with_create_default_catalog_and_schema(
         self, enabled: bool = True
     ) -> SessionConfig:

From b467100364955a401da3b2257c27e277141aeb55 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Thu, 3 Apr 2025 13:15:44 +0800
Subject: [PATCH 40/51] test: Add fixture for test data and refactor tests to
 use it

---
 python/tests/test_dataframe.py | 81 +++++++++++++++-------------------
 src/dataframe.rs               |  3 --
 2 files changed, 35 insertions(+), 49 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index daeb0a597..f1c74f25a 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -52,6 +52,11 @@ def df():
     return ctx.from_arrow(batch)
 
 
+@pytest.fixture
+def data():
+    return [{"a": 1, "b": "x" * 50, "c": 3}] * 100
+
+
 def test_display_config():
     # Test display_config initialization
     config = DataframeDisplayConfig(
@@ -93,12 +98,12 @@ def test_display_config():
         config.max_table_rows_in_repr = -5
 
 
-def test_session_with_display_config():
+def test_session_with_display_config(data):
     # Test with_display_config returns a new context with updated config
     ctx = SessionContext()
 
     # Verify the default values are used initially
-    df = ctx.from_pylist([{"a": 1, "b": "x" * 50, "c": 3}] * 100)
+    df = ctx.from_pylist(data)
     html_repr = df._repr_html_()
 
     # Create a new context with custom display config
@@ -110,7 +115,7 @@ def test_session_with_display_config():
     )
 
     # Create a dataframe with the same data but using the new context
-    df2 = ctx2.from_pylist([{"a": 1, "b": "x" * 50, "c": 3}] * 100)
+    df2 = ctx2.from_pylist(data)
     html_repr2 = df2._repr_html_()
 
     # The HTML representation should be different with different display configs
@@ -121,7 +126,7 @@ def test_session_with_display_config():
     assert f'>{("x" * 10)}</span>' in html_repr2
 
 
-def test_display_config_in_init():
+def test_display_config_in_init(data):
     # Test providing display config directly in SessionContext constructor
     display_config = DataframeDisplayConfig(
         max_table_bytes=1024,
@@ -131,7 +136,7 @@ def test_display_config_in_init():
     )
 
     ctx = SessionContext()
-    df1 = ctx.from_pylist([{"a": 1, "b": "x" * 50, "c": 3}] * 100)
+    df1 = ctx.from_pylist(data)
     html_repr1 = df1._repr_html_()
 
     # Create a context with custom display config through the with_display_config method
@@ -141,7 +146,7 @@ def test_display_config_in_init():
         max_cell_length=10,
         max_table_rows_in_repr=3,
     )
-    df2 = ctx2.from_pylist([{"a": 1, "b": "x" * 50, "c": 3}] * 100)
+    df2 = ctx2.from_pylist(data)
     html_repr2 = df2._repr_html_()
 
     # Both methods should result in equivalent display configuration
@@ -1360,7 +1365,7 @@ def test_dataframe_repr_html(df) -> None:
     assert len(re.findall(body_pattern, output, re.DOTALL)) == 1
 
 
-def test_display_config_affects_repr():
+def test_display_config_affects_repr(data):
     max_table_rows_in_repr = 3
     # Create a context with custom display config
     ctx = SessionContext().with_display_config(
@@ -1368,71 +1373,62 @@ def test_display_config_affects_repr():
     )
 
     # Create a DataFrame with more rows than the display limit
-    data = [{"a": i, "b": f"value_{i}", "c": i * 10} for i in range(10)]
     df = ctx.from_pylist(data)
 
-    # Get the string representation
-    # +---+---------+----+
-    # | a | b       | c  |
-    # +---+---------+----+
-    # | 0 | value_0 | 0  |
-    # | 1 | value_1 | 10 |
-    # | 2 | value_2 | 20 |
-    # +---+---------+----+
-    # Data truncated.
     repr_str = repr(df)
 
     # The representation should show truncated data (3 rows as specified)
     assert (
-        repr_str.count("\n") <= max_table_rows_in_repr + 5
-    )  # header row + separator lines + data rows + possibly truncation message
+        # 5 = 1 header row + 3 separator line + 1 truncation message
+        repr_str.count("\n")
+        <= max_table_rows_in_repr + 5
+    )
     assert "Data truncated" in repr_str
 
     # Create a context with larger display limit
-    ctx2 = SessionContext().with_display_config(max_table_rows_in_repr=15)
+    max_table_rows_in_repr = 100
+    ctx2 = SessionContext().with_display_config(
+        max_table_rows_in_repr=max_table_rows_in_repr
+    )
 
     df2 = ctx2.from_pylist(data)
     repr_str2 = repr(df2)
 
     # Should show all data without truncation message
-    assert repr_str2.count("\n") >= 10  # All rows should be shown
+    assert (
+        # 4 = 1 header row + 3 separator lines
+        repr_str2.count("\n")
+        == max_table_rows_in_repr + 4
+    )  # All rows should be shown
     assert "Data truncated" not in repr_str2
 
 
-def test_display_config_affects_html_repr():
+def test_display_config_affects_html_repr(data):
     # Create a context with custom display config to show only a small cell length
     ctx = SessionContext().with_display_config(max_cell_length=5)
 
     # Create a DataFrame with a column containing long strings
-    data = [
-        {"a": 1, "b": "This is a very long string that should be truncated", "c": 100}
-    ]
     df = ctx.from_pylist(data)
 
     # Get the HTML representation
     html_str = df._repr_html_()
 
     # The cell should be truncated to 5 characters and have expansion button
-    assert ">This " in html_str  # 5 character limit
-    assert "expandable" in html_str
-    assert "expand-btn" in html_str
+    assert ">xxxxx" in html_str  # 5 character limit
+    expandable_class = 'class="expandable-container"'
+    assert expandable_class in html_str
 
     # Create a context with larger cell length limit
-    ctx2 = SessionContext().with_display_config(max_cell_length=50)
+    ctx2 = SessionContext().with_display_config(max_cell_length=60)
 
     df2 = ctx2.from_pylist(data)
     html_str2 = df2._repr_html_()
 
     # String shouldn't be truncated (or at least not in the same way)
-    if "expandable" in html_str2:
-        # If it still has an expandable div, it should contain more characters
-        assert ">This is a very long string that" in html_str2
-    else:
-        # Or it might not need expansion at all
-        assert "This is a very long string that should be truncated" in html_str2
+    assert expandable_class not in html_str2
 
 
-def test_display_config_rows_limit_in_html():
+def test_display_config_rows_limit_in_html(data):
     max_table_rows = 5
     # Create a context with custom display config to limit rows
     ctx = SessionContext().with_display_config(
@@ -1440,7 +1436,6 @@ def test_display_config_rows_limit_in_html():
     )
 
     # Create a DataFrame with 10 rows
-    data = [{"a": i, "b": f"value_{i}", "c": i * 10} for i in range(10)]
     df = ctx.from_pylist(data)
 
     # Get the HTML representation
@@ -1452,7 +1447,7 @@ def test_display_config_rows_limit_in_html():
     assert "Data truncated" in html_str
 
     # Create a context with larger row limit
-    max_table_rows = 20
+    max_table_rows = 100
     ctx2 = SessionContext().with_display_config(
         max_table_rows_in_repr=max_table_rows
     )  # Show more rows
@@ -1462,11 +1457,11 @@ def test_display_config_rows_limit_in_html():
 
     # Should show all rows
     row_count2 = html_str2.count("<tr>") - 1  # Subtract 1 for header row
-    assert row_count2 == 10  # Should show all 10 rows
+    assert row_count2 == max_table_rows
     assert "Data truncated" not in html_str2
 
 
-def test_display_config_max_bytes_limit():
+def test_display_config_max_bytes_limit(data):
     min_table_rows = 10
     max_table_rows = 20
     # Create a context with custom display config with very small byte limit
@@ -1477,12 +1472,6 @@ def test_display_config_max_bytes_limit():
     )  # Very small limit
 
     # Create a DataFrame with large content
-    # Generate some data with long strings to hit the byte limit quickly
-    large_string = "x" * 50
-    data = [
-        {"a": i, "b": large_string, "c": large_string}
-        for i in range(20)  # 20 rows with long strings
-    ]
     df = ctx.from_pylist(data)
 
     # Get the HTML representation
diff --git a/src/dataframe.rs b/src/dataframe.rs
index 10247a79c..5d62ad2bd 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -944,9 +944,6 @@ async fn collect_record_batches_to_display(
             record_batches.push(rb);
         }
     }
-    println!(
-        "==> after while, size_estimate_so_far: {size_estimate_so_far}, rows_so_far: {rows_so_far}"
-    );
 
     if record_batches.is_empty() {
         return Ok((Vec::default(), false));

From 2993854faf736c824c50edca6d0f3166b0ed8dd6 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Thu, 3 Apr 2025 14:31:18 +0800
Subject: [PATCH 41/51] fix: Update loop condition in
 collect_record_batches_to_display for correct row handling

---
 src/dataframe.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index 5d62ad2bd..5c06df985 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -905,7 +905,7 @@ async fn collect_record_batches_to_display(
     let mut record_batches = Vec::default();
     let mut has_more = false;
 
-    while size_estimate_so_far < max_bytes && rows_so_far < max_rows && rows_so_far < min_rows {
+    while (size_estimate_so_far < max_bytes && rows_so_far < max_rows) || rows_so_far < min_rows {
         let mut rb = match stream.next().await {
             None => {
                 break;

From 71c64b9c8718a0d941ba8f0e32de813548480819 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Thu, 3 Apr 2025 15:03:44 +0800
Subject: [PATCH 42/51] fix ruff errors

---
 python/datafusion/__init__.py  | 16 +++++-----------
 python/datafusion/context.py   | 24 ++++++++++++------------
 python/tests/test_dataframe.py | 31 +++++++++++++++----------------
 3 files changed, 32 insertions(+), 39 deletions(-)

diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py
index 23f6c971d..436c30e52 100644
--- a/python/datafusion/__init__.py
+++ b/python/datafusion/__init__.py
@@ -26,26 +26,20 @@
 except ImportError:
     import importlib_metadata
 
+# Local module imports
 from . import functions, object_store, substrait
-
-# The following imports are okay to remain as opaque to the user.
 from ._internal import Config
 from .catalog import Catalog, Database, Table
-from .common import (
-    DFSchema,
-)
+from .common import DFSchema
 from .context import (
+    DataframeDisplayConfig,
     RuntimeEnvBuilder,
+    SQLOptions,
     SessionConfig,
     SessionContext,
-    DataframeDisplayConfig,
-    SQLOptions,
 )
 from .dataframe import DataFrame
-from .expr import (
-    Expr,
-    WindowFrame,
-)
+from .expr import Expr, WindowFrame
 from .io import read_avro, read_csv, read_json, read_parquet
 from .plan import ExecutionPlan, LogicalPlan
 from .record_batch import RecordBatch, RecordBatchStream
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
index 4258c01fb..728bae458 100644
--- a/python/datafusion/context.py
+++ b/python/datafusion/context.py
@@ -26,26 +26,25 @@
 except ImportError:
     from typing_extensions import deprecated  # Python 3.12
 
+if TYPE_CHECKING:
+    import pandas as pd
+    import pathlib
+    import polars as pl
+    import pyarrow as pa
+
+    from datafusion.plan import ExecutionPlan, LogicalPlan
+
 from datafusion.catalog import Catalog, Table
 from datafusion.dataframe import DataFrame
 from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list
 from datafusion.record_batch import RecordBatchStream
 from datafusion.udf import AggregateUDF, ScalarUDF, WindowUDF
 
+from ._internal import DataframeDisplayConfig as DataframeDisplayConfigInternal
 from ._internal import RuntimeEnvBuilder as RuntimeEnvBuilderInternal
+from ._internal import SQLOptions as SQLOptionsInternal
 from ._internal import SessionConfig as SessionConfigInternal
 from ._internal import SessionContext as SessionContextInternal
-from ._internal import SQLOptions as SQLOptionsInternal
-from ._internal import DataframeDisplayConfig as DataframeDisplayConfigInternal
-
-if TYPE_CHECKING:
-    import pathlib
-
-    import pandas as pd
-    import polars as pl
-    import pyarrow as pa
-
-    from datafusion.plan import ExecutionPlan, LogicalPlan
 
 
 class ArrowStreamExportable(Protocol):
@@ -131,7 +130,8 @@ def _validate_positive(self, value: int, name: str) -> None:
             ValueError: If the value is not positive
         """
         if value <= 0:
-            raise ValueError(f"{name} must be greater than 0")
+            error_message = f"{name} must be greater than 0"
+            raise ValueError(error_message)
 
     @property
     def max_table_bytes(self) -> int:
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index f1c74f25a..52b0dc4f1 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -29,9 +29,9 @@
     literal,
 )
 from datafusion import functions as f
+from datafusion.context import DataframeDisplayConfig
 from datafusion.expr import Window
 from pyarrow.csv import write_csv
-from datafusion.context import DataframeDisplayConfig
 
 
 @pytest.fixture
@@ -57,6 +57,11 @@ def data():
     return [{"a": 1, "b": "x" * 50, "c": 3}] * 100
 
 
+@pytest.fixture
+def span_expandable_class():
+    return '<span class="expandable" id="'
+
+
 def test_display_config():
     # Test display_config initialization
     config = DataframeDisplayConfig(
@@ -98,7 +103,7 @@ def test_display_config():
         config.max_table_rows_in_repr = -5
 
 
-def test_session_with_display_config(data):
+def test_session_with_display_config(data, span_expandable_class):
     # Test with_display_config returns a new context with updated config
     ctx = SessionContext()
 
@@ -121,19 +126,14 @@ def test_session_with_display_config(data):
     # The HTML representation should be different with different display configs
     assert html_repr != html_repr2
 
-    # Check that the second representation has the short cell data based on the configured length
-    assert f'<span class="expandable" id="' in html_repr2
-    assert f'>{("x" * 10)}</span>' in html_repr2
+    # Check that the second representation has the short cell data based on the
+    # configured length
+    assert span_expandable_class in html_repr2
+    assert f">{('x' * 10)}</span>" in html_repr2
 
 
 def test_display_config_in_init(data):
-    # Test providing display config directly in SessionContext constructor
-    display_config = DataframeDisplayConfig(
-        max_table_bytes=1024,
-        min_table_rows=5,
-        max_cell_length=10,
-        max_table_rows_in_repr=3,
-    )
+    # Test default display config directly in SessionContext constructor
 
     ctx = SessionContext()
     df1 = ctx.from_pylist(data)
@@ -1403,7 +1403,7 @@ def test_display_config_affects_repr(data):
     assert "Data truncated" not in repr_str2
 
 
-def test_display_config_affects_html_repr(data):
+def test_display_config_affects_html_repr(data, span_expandable_class):
     # Create a context with custom display config to show only a small cell length
     ctx = SessionContext().with_display_config(max_cell_length=5)
 
@@ -1415,8 +1415,7 @@ def test_display_config_affects_html_repr(data):
 
     # The cell should be truncated to 5 characters and have expansion button
     assert ">xxxxx" in html_str  # 5 character limit
-    expandable_class = 'class="expandable-container"'
-    assert expandable_class in html_str
+    assert span_expandable_class in html_str
 
     # Create a context with larger cell length limit
     ctx2 = SessionContext().with_display_config(max_cell_length=60)
@@ -1425,7 +1424,7 @@ def test_display_config_affects_html_repr(data):
     html_str2 = df2._repr_html_()
 
     # String shouldn't be truncated (or at least not in the same way)
-    assert expandable_class not in html_str2
+    assert span_expandable_class not in html_str2
 
 
 def test_display_config_rows_limit_in_html(data):

From ec7033a223b8d0ac8c7024084e3b4be4470be974 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Thu, 3 Apr 2025 15:13:36 +0800
Subject: [PATCH 43/51] fix ruff errors

---
 python/datafusion/__init__.py |  2 +-
 python/datafusion/context.py  | 19 ++++++++++---------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py
index c5ed77057..728b9c390 100644
--- a/python/datafusion/__init__.py
+++ b/python/datafusion/__init__.py
@@ -35,9 +35,9 @@
 from .context import (
     DataframeDisplayConfig,
     RuntimeEnvBuilder,
-    SQLOptions,
     SessionConfig,
     SessionContext,
+    SQLOptions,
 )
 from .dataframe import DataFrame
 from .expr import Expr, WindowFrame
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
index 728bae458..73f7cbd09 100644
--- a/python/datafusion/context.py
+++ b/python/datafusion/context.py
@@ -26,14 +26,6 @@
 except ImportError:
     from typing_extensions import deprecated  # Python 3.12
 
-if TYPE_CHECKING:
-    import pandas as pd
-    import pathlib
-    import polars as pl
-    import pyarrow as pa
-
-    from datafusion.plan import ExecutionPlan, LogicalPlan
-
 from datafusion.catalog import Catalog, Table
 from datafusion.dataframe import DataFrame
 from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list
@@ -42,9 +34,18 @@
 
 from ._internal import DataframeDisplayConfig as DataframeDisplayConfigInternal
 from ._internal import RuntimeEnvBuilder as RuntimeEnvBuilderInternal
-from ._internal import SQLOptions as SQLOptionsInternal
 from ._internal import SessionConfig as SessionConfigInternal
 from ._internal import SessionContext as SessionContextInternal
+from ._internal import SQLOptions as SQLOptionsInternal
+
+if TYPE_CHECKING:
+    import pathlib
+
+    import pandas as pd
+    import polars as pl
+    import pyarrow as pa
+
+    from datafusion.plan import ExecutionPlan, LogicalPlan
 
 
 class ArrowStreamExportable(Protocol):

From ad83fc5ebd19e93dd50519fc8e8c4529550470c2 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Thu, 3 Apr 2025 15:40:54 +0800
Subject: [PATCH 44/51] feat: Add optional display_config parameter to
 SessionContext constructor

---
 python/datafusion/context.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/datafusion/context.py b/python/datafusion/context.py
index 73f7cbd09..83fd046b3 100644
--- a/python/datafusion/context.py
+++ b/python/datafusion/context.py
@@ -571,6 +571,7 @@ def __init__(
         self,
         config: SessionConfig | None = None,
         runtime: RuntimeEnvBuilder | None = None,
+        display_config: DataframeDisplayConfig | None = None,
     ) -> None:
         """Main interface for executing queries with DataFusion.
 
@@ -594,7 +595,9 @@ def __init__(
         """
         config = config.config_internal if config is not None else None
         runtime = runtime.config_internal if runtime is not None else None
-        display_config = DataframeDisplayConfigInternal()
+        display_config = (
+            display_config.config_internal if display_config is not None else None
+        )
         self.ctx = SessionContextInternal(config, runtime, display_config)
 
     @classmethod

From fb90fbc3dcef9cbef5eb0961767dcd6d222d0378 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Thu, 3 Apr 2025 15:41:55 +0800
Subject: [PATCH 45/51] fix: Update test data size and improve display config
 tests

- Reduced the size of test data in the `data` fixture from 100 to 10 entries for efficiency.
- Added `normalize_uuid` function to standardize UUIDs in HTML representations for consistent testing.
- Modified the `test_display_config_in_init` to use a custom display configuration and updated assertions to compare normalized HTML outputs.
- Enhanced readability of assertions in `test_display_config_affects_repr` by formatting conditions.
---
 python/tests/test_dataframe.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index a72d21b84..88e96e27d 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import dis
 import os
 import re
 from typing import Any
@@ -54,7 +55,7 @@ def df():
 
 @pytest.fixture
 def data():
-    return [{"a": 1, "b": "x" * 50, "c": 3}] * 100
+    return [{"a": 1, "b": "x" * 50, "c": 3}] * 10
 
 
 @pytest.fixture
@@ -62,6 +63,14 @@ def span_expandable_class():
     return '<span class="expandable" id="'
 
 
+def normalize_uuid(html):
+    return re.sub(
+        r"[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}",
+        "STATIC_UUID",
+        html,
+    )
+
+
 def test_display_config():
     # Test display_config initialization
     config = DataframeDisplayConfig(
@@ -133,9 +142,15 @@ def test_session_with_display_config(data, span_expandable_class):
 
 
 def test_display_config_in_init(data):
-    # Test default display config directly in SessionContext constructor
+    # Test display config directly in SessionContext constructor
+    display_config = DataframeDisplayConfig(
+        max_table_bytes=1024,
+        min_table_rows=5,
+        max_cell_length=10,
+        max_table_rows_in_repr=3,
+    )
 
-    ctx = SessionContext()
+    ctx = SessionContext(display_config=display_config)
     df1 = ctx.from_pylist(data)
     html_repr1 = df1._repr_html_()
 
@@ -150,7 +165,7 @@ def test_display_config_in_init(data):
     html_repr2 = df2._repr_html_()
 
     # Both methods should result in equivalent display configuration
-    assert html_repr1 != html_repr2
+    assert normalize_uuid(html_repr1) == normalize_uuid(html_repr2)
 
 
 @pytest.fixture
@@ -1380,7 +1395,8 @@ def test_display_config_affects_repr(data):
     # The representation should show truncated data (3 rows as specified)
     assert (
         # 5 = 1 header row + 3 separator line + 1 truncation message
-        repr_str.count("\n") <= max_table_rows_in_repr + 5
+        repr_str.count("\n")
+        <= max_table_rows_in_repr + 5
     )
     assert "Data truncated" in repr_str
 
@@ -1396,7 +1412,8 @@ def test_display_config_affects_repr(data):
     # Should show all data without truncation message
     assert (
         # 4 = 1 header row + 3 separator lines
-        repr_str2.count("\n") == max_table_rows_in_repr + 4
+        repr_str2.count("\n")
+        == max_table_rows_in_repr + 4
     )  # All rows should be shown
     assert "Data truncated" not in repr_str2
 

From 73edc6a84efa3d7a88c855c75b61687ab1fc0b82 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Thu, 3 Apr 2025 15:42:38 +0800
Subject: [PATCH 46/51] fix: Remove unused import of 'dis' in test_dataframe.py

---
 python/tests/test_dataframe.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 88e96e27d..c0f78940d 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -14,7 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import dis
 import os
 import re
 from typing import Any

From f08c070a6da2d3237632fb7a4b515dbd212453fd Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Thu, 3 Apr 2025 15:45:05 +0800
Subject: [PATCH 47/51] feat: Add display_config parameter to SessionContext
 constructor

---
 python/datafusion/context.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/datafusion/context.py b/python/datafusion/context.py
index 83fd046b3..c579a054b 100644
--- a/python/datafusion/context.py
+++ b/python/datafusion/context.py
@@ -582,7 +582,7 @@ def __init__(
         Args:
             config: Session configuration options.
             runtime: Runtime configuration options.
-
+            display_config: DataFrame display configuration options.
         Example usage:
 
         The following example demonstrates how to use the context to execute

From 2751759d80be0817fbc8f76ced41194a5ab467c6 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Thu, 3 Apr 2025 15:45:15 +0800
Subject: [PATCH 48/51] fix: Increase test data size in data fixture for better
 coverage

---
 python/tests/test_dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index c0f78940d..780409f34 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -54,7 +54,7 @@ def df():
 
 @pytest.fixture
 def data():
-    return [{"a": 1, "b": "x" * 50, "c": 3}] * 10
+    return [{"a": 1, "b": "x" * 50, "c": 3}] * 100
 
 
 @pytest.fixture

From c109ad2dea59cc5c1ef42cc5c4d21567ee5e2771 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Thu, 3 Apr 2025 15:47:09 +0800
Subject: [PATCH 49/51] docs: Add docstring to normalize_uuid function for
 clarity in testing

---
 python/tests/test_dataframe.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 780409f34..48a5d1f1f 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -63,6 +63,19 @@ def span_expandable_class():
 
 
 def normalize_uuid(html):
+    """
+    Normalize UUIDs in HTML content by replacing them with a static string.
+
+    This is used in testing to ensure consistent output when comparing HTML
+    representations that contain randomly generated UUIDs (like element IDs),
+    allowing for meaningful comparison of structure and content.
+
+    Args:
+        html: HTML string possibly containing UUIDs
+
+    Returns:
+        HTML string with all UUIDs replaced by "STATIC_UUID"
+    """
     return re.sub(
         r"[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}",
         "STATIC_UUID",

From f3cdfbeb3c2e5b1bb0e994a6c1285f723e62e682 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Thu, 3 Apr 2025 17:46:36 +0800
Subject: [PATCH 50/51] fix ruff errors

---
 python/tests/test_dataframe.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 48a5d1f1f..e10613d9b 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1407,8 +1407,7 @@ def test_display_config_affects_repr(data):
     # The representation should show truncated data (3 rows as specified)
     assert (
         # 5 = 1 header row + 3 separator line + 1 truncation message
-        repr_str.count("\n")
-        <= max_table_rows_in_repr + 5
+        repr_str.count("\n") <= max_table_rows_in_repr + 5
     )
     assert "Data truncated" in repr_str
 
@@ -1424,8 +1423,7 @@ def test_display_config_affects_repr(data):
     # Should show all data without truncation message
     assert (
         # 4 = 1 header row + 3 separator lines
-        repr_str2.count("\n")
-        == max_table_rows_in_repr + 4
+        repr_str2.count("\n") == max_table_rows_in_repr + 4
     )  # All rows should be shown
     assert "Data truncated" not in repr_str2
 

From 2fcc2c128d1691b6adf7f8214f25dad39e526430 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Thu, 3 Apr 2025 18:06:33 +0800
Subject: [PATCH 51/51] fix clippy errors

---
 src/dataframe.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index 5c06df985..1fb925347 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -845,7 +845,7 @@ fn record_batch_into_schema(
 ) -> Result<RecordBatch, ArrowError> {
     let schema = Arc::new(schema.clone());
     let base_schema = record_batch.schema();
-    if base_schema.fields().len() == 0 {
+    if base_schema.fields().is_empty() {
         // Nothing to project
         return Ok(RecordBatch::new_empty(schema));
     }