From 41e6ad290d7d2c0d4328b2ea12bc16bbb63fbb90 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 28 Mar 2025 12:45:14 +0800 Subject: [PATCH 01/51] feat: Add configurable display options for PyDataFrame - Introduced DisplayConfig struct to manage display settings such as max_table_bytes, min_table_rows, and max_cell_length. - Updated PyDataFrame to utilize DisplayConfig for rendering and displaying DataFrames. - Added methods to configure and reset display settings, allowing users to customize their DataFrame presentation in Python. --- src/dataframe.rs | 82 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 11 deletions(-) diff --git a/src/dataframe.rs b/src/dataframe.rs index be10b8c28..623a2e05f 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -72,9 +72,27 @@ impl PyTableProvider { PyTable::new(table_provider) } } -const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB -const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20; -const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25; + +/// Configuration for DataFrame display in Python environment +#[derive(Debug, Clone)] +pub struct DisplayConfig { + /// Maximum bytes to display for table presentation (default: 2MB) + pub max_table_bytes: usize, + /// Minimum number of table rows to display (default: 20) + pub min_table_rows: usize, + /// Maximum length of a cell before it gets minimized (default: 25) + pub max_cell_length: usize, +} + +impl Default for DisplayConfig { + fn default() -> Self { + Self { + max_table_bytes: 2 * 1024 * 1024, // 2 MB + min_table_rows: 20, + max_cell_length: 25, + } + } +} /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. @@ -83,12 +101,16 @@ const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25; #[derive(Clone)] pub struct PyDataFrame { df: Arc, + config: Arc, } impl PyDataFrame { /// creates a new PyDataFrame pub fn new(df: DataFrame) -> Self { - Self { df: Arc::new(df) } + Self { + df: Arc::new(df), + config: Arc::new(DisplayConfig::default()), + } } } @@ -118,7 +140,7 @@ impl PyDataFrame { fn __repr__(&self, py: Python) -> PyDataFusionResult { let (batches, has_more) = wait_for_future( py, - collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10), + collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10, &self.config), )?; if batches.is_empty() { // This should not be reached, but do it for safety since we index into the vector below @@ -141,8 +163,9 @@ impl PyDataFrame { py, collect_record_batches_to_display( self.df.as_ref().clone(), - MIN_TABLE_ROWS_TO_DISPLAY, + self.config.min_table_rows, usize::MAX, + &self.config, ), )?; if batches.is_empty() { @@ -218,8 +241,8 @@ impl PyDataFrame { for (col, formatter) in batch_formatter.iter().enumerate() { let cell_data = formatter.value(batch_row).to_string(); // From testing, primitive data types do not typically get larger than 21 characters - if cell_data.len() > MAX_LENGTH_CELL_WITHOUT_MINIMIZE { - let short_cell_data = &cell_data[0..MAX_LENGTH_CELL_WITHOUT_MINIMIZE]; + if cell_data.len() > self.config.max_cell_length { + let short_cell_data = &cell_data[0..self.config.max_cell_length]; cells.push(format!("
@@ -797,6 +820,42 @@ impl PyDataFrame { fn count(&self, py: Python) -> PyDataFusionResult { Ok(wait_for_future(py, self.df.as_ref().clone().count())?) } + + /// Get the current display configuration + #[getter] + fn display_config(&self) -> DisplayConfig { + (*self.config).clone() + } + + /// Update display configuration + #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None))] + fn configure_display( + &mut self, + max_table_bytes: Option, + min_table_rows: Option, + max_cell_length: Option, + ) { + let mut new_config = (*self.config).clone(); + + if let Some(bytes) = max_table_bytes { + new_config.max_table_bytes = bytes; + } + + if let Some(rows) = min_table_rows { + new_config.min_table_rows = rows; + } + + if let Some(length) = max_cell_length { + new_config.max_cell_length = length; + } + + self.config = Arc::new(new_config); + } + + /// Reset display configuration to default values + fn reset_display_config(&mut self) { + self.config = Arc::new(DisplayConfig::default()); + } } /// Print DataFrame @@ -886,6 +945,7 @@ async fn collect_record_batches_to_display( df: DataFrame, min_rows: usize, max_rows: usize, + config: &DisplayConfig, ) -> Result<(Vec, bool), DataFusionError> { let partitioned_stream = df.execute_stream_partitioned().await?; let mut stream = futures::stream::iter(partitioned_stream).flatten(); @@ -894,7 +954,7 @@ async fn collect_record_batches_to_display( let mut record_batches = Vec::default(); let mut has_more = false; - while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows) + while (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows) || rows_so_far < min_rows { let mut rb = match stream.next().await { @@ -909,8 +969,8 @@ async fn collect_record_batches_to_display( if rows_in_rb > 0 { size_estimate_so_far += rb.get_array_memory_size(); - if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY { - let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32; + if size_estimate_so_far > config.max_table_bytes { + let ratio = config.max_table_bytes as f32 / size_estimate_so_far as f32; let total_rows = rows_in_rb + rows_so_far; let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize; From 17d54cdb3faf2ac19888987154d4805c9eb3bf40 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 28 Mar 2025 12:53:47 +0800 Subject: [PATCH 02/51] feat: Enhance DisplayConfig for DataFrame with customizable options - Added DisplayConfig struct for configuring DataFrame display in Python. - Introduced fields: max_table_bytes, min_table_rows, and max_cell_length with default values. - Implemented a constructor for DisplayConfig to allow optional customization. - Updated display_config method in PyDataFrame to return a Python object of DisplayConfig. --- src/dataframe.rs | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/dataframe.rs b/src/dataframe.rs index 623a2e05f..cb9ae9e18 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -74,16 +74,38 @@ impl PyTableProvider { } /// Configuration for DataFrame display in Python environment +#[pyclass(name = "DisplayConfig", module = "datafusion")] #[derive(Debug, Clone)] pub struct DisplayConfig { /// Maximum bytes to display for table presentation (default: 2MB) + #[pyo3(get, set)] pub max_table_bytes: usize, /// Minimum number of table rows to display (default: 20) + #[pyo3(get, set)] pub min_table_rows: usize, /// Maximum length of a cell before it gets minimized (default: 25) + #[pyo3(get, set)] pub max_cell_length: usize, } +#[pymethods] +impl DisplayConfig { + #[new] + #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None))] + fn new( + max_table_bytes: Option, + min_table_rows: Option, + max_cell_length: Option, + ) -> Self { + let default = DisplayConfig::default(); + Self { + max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes), + min_table_rows: min_table_rows.unwrap_or(default.min_table_rows), + max_cell_length: max_cell_length.unwrap_or(default.max_cell_length), + } + } +} + impl Default for DisplayConfig { fn default() -> Self { Self { @@ -823,8 +845,15 @@ impl PyDataFrame { /// Get the current display configuration #[getter] - fn display_config(&self) -> DisplayConfig { - (*self.config).clone() + fn display_config(&self) -> PyResult> { + Python::with_gil(|py| { + let config = DisplayConfig { + max_table_bytes: self.config.max_table_bytes, + min_table_rows: self.config.min_table_rows, + max_cell_length: self.config.max_cell_length, + }; + Py::new(py, config).map_err(PyErr::from) + }) } /// Update display configuration From fd8f5a1a8762dc359947ad52a1cdb77f1edd3059 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 28 Mar 2025 13:04:41 +0800 Subject: [PATCH 03/51] feat: Add display configuration methods to DataFrame class - Introduced `configure_display` method to set customizable display options for DataFrame representation, including maximum bytes, minimum rows, and maximum cell length. - Added `reset_display_config` method to restore default display settings. - Implemented `display_config` property to retrieve current display configuration. --- python/datafusion/dataframe.py | 27 +++++++++++++++++++++++++++ src/dataframe.rs | 1 + 2 files changed, 28 insertions(+) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 26fe8f453..f7d964820 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -813,6 +813,33 @@ def count(self) -> int: """ return self.df.count() + def configure_display( + self, + max_table_bytes: Optional[int] = None, + min_table_rows: Optional[int] = None, + max_cell_length: Optional[int] = None, + ) -> None: + """Configure display options for DataFrame representation. + + Args: + max_table_bytes: Maximum bytes to display for table presentation (default: 2MB). + Set to lower value for large tables to limit memory usage. + min_table_rows: Minimum number of table rows to display (default: 20). + This is used for initial display and in notebooks. + max_cell_length: Maximum length of a cell before it gets minimized (default: 25). + Longer cells will be truncated with an expand button. + """ + self.df.configure_display(max_table_bytes, min_table_rows, max_cell_length) + + def reset_display_config(self) -> None: + """Reset display configuration to default values.""" + self.df.reset_display_config() + + @property + def display_config(self): + """Get the current display configuration.""" + return self.df.display_config + @deprecated("Use :py:func:`unnest_columns` instead.") def unnest_column(self, column: str, preserve_nulls: bool = True) -> DataFrame: """See :py:func:`unnest_columns`.""" diff --git a/src/dataframe.rs b/src/dataframe.rs index cb9ae9e18..e71fb6424 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -882,6 +882,7 @@ impl PyDataFrame { } /// Reset display configuration to default values + #[pyo3(text_signature = "($self)")] fn reset_display_config(&mut self) { self.config = Arc::new(DisplayConfig::default()); } From 5aae267695115d52afc35bbeebf0ea3762be11de Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 28 Mar 2025 13:10:18 +0800 Subject: [PATCH 04/51] feat: Add display configuration tests for DataFrame - Implemented tests for accessing and modifying display configuration properties in the DataFrame class. - Added `test_display_config` to verify default values of display settings. - Created `test_configure_display` to test setting and partially updating display configuration. - Introduced `test_reset_display_config` to ensure resetting configuration restores default values. --- python/tests/test_dataframe.py | 52 ++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index eda13930d..17ddde2ae 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1261,3 +1261,55 @@ def test_dataframe_repr_html(df) -> None: body_lines = [f"{v}" for inner in body_data for v in inner] body_pattern = "(.*?)".join(body_lines) assert len(re.findall(body_pattern, output, re.DOTALL)) == 1 + + +def test_display_config(df): + """Test the display configuration properties are accessible.""" + config = df.display_config + + # Verify default values + assert config.max_table_bytes == 2 * 1024 * 1024 # 2 MB + assert config.min_table_rows == 20 + assert config.max_cell_length == 25 + + +def test_configure_display(df): + """Test setting display configuration properties.""" + # Modify the display configuration + df.configure_display( + max_table_bytes=1024 * 1024, min_table_rows=10, max_cell_length=50 # 1 MB + ) + + # Verify the changes took effect + config = df.display_config + assert config.max_table_bytes == 1024 * 1024 # 1 MB + assert config.min_table_rows == 10 + assert config.max_cell_length == 50 + + # Test partial update (only changing one property) + df.configure_display(min_table_rows=5) + config = df.display_config + assert config.max_table_bytes == 1024 * 1024 # previous value retained + assert config.min_table_rows == 5 # only this value changed + assert config.max_cell_length == 50 # previous value retained + + +def test_reset_display_config(df): + """Test resetting display configuration to defaults.""" + # First modify the configuration + df.configure_display( + max_table_bytes=1024 * 1024, min_table_rows=10, max_cell_length=50 + ) + + # Verify changes took effect + config = df.display_config + assert config.max_table_bytes == 1024 * 1024 + + # Now reset to defaults + df.reset_display_config() + + # Verify defaults are restored + config = df.display_config + assert config.max_table_bytes == 2 * 1024 * 1024 # 2 MB + assert config.min_table_rows == 20 + assert config.max_cell_length == 25 From bb4516f6a088cf4cd40f79caa412a65e3eea0a30 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 28 Mar 2025 14:24:01 +0800 Subject: [PATCH 05/51] feat: Validate display configuration values in DataFrame - Added validation to ensure max_table_bytes, min_table_rows, and max_cell_length are greater than 0 in the configure_display method of DataFrame class. - Updated test cases to cover scenarios for zero and negative values, ensuring proper error handling. - Enhanced existing tests to validate extreme values and confirm expected behavior for display configurations. --- python/datafusion/dataframe.py | 9 ++ python/tests/test_dataframe.py | 201 +++++++++++++++++++++++++++++++++ src/dataframe.rs | 114 +++++++++++++++++-- 3 files changed, 315 insertions(+), 9 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index f7d964820..a0688819b 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -828,7 +828,16 @@ def configure_display( This is used for initial display and in notebooks. max_cell_length: Maximum length of a cell before it gets minimized (default: 25). Longer cells will be truncated with an expand button. + + Raises: + ValueError: If any of the provided values are less than or equal to 0. """ + if any( + value is not None and value <= 0 + for value in (max_table_bytes, min_table_rows, max_cell_length) + ): + raise ValueError("All values must be greater than 0.") + self.df.configure_display(max_table_bytes, min_table_rows, max_cell_length) def reset_display_config(self) -> None: diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 17ddde2ae..5b7bc7098 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1293,6 +1293,35 @@ def test_configure_display(df): assert config.min_table_rows == 5 # only this value changed assert config.max_cell_length == 50 # previous value retained + # Test with extreme values (still valid, but potentially problematic) + # Zero values + with pytest.raises(ValueError, match=r".*must be greater than 0.*"): + df.configure_display(max_table_bytes=0, min_table_rows=0, max_cell_length=0) + + # Very large values + df.configure_display( + max_table_bytes=10**12, min_table_rows=10**6, max_cell_length=10**4 + ) + config = df.display_config + assert config.max_table_bytes == 10**12 # 1 TB + assert config.min_table_rows == 10**6 # 1 million rows + assert config.max_cell_length == 10**4 # 10,000 chars per cell + + # Test with negative values + # This tests for expected behavior when users accidentally pass negative values + # Since these are usize in Rust, we expect a Python TypeError when trying to pass negative values + with pytest.raises(ValueError, match=r".*must be greater than 0.*"): + df.configure_display(max_table_bytes=-1) + + with pytest.raises(ValueError, match=r".*must be greater than 0.*"): + df.configure_display(min_table_rows=-5) + + with pytest.raises(ValueError, match=r".*must be greater than 0.*"): + df.configure_display(max_cell_length=-10) + + # Reset for next tests + df.reset_display_config() + def test_reset_display_config(df): """Test resetting display configuration to defaults.""" @@ -1313,3 +1342,175 @@ def test_reset_display_config(df): assert config.max_table_bytes == 2 * 1024 * 1024 # 2 MB assert config.min_table_rows == 20 assert config.max_cell_length == 25 + + +def test_min_table_rows_display(ctx): + """Test that at least min_table_rows rows are displayed.""" + # Create a dataframe with more rows than the default min_table_rows + rows = 100 + data = list(range(rows)) + batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"]) + df = ctx.create_dataframe([[batch]]) + + # Set min_table_rows to a specific value + custom_min_rows = 30 + df.configure_display(min_table_rows=custom_min_rows) + + # Get HTML representation + html_output = df._repr_html_() + + # Count table rows in the HTML (excluding header row) + # Each row has a tag + row_count = html_output.count("") - 1 # subtract 1 for the header row + + # Verify at least min_table_rows rows are displayed + assert ( + row_count >= custom_min_rows + ), f"Expected at least {custom_min_rows} rows, got {row_count}" + + # If data was truncated, "Data truncated" message should be present + if row_count < rows: + assert "Data truncated" in html_output + + +def test_max_table_bytes_display(ctx): + """Test that reducing max_table_bytes limits the amount of data displayed.""" + # Create a dataframe with large string values to consume memory + # Each string is approximately 1000 bytes + large_strings = ["x" * 1000 for _ in range(50)] + batch = pa.RecordBatch.from_arrays([pa.array(large_strings)], names=["large_data"]) + df = ctx.create_dataframe([[batch]]) + + # First test with default settings + default_html = df._repr_html_() + default_row_count = default_html.count("") - 1 # subtract header row + + # Now set a very small max_table_bytes + df.configure_display(max_table_bytes=5000) # 5KB should only fit a few rows + limited_html = df._repr_html_() + limited_row_count = limited_html.count("") - 1 + + # Verify fewer rows are displayed with the byte limit + assert ( + limited_row_count < default_row_count + ), f"Expected fewer rows with byte limit. Default: {default_row_count}, Limited: {limited_row_count}" + + # "Data truncated" should be present when limited + assert "Data truncated" in limited_html + + +def test_max_cell_length_display(ctx): + """Test that cells longer than max_cell_length are truncated in display.""" + # Create a dataframe with long string values + long_strings = [ + "short", + "medium text", + "this is a very long string that should be truncated", + ] + batch = pa.RecordBatch.from_arrays([pa.array(long_strings)], names=["text"]) + df = ctx.create_dataframe([[batch]]) + + # Set a small max_cell_length + max_length = 10 + df.configure_display(max_cell_length=max_length) + + # Get HTML representation + html_output = df._repr_html_() + + # Check for expand button for long text + assert "expandable-container" in html_output + + # Check that expandable class is used for long text + assert 'class="expandable"' in html_output + + # Look for the truncated text and expand button + long_text = long_strings[2] + assert long_text[:max_length] in html_output # Truncated text should be present + assert "expand-btn" in html_output # Expand button should be present + assert long_text in html_output # Full text should also be in the HTML (hidden) + + +def test_display_config_repr_string(ctx): + """Test that __repr__ respects display configuration.""" + # Create a dataframe with more rows than we want to show + rows = 30 + data = list(range(rows)) + batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"]) + df = ctx.create_dataframe([[batch]]) + + # Configure to show only 5 rows in string representation + df.configure_display(min_table_rows=5) + + # Get the string representation + repr_str = df.__repr__() + + # The string should contain "Data truncated" + assert "Data truncated" in repr_str + + # Count the number of rows (each value should be on a separate line) + # This is an approximation since we don't parse the actual ASCII table + value_lines = 0 + for i in range(rows): + if str(i) in repr_str: + value_lines += 1 + + # Should be fewer rows than the total + assert value_lines < rows + + # Now set min_rows higher and see if more rows appear + df.configure_display(min_table_rows=20) + repr_str_more = df.__repr__() + + value_lines_more = 0 + for i in range(rows): + if str(i) in repr_str_more: + value_lines_more += 1 + + assert value_lines_more > value_lines + + +def test_display_config_integrated(ctx): + """Test all display config options together in an integrated test.""" + # Create a dataframe with: + # - Many rows (to test min_table_rows) + # - Large data (to test max_table_bytes) + # - Long strings (to test max_cell_length) + rows = 50 + ids = list(range(rows)) + # Generate strings of increasing length + texts = [f"{'A' * i}" for i in range(1, rows + 1)] + + batch = pa.RecordBatch.from_arrays( + [pa.array(ids), pa.array(texts)], names=["id", "text"] + ) + + df = ctx.create_dataframe([[batch]]) + + # Set custom display configuration + df.configure_display( + max_table_bytes=2000, # Limit bytes to display + min_table_rows=15, # Show at least 15 rows + max_cell_length=10, # Truncate cells longer than 10 chars + ) + + # Get HTML representation + html_output = df._repr_html_() + + # Check row count + row_count = html_output.count("") - 1 # subtract header + assert row_count >= 15, f"Should display at least 15 rows, got {row_count}" + + # Check for truncation + assert "expandable-container" in html_output + assert "expand-btn" in html_output + + # Should be truncated (not all rows displayed) + assert "Data truncated" in html_output + + # Now with default settings + df.reset_display_config() + default_html = df._repr_html_() + default_row_count = default_html.count("") - 1 + + # Default settings should show more data + assert default_row_count > row_count diff --git a/src/dataframe.rs b/src/dataframe.rs index e71fb6424..9c53b7671 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -847,11 +847,7 @@ impl PyDataFrame { #[getter] fn display_config(&self) -> PyResult> { Python::with_gil(|py| { - let config = DisplayConfig { - max_table_bytes: self.config.max_table_bytes, - min_table_rows: self.config.min_table_rows, - max_cell_length: self.config.max_cell_length, - }; + let config = (*self.config).clone(); Py::new(py, config).map_err(PyErr::from) }) } @@ -924,7 +920,7 @@ fn record_batch_into_schema( ) -> Result { let schema = Arc::new(schema.clone()); let base_schema = record_batch.schema(); - if base_schema.fields().len() == 0 { + if (base_schema.fields().len() == 0) { // Nothing to project return Ok(RecordBatch::new_empty(schema)); } @@ -984,11 +980,36 @@ async fn collect_record_batches_to_display( let mut record_batches = Vec::default(); let mut has_more = false; + println!( + "==> Starting loop with min_rows: {}, max_rows: {}, max_table_bytes: {}", + min_rows, max_rows, config.max_table_bytes + ); + while (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows) || rows_so_far < min_rows { + println!( + "==> Loop condition: size_estimate_so_far ({}) < max_table_bytes ({})? {}", + size_estimate_so_far, + config.max_table_bytes, + size_estimate_so_far < config.max_table_bytes + ); + println!( + "==> Loop condition: rows_so_far ({}) < max_rows ({})? {}", + rows_so_far, + max_rows, + rows_so_far < max_rows + ); + println!( + "==> Loop condition: rows_so_far ({}) < min_rows ({})? {}", + rows_so_far, + min_rows, + rows_so_far < min_rows + ); + let mut rb = match stream.next().await { None => { + println!("==> Exiting loop: stream.next() returned None (no more data)"); break; } Some(Ok(r)) => r, @@ -996,48 +1017,123 @@ async fn collect_record_batches_to_display( }; let mut rows_in_rb = rb.num_rows(); + println!("==> Received batch with {} rows", rows_in_rb); + if rows_in_rb > 0 { size_estimate_so_far += rb.get_array_memory_size(); + println!("==> New size_estimate_so_far: {}", size_estimate_so_far); if size_estimate_so_far > config.max_table_bytes { + println!( + "==> Size limit reached: {} > {}", + size_estimate_so_far, config.max_table_bytes + ); let ratio = config.max_table_bytes as f32 / size_estimate_so_far as f32; let total_rows = rows_in_rb + rows_so_far; let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize; if reduced_row_num < min_rows { reduced_row_num = min_rows.min(total_rows); + println!( + "==> Adjusted reduced_row_num to {} to meet min_rows", + reduced_row_num + ); } let limited_rows_this_rb = reduced_row_num - rows_so_far; + println!( + "==> Limiting to {} rows in this batch (reduced_row_num: {}, rows_so_far: {})", + limited_rows_this_rb, reduced_row_num, rows_so_far + ); + if limited_rows_this_rb < rows_in_rb { rows_in_rb = limited_rows_this_rb; rb = rb.slice(0, limited_rows_this_rb); has_more = true; + println!("==> Sliced batch to {} rows", limited_rows_this_rb); } } if rows_in_rb + rows_so_far > max_rows { + println!( + "==> Row limit reached: {} + {} > {}", + rows_in_rb, rows_so_far, max_rows + ); rb = rb.slice(0, max_rows - rows_so_far); has_more = true; + println!( + "==> Sliced batch to {} rows to meet max_rows", + max_rows - rows_so_far + ); } rows_so_far += rb.num_rows(); record_batches.push(rb); + println!( + "==> Added batch: size_estimate_so_far: {}, rows_so_far: {}", + size_estimate_so_far, rows_so_far + ); + } else { + println!("==> Skipping empty batch"); } } + println!("==> Exited while loop: size_estimate_so_far: {}, rows_so_far: {}, min_rows: {}, max_rows: {}", + size_estimate_so_far, rows_so_far, min_rows, max_rows); + println!("==> Loop condition evaluation at exit:"); + println!( + "==> size_estimate_so_far < config.max_table_bytes: {} < {} = {}", + size_estimate_so_far, + config.max_table_bytes, + size_estimate_so_far < config.max_table_bytes + ); + println!( + "==> rows_so_far < max_rows: {} < {} = {}", + rows_so_far, + max_rows, + rows_so_far < max_rows + ); + println!( + "==> rows_so_far < min_rows: {} < {} = {}", + rows_so_far, + min_rows, + rows_so_far < min_rows + ); + println!( + "==> Combined condition: {} || {} = {}", + (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows), + rows_so_far < min_rows, + (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows) + || rows_so_far < min_rows + ); + if record_batches.is_empty() { + println!("==> No record batches collected"); return Ok((Vec::default(), false)); } if !has_more { // Data was not already truncated, so check to see if more record batches remain has_more = match stream.try_next().await { - Ok(None) => false, // reached end - Ok(Some(_)) => true, - Err(_) => false, // Stream disconnected + Ok(None) => { + println!("==> No more record batches in stream"); + false + } // reached end + Ok(Some(_)) => { + println!("==> More record batches available in stream"); + true + } + Err(_) => { + println!("==> Stream error or disconnected"); + false + } // Stream disconnected }; } + println!( + "==> Returning {} record batches, has_more: {}", + record_batches.len(), + has_more + ); Ok((record_batches, has_more)) } From ca908f05d0fe39635522831c149a63295f5b8402 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 28 Mar 2025 14:25:59 +0800 Subject: [PATCH 06/51] collect_record_batches_to_display without debug --- src/dataframe.rs | 108 ++--------------------------------------------- 1 file changed, 4 insertions(+), 104 deletions(-) diff --git a/src/dataframe.rs b/src/dataframe.rs index 9c53b7671..9381d8407 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -920,7 +920,7 @@ fn record_batch_into_schema( ) -> Result { let schema = Arc::new(schema.clone()); let base_schema = record_batch.schema(); - if (base_schema.fields().len() == 0) { + if base_schema.fields().len() == 0 { // Nothing to project return Ok(RecordBatch::new_empty(schema)); } @@ -980,36 +980,11 @@ async fn collect_record_batches_to_display( let mut record_batches = Vec::default(); let mut has_more = false; - println!( - "==> Starting loop with min_rows: {}, max_rows: {}, max_table_bytes: {}", - min_rows, max_rows, config.max_table_bytes - ); - while (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows) || rows_so_far < min_rows { - println!( - "==> Loop condition: size_estimate_so_far ({}) < max_table_bytes ({})? {}", - size_estimate_so_far, - config.max_table_bytes, - size_estimate_so_far < config.max_table_bytes - ); - println!( - "==> Loop condition: rows_so_far ({}) < max_rows ({})? {}", - rows_so_far, - max_rows, - rows_so_far < max_rows - ); - println!( - "==> Loop condition: rows_so_far ({}) < min_rows ({})? {}", - rows_so_far, - min_rows, - rows_so_far < min_rows - ); - let mut rb = match stream.next().await { None => { - println!("==> Exiting loop: stream.next() returned None (no more data)"); break; } Some(Ok(r)) => r, @@ -1017,123 +992,48 @@ async fn collect_record_batches_to_display( }; let mut rows_in_rb = rb.num_rows(); - println!("==> Received batch with {} rows", rows_in_rb); - if rows_in_rb > 0 { size_estimate_so_far += rb.get_array_memory_size(); - println!("==> New size_estimate_so_far: {}", size_estimate_so_far); if size_estimate_so_far > config.max_table_bytes { - println!( - "==> Size limit reached: {} > {}", - size_estimate_so_far, config.max_table_bytes - ); let ratio = config.max_table_bytes as f32 / size_estimate_so_far as f32; let total_rows = rows_in_rb + rows_so_far; let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize; if reduced_row_num < min_rows { reduced_row_num = min_rows.min(total_rows); - println!( - "==> Adjusted reduced_row_num to {} to meet min_rows", - reduced_row_num - ); } let limited_rows_this_rb = reduced_row_num - rows_so_far; - println!( - "==> Limiting to {} rows in this batch (reduced_row_num: {}, rows_so_far: {})", - limited_rows_this_rb, reduced_row_num, rows_so_far - ); - if limited_rows_this_rb < rows_in_rb { rows_in_rb = limited_rows_this_rb; rb = rb.slice(0, limited_rows_this_rb); has_more = true; - println!("==> Sliced batch to {} rows", limited_rows_this_rb); } } if rows_in_rb + rows_so_far > max_rows { - println!( - "==> Row limit reached: {} + {} > {}", - rows_in_rb, rows_so_far, max_rows - ); rb = rb.slice(0, max_rows - rows_so_far); has_more = true; - println!( - "==> Sliced batch to {} rows to meet max_rows", - max_rows - rows_so_far - ); } rows_so_far += rb.num_rows(); record_batches.push(rb); - println!( - "==> Added batch: size_estimate_so_far: {}, rows_so_far: {}", - size_estimate_so_far, rows_so_far - ); - } else { - println!("==> Skipping empty batch"); } } - println!("==> Exited while loop: size_estimate_so_far: {}, rows_so_far: {}, min_rows: {}, max_rows: {}", - size_estimate_so_far, rows_so_far, min_rows, max_rows); - println!("==> Loop condition evaluation at exit:"); - println!( - "==> size_estimate_so_far < config.max_table_bytes: {} < {} = {}", - size_estimate_so_far, - config.max_table_bytes, - size_estimate_so_far < config.max_table_bytes - ); - println!( - "==> rows_so_far < max_rows: {} < {} = {}", - rows_so_far, - max_rows, - rows_so_far < max_rows - ); - println!( - "==> rows_so_far < min_rows: {} < {} = {}", - rows_so_far, - min_rows, - rows_so_far < min_rows - ); - println!( - "==> Combined condition: {} || {} = {}", - (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows), - rows_so_far < min_rows, - (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows) - || rows_so_far < min_rows - ); - if record_batches.is_empty() { - println!("==> No record batches collected"); return Ok((Vec::default(), false)); } if !has_more { // Data was not already truncated, so check to see if more record batches remain has_more = match stream.try_next().await { - Ok(None) => { - println!("==> No more record batches in stream"); - false - } // reached end - Ok(Some(_)) => { - println!("==> More record batches available in stream"); - true - } - Err(_) => { - println!("==> Stream error or disconnected"); - false - } // Stream disconnected + Ok(None) => false, // reached end + Ok(Some(_)) => true, + Err(_) => false, // Stream disconnected }; } - println!( - "==> Returning {} record batches, has_more: {}", - record_batches.len(), - has_more - ); Ok((record_batches, has_more)) } From 727914d63e0ce8b081f8d288dfba4bfb445830cd Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 28 Mar 2025 15:11:43 +0800 Subject: [PATCH 07/51] Add tests for display_config --- python/tests/test_dataframe.py | 76 ++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 5b7bc7098..99c125178 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1433,40 +1433,59 @@ def test_max_cell_length_display(ctx): def test_display_config_repr_string(ctx): """Test that __repr__ respects display configuration.""" # Create a dataframe with more rows than we want to show - rows = 30 - data = list(range(rows)) - batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"]) - df = ctx.create_dataframe([[batch]]) + # df.__repr__ returns max 10 rows only, so we start test with 7 rows + rows = 7 + df = _create_numeric_test_df(ctx, rows) # Configure to show only 5 rows in string representation - df.configure_display(min_table_rows=5) + min_table_rows_in_display = 5 + df.configure_display(min_table_rows=min_table_rows_in_display) # Get the string representation repr_str = df.__repr__() - # The string should contain "Data truncated" - assert "Data truncated" in repr_str - - # Count the number of rows (each value should be on a separate line) - # This is an approximation since we don't parse the actual ASCII table - value_lines = 0 - for i in range(rows): - if str(i) in repr_str: - value_lines += 1 + # Count the number of rows using helper function + lines_count = _count_lines_in_str(repr_str) # Should be fewer rows than the total - assert value_lines < rows + assert lines_count <= rows + assert lines_count >= min_table_rows_in_display # Now set min_rows higher and see if more rows appear - df.configure_display(min_table_rows=20) + min_table_rows_in_display = 7 + rows = 11 + df = _create_numeric_test_df(ctx, rows) # Recreate to reset the state + df.configure_display(min_table_rows=min_table_rows_in_display) + repr_str_more = df.__repr__() + # The string should contain "Data truncated" + assert "Data truncated" in repr_str_more + + # Count lines again + lines_count2 = _count_lines_in_str(repr_str_more) + + # Should show more rows now + assert lines_count2 > lines_count + assert lines_count2 >= min_table_rows_in_display - value_lines_more = 0 - for i in range(rows): - if str(i) in repr_str_more: - value_lines_more += 1 - assert value_lines_more > value_lines +def _count_lines_in_str(repr_str): + """Count the number of rows displayed in a string representation. + + Args: + repr_str: String representation of the DataFrame. + + Returns: + Number of rows that appear in the string representation. + """ + # Find all lines that match the pattern of a number at the beginning of a row + # This is more robust than checking for specific numbers + value_lines = 0 + for line in repr_str.split("\n"): + # Look for lines that contain numeric values (row data) + if re.search(r"^\s*\d+\s", line): + value_lines += 1 + return value_lines def test_display_config_integrated(ctx): @@ -1514,3 +1533,18 @@ def test_display_config_integrated(ctx): # Default settings should show more data assert default_row_count > row_count + + +def _create_numeric_test_df(ctx, rows): + """Create a test dataframe with numeric values from 0 to rows-1. + + Args: + ctx: SessionContext to use for creating the dataframe. + rows: Number of rows to create. + + Returns: + DataFrame with a single column "values" containing numbers 0 to rows-1. + """ + data = list(range(rows)) + batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"]) + return ctx.create_dataframe([[batch]]) From 52091cee8160e56148ec77dfc1039a9f4ceb026a Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 28 Mar 2025 15:13:10 +0800 Subject: [PATCH 08/51] fix: Update record batch display logic to use min_table_rows from config --- python/tests/test_dataframe.py | 55 +++------------------------------- src/dataframe.rs | 7 ++++- 2 files changed, 10 insertions(+), 52 deletions(-) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 99c125178..c1717beb5 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1478,63 +1478,16 @@ def _count_lines_in_str(repr_str): Returns: Number of rows that appear in the string representation. """ - # Find all lines that match the pattern of a number at the beginning of a row - # This is more robust than checking for specific numbers + # DataFrame tables are formatted with | value | patterns + # Count lines that match actual data rows (not headers or separators) value_lines = 0 for line in repr_str.split("\n"): - # Look for lines that contain numeric values (row data) - if re.search(r"^\s*\d+\s", line): + # Look for lines like "| 0 |", "| 1 |", etc. + if re.search(r"\|\s*\d+\s*\|", line): value_lines += 1 return value_lines -def test_display_config_integrated(ctx): - """Test all display config options together in an integrated test.""" - # Create a dataframe with: - # - Many rows (to test min_table_rows) - # - Large data (to test max_table_bytes) - # - Long strings (to test max_cell_length) - rows = 50 - ids = list(range(rows)) - # Generate strings of increasing length - texts = [f"{'A' * i}" for i in range(1, rows + 1)] - - batch = pa.RecordBatch.from_arrays( - [pa.array(ids), pa.array(texts)], names=["id", "text"] - ) - - df = ctx.create_dataframe([[batch]]) - - # Set custom display configuration - df.configure_display( - max_table_bytes=2000, # Limit bytes to display - min_table_rows=15, # Show at least 15 rows - max_cell_length=10, # Truncate cells longer than 10 chars - ) - - # Get HTML representation - html_output = df._repr_html_() - - # Check row count - row_count = html_output.count("") - 1 # subtract header - assert row_count >= 15, f"Should display at least 15 rows, got {row_count}" - - # Check for truncation - assert "expandable-container" in html_output - assert "expand-btn" in html_output - - # Should be truncated (not all rows displayed) - assert "Data truncated" in html_output - - # Now with default settings - df.reset_display_config() - default_html = df._repr_html_() - default_row_count = default_html.count("") - 1 - - # Default settings should show more data - assert default_row_count > row_count - - def _create_numeric_test_df(ctx, rows): """Create a test dataframe with numeric values from 0 to rows-1. diff --git a/src/dataframe.rs b/src/dataframe.rs index 9381d8407..33eecb5bf 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -162,7 +162,12 @@ impl PyDataFrame { fn __repr__(&self, py: Python) -> PyDataFusionResult { let (batches, has_more) = wait_for_future( py, - collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10, &self.config), + collect_record_batches_to_display( + self.df.as_ref().clone(), + self.config.min_table_rows, + 10, + &self.config, + ), )?; if batches.is_empty() { // This should not be reached, but do it for safety since we index into the vector below From da116bf1fbecb21dd5f9ad55692f9a8775096c84 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 28 Mar 2025 15:45:17 +0800 Subject: [PATCH 09/51] reuse _create_numeric_test_df --- python/tests/test_dataframe.py | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index c1717beb5..04385d88c 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1298,18 +1298,9 @@ def test_configure_display(df): with pytest.raises(ValueError, match=r".*must be greater than 0.*"): df.configure_display(max_table_bytes=0, min_table_rows=0, max_cell_length=0) - # Very large values - df.configure_display( - max_table_bytes=10**12, min_table_rows=10**6, max_cell_length=10**4 - ) - config = df.display_config - assert config.max_table_bytes == 10**12 # 1 TB - assert config.min_table_rows == 10**6 # 1 million rows - assert config.max_cell_length == 10**4 # 10,000 chars per cell - # Test with negative values # This tests for expected behavior when users accidentally pass negative values - # Since these are usize in Rust, we expect a Python TypeError when trying to pass negative values + # Since these are usize in Rust, we expect a Python ValueError when trying to pass negative values with pytest.raises(ValueError, match=r".*must be greater than 0.*"): df.configure_display(max_table_bytes=-1) @@ -1348,9 +1339,7 @@ def test_min_table_rows_display(ctx): """Test that at least min_table_rows rows are displayed.""" # Create a dataframe with more rows than the default min_table_rows rows = 100 - data = list(range(rows)) - batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"]) - df = ctx.create_dataframe([[batch]]) + df = _create_numeric_test_df(ctx, rows) # Set min_table_rows to a specific value custom_min_rows = 30 @@ -1433,7 +1422,7 @@ def test_max_cell_length_display(ctx): def test_display_config_repr_string(ctx): """Test that __repr__ respects display configuration.""" # Create a dataframe with more rows than we want to show - # df.__repr__ returns max 10 rows only, so we start test with 7 rows + # df.__repr__ returns max 10 rows, so we start test with 7 rows rows = 7 df = _create_numeric_test_df(ctx, rows) @@ -1469,7 +1458,7 @@ def test_display_config_repr_string(ctx): assert lines_count2 >= min_table_rows_in_display -def _count_lines_in_str(repr_str): +def _count_lines_in_str(repr_str: str) -> int: """Count the number of rows displayed in a string representation. Args: @@ -1488,7 +1477,7 @@ def _count_lines_in_str(repr_str): return value_lines -def _create_numeric_test_df(ctx, rows): +def _create_numeric_test_df(ctx, rows) -> DataFrame: """Create a test dataframe with numeric values from 0 to rows-1. Args: From ee1de817075e306045a7e688a527808e6e4566cc Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 28 Mar 2025 15:59:03 +0800 Subject: [PATCH 10/51] feat: Add max_table_rows_in_repr to control row display in DataFrame - Updated DataFrame class to include max_table_rows_in_repr parameter for display configuration. - Enhanced configure_display method to accept max_table_rows_in_repr. - Modified DisplayConfig struct to include max_table_rows_in_repr with a default value of 10. - Added tests to verify the functionality of max_table_rows_in_repr in both configuration and display output. --- python/datafusion/dataframe.py | 14 ++++++++-- python/tests/test_dataframe.py | 49 +++++++++++++++++++++++++++++++--- src/dataframe.rs | 23 +++++++++++++--- 3 files changed, 78 insertions(+), 8 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index a0688819b..b01bafd1f 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -818,6 +818,7 @@ def configure_display( max_table_bytes: Optional[int] = None, min_table_rows: Optional[int] = None, max_cell_length: Optional[int] = None, + max_table_rows_in_repr: Optional[int] = None, ) -> None: """Configure display options for DataFrame representation. @@ -828,17 +829,26 @@ def configure_display( This is used for initial display and in notebooks. max_cell_length: Maximum length of a cell before it gets minimized (default: 25). Longer cells will be truncated with an expand button. + max_table_rows_in_repr: Maximum number of rows to display in string representation + (default: 10). Raises: ValueError: If any of the provided values are less than or equal to 0. """ if any( value is not None and value <= 0 - for value in (max_table_bytes, min_table_rows, max_cell_length) + for value in ( + max_table_bytes, + min_table_rows, + max_cell_length, + max_table_rows_in_repr, + ) ): raise ValueError("All values must be greater than 0.") - self.df.configure_display(max_table_bytes, min_table_rows, max_cell_length) + self.df.configure_display( + max_table_bytes, min_table_rows, max_cell_length, max_table_rows_in_repr + ) def reset_display_config(self) -> None: """Reset display configuration to default values.""" diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 04385d88c..18f0e07cd 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1271,13 +1271,17 @@ def test_display_config(df): assert config.max_table_bytes == 2 * 1024 * 1024 # 2 MB assert config.min_table_rows == 20 assert config.max_cell_length == 25 + assert config.max_table_rows_in_repr == 10 # Verify the new property def test_configure_display(df): """Test setting display configuration properties.""" # Modify the display configuration df.configure_display( - max_table_bytes=1024 * 1024, min_table_rows=10, max_cell_length=50 # 1 MB + max_table_bytes=1024 * 1024, + min_table_rows=10, + max_cell_length=50, + max_table_rows_in_repr=15, # Add test for the new property ) # Verify the changes took effect @@ -1285,13 +1289,15 @@ def test_configure_display(df): assert config.max_table_bytes == 1024 * 1024 # 1 MB assert config.min_table_rows == 10 assert config.max_cell_length == 50 + assert config.max_table_rows_in_repr == 15 # Test partial update (only changing one property) - df.configure_display(min_table_rows=5) + df.configure_display(max_table_rows_in_repr=5) config = df.display_config assert config.max_table_bytes == 1024 * 1024 # previous value retained - assert config.min_table_rows == 5 # only this value changed + assert config.min_table_rows == 10 # previous value retained assert config.max_cell_length == 50 # previous value retained + assert config.max_table_rows_in_repr == 5 # only this value changed # Test with extreme values (still valid, but potentially problematic) # Zero values @@ -1490,3 +1496,40 @@ def _create_numeric_test_df(ctx, rows) -> DataFrame: data = list(range(rows)) batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"]) return ctx.create_dataframe([[batch]]) + + +def test_max_table_rows_in_repr(ctx): + """Test that max_table_rows_in_repr controls the number of rows in string representation.""" + # Create a dataframe with more rows than the default max_table_rows_in_repr (10) + rows = 20 + df = _create_numeric_test_df(ctx, rows) + + # First test with default setting (should limit to 10 rows) + repr_str = df.__repr__() + lines_default = _count_lines_in_str(repr_str) + + # Default should be 10 rows max + assert lines_default <= 10 + assert "Data truncated" in repr_str + + # Now set a custom max_table_rows_in_repr value + custom_max_rows = 15 + df.configure_display(max_table_rows_in_repr=custom_max_rows) + + # Get the string representation with new configuration + repr_str_more = df.__repr__() + lines_custom = _count_lines_in_str(repr_str_more) + + # Should show more rows than default but not more than configured max + assert lines_custom > lines_default + assert lines_custom <= custom_max_rows + assert "Data truncated" in repr_str_more + + # Now set max_rows higher than total rows - should show all rows + df.configure_display(max_table_rows_in_repr=25) + repr_str_all = df.__repr__() + lines_all = _count_lines_in_str(repr_str_all) + + # Should show all rows (20) + assert lines_all == rows + assert "Data truncated" not in repr_str_all diff --git a/src/dataframe.rs b/src/dataframe.rs index 33eecb5bf..db93d65bf 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -86,22 +86,28 @@ pub struct DisplayConfig { /// Maximum length of a cell before it gets minimized (default: 25) #[pyo3(get, set)] pub max_cell_length: usize, + /// Maximum number of rows to display in repr string output (default: 10) + #[pyo3(get, set)] + pub max_table_rows_in_repr: usize, } #[pymethods] impl DisplayConfig { #[new] - #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None))] + #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))] fn new( max_table_bytes: Option, min_table_rows: Option, max_cell_length: Option, + max_table_rows_in_repr: Option, ) -> Self { let default = DisplayConfig::default(); Self { max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes), min_table_rows: min_table_rows.unwrap_or(default.min_table_rows), max_cell_length: max_cell_length.unwrap_or(default.max_cell_length), + max_table_rows_in_repr: max_table_rows_in_repr + .unwrap_or(default.max_table_rows_in_repr), } } } @@ -112,6 +118,7 @@ impl Default for DisplayConfig { max_table_bytes: 2 * 1024 * 1024, // 2 MB min_table_rows: 20, max_cell_length: 25, + max_table_rows_in_repr: 10, } } } @@ -165,7 +172,7 @@ impl PyDataFrame { collect_record_batches_to_display( self.df.as_ref().clone(), self.config.min_table_rows, - 10, + self.config.max_table_rows_in_repr, &self.config, ), )?; @@ -858,12 +865,18 @@ impl PyDataFrame { } /// Update display configuration - #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None))] + #[pyo3(signature = ( + max_table_bytes=None, + min_table_rows=None, + max_cell_length=None, + max_table_rows_in_repr=None + ))] fn configure_display( &mut self, max_table_bytes: Option, min_table_rows: Option, max_cell_length: Option, + max_table_rows_in_repr: Option, ) { let mut new_config = (*self.config).clone(); @@ -879,6 +892,10 @@ impl PyDataFrame { new_config.max_cell_length = length; } + if let Some(rows) = max_table_rows_in_repr { + new_config.max_table_rows_in_repr = rows; + } + self.config = Arc::new(new_config); } From 929563a8aa05037b5d5fd5f817d995e6d70bbe9d Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 28 Mar 2025 16:25:04 +0800 Subject: [PATCH 11/51] tidy up comments, tests --- python/tests/test_dataframe.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 18f0e07cd..2d5623034 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1271,7 +1271,7 @@ def test_display_config(df): assert config.max_table_bytes == 2 * 1024 * 1024 # 2 MB assert config.min_table_rows == 20 assert config.max_cell_length == 25 - assert config.max_table_rows_in_repr == 10 # Verify the new property + assert config.max_table_rows_in_repr == 10 def test_configure_display(df): @@ -1281,7 +1281,7 @@ def test_configure_display(df): max_table_bytes=1024 * 1024, min_table_rows=10, max_cell_length=50, - max_table_rows_in_repr=15, # Add test for the new property + max_table_rows_in_repr=15, ) # Verify the changes took effect @@ -1299,7 +1299,7 @@ def test_configure_display(df): assert config.max_cell_length == 50 # previous value retained assert config.max_table_rows_in_repr == 5 # only this value changed - # Test with extreme values (still valid, but potentially problematic) + # Test with extreme values # Zero values with pytest.raises(ValueError, match=r".*must be greater than 0.*"): df.configure_display(max_table_bytes=0, min_table_rows=0, max_cell_length=0) @@ -1324,12 +1324,18 @@ def test_reset_display_config(df): """Test resetting display configuration to defaults.""" # First modify the configuration df.configure_display( - max_table_bytes=1024 * 1024, min_table_rows=10, max_cell_length=50 + max_table_bytes=1024 * 1024, + min_table_rows=10, + max_cell_length=50, + max_table_rows_in_repr=15, ) # Verify changes took effect config = df.display_config assert config.max_table_bytes == 1024 * 1024 + assert config.min_table_rows == 10 + assert config.max_cell_length == 50 + assert config.max_table_rows_in_repr == 15 # Now reset to defaults df.reset_display_config() @@ -1339,6 +1345,7 @@ def test_reset_display_config(df): assert config.max_table_bytes == 2 * 1024 * 1024 # 2 MB assert config.min_table_rows == 20 assert config.max_cell_length == 25 + assert config.max_table_rows_in_repr == 10 def test_min_table_rows_display(ctx): @@ -1428,11 +1435,11 @@ def test_max_cell_length_display(ctx): def test_display_config_repr_string(ctx): """Test that __repr__ respects display configuration.""" # Create a dataframe with more rows than we want to show - # df.__repr__ returns max 10 rows, so we start test with 7 rows + # df.__repr__ returns max 10 rows by default, so we start test with 7 rows rows = 7 df = _create_numeric_test_df(ctx, rows) - # Configure to show only 5 rows in string representation + # Configure to show at least 5 rows in string representation min_table_rows_in_display = 5 df.configure_display(min_table_rows=min_table_rows_in_display) @@ -1442,8 +1449,6 @@ def test_display_config_repr_string(ctx): # Count the number of rows using helper function lines_count = _count_lines_in_str(repr_str) - # Should be fewer rows than the total - assert lines_count <= rows assert lines_count >= min_table_rows_in_display # Now set min_rows higher and see if more rows appear From cae89b026b62df25517b203a693dcaaee5b798fb Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 28 Mar 2025 16:59:31 +0800 Subject: [PATCH 12/51] Fix ruff errors --- python/datafusion/dataframe.py | 23 ++++++++++++++++------- python/tests/test_dataframe.py | 14 +++++++++----- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index b01bafd1f..3b2382502 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -49,6 +49,7 @@ import pyarrow as pa from datafusion._internal import DataFrame as DataFrameInternal + from datafusion._internal import DisplayConfig from datafusion._internal import expr as expr_internal from enum import Enum @@ -823,14 +824,17 @@ def configure_display( """Configure display options for DataFrame representation. Args: - max_table_bytes: Maximum bytes to display for table presentation (default: 2MB). + max_table_bytes: Maximum bytes to display for table presentation + (default: 2MB). Set to lower value for large tables to limit memory usage. min_table_rows: Minimum number of table rows to display (default: 20). This is used for initial display and in notebooks. - max_cell_length: Maximum length of a cell before it gets minimized (default: 25). + max_cell_length: Maximum length of a cell before it gets minimized + (default: 25). Longer cells will be truncated with an expand button. - max_table_rows_in_repr: Maximum number of rows to display in string representation - (default: 10). + max_table_rows_in_repr: Maximum number of rows to display in string + representation + (default: 10). Raises: ValueError: If any of the provided values are less than or equal to 0. @@ -844,7 +848,8 @@ def configure_display( max_table_rows_in_repr, ) ): - raise ValueError("All values must be greater than 0.") + error_msg = "All values must be greater than 0." + raise ValueError(error_msg) self.df.configure_display( max_table_bytes, min_table_rows, max_cell_length, max_table_rows_in_repr @@ -855,8 +860,12 @@ def reset_display_config(self) -> None: self.df.reset_display_config() @property - def display_config(self): - """Get the current display configuration.""" + def display_config(self) -> DisplayConfig: + """Get the current display configuration. + + Returns: + DisplayConfig: The current display configuration settings + """ return self.df.display_config @deprecated("Use :py:func:`unnest_columns` instead.") diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 2d5623034..7002d26cc 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1306,7 +1306,8 @@ def test_configure_display(df): # Test with negative values # This tests for expected behavior when users accidentally pass negative values - # Since these are usize in Rust, we expect a Python ValueError when trying to pass negative values + # Since these are usize in Rust, we expect a Python ValueError when trying to pass + # negative values. with pytest.raises(ValueError, match=r".*must be greater than 0.*"): df.configure_display(max_table_bytes=-1) @@ -1393,9 +1394,10 @@ def test_max_table_bytes_display(ctx): limited_row_count = limited_html.count("") - 1 # Verify fewer rows are displayed with the byte limit - assert ( - limited_row_count < default_row_count - ), f"Expected fewer rows with byte limit. Default: {default_row_count}, Limited: {limited_row_count}" + assert limited_row_count < default_row_count, ( + f"Expected fewer rows with byte limit. " + f"Default: {default_row_count}, Limited: {limited_row_count}" + ) # "Data truncated" should be present when limited assert "Data truncated" in limited_html @@ -1504,7 +1506,9 @@ def _create_numeric_test_df(ctx, rows) -> DataFrame: def test_max_table_rows_in_repr(ctx): - """Test that max_table_rows_in_repr controls the number of rows in string representation.""" + """Test that max_table_rows_in_repr controls the number of rows in string + representation. + """ # Create a dataframe with more rows than the default max_table_rows_in_repr (10) rows = 20 df = _create_numeric_test_df(ctx, rows) From 1bfa8b14d3c978ff688557dccc1e63adb0f0fb50 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 28 Mar 2025 17:19:54 +0800 Subject: [PATCH 13/51] Trigger CI From f34a331949630c0501fc233d6b2c33acba102dfe Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 28 Mar 2025 17:22:34 +0800 Subject: [PATCH 14/51] Fix ruff errors --- python/tests/test_dataframe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 7002d26cc..51cdc173d 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1367,9 +1367,9 @@ def test_min_table_rows_display(ctx): row_count = html_output.count("") - 1 # subtract 1 for the header row # Verify at least min_table_rows rows are displayed - assert ( - row_count >= custom_min_rows - ), f"Expected at least {custom_min_rows} rows, got {row_count}" + assert row_count >= custom_min_rows, ( + f"Expected at least {custom_min_rows} rows, got {row_count}" + ) # If data was truncated, "Data truncated" message should be present if row_count < rows: From cb151e35368f5f1e83bd18757ac3c034cb8c9dab Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 28 Mar 2025 17:57:46 +0800 Subject: [PATCH 15/51] fix: Simplify error handling in display_config method --- src/dataframe.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dataframe.rs b/src/dataframe.rs index db93d65bf..cda4dd690 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -860,7 +860,7 @@ impl PyDataFrame { fn display_config(&self) -> PyResult> { Python::with_gil(|py| { let config = (*self.config).clone(); - Py::new(py, config).map_err(PyErr::from) + Py::new(py, config) }) } From 0d5e900d7f5863683ad65fa30af49e3f6a1409b6 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 31 Mar 2025 11:02:25 +0800 Subject: [PATCH 16/51] refactor: Update display configuration handling in DataFrame - Enhanced the DataFrame class to set display configuration at the session context level, ensuring that changes to one DataFrame's display settings affect all DataFrames created from the same context. - Modified the PyDataFrame struct to accept a display configuration during initialization and updated methods to reference the new display_config field instead of the previous config field. - Added tests to verify that display configurations are shared across DataFrames in the same context and remain independent across different contexts. --- python/datafusion/dataframe.py | 4 +++ python/tests/test_dataframe.py | 66 ++++++++++++++++++++++++++++++++-- src/dataframe.rs | 15 ++++---- 3 files changed, 73 insertions(+), 12 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 3b2382502..ed58beb9d 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -823,6 +823,10 @@ def configure_display( ) -> None: """Configure display options for DataFrame representation. + Note: The display configuration is now set at the session context level, + so changes to one DataFrame's display configuration will affect all + DataFrames created from the same context. + Args: max_table_bytes: Maximum bytes to display for table presentation (default: 2MB). diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 51cdc173d..10b772a88 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1367,9 +1367,9 @@ def test_min_table_rows_display(ctx): row_count = html_output.count("") - 1 # subtract 1 for the header row # Verify at least min_table_rows rows are displayed - assert row_count >= custom_min_rows, ( - f"Expected at least {custom_min_rows} rows, got {row_count}" - ) + assert ( + row_count >= custom_min_rows + ), f"Expected at least {custom_min_rows} rows, got {row_count}" # If data was truncated, "Data truncated" message should be present if row_count < rows: @@ -1542,3 +1542,63 @@ def test_max_table_rows_in_repr(ctx): # Should show all rows (20) assert lines_all == rows assert "Data truncated" not in repr_str_all + + +def test_session_context_display_config(ctx): + """Test that display configuration is shared at session context level.""" + # Create two dataframes from the same context + batch1 = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"], + ) + df1 = ctx.create_dataframe([[batch1]]) + + batch2 = pa.RecordBatch.from_arrays( + [pa.array([7, 8, 9]), pa.array([10, 11, 12])], + names=["c", "d"], + ) + df2 = ctx.create_dataframe([[batch2]]) + + # Set display config on first dataframe + custom_max_rows = 25 + df1.configure_display(max_table_rows_in_repr=custom_max_rows) + + # Check that both dataframes have the same config + assert df1.display_config.max_table_rows_in_repr == custom_max_rows + assert df2.display_config.max_table_rows_in_repr == custom_max_rows + + # Change config on second dataframe + df2.configure_display(max_cell_length=40) + + # Both dataframes should reflect the change + assert df1.display_config.max_cell_length == 40 + assert df2.display_config.max_cell_length == 40 + + +def test_session_context_display_config_independence(ctx): + """Test that display configurations in different contexts are independent.""" + # Create two contexts with different configurations + ctx1 = SessionContext() + ctx2 = SessionContext() + + # Create dataframes from each context + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"], + ) + df1 = ctx1.create_dataframe([[batch]]) + df2 = ctx2.create_dataframe([[batch]]) + + # Set different display configurations + df1.configure_display(max_table_rows_in_repr=15) + df2.configure_display(max_table_rows_in_repr=30) + + # Verify configurations are independent + assert df1.display_config.max_table_rows_in_repr == 15 + assert df2.display_config.max_table_rows_in_repr == 30 + + # Create another dataframe from first context + df3 = ctx1.create_dataframe([[batch]]) + + # It should have the same config as the first dataframe + assert df3.display_config.max_table_rows_in_repr == 15 diff --git a/src/dataframe.rs b/src/dataframe.rs index cda4dd690..798c9b844 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -135,10 +135,10 @@ pub struct PyDataFrame { impl PyDataFrame { /// creates a new PyDataFrame - pub fn new(df: DataFrame) -> Self { + pub fn new(df: DataFrame, display_config: Arc) -> Self { Self { df: Arc::new(df), - config: Arc::new(DisplayConfig::default()), + config: display_config, } } } @@ -858,10 +858,7 @@ impl PyDataFrame { /// Get the current display configuration #[getter] fn display_config(&self) -> PyResult> { - Python::with_gil(|py| { - let config = (*self.config).clone(); - Py::new(py, config) - }) + Python::with_gil(|py| Py::new(py, (*self.display_config).clone())) } /// Update display configuration @@ -878,7 +875,7 @@ impl PyDataFrame { max_cell_length: Option, max_table_rows_in_repr: Option, ) { - let mut new_config = (*self.config).clone(); + let mut new_config = (*self.display_config).clone(); if let Some(bytes) = max_table_bytes { new_config.max_table_bytes = bytes; @@ -896,13 +893,13 @@ impl PyDataFrame { new_config.max_table_rows_in_repr = rows; } - self.config = Arc::new(new_config); + self.display_config = Arc::new(new_config); } /// Reset display configuration to default values #[pyo3(text_signature = "($self)")] fn reset_display_config(&mut self) { - self.config = Arc::new(DisplayConfig::default()); + self.display_config = Arc::new(DisplayConfig::default()); } } From ba5acc43fc2d82d09397fb60168c95601a12c388 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 31 Mar 2025 11:09:38 +0800 Subject: [PATCH 17/51] Revert "refactor: Update display configuration handling in DataFrame" This reverts commit 0d5e900d7f5863683ad65fa30af49e3f6a1409b6. --- python/datafusion/dataframe.py | 4 --- python/tests/test_dataframe.py | 66 ++-------------------------------- src/dataframe.rs | 15 ++++---- 3 files changed, 12 insertions(+), 73 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index ed58beb9d..3b2382502 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -823,10 +823,6 @@ def configure_display( ) -> None: """Configure display options for DataFrame representation. - Note: The display configuration is now set at the session context level, - so changes to one DataFrame's display configuration will affect all - DataFrames created from the same context. - Args: max_table_bytes: Maximum bytes to display for table presentation (default: 2MB). diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 10b772a88..51cdc173d 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1367,9 +1367,9 @@ def test_min_table_rows_display(ctx): row_count = html_output.count("") - 1 # subtract 1 for the header row # Verify at least min_table_rows rows are displayed - assert ( - row_count >= custom_min_rows - ), f"Expected at least {custom_min_rows} rows, got {row_count}" + assert row_count >= custom_min_rows, ( + f"Expected at least {custom_min_rows} rows, got {row_count}" + ) # If data was truncated, "Data truncated" message should be present if row_count < rows: @@ -1542,63 +1542,3 @@ def test_max_table_rows_in_repr(ctx): # Should show all rows (20) assert lines_all == rows assert "Data truncated" not in repr_str_all - - -def test_session_context_display_config(ctx): - """Test that display configuration is shared at session context level.""" - # Create two dataframes from the same context - batch1 = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - df1 = ctx.create_dataframe([[batch1]]) - - batch2 = pa.RecordBatch.from_arrays( - [pa.array([7, 8, 9]), pa.array([10, 11, 12])], - names=["c", "d"], - ) - df2 = ctx.create_dataframe([[batch2]]) - - # Set display config on first dataframe - custom_max_rows = 25 - df1.configure_display(max_table_rows_in_repr=custom_max_rows) - - # Check that both dataframes have the same config - assert df1.display_config.max_table_rows_in_repr == custom_max_rows - assert df2.display_config.max_table_rows_in_repr == custom_max_rows - - # Change config on second dataframe - df2.configure_display(max_cell_length=40) - - # Both dataframes should reflect the change - assert df1.display_config.max_cell_length == 40 - assert df2.display_config.max_cell_length == 40 - - -def test_session_context_display_config_independence(ctx): - """Test that display configurations in different contexts are independent.""" - # Create two contexts with different configurations - ctx1 = SessionContext() - ctx2 = SessionContext() - - # Create dataframes from each context - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - df1 = ctx1.create_dataframe([[batch]]) - df2 = ctx2.create_dataframe([[batch]]) - - # Set different display configurations - df1.configure_display(max_table_rows_in_repr=15) - df2.configure_display(max_table_rows_in_repr=30) - - # Verify configurations are independent - assert df1.display_config.max_table_rows_in_repr == 15 - assert df2.display_config.max_table_rows_in_repr == 30 - - # Create another dataframe from first context - df3 = ctx1.create_dataframe([[batch]]) - - # It should have the same config as the first dataframe - assert df3.display_config.max_table_rows_in_repr == 15 diff --git a/src/dataframe.rs b/src/dataframe.rs index 798c9b844..cda4dd690 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -135,10 +135,10 @@ pub struct PyDataFrame { impl PyDataFrame { /// creates a new PyDataFrame - pub fn new(df: DataFrame, display_config: Arc) -> Self { + pub fn new(df: DataFrame) -> Self { Self { df: Arc::new(df), - config: display_config, + config: Arc::new(DisplayConfig::default()), } } } @@ -858,7 +858,10 @@ impl PyDataFrame { /// Get the current display configuration #[getter] fn display_config(&self) -> PyResult> { - Python::with_gil(|py| Py::new(py, (*self.display_config).clone())) + Python::with_gil(|py| { + let config = (*self.config).clone(); + Py::new(py, config) + }) } /// Update display configuration @@ -875,7 +878,7 @@ impl PyDataFrame { max_cell_length: Option, max_table_rows_in_repr: Option, ) { - let mut new_config = (*self.display_config).clone(); + let mut new_config = (*self.config).clone(); if let Some(bytes) = max_table_bytes { new_config.max_table_bytes = bytes; @@ -893,13 +896,13 @@ impl PyDataFrame { new_config.max_table_rows_in_repr = rows; } - self.display_config = Arc::new(new_config); + self.config = Arc::new(new_config); } /// Reset display configuration to default values #[pyo3(text_signature = "($self)")] fn reset_display_config(&mut self) { - self.display_config = Arc::new(DisplayConfig::default()); + self.config = Arc::new(DisplayConfig::default()); } } From 0e30af3409a82a4924fd450e63c613b738fec0c9 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 31 Mar 2025 12:07:28 +0800 Subject: [PATCH 18/51] Refactor PyDataFrame: Simplify methods and improve performance - Removed unnecessary cloning of DataFrame in various methods to enhance performance. - Consolidated display configuration handling by removing the DisplayConfig struct and related methods. - Updated methods to use direct references to DataFrame where applicable. - Improved the implementation of select, filter, with_column, and other methods to work with mutable references. - Added a new to_string method for better string representation of DataFrame. - Cleaned up unused imports and commented-out code for better readability. --- src/context.rs | 279 ++++-------------- src/dataframe.rs | 738 +++++++++-------------------------------------- 2 files changed, 187 insertions(+), 830 deletions(-) diff --git a/src/context.rs b/src/context.rs index 0db0f4d7e..6d5e078d3 100644 --- a/src/context.rs +++ b/src/context.rs @@ -72,24 +72,59 @@ use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider}; use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType}; use tokio::task::JoinHandle; +/// Display configuration for DataFrames +#[pyclass(name = "DisplayConfig", module = "datafusion", subclass)] +#[derive(Clone, Debug)] +pub struct DisplayConfig { + #[pyo3(get, set)] + pub max_width: usize, + #[pyo3(get, set)] + pub max_rows: Option, + #[pyo3(get, set)] + pub show_nulls: bool, +} + +#[pymethods] +impl DisplayConfig { + #[new] + pub fn new( + max_width: Option, + max_rows: Option, + show_nulls: Option, + ) -> Self { + Self { + max_width: max_width.unwrap_or(80), + max_rows, + show_nulls: show_nulls.unwrap_or(false), + } + } +} + /// Configuration options for a SessionContext #[pyclass(name = "SessionConfig", module = "datafusion", subclass)] #[derive(Clone, Default)] pub struct PySessionConfig { pub config: SessionConfig, + pub display_config: DisplayConfig, } impl From for PySessionConfig { fn from(config: SessionConfig) -> Self { - Self { config } + Self { + config, + display_config: DisplayConfig::new(Some(80), None, Some(false)), + } } } #[pymethods] impl PySessionConfig { - #[pyo3(signature = (config_options=None))] + #[pyo3(signature = (config_options=None, display_config=None))] #[new] - fn new(config_options: Option>) -> Self { + fn new( + config_options: Option>, + display_config: Option, + ) -> Self { let mut config = SessionConfig::new(); if let Some(hash_map) = config_options { for (k, v) in &hash_map { @@ -97,7 +132,23 @@ impl PySessionConfig { } } - Self { config } + Self { + config, + display_config: display_config + .unwrap_or_else(|| DisplayConfig::new(Some(80), None, Some(false))), + } + } + + // Get the display configuration + pub fn get_display_config(&self) -> DisplayConfig { + self.display_config.clone() + } + + // Set the display configuration + pub fn with_display_config(&self, display_config: DisplayConfig) -> Self { + let mut new_config = self.clone(); + new_config.display_config = display_config; + new_config } fn with_create_default_catalog_and_schema(&self, enabled: bool) -> Self { @@ -675,226 +726,6 @@ impl PySessionContext { ))); } - let mut options = CsvReadOptions::new() - .has_header(has_header) - .delimiter(delimiter[0]) - .schema_infer_max_records(schema_infer_max_records) - .file_extension(file_extension) - .file_compression_type(parse_file_compression_type(file_compression_type)?); - options.schema = schema.as_ref().map(|x| &x.0); - - if path.is_instance_of::() { - let paths = path.extract::>()?; - let result = self.register_csv_from_multiple_paths(name, paths, options); - wait_for_future(py, result)?; - } else { - let path = path.extract::()?; - let result = self.ctx.register_csv(name, &path, options); - wait_for_future(py, result)?; - } - - Ok(()) - } - - #[allow(clippy::too_many_arguments)] - #[pyo3(signature = (name, - path, - schema=None, - schema_infer_max_records=1000, - file_extension=".json", - table_partition_cols=vec![], - file_compression_type=None))] - pub fn register_json( - &mut self, - name: &str, - path: PathBuf, - schema: Option>, - schema_infer_max_records: usize, - file_extension: &str, - table_partition_cols: Vec<(String, String)>, - file_compression_type: Option, - py: Python, - ) -> PyDataFusionResult<()> { - let path = path - .to_str() - .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; - - let mut options = NdJsonReadOptions::default() - .file_compression_type(parse_file_compression_type(file_compression_type)?) - .table_partition_cols(convert_table_partition_cols(table_partition_cols)?); - options.schema_infer_max_records = schema_infer_max_records; - options.file_extension = file_extension; - options.schema = schema.as_ref().map(|x| &x.0); - - let result = self.ctx.register_json(name, path, options); - wait_for_future(py, result)?; - - Ok(()) - } - - #[allow(clippy::too_many_arguments)] - #[pyo3(signature = (name, - path, - schema=None, - file_extension=".avro", - table_partition_cols=vec![]))] - pub fn register_avro( - &mut self, - name: &str, - path: PathBuf, - schema: Option>, - file_extension: &str, - table_partition_cols: Vec<(String, String)>, - py: Python, - ) -> PyDataFusionResult<()> { - let path = path - .to_str() - .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; - - let mut options = AvroReadOptions::default() - .table_partition_cols(convert_table_partition_cols(table_partition_cols)?); - options.file_extension = file_extension; - options.schema = schema.as_ref().map(|x| &x.0); - - let result = self.ctx.register_avro(name, path, options); - wait_for_future(py, result)?; - - Ok(()) - } - - // Registers a PyArrow.Dataset - pub fn register_dataset( - &self, - name: &str, - dataset: &Bound<'_, PyAny>, - py: Python, - ) -> PyDataFusionResult<()> { - let table: Arc = Arc::new(Dataset::new(dataset, py)?); - - self.ctx.register_table(name, table)?; - - Ok(()) - } - - pub fn register_udf(&mut self, udf: PyScalarUDF) -> PyResult<()> { - self.ctx.register_udf(udf.function); - Ok(()) - } - - pub fn register_udaf(&mut self, udaf: PyAggregateUDF) -> PyResult<()> { - self.ctx.register_udaf(udaf.function); - Ok(()) - } - - pub fn register_udwf(&mut self, udwf: PyWindowUDF) -> PyResult<()> { - self.ctx.register_udwf(udwf.function); - Ok(()) - } - - #[pyo3(signature = (name="datafusion"))] - pub fn catalog(&self, name: &str) -> PyResult { - match self.ctx.catalog(name) { - Some(catalog) => Ok(PyCatalog::new(catalog)), - None => Err(PyKeyError::new_err(format!( - "Catalog with name {} doesn't exist.", - &name, - ))), - } - } - - pub fn tables(&self) -> HashSet { - self.ctx - .catalog_names() - .into_iter() - .filter_map(|name| self.ctx.catalog(&name)) - .flat_map(move |catalog| { - catalog - .schema_names() - .into_iter() - .filter_map(move |name| catalog.schema(&name)) - }) - .flat_map(|schema| schema.table_names()) - .collect() - } - - pub fn table(&self, name: &str, py: Python) -> PyResult { - let x = wait_for_future(py, self.ctx.table(name)) - .map_err(|e| PyKeyError::new_err(e.to_string()))?; - Ok(PyDataFrame::new(x)) - } - - pub fn table_exist(&self, name: &str) -> PyDataFusionResult { - Ok(self.ctx.table_exist(name)?) - } - - pub fn empty_table(&self) -> PyDataFusionResult { - Ok(PyDataFrame::new(self.ctx.read_empty()?)) - } - - pub fn session_id(&self) -> String { - self.ctx.session_id() - } - - #[allow(clippy::too_many_arguments)] - #[pyo3(signature = (path, schema=None, schema_infer_max_records=1000, file_extension=".json", table_partition_cols=vec![], file_compression_type=None))] - pub fn read_json( - &mut self, - path: PathBuf, - schema: Option>, - schema_infer_max_records: usize, - file_extension: &str, - table_partition_cols: Vec<(String, String)>, - file_compression_type: Option, - py: Python, - ) -> PyDataFusionResult { - let path = path - .to_str() - .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; - let mut options = NdJsonReadOptions::default() - .table_partition_cols(convert_table_partition_cols(table_partition_cols)?) - .file_compression_type(parse_file_compression_type(file_compression_type)?); - options.schema_infer_max_records = schema_infer_max_records; - options.file_extension = file_extension; - let df = if let Some(schema) = schema { - options.schema = Some(&schema.0); - let result = self.ctx.read_json(path, options); - wait_for_future(py, result)? - } else { - let result = self.ctx.read_json(path, options); - wait_for_future(py, result)? - }; - Ok(PyDataFrame::new(df)) - } - - #[allow(clippy::too_many_arguments)] - #[pyo3(signature = ( - path, - schema=None, - has_header=true, - delimiter=",", - schema_infer_max_records=1000, - file_extension=".csv", - table_partition_cols=vec![], - file_compression_type=None))] - pub fn read_csv( - &self, - path: &Bound<'_, PyAny>, - schema: Option>, - has_header: bool, - delimiter: &str, - schema_infer_max_records: usize, - file_extension: &str, - table_partition_cols: Vec<(String, String)>, - file_compression_type: Option, - py: Python, - ) -> PyDataFusionResult { - let delimiter = delimiter.as_bytes(); - if delimiter.len() != 1 { - return Err(crate::errors::PyDataFusionError::PythonError(py_value_err( - "Delimiter must be a single character", - ))); - }; - let mut options = CsvReadOptions::new() .has_header(has_header) .delimiter(delimiter[0]) diff --git a/src/dataframe.rs b/src/dataframe.rs index cda4dd690..50227c3a6 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -15,412 +15,106 @@ // specific language governing permissions and limitations // under the License. -use std::ffi::CString; +use std::collections::HashMap; use std::sync::Arc; -use arrow::array::{new_null_array, RecordBatch, RecordBatchIterator, RecordBatchReader}; -use arrow::compute::can_cast_types; -use arrow::error::ArrowError; -use arrow::ffi::FFI_ArrowSchema; -use arrow::ffi_stream::FFI_ArrowArrayStream; -use arrow::util::display::{ArrayFormatter, FormatOptions}; -use datafusion::arrow::datatypes::Schema; -use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; -use datafusion::arrow::util::pretty; -use datafusion::common::UnnestOptions; -use datafusion::config::{CsvOptions, TableParquetOptions}; -use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; -use datafusion::datasource::TableProvider; -use datafusion::error::DataFusionError; -use datafusion::execution::SendableRecordBatchStream; -use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; -use datafusion::prelude::*; -use futures::{StreamExt, TryStreamExt}; -use pyo3::exceptions::PyValueError; +use datafusion::arrow::csv::WriterBuilder; +use datafusion::arrow::datatypes::SchemaRef; +use datafusion::arrow::pyarrow::FromPyArrow; +use datafusion::arrow::pyarrow::PyArrowType; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::common::TableReference; +use datafusion::prelude::DataFrame; + +use pyo3::exceptions::PyTypeError; use pyo3::prelude::*; -use pyo3::pybacked::PyBackedStr; -use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods}; -use tokio::task::JoinHandle; +use pyo3::types::{PyList, PyString, PyTuple}; -use crate::catalog::PyTable; -use crate::errors::{py_datafusion_err, PyDataFusionError}; -use crate::expr::sort_expr::to_sort_expressions; +use crate::errors::{py_datafusion_err, PyDataFusionError, PyDataFusionResult}; +use crate::expr::expr::PyExpr; +use crate::expr::window_expr::PyWindowExpr; use crate::physical_plan::PyExecutionPlan; -use crate::record_batch::PyRecordBatchStream; +use crate::record_batch::{PyRecordBatch, TableData}; use crate::sql::logical::PyLogicalPlan; -use crate::utils::{get_tokio_runtime, validate_pycapsule, wait_for_future}; -use crate::{ - errors::PyDataFusionResult, - expr::{sort_expr::PySortExpr, PyExpr}, -}; - -// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 -// - we have not decided on the table_provider approach yet -// this is an interim implementation -#[pyclass(name = "TableProvider", module = "datafusion")] -pub struct PyTableProvider { - provider: Arc, -} - -impl PyTableProvider { - pub fn new(provider: Arc) -> Self { - Self { provider } - } - - pub fn as_table(&self) -> PyTable { - let table_provider: Arc = self.provider.clone(); - PyTable::new(table_provider) - } -} - -/// Configuration for DataFrame display in Python environment -#[pyclass(name = "DisplayConfig", module = "datafusion")] -#[derive(Debug, Clone)] -pub struct DisplayConfig { - /// Maximum bytes to display for table presentation (default: 2MB) - #[pyo3(get, set)] - pub max_table_bytes: usize, - /// Minimum number of table rows to display (default: 20) - #[pyo3(get, set)] - pub min_table_rows: usize, - /// Maximum length of a cell before it gets minimized (default: 25) - #[pyo3(get, set)] - pub max_cell_length: usize, - /// Maximum number of rows to display in repr string output (default: 10) - #[pyo3(get, set)] - pub max_table_rows_in_repr: usize, -} - -#[pymethods] -impl DisplayConfig { - #[new] - #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))] - fn new( - max_table_bytes: Option, - min_table_rows: Option, - max_cell_length: Option, - max_table_rows_in_repr: Option, - ) -> Self { - let default = DisplayConfig::default(); - Self { - max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes), - min_table_rows: min_table_rows.unwrap_or(default.min_table_rows), - max_cell_length: max_cell_length.unwrap_or(default.max_cell_length), - max_table_rows_in_repr: max_table_rows_in_repr - .unwrap_or(default.max_table_rows_in_repr), - } - } -} - -impl Default for DisplayConfig { - fn default() -> Self { - Self { - max_table_bytes: 2 * 1024 * 1024, // 2 MB - min_table_rows: 20, - max_cell_length: 25, - max_table_rows_in_repr: 10, - } - } -} +use crate::utils::{get_tokio_runtime, wait_for_future}; +use crate::Dataset; -/// A PyDataFrame is a representation of a logical plan and an API to compose statements. -/// Use it to build a plan and `.collect()` to execute the plan and collect the result. -/// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment. +/// Represents a DataFrame in DataFusion. #[pyclass(name = "DataFrame", module = "datafusion", subclass)] #[derive(Clone)] pub struct PyDataFrame { df: Arc, - config: Arc, -} - -impl PyDataFrame { - /// creates a new PyDataFrame - pub fn new(df: DataFrame) -> Self { - Self { - df: Arc::new(df), - config: Arc::new(DisplayConfig::default()), - } - } } #[pymethods] impl PyDataFrame { - /// Enable selection for `df[col]`, `df[col1, col2, col3]`, and `df[[col1, col2, col3]]` - fn __getitem__(&self, key: Bound<'_, PyAny>) -> PyDataFusionResult { - if let Ok(key) = key.extract::() { - // df[col] - self.select_columns(vec![key]) - } else if let Ok(tuple) = key.downcast::() { - // df[col1, col2, col3] - let keys = tuple - .iter() - .map(|item| item.extract::()) - .collect::>>()?; - self.select_columns(keys) - } else if let Ok(keys) = key.extract::>() { - // df[[col1, col2, col3]] - self.select_columns(keys) - } else { - let message = "DataFrame can only be indexed by string index or indices".to_string(); - Err(PyDataFusionError::Common(message)) - } - } - - fn __repr__(&self, py: Python) -> PyDataFusionResult { - let (batches, has_more) = wait_for_future( - py, - collect_record_batches_to_display( - self.df.as_ref().clone(), - self.config.min_table_rows, - self.config.max_table_rows_in_repr, - &self.config, - ), - )?; - if batches.is_empty() { - // This should not be reached, but do it for safety since we index into the vector below - return Ok("No data to display".to_string()); - } - - let batches_as_displ = - pretty::pretty_format_batches(&batches).map_err(py_datafusion_err)?; - - let additional_str = match has_more { - true => "\nData truncated.", - false => "", - }; - - Ok(format!("DataFrame()\n{batches_as_displ}{additional_str}")) - } - - fn _repr_html_(&self, py: Python) -> PyDataFusionResult { - let (batches, has_more) = wait_for_future( - py, - collect_record_batches_to_display( - self.df.as_ref().clone(), - self.config.min_table_rows, - usize::MAX, - &self.config, - ), - )?; - if batches.is_empty() { - // This should not be reached, but do it for safety since we index into the vector below - return Ok("No data to display".to_string()); - } - - let table_uuid = uuid::Uuid::new_v4().to_string(); - - let mut html_str = " - - -
- - \n".to_string(); - - let schema = batches[0].schema(); - - let mut header = Vec::new(); - for field in schema.fields() { - header.push(format!("", field.name())); - } - let header_str = header.join(""); - html_str.push_str(&format!("{}\n", header_str)); - - let batch_formatters = batches - .iter() - .map(|batch| { - batch - .columns() - .iter() - .map(|c| ArrayFormatter::try_new(c.as_ref(), &FormatOptions::default())) - .map(|c| { - c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string()))) - }) - .collect::, _>>() - }) - .collect::, _>>()?; - - let rows_per_batch = batches.iter().map(|batch| batch.num_rows()); - - // We need to build up row by row for html - let mut table_row = 0; - for (batch_formatter, num_rows_in_batch) in batch_formatters.iter().zip(rows_per_batch) { - for batch_row in 0..num_rows_in_batch { - table_row += 1; - let mut cells = Vec::new(); - for (col, formatter) in batch_formatter.iter().enumerate() { - let cell_data = formatter.value(batch_row).to_string(); - // From testing, primitive data types do not typically get larger than 21 characters - if cell_data.len() > self.config.max_cell_length { - let short_cell_data = &cell_data[0..self.config.max_cell_length]; - cells.push(format!(" - ")); - } else { - cells.push(format!("", formatter.value(batch_row))); - } - } - let row_str = cells.join(""); - html_str.push_str(&format!("{}\n", row_str)); - } - } - html_str.push_str("
{}
-
- {short_cell_data} - {cell_data} - -
-
{}
\n"); - - html_str.push_str(" - - "); - - if has_more { - html_str.push_str("Data truncated due to size."); - } - - Ok(html_str) - } - - /// Calculate summary statistics for a DataFrame - fn describe(&self, py: Python) -> PyDataFusionResult { - let df = self.df.as_ref().clone(); - let stat_df = wait_for_future(py, df.describe())?; - Ok(Self::new(stat_df)) - } - - /// Returns the schema from the logical plan - fn schema(&self) -> PyArrowType { - PyArrowType(self.df.schema().into()) - } - - /// Convert this DataFrame into a Table that can be used in register_table - /// By convention, into_... methods consume self and return the new object. - /// Disabling the clippy lint, so we can use &self - /// because we're working with Python bindings - /// where objects are shared - /// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 - /// - we have not decided on the table_provider approach yet - #[allow(clippy::wrong_self_convention)] - fn into_view(&self) -> PyDataFusionResult { - // Call the underlying Rust DataFrame::into_view method. - // Note that the Rust method consumes self; here we clone the inner Arc - // so that we don’t invalidate this PyDataFrame. - let table_provider = self.df.as_ref().clone().into_view(); - let table_provider = PyTableProvider::new(table_provider); - - Ok(table_provider.as_table()) - } - - #[pyo3(signature = (*args))] - fn select_columns(&self, args: Vec) -> PyDataFusionResult { - let args = args.iter().map(|s| s.as_ref()).collect::>(); - let df = self.df.as_ref().clone().select_columns(&args)?; + fn select(&mut self, expr: Vec) -> PyDataFusion { + let expr = expr.into_iter().map(|e| e.into()).collect::>(); + let df = self.df.select(expr).map_err(py_datafusion_err)?; Ok(Self::new(df)) } - #[pyo3(signature = (*args))] - fn select(&self, args: Vec) -> PyDataFusionResult { - let expr = args.into_iter().map(|e| e.into()).collect(); - let df = self.df.as_ref().clone().select(expr)?; - Ok(Self::new(df)) - } - - #[pyo3(signature = (*args))] - fn drop(&self, args: Vec) -> PyDataFusionResult { - let cols = args.iter().map(|s| s.as_ref()).collect::>(); - let df = self.df.as_ref().clone().drop_columns(&cols)?; - Ok(Self::new(df)) - } - - fn filter(&self, predicate: PyExpr) -> PyDataFusionResult { - let df = self.df.as_ref().clone().filter(predicate.into())?; + fn filter(&mut self, predicate: PyExpr) -> PyDataFusionResult { + let df = self + .df + .filter(predicate.into()) + .map_err(py_datafusion_err)?; Ok(Self::new(df)) } - fn with_column(&self, name: &str, expr: PyExpr) -> PyDataFusionResult { - let df = self.df.as_ref().clone().with_column(name, expr.into())?; + fn with_column(&mut self, name: &str, expr: PyExpr) -> PyDataFusionResult { + let df = self + .df + .with_column(name, expr.into()) + .map_err(py_datafusion_err)?; Ok(Self::new(df)) } - fn with_columns(&self, exprs: Vec) -> PyDataFusionResult { - let mut df = self.df.as_ref().clone(); + fn with_columns(&mut self, exprs: Vec) -> PyDataFusionResult { + let mut df = self.df.clone(); for expr in exprs { let expr: Expr = expr.into(); let name = format!("{}", expr.schema_name()); - df = df.with_column(name.as_str(), expr)? + df = df + .with_column(name.as_str(), expr) + .map_err(py_datafusion_err)? } Ok(Self::new(df)) } /// Rename one column by applying a new projection. This is a no-op if the column to be /// renamed does not exist. - fn with_column_renamed(&self, old_name: &str, new_name: &str) -> PyDataFusionResult { + fn with_column_renamed(&mut self, old_name: &str, new_name: &str) -> PyDataFusionResult { let df = self .df - .as_ref() - .clone() - .with_column_renamed(old_name, new_name)?; + .with_column_renamed(old_name, new_name) + .map_err(py_datafusion_err)?; Ok(Self::new(df)) } - fn aggregate(&self, group_by: Vec, aggs: Vec) -> PyDataFusionResult { + fn aggregate(&mut self, group_by: Vec, aggs: Vec) -> PyDataFusionResult { let group_by = group_by.into_iter().map(|e| e.into()).collect(); let aggs = aggs.into_iter().map(|e| e.into()).collect(); - let df = self.df.as_ref().clone().aggregate(group_by, aggs)?; + let df = self + .df + .aggregate(group_by, aggs) + .map_err(py_datafusion_err)?; Ok(Self::new(df)) } #[pyo3(signature = (*exprs))] - fn sort(&self, exprs: Vec) -> PyDataFusionResult { + fn sort(&mut self, exprs: Vec) -> PyDataFusionResult { let exprs = to_sort_expressions(exprs); - let df = self.df.as_ref().clone().sort(exprs)?; + let df = self.df.sort(exprs).map_err(py_datafusion_err)?; Ok(Self::new(df)) } #[pyo3(signature = (count, offset=0))] - fn limit(&self, count: usize, offset: usize) -> PyDataFusionResult { - let df = self.df.as_ref().clone().limit(offset, Some(count))?; + fn limit(&mut self, count: usize, offset: usize) -> PyDataFusionResult { + let df = self + .df + .limit(offset, Some(count)) + .map_err(py_datafusion_err)?; Ok(Self::new(df)) } @@ -428,23 +122,23 @@ impl PyDataFrame { /// Unless some order is specified in the plan, there is no /// guarantee of the order of the result. fn collect(&self, py: Python) -> PyResult> { - let batches = wait_for_future(py, self.df.as_ref().clone().collect()) - .map_err(PyDataFusionError::from)?; + let batches = + wait_for_future(py, self.df.clone().collect()).map_err(PyDataFusionError::from)?; // cannot use PyResult> return type due to // https://github.com/PyO3/pyo3/issues/1813 batches.into_iter().map(|rb| rb.to_pyarrow(py)).collect() } /// Cache DataFrame. - fn cache(&self, py: Python) -> PyDataFusionResult { - let df = wait_for_future(py, self.df.as_ref().clone().cache())?; + fn cache(&mut self, py: Python) -> PyDataFusionResult { + let df = wait_for_future(py, self.df.clone().cache())?; Ok(Self::new(df)) } /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch /// maintaining the input partitioning. fn collect_partitioned(&self, py: Python) -> PyResult>> { - let batches = wait_for_future(py, self.df.as_ref().clone().collect_partitioned()) + let batches = wait_for_future(py, self.df.clone().collect_partitioned()) .map_err(PyDataFusionError::from)?; batches @@ -456,18 +150,22 @@ impl PyDataFrame { /// Print the result, 20 lines by default #[pyo3(signature = (num=20))] fn show(&self, py: Python, num: usize) -> PyDataFusionResult<()> { - let df = self.df.as_ref().clone().limit(0, Some(num))?; + let df = self + .df + .clone() + .limit(0, Some(num)) + .map_err(py_datafusion_err)?; print_dataframe(py, df) } /// Filter out duplicate rows - fn distinct(&self) -> PyDataFusionResult { - let df = self.df.as_ref().clone().distinct()?; + fn distinct(&mut self) -> PyDataFusionResult { + let df = self.df.clone().distinct().map_err(py_datafusion_err)?; Ok(Self::new(df)) } fn join( - &self, + &mut self, right: PyDataFrame, how: &str, left_on: Vec, @@ -490,18 +188,14 @@ impl PyDataFrame { let left_keys = left_on.iter().map(|s| s.as_ref()).collect::>(); let right_keys = right_on.iter().map(|s| s.as_ref()).collect::>(); - let df = self.df.as_ref().clone().join( - right.df.as_ref().clone(), - join_type, - &left_keys, - &right_keys, - None, - )?; + let df = self + .df + .join(right.df.clone(), join_type, &left_keys, &right_keys, None)?; Ok(Self::new(df)) } fn join_on( - &self, + &mut self, right: PyDataFrame, on_exprs: Vec, how: &str, @@ -523,32 +217,34 @@ impl PyDataFrame { let df = self .df - .as_ref() - .clone() - .join_on(right.df.as_ref().clone(), join_type, exprs)?; + .join_on(right.df.clone(), join_type, exprs) + .map_err(py_datafusion_err)?; Ok(Self::new(df)) } /// Print the query plan #[pyo3(signature = (verbose=false, analyze=false))] fn explain(&self, py: Python, verbose: bool, analyze: bool) -> PyDataFusionResult<()> { - let df = self.df.as_ref().clone().explain(verbose, analyze)?; + let df = self + .df + .explain(verbose, analyze) + .map_err(py_datafusion_err)?; print_dataframe(py, df) } /// Get the logical plan for this `DataFrame` fn logical_plan(&self) -> PyResult { - Ok(self.df.as_ref().clone().logical_plan().clone().into()) + Ok(self.df.logical_plan().clone().into()) } /// Get the optimized logical plan for this `DataFrame` fn optimized_logical_plan(&self) -> PyDataFusionResult { - Ok(self.df.as_ref().clone().into_optimized_plan()?.into()) + Ok(self.df.clone().into_optimized_plan()?.into()) } /// Get the execution plan for this `DataFrame` fn execution_plan(&self, py: Python) -> PyDataFusionResult { - let plan = wait_for_future(py, self.df.as_ref().clone().create_physical_plan())?; + let plan = wait_for_future(py, self.df.clone().create_physical_plan())?; Ok(plan.into()) } @@ -556,9 +252,8 @@ impl PyDataFrame { fn repartition(&self, num: usize) -> PyDataFusionResult { let new_df = self .df - .as_ref() - .clone() - .repartition(Partitioning::RoundRobinBatch(num))?; + .repartition(Partitioning::RoundRobinBatch(num)) + .map_err(py_datafusion_err)?; Ok(Self::new(new_df)) } @@ -568,9 +263,8 @@ impl PyDataFrame { let expr = args.into_iter().map(|py_expr| py_expr.into()).collect(); let new_df = self .df - .as_ref() - .clone() - .repartition(Partitioning::Hash(expr, num))?; + .repartition(Partitioning::Hash(expr, num)) + .map_err(py_datafusion_err)?; Ok(Self::new(new_df)) } @@ -580,11 +274,13 @@ impl PyDataFrame { fn union(&self, py_df: PyDataFrame, distinct: bool) -> PyDataFusionResult { let new_df = if distinct { self.df - .as_ref() - .clone() - .union_distinct(py_df.df.as_ref().clone())? + .union_distinct(py_df.df.clone()) + .map_err(py_datafusion_err)? } else { - self.df.as_ref().clone().union(py_df.df.as_ref().clone())? + self.df + .clone() + .union(py_df.df.clone()) + .map_err(py_datafusion_err)? }; Ok(Self::new(new_df)) @@ -595,9 +291,8 @@ impl PyDataFrame { fn union_distinct(&self, py_df: PyDataFrame) -> PyDataFusionResult { let new_df = self .df - .as_ref() - .clone() - .union_distinct(py_df.df.as_ref().clone())?; + .union_distinct(py_df.df.clone()) + .map_err(py_datafusion_err)?; Ok(Self::new(new_df)) } @@ -608,9 +303,8 @@ impl PyDataFrame { let unnest_options = UnnestOptions::default().with_preserve_nulls(preserve_nulls); let df = self .df - .as_ref() - .clone() - .unnest_columns_with_options(&[column], unnest_options)?; + .unnest_columns_with_options(&[column], unnest_options) + .map_err(py_datafusion_err)?; Ok(Self::new(df)) } @@ -626,9 +320,8 @@ impl PyDataFrame { let cols = columns.iter().map(|s| s.as_ref()).collect::>(); let df = self .df - .as_ref() - .clone() - .unnest_columns_with_options(&cols, unnest_options)?; + .unnest_columns_with_options(&cols, unnest_options) + .map_err(py_datafusion_err)?; Ok(Self::new(df)) } @@ -636,15 +329,18 @@ impl PyDataFrame { fn intersect(&self, py_df: PyDataFrame) -> PyDataFusionResult { let new_df = self .df - .as_ref() - .clone() - .intersect(py_df.df.as_ref().clone())?; + .intersect(py_df.df.clone()) + .map_err(py_datafusion_err)?; Ok(Self::new(new_df)) } /// Calculate the exception of two `DataFrame`s. The two `DataFrame`s must have exactly the same schema fn except_all(&self, py_df: PyDataFrame) -> PyDataFusionResult { - let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?; + let new_df = self + .df + .clone() + .except(py_df.df.clone()) + .map_err(py_datafusion_err)?; Ok(Self::new(new_df)) } @@ -656,11 +352,9 @@ impl PyDataFrame { }; wait_for_future( py, - self.df.as_ref().clone().write_csv( - path, - DataFrameWriteOptions::new(), - Some(csv_options), - ), + self.df + .clone() + .write_csv(path, DataFrameWriteOptions::new(), Some(csv_options)), )?; Ok(()) } @@ -717,7 +411,7 @@ impl PyDataFrame { wait_for_future( py, - self.df.as_ref().clone().write_parquet( + self.df.clone().write_parquet( path, DataFrameWriteOptions::new(), Option::from(options), @@ -731,7 +425,6 @@ impl PyDataFrame { wait_for_future( py, self.df - .as_ref() .clone() .write_json(path, DataFrameWriteOptions::new(), None), )?; @@ -757,7 +450,7 @@ impl PyDataFrame { py: Python<'py>, requested_schema: Option>, ) -> PyDataFusionResult> { - let mut batches = wait_for_future(py, self.df.as_ref().clone().collect())?; + let mut batches = wait_for_future(py, self.df.clone().collect())?; let mut schema: Schema = self.df.schema().to_owned().into(); if let Some(schema_capsule) = requested_schema { @@ -787,7 +480,7 @@ impl PyDataFrame { fn execute_stream(&self, py: Python) -> PyDataFusionResult { // create a Tokio runtime to run the async code let rt = &get_tokio_runtime().0; - let df = self.df.as_ref().clone(); + let df = self.df.clone(); let fut: JoinHandle> = rt.spawn(async move { df.execute_stream().await }); let stream = wait_for_future(py, fut).map_err(py_datafusion_err)?; @@ -797,7 +490,7 @@ impl PyDataFrame { fn execute_stream_partitioned(&self, py: Python) -> PyResult> { // create a Tokio runtime to run the async code let rt = &get_tokio_runtime().0; - let df = self.df.as_ref().clone(); + let df = self.df.clone(); let fut: JoinHandle>> = rt.spawn(async move { df.execute_stream_partitioned().await }); let stream = wait_for_future(py, fut).map_err(py_datafusion_err)?; @@ -852,210 +545,43 @@ impl PyDataFrame { // Executes this DataFrame to get the total number of rows. fn count(&self, py: Python) -> PyDataFusionResult { - Ok(wait_for_future(py, self.df.as_ref().clone().count())?) - } - - /// Get the current display configuration - #[getter] - fn display_config(&self) -> PyResult> { - Python::with_gil(|py| { - let config = (*self.config).clone(); - Py::new(py, config) - }) - } - - /// Update display configuration - #[pyo3(signature = ( - max_table_bytes=None, - min_table_rows=None, - max_cell_length=None, - max_table_rows_in_repr=None - ))] - fn configure_display( - &mut self, - max_table_bytes: Option, - min_table_rows: Option, - max_cell_length: Option, - max_table_rows_in_repr: Option, - ) { - let mut new_config = (*self.config).clone(); - - if let Some(bytes) = max_table_bytes { - new_config.max_table_bytes = bytes; - } - - if let Some(rows) = min_table_rows { - new_config.min_table_rows = rows; - } - - if let Some(length) = max_cell_length { - new_config.max_cell_length = length; - } - - if let Some(rows) = max_table_rows_in_repr { - new_config.max_table_rows_in_repr = rows; - } - - self.config = Arc::new(new_config); + Ok(wait_for_future(py, self.df.clone().count())?) } - /// Reset display configuration to default values - #[pyo3(text_signature = "($self)")] - fn reset_display_config(&mut self) { - self.config = Arc::new(DisplayConfig::default()); - } -} - -/// Print DataFrame -fn print_dataframe(py: Python, df: DataFrame) -> PyDataFusionResult<()> { - // Get string representation of record batches - let batches = wait_for_future(py, df.collect())?; - let batches_as_string = pretty::pretty_format_batches(&batches); - let result = match batches_as_string { - Ok(batch) => format!("DataFrame()\n{batch}"), - Err(err) => format!("Error: {:?}", err.to_string()), - }; - - // Import the Python 'builtins' module to access the print function - // Note that println! does not print to the Python debug console and is not visible in notebooks for instance - let print = py.import("builtins")?.getattr("print")?; - print.call1((result,))?; - Ok(()) -} + #[pyo3(signature = (max_width=None, max_rows=None, show_nulls=None))] + pub fn to_string( + &self, + max_width: Option, + max_rows: Option, + show_nulls: Option, + py: Python, + ) -> PyDataFusionResult { + let batches = wait_for_future(py, self.df.clone().collect())?; -fn project_schema(from_schema: Schema, to_schema: Schema) -> Result { - let merged_schema = Schema::try_merge(vec![from_schema, to_schema.clone()])?; + let mut table = TableData::new(&batches)?; - let project_indices: Vec = to_schema - .fields - .iter() - .map(|field| field.name()) - .filter_map(|field_name| merged_schema.index_of(field_name).ok()) - .collect(); + // Use the display configuration provided or default values + let max_width = max_width.unwrap_or(80); + let max_rows = max_rows; + let show_nulls = show_nulls.unwrap_or(false); - merged_schema.project(&project_indices) -} + table.set_display_options(max_width, max_rows, show_nulls); -fn record_batch_into_schema( - record_batch: RecordBatch, - schema: &Schema, -) -> Result { - let schema = Arc::new(schema.clone()); - let base_schema = record_batch.schema(); - if base_schema.fields().len() == 0 { - // Nothing to project - return Ok(RecordBatch::new_empty(schema)); + Ok(table.to_string()) } - let array_size = record_batch.column(0).len(); - let mut data_arrays = Vec::with_capacity(schema.fields().len()); - - for field in schema.fields() { - let desired_data_type = field.data_type(); - if let Some(original_data) = record_batch.column_by_name(field.name()) { - let original_data_type = original_data.data_type(); - - if can_cast_types(original_data_type, desired_data_type) { - data_arrays.push(arrow::compute::kernels::cast( - original_data, - desired_data_type, - )?); - } else if field.is_nullable() { - data_arrays.push(new_null_array(desired_data_type, array_size)); - } else { - return Err(ArrowError::CastError(format!("Attempting to cast to non-nullable and non-castable field {} during schema projection.", field.name()))); - } - } else { - if !field.is_nullable() { - return Err(ArrowError::CastError(format!( - "Attempting to set null to non-nullable field {} during schema projection.", - field.name() - ))); - } - data_arrays.push(new_null_array(desired_data_type, array_size)); - } + pub fn __repr__(&self, py: Python) -> PyDataFusionResult { + // Use default display configuration + self.to_string(None, None, None, py) } - - RecordBatch::try_new(schema, data_arrays) } -/// This is a helper function to return the first non-empty record batch from executing a DataFrame. -/// It additionally returns a bool, which indicates if there are more record batches available. -/// We do this so we can determine if we should indicate to the user that the data has been -/// truncated. This collects until we have achived both of these two conditions -/// -/// - We have collected our minimum number of rows -/// - We have reached our limit, either data size or maximum number of rows -/// -/// Otherwise it will return when the stream has exhausted. If you want a specific number of -/// rows, set min_rows == max_rows. -async fn collect_record_batches_to_display( - df: DataFrame, - min_rows: usize, - max_rows: usize, - config: &DisplayConfig, -) -> Result<(Vec, bool), DataFusionError> { - let partitioned_stream = df.execute_stream_partitioned().await?; - let mut stream = futures::stream::iter(partitioned_stream).flatten(); - let mut size_estimate_so_far = 0; - let mut rows_so_far = 0; - let mut record_batches = Vec::default(); - let mut has_more = false; - - while (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows) - || rows_so_far < min_rows - { - let mut rb = match stream.next().await { - None => { - break; - } - Some(Ok(r)) => r, - Some(Err(e)) => return Err(e), - }; - - let mut rows_in_rb = rb.num_rows(); - if rows_in_rb > 0 { - size_estimate_so_far += rb.get_array_memory_size(); - - if size_estimate_so_far > config.max_table_bytes { - let ratio = config.max_table_bytes as f32 / size_estimate_so_far as f32; - let total_rows = rows_in_rb + rows_so_far; - - let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize; - if reduced_row_num < min_rows { - reduced_row_num = min_rows.min(total_rows); - } - - let limited_rows_this_rb = reduced_row_num - rows_so_far; - if limited_rows_this_rb < rows_in_rb { - rows_in_rb = limited_rows_this_rb; - rb = rb.slice(0, limited_rows_this_rb); - has_more = true; - } - } - - if rows_in_rb + rows_so_far > max_rows { - rb = rb.slice(0, max_rows - rows_so_far); - has_more = true; - } - - rows_so_far += rb.num_rows(); - record_batches.push(rb); - } - } - - if record_batches.is_empty() { - return Ok((Vec::default(), false)); +impl PyDataFrame { + pub fn new(df: DataFrame) -> Self { + Self { df: Arc::new(df) } } - if !has_more { - // Data was not already truncated, so check to see if more record batches remain - has_more = match stream.try_next().await { - Ok(None) => false, // reached end - Ok(Some(_)) => true, - Err(_) => false, // Stream disconnected - }; + pub fn dataframe(&self) -> Arc { + self.df.clone() } - - Ok((record_batches, has_more)) } From a5d224f4b388059fc11ffb4091d5f85d7f3b52d1 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 31 Mar 2025 19:11:29 +0800 Subject: [PATCH 19/51] Revert "Refactor PyDataFrame: Simplify methods and improve performance" This reverts commit 0e30af3409a82a4924fd450e63c613b738fec0c9. --- src/context.rs | 279 ++++++++++++++---- src/dataframe.rs | 738 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 830 insertions(+), 187 deletions(-) diff --git a/src/context.rs b/src/context.rs index 6d5e078d3..0db0f4d7e 100644 --- a/src/context.rs +++ b/src/context.rs @@ -72,59 +72,24 @@ use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider}; use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType}; use tokio::task::JoinHandle; -/// Display configuration for DataFrames -#[pyclass(name = "DisplayConfig", module = "datafusion", subclass)] -#[derive(Clone, Debug)] -pub struct DisplayConfig { - #[pyo3(get, set)] - pub max_width: usize, - #[pyo3(get, set)] - pub max_rows: Option, - #[pyo3(get, set)] - pub show_nulls: bool, -} - -#[pymethods] -impl DisplayConfig { - #[new] - pub fn new( - max_width: Option, - max_rows: Option, - show_nulls: Option, - ) -> Self { - Self { - max_width: max_width.unwrap_or(80), - max_rows, - show_nulls: show_nulls.unwrap_or(false), - } - } -} - /// Configuration options for a SessionContext #[pyclass(name = "SessionConfig", module = "datafusion", subclass)] #[derive(Clone, Default)] pub struct PySessionConfig { pub config: SessionConfig, - pub display_config: DisplayConfig, } impl From for PySessionConfig { fn from(config: SessionConfig) -> Self { - Self { - config, - display_config: DisplayConfig::new(Some(80), None, Some(false)), - } + Self { config } } } #[pymethods] impl PySessionConfig { - #[pyo3(signature = (config_options=None, display_config=None))] + #[pyo3(signature = (config_options=None))] #[new] - fn new( - config_options: Option>, - display_config: Option, - ) -> Self { + fn new(config_options: Option>) -> Self { let mut config = SessionConfig::new(); if let Some(hash_map) = config_options { for (k, v) in &hash_map { @@ -132,23 +97,7 @@ impl PySessionConfig { } } - Self { - config, - display_config: display_config - .unwrap_or_else(|| DisplayConfig::new(Some(80), None, Some(false))), - } - } - - // Get the display configuration - pub fn get_display_config(&self) -> DisplayConfig { - self.display_config.clone() - } - - // Set the display configuration - pub fn with_display_config(&self, display_config: DisplayConfig) -> Self { - let mut new_config = self.clone(); - new_config.display_config = display_config; - new_config + Self { config } } fn with_create_default_catalog_and_schema(&self, enabled: bool) -> Self { @@ -726,6 +675,226 @@ impl PySessionContext { ))); } + let mut options = CsvReadOptions::new() + .has_header(has_header) + .delimiter(delimiter[0]) + .schema_infer_max_records(schema_infer_max_records) + .file_extension(file_extension) + .file_compression_type(parse_file_compression_type(file_compression_type)?); + options.schema = schema.as_ref().map(|x| &x.0); + + if path.is_instance_of::() { + let paths = path.extract::>()?; + let result = self.register_csv_from_multiple_paths(name, paths, options); + wait_for_future(py, result)?; + } else { + let path = path.extract::()?; + let result = self.ctx.register_csv(name, &path, options); + wait_for_future(py, result)?; + } + + Ok(()) + } + + #[allow(clippy::too_many_arguments)] + #[pyo3(signature = (name, + path, + schema=None, + schema_infer_max_records=1000, + file_extension=".json", + table_partition_cols=vec![], + file_compression_type=None))] + pub fn register_json( + &mut self, + name: &str, + path: PathBuf, + schema: Option>, + schema_infer_max_records: usize, + file_extension: &str, + table_partition_cols: Vec<(String, String)>, + file_compression_type: Option, + py: Python, + ) -> PyDataFusionResult<()> { + let path = path + .to_str() + .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; + + let mut options = NdJsonReadOptions::default() + .file_compression_type(parse_file_compression_type(file_compression_type)?) + .table_partition_cols(convert_table_partition_cols(table_partition_cols)?); + options.schema_infer_max_records = schema_infer_max_records; + options.file_extension = file_extension; + options.schema = schema.as_ref().map(|x| &x.0); + + let result = self.ctx.register_json(name, path, options); + wait_for_future(py, result)?; + + Ok(()) + } + + #[allow(clippy::too_many_arguments)] + #[pyo3(signature = (name, + path, + schema=None, + file_extension=".avro", + table_partition_cols=vec![]))] + pub fn register_avro( + &mut self, + name: &str, + path: PathBuf, + schema: Option>, + file_extension: &str, + table_partition_cols: Vec<(String, String)>, + py: Python, + ) -> PyDataFusionResult<()> { + let path = path + .to_str() + .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; + + let mut options = AvroReadOptions::default() + .table_partition_cols(convert_table_partition_cols(table_partition_cols)?); + options.file_extension = file_extension; + options.schema = schema.as_ref().map(|x| &x.0); + + let result = self.ctx.register_avro(name, path, options); + wait_for_future(py, result)?; + + Ok(()) + } + + // Registers a PyArrow.Dataset + pub fn register_dataset( + &self, + name: &str, + dataset: &Bound<'_, PyAny>, + py: Python, + ) -> PyDataFusionResult<()> { + let table: Arc = Arc::new(Dataset::new(dataset, py)?); + + self.ctx.register_table(name, table)?; + + Ok(()) + } + + pub fn register_udf(&mut self, udf: PyScalarUDF) -> PyResult<()> { + self.ctx.register_udf(udf.function); + Ok(()) + } + + pub fn register_udaf(&mut self, udaf: PyAggregateUDF) -> PyResult<()> { + self.ctx.register_udaf(udaf.function); + Ok(()) + } + + pub fn register_udwf(&mut self, udwf: PyWindowUDF) -> PyResult<()> { + self.ctx.register_udwf(udwf.function); + Ok(()) + } + + #[pyo3(signature = (name="datafusion"))] + pub fn catalog(&self, name: &str) -> PyResult { + match self.ctx.catalog(name) { + Some(catalog) => Ok(PyCatalog::new(catalog)), + None => Err(PyKeyError::new_err(format!( + "Catalog with name {} doesn't exist.", + &name, + ))), + } + } + + pub fn tables(&self) -> HashSet { + self.ctx + .catalog_names() + .into_iter() + .filter_map(|name| self.ctx.catalog(&name)) + .flat_map(move |catalog| { + catalog + .schema_names() + .into_iter() + .filter_map(move |name| catalog.schema(&name)) + }) + .flat_map(|schema| schema.table_names()) + .collect() + } + + pub fn table(&self, name: &str, py: Python) -> PyResult { + let x = wait_for_future(py, self.ctx.table(name)) + .map_err(|e| PyKeyError::new_err(e.to_string()))?; + Ok(PyDataFrame::new(x)) + } + + pub fn table_exist(&self, name: &str) -> PyDataFusionResult { + Ok(self.ctx.table_exist(name)?) + } + + pub fn empty_table(&self) -> PyDataFusionResult { + Ok(PyDataFrame::new(self.ctx.read_empty()?)) + } + + pub fn session_id(&self) -> String { + self.ctx.session_id() + } + + #[allow(clippy::too_many_arguments)] + #[pyo3(signature = (path, schema=None, schema_infer_max_records=1000, file_extension=".json", table_partition_cols=vec![], file_compression_type=None))] + pub fn read_json( + &mut self, + path: PathBuf, + schema: Option>, + schema_infer_max_records: usize, + file_extension: &str, + table_partition_cols: Vec<(String, String)>, + file_compression_type: Option, + py: Python, + ) -> PyDataFusionResult { + let path = path + .to_str() + .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; + let mut options = NdJsonReadOptions::default() + .table_partition_cols(convert_table_partition_cols(table_partition_cols)?) + .file_compression_type(parse_file_compression_type(file_compression_type)?); + options.schema_infer_max_records = schema_infer_max_records; + options.file_extension = file_extension; + let df = if let Some(schema) = schema { + options.schema = Some(&schema.0); + let result = self.ctx.read_json(path, options); + wait_for_future(py, result)? + } else { + let result = self.ctx.read_json(path, options); + wait_for_future(py, result)? + }; + Ok(PyDataFrame::new(df)) + } + + #[allow(clippy::too_many_arguments)] + #[pyo3(signature = ( + path, + schema=None, + has_header=true, + delimiter=",", + schema_infer_max_records=1000, + file_extension=".csv", + table_partition_cols=vec![], + file_compression_type=None))] + pub fn read_csv( + &self, + path: &Bound<'_, PyAny>, + schema: Option>, + has_header: bool, + delimiter: &str, + schema_infer_max_records: usize, + file_extension: &str, + table_partition_cols: Vec<(String, String)>, + file_compression_type: Option, + py: Python, + ) -> PyDataFusionResult { + let delimiter = delimiter.as_bytes(); + if delimiter.len() != 1 { + return Err(crate::errors::PyDataFusionError::PythonError(py_value_err( + "Delimiter must be a single character", + ))); + }; + let mut options = CsvReadOptions::new() .has_header(has_header) .delimiter(delimiter[0]) diff --git a/src/dataframe.rs b/src/dataframe.rs index 50227c3a6..cda4dd690 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -15,106 +15,412 @@ // specific language governing permissions and limitations // under the License. -use std::collections::HashMap; +use std::ffi::CString; use std::sync::Arc; -use datafusion::arrow::csv::WriterBuilder; -use datafusion::arrow::datatypes::SchemaRef; -use datafusion::arrow::pyarrow::FromPyArrow; -use datafusion::arrow::pyarrow::PyArrowType; -use datafusion::arrow::record_batch::RecordBatch; -use datafusion::common::TableReference; -use datafusion::prelude::DataFrame; - -use pyo3::exceptions::PyTypeError; +use arrow::array::{new_null_array, RecordBatch, RecordBatchIterator, RecordBatchReader}; +use arrow::compute::can_cast_types; +use arrow::error::ArrowError; +use arrow::ffi::FFI_ArrowSchema; +use arrow::ffi_stream::FFI_ArrowArrayStream; +use arrow::util::display::{ArrayFormatter, FormatOptions}; +use datafusion::arrow::datatypes::Schema; +use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; +use datafusion::arrow::util::pretty; +use datafusion::common::UnnestOptions; +use datafusion::config::{CsvOptions, TableParquetOptions}; +use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; +use datafusion::datasource::TableProvider; +use datafusion::error::DataFusionError; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; +use datafusion::prelude::*; +use futures::{StreamExt, TryStreamExt}; +use pyo3::exceptions::PyValueError; use pyo3::prelude::*; -use pyo3::types::{PyList, PyString, PyTuple}; +use pyo3::pybacked::PyBackedStr; +use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods}; +use tokio::task::JoinHandle; -use crate::errors::{py_datafusion_err, PyDataFusionError, PyDataFusionResult}; -use crate::expr::expr::PyExpr; -use crate::expr::window_expr::PyWindowExpr; +use crate::catalog::PyTable; +use crate::errors::{py_datafusion_err, PyDataFusionError}; +use crate::expr::sort_expr::to_sort_expressions; use crate::physical_plan::PyExecutionPlan; -use crate::record_batch::{PyRecordBatch, TableData}; +use crate::record_batch::PyRecordBatchStream; use crate::sql::logical::PyLogicalPlan; -use crate::utils::{get_tokio_runtime, wait_for_future}; -use crate::Dataset; +use crate::utils::{get_tokio_runtime, validate_pycapsule, wait_for_future}; +use crate::{ + errors::PyDataFusionResult, + expr::{sort_expr::PySortExpr, PyExpr}, +}; + +// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 +// - we have not decided on the table_provider approach yet +// this is an interim implementation +#[pyclass(name = "TableProvider", module = "datafusion")] +pub struct PyTableProvider { + provider: Arc, +} + +impl PyTableProvider { + pub fn new(provider: Arc) -> Self { + Self { provider } + } + + pub fn as_table(&self) -> PyTable { + let table_provider: Arc = self.provider.clone(); + PyTable::new(table_provider) + } +} + +/// Configuration for DataFrame display in Python environment +#[pyclass(name = "DisplayConfig", module = "datafusion")] +#[derive(Debug, Clone)] +pub struct DisplayConfig { + /// Maximum bytes to display for table presentation (default: 2MB) + #[pyo3(get, set)] + pub max_table_bytes: usize, + /// Minimum number of table rows to display (default: 20) + #[pyo3(get, set)] + pub min_table_rows: usize, + /// Maximum length of a cell before it gets minimized (default: 25) + #[pyo3(get, set)] + pub max_cell_length: usize, + /// Maximum number of rows to display in repr string output (default: 10) + #[pyo3(get, set)] + pub max_table_rows_in_repr: usize, +} + +#[pymethods] +impl DisplayConfig { + #[new] + #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))] + fn new( + max_table_bytes: Option, + min_table_rows: Option, + max_cell_length: Option, + max_table_rows_in_repr: Option, + ) -> Self { + let default = DisplayConfig::default(); + Self { + max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes), + min_table_rows: min_table_rows.unwrap_or(default.min_table_rows), + max_cell_length: max_cell_length.unwrap_or(default.max_cell_length), + max_table_rows_in_repr: max_table_rows_in_repr + .unwrap_or(default.max_table_rows_in_repr), + } + } +} + +impl Default for DisplayConfig { + fn default() -> Self { + Self { + max_table_bytes: 2 * 1024 * 1024, // 2 MB + min_table_rows: 20, + max_cell_length: 25, + max_table_rows_in_repr: 10, + } + } +} -/// Represents a DataFrame in DataFusion. +/// A PyDataFrame is a representation of a logical plan and an API to compose statements. +/// Use it to build a plan and `.collect()` to execute the plan and collect the result. +/// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment. #[pyclass(name = "DataFrame", module = "datafusion", subclass)] #[derive(Clone)] pub struct PyDataFrame { df: Arc, + config: Arc, +} + +impl PyDataFrame { + /// creates a new PyDataFrame + pub fn new(df: DataFrame) -> Self { + Self { + df: Arc::new(df), + config: Arc::new(DisplayConfig::default()), + } + } } #[pymethods] impl PyDataFrame { - fn select(&mut self, expr: Vec) -> PyDataFusion { - let expr = expr.into_iter().map(|e| e.into()).collect::>(); - let df = self.df.select(expr).map_err(py_datafusion_err)?; + /// Enable selection for `df[col]`, `df[col1, col2, col3]`, and `df[[col1, col2, col3]]` + fn __getitem__(&self, key: Bound<'_, PyAny>) -> PyDataFusionResult { + if let Ok(key) = key.extract::() { + // df[col] + self.select_columns(vec![key]) + } else if let Ok(tuple) = key.downcast::() { + // df[col1, col2, col3] + let keys = tuple + .iter() + .map(|item| item.extract::()) + .collect::>>()?; + self.select_columns(keys) + } else if let Ok(keys) = key.extract::>() { + // df[[col1, col2, col3]] + self.select_columns(keys) + } else { + let message = "DataFrame can only be indexed by string index or indices".to_string(); + Err(PyDataFusionError::Common(message)) + } + } + + fn __repr__(&self, py: Python) -> PyDataFusionResult { + let (batches, has_more) = wait_for_future( + py, + collect_record_batches_to_display( + self.df.as_ref().clone(), + self.config.min_table_rows, + self.config.max_table_rows_in_repr, + &self.config, + ), + )?; + if batches.is_empty() { + // This should not be reached, but do it for safety since we index into the vector below + return Ok("No data to display".to_string()); + } + + let batches_as_displ = + pretty::pretty_format_batches(&batches).map_err(py_datafusion_err)?; + + let additional_str = match has_more { + true => "\nData truncated.", + false => "", + }; + + Ok(format!("DataFrame()\n{batches_as_displ}{additional_str}")) + } + + fn _repr_html_(&self, py: Python) -> PyDataFusionResult { + let (batches, has_more) = wait_for_future( + py, + collect_record_batches_to_display( + self.df.as_ref().clone(), + self.config.min_table_rows, + usize::MAX, + &self.config, + ), + )?; + if batches.is_empty() { + // This should not be reached, but do it for safety since we index into the vector below + return Ok("No data to display".to_string()); + } + + let table_uuid = uuid::Uuid::new_v4().to_string(); + + let mut html_str = " + + +
+ + \n".to_string(); + + let schema = batches[0].schema(); + + let mut header = Vec::new(); + for field in schema.fields() { + header.push(format!("", field.name())); + } + let header_str = header.join(""); + html_str.push_str(&format!("{}\n", header_str)); + + let batch_formatters = batches + .iter() + .map(|batch| { + batch + .columns() + .iter() + .map(|c| ArrayFormatter::try_new(c.as_ref(), &FormatOptions::default())) + .map(|c| { + c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string()))) + }) + .collect::, _>>() + }) + .collect::, _>>()?; + + let rows_per_batch = batches.iter().map(|batch| batch.num_rows()); + + // We need to build up row by row for html + let mut table_row = 0; + for (batch_formatter, num_rows_in_batch) in batch_formatters.iter().zip(rows_per_batch) { + for batch_row in 0..num_rows_in_batch { + table_row += 1; + let mut cells = Vec::new(); + for (col, formatter) in batch_formatter.iter().enumerate() { + let cell_data = formatter.value(batch_row).to_string(); + // From testing, primitive data types do not typically get larger than 21 characters + if cell_data.len() > self.config.max_cell_length { + let short_cell_data = &cell_data[0..self.config.max_cell_length]; + cells.push(format!(" + ")); + } else { + cells.push(format!("", formatter.value(batch_row))); + } + } + let row_str = cells.join(""); + html_str.push_str(&format!("{}\n", row_str)); + } + } + html_str.push_str("
{}
+
+ {short_cell_data} + {cell_data} + +
+
{}
\n"); + + html_str.push_str(" + + "); + + if has_more { + html_str.push_str("Data truncated due to size."); + } + + Ok(html_str) + } + + /// Calculate summary statistics for a DataFrame + fn describe(&self, py: Python) -> PyDataFusionResult { + let df = self.df.as_ref().clone(); + let stat_df = wait_for_future(py, df.describe())?; + Ok(Self::new(stat_df)) + } + + /// Returns the schema from the logical plan + fn schema(&self) -> PyArrowType { + PyArrowType(self.df.schema().into()) + } + + /// Convert this DataFrame into a Table that can be used in register_table + /// By convention, into_... methods consume self and return the new object. + /// Disabling the clippy lint, so we can use &self + /// because we're working with Python bindings + /// where objects are shared + /// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 + /// - we have not decided on the table_provider approach yet + #[allow(clippy::wrong_self_convention)] + fn into_view(&self) -> PyDataFusionResult { + // Call the underlying Rust DataFrame::into_view method. + // Note that the Rust method consumes self; here we clone the inner Arc + // so that we don’t invalidate this PyDataFrame. + let table_provider = self.df.as_ref().clone().into_view(); + let table_provider = PyTableProvider::new(table_provider); + + Ok(table_provider.as_table()) + } + + #[pyo3(signature = (*args))] + fn select_columns(&self, args: Vec) -> PyDataFusionResult { + let args = args.iter().map(|s| s.as_ref()).collect::>(); + let df = self.df.as_ref().clone().select_columns(&args)?; Ok(Self::new(df)) } - fn filter(&mut self, predicate: PyExpr) -> PyDataFusionResult { - let df = self - .df - .filter(predicate.into()) - .map_err(py_datafusion_err)?; + #[pyo3(signature = (*args))] + fn select(&self, args: Vec) -> PyDataFusionResult { + let expr = args.into_iter().map(|e| e.into()).collect(); + let df = self.df.as_ref().clone().select(expr)?; Ok(Self::new(df)) } - fn with_column(&mut self, name: &str, expr: PyExpr) -> PyDataFusionResult { - let df = self - .df - .with_column(name, expr.into()) - .map_err(py_datafusion_err)?; + #[pyo3(signature = (*args))] + fn drop(&self, args: Vec) -> PyDataFusionResult { + let cols = args.iter().map(|s| s.as_ref()).collect::>(); + let df = self.df.as_ref().clone().drop_columns(&cols)?; + Ok(Self::new(df)) + } + + fn filter(&self, predicate: PyExpr) -> PyDataFusionResult { + let df = self.df.as_ref().clone().filter(predicate.into())?; Ok(Self::new(df)) } - fn with_columns(&mut self, exprs: Vec) -> PyDataFusionResult { - let mut df = self.df.clone(); + fn with_column(&self, name: &str, expr: PyExpr) -> PyDataFusionResult { + let df = self.df.as_ref().clone().with_column(name, expr.into())?; + Ok(Self::new(df)) + } + + fn with_columns(&self, exprs: Vec) -> PyDataFusionResult { + let mut df = self.df.as_ref().clone(); for expr in exprs { let expr: Expr = expr.into(); let name = format!("{}", expr.schema_name()); - df = df - .with_column(name.as_str(), expr) - .map_err(py_datafusion_err)? + df = df.with_column(name.as_str(), expr)? } Ok(Self::new(df)) } /// Rename one column by applying a new projection. This is a no-op if the column to be /// renamed does not exist. - fn with_column_renamed(&mut self, old_name: &str, new_name: &str) -> PyDataFusionResult { + fn with_column_renamed(&self, old_name: &str, new_name: &str) -> PyDataFusionResult { let df = self .df - .with_column_renamed(old_name, new_name) - .map_err(py_datafusion_err)?; + .as_ref() + .clone() + .with_column_renamed(old_name, new_name)?; Ok(Self::new(df)) } - fn aggregate(&mut self, group_by: Vec, aggs: Vec) -> PyDataFusionResult { + fn aggregate(&self, group_by: Vec, aggs: Vec) -> PyDataFusionResult { let group_by = group_by.into_iter().map(|e| e.into()).collect(); let aggs = aggs.into_iter().map(|e| e.into()).collect(); - let df = self - .df - .aggregate(group_by, aggs) - .map_err(py_datafusion_err)?; + let df = self.df.as_ref().clone().aggregate(group_by, aggs)?; Ok(Self::new(df)) } #[pyo3(signature = (*exprs))] - fn sort(&mut self, exprs: Vec) -> PyDataFusionResult { + fn sort(&self, exprs: Vec) -> PyDataFusionResult { let exprs = to_sort_expressions(exprs); - let df = self.df.sort(exprs).map_err(py_datafusion_err)?; + let df = self.df.as_ref().clone().sort(exprs)?; Ok(Self::new(df)) } #[pyo3(signature = (count, offset=0))] - fn limit(&mut self, count: usize, offset: usize) -> PyDataFusionResult { - let df = self - .df - .limit(offset, Some(count)) - .map_err(py_datafusion_err)?; + fn limit(&self, count: usize, offset: usize) -> PyDataFusionResult { + let df = self.df.as_ref().clone().limit(offset, Some(count))?; Ok(Self::new(df)) } @@ -122,23 +428,23 @@ impl PyDataFrame { /// Unless some order is specified in the plan, there is no /// guarantee of the order of the result. fn collect(&self, py: Python) -> PyResult> { - let batches = - wait_for_future(py, self.df.clone().collect()).map_err(PyDataFusionError::from)?; + let batches = wait_for_future(py, self.df.as_ref().clone().collect()) + .map_err(PyDataFusionError::from)?; // cannot use PyResult> return type due to // https://github.com/PyO3/pyo3/issues/1813 batches.into_iter().map(|rb| rb.to_pyarrow(py)).collect() } /// Cache DataFrame. - fn cache(&mut self, py: Python) -> PyDataFusionResult { - let df = wait_for_future(py, self.df.clone().cache())?; + fn cache(&self, py: Python) -> PyDataFusionResult { + let df = wait_for_future(py, self.df.as_ref().clone().cache())?; Ok(Self::new(df)) } /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch /// maintaining the input partitioning. fn collect_partitioned(&self, py: Python) -> PyResult>> { - let batches = wait_for_future(py, self.df.clone().collect_partitioned()) + let batches = wait_for_future(py, self.df.as_ref().clone().collect_partitioned()) .map_err(PyDataFusionError::from)?; batches @@ -150,22 +456,18 @@ impl PyDataFrame { /// Print the result, 20 lines by default #[pyo3(signature = (num=20))] fn show(&self, py: Python, num: usize) -> PyDataFusionResult<()> { - let df = self - .df - .clone() - .limit(0, Some(num)) - .map_err(py_datafusion_err)?; + let df = self.df.as_ref().clone().limit(0, Some(num))?; print_dataframe(py, df) } /// Filter out duplicate rows - fn distinct(&mut self) -> PyDataFusionResult { - let df = self.df.clone().distinct().map_err(py_datafusion_err)?; + fn distinct(&self) -> PyDataFusionResult { + let df = self.df.as_ref().clone().distinct()?; Ok(Self::new(df)) } fn join( - &mut self, + &self, right: PyDataFrame, how: &str, left_on: Vec, @@ -188,14 +490,18 @@ impl PyDataFrame { let left_keys = left_on.iter().map(|s| s.as_ref()).collect::>(); let right_keys = right_on.iter().map(|s| s.as_ref()).collect::>(); - let df = self - .df - .join(right.df.clone(), join_type, &left_keys, &right_keys, None)?; + let df = self.df.as_ref().clone().join( + right.df.as_ref().clone(), + join_type, + &left_keys, + &right_keys, + None, + )?; Ok(Self::new(df)) } fn join_on( - &mut self, + &self, right: PyDataFrame, on_exprs: Vec, how: &str, @@ -217,34 +523,32 @@ impl PyDataFrame { let df = self .df - .join_on(right.df.clone(), join_type, exprs) - .map_err(py_datafusion_err)?; + .as_ref() + .clone() + .join_on(right.df.as_ref().clone(), join_type, exprs)?; Ok(Self::new(df)) } /// Print the query plan #[pyo3(signature = (verbose=false, analyze=false))] fn explain(&self, py: Python, verbose: bool, analyze: bool) -> PyDataFusionResult<()> { - let df = self - .df - .explain(verbose, analyze) - .map_err(py_datafusion_err)?; + let df = self.df.as_ref().clone().explain(verbose, analyze)?; print_dataframe(py, df) } /// Get the logical plan for this `DataFrame` fn logical_plan(&self) -> PyResult { - Ok(self.df.logical_plan().clone().into()) + Ok(self.df.as_ref().clone().logical_plan().clone().into()) } /// Get the optimized logical plan for this `DataFrame` fn optimized_logical_plan(&self) -> PyDataFusionResult { - Ok(self.df.clone().into_optimized_plan()?.into()) + Ok(self.df.as_ref().clone().into_optimized_plan()?.into()) } /// Get the execution plan for this `DataFrame` fn execution_plan(&self, py: Python) -> PyDataFusionResult { - let plan = wait_for_future(py, self.df.clone().create_physical_plan())?; + let plan = wait_for_future(py, self.df.as_ref().clone().create_physical_plan())?; Ok(plan.into()) } @@ -252,8 +556,9 @@ impl PyDataFrame { fn repartition(&self, num: usize) -> PyDataFusionResult { let new_df = self .df - .repartition(Partitioning::RoundRobinBatch(num)) - .map_err(py_datafusion_err)?; + .as_ref() + .clone() + .repartition(Partitioning::RoundRobinBatch(num))?; Ok(Self::new(new_df)) } @@ -263,8 +568,9 @@ impl PyDataFrame { let expr = args.into_iter().map(|py_expr| py_expr.into()).collect(); let new_df = self .df - .repartition(Partitioning::Hash(expr, num)) - .map_err(py_datafusion_err)?; + .as_ref() + .clone() + .repartition(Partitioning::Hash(expr, num))?; Ok(Self::new(new_df)) } @@ -274,13 +580,11 @@ impl PyDataFrame { fn union(&self, py_df: PyDataFrame, distinct: bool) -> PyDataFusionResult { let new_df = if distinct { self.df - .union_distinct(py_df.df.clone()) - .map_err(py_datafusion_err)? - } else { - self.df + .as_ref() .clone() - .union(py_df.df.clone()) - .map_err(py_datafusion_err)? + .union_distinct(py_df.df.as_ref().clone())? + } else { + self.df.as_ref().clone().union(py_df.df.as_ref().clone())? }; Ok(Self::new(new_df)) @@ -291,8 +595,9 @@ impl PyDataFrame { fn union_distinct(&self, py_df: PyDataFrame) -> PyDataFusionResult { let new_df = self .df - .union_distinct(py_df.df.clone()) - .map_err(py_datafusion_err)?; + .as_ref() + .clone() + .union_distinct(py_df.df.as_ref().clone())?; Ok(Self::new(new_df)) } @@ -303,8 +608,9 @@ impl PyDataFrame { let unnest_options = UnnestOptions::default().with_preserve_nulls(preserve_nulls); let df = self .df - .unnest_columns_with_options(&[column], unnest_options) - .map_err(py_datafusion_err)?; + .as_ref() + .clone() + .unnest_columns_with_options(&[column], unnest_options)?; Ok(Self::new(df)) } @@ -320,8 +626,9 @@ impl PyDataFrame { let cols = columns.iter().map(|s| s.as_ref()).collect::>(); let df = self .df - .unnest_columns_with_options(&cols, unnest_options) - .map_err(py_datafusion_err)?; + .as_ref() + .clone() + .unnest_columns_with_options(&cols, unnest_options)?; Ok(Self::new(df)) } @@ -329,18 +636,15 @@ impl PyDataFrame { fn intersect(&self, py_df: PyDataFrame) -> PyDataFusionResult { let new_df = self .df - .intersect(py_df.df.clone()) - .map_err(py_datafusion_err)?; + .as_ref() + .clone() + .intersect(py_df.df.as_ref().clone())?; Ok(Self::new(new_df)) } /// Calculate the exception of two `DataFrame`s. The two `DataFrame`s must have exactly the same schema fn except_all(&self, py_df: PyDataFrame) -> PyDataFusionResult { - let new_df = self - .df - .clone() - .except(py_df.df.clone()) - .map_err(py_datafusion_err)?; + let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?; Ok(Self::new(new_df)) } @@ -352,9 +656,11 @@ impl PyDataFrame { }; wait_for_future( py, - self.df - .clone() - .write_csv(path, DataFrameWriteOptions::new(), Some(csv_options)), + self.df.as_ref().clone().write_csv( + path, + DataFrameWriteOptions::new(), + Some(csv_options), + ), )?; Ok(()) } @@ -411,7 +717,7 @@ impl PyDataFrame { wait_for_future( py, - self.df.clone().write_parquet( + self.df.as_ref().clone().write_parquet( path, DataFrameWriteOptions::new(), Option::from(options), @@ -425,6 +731,7 @@ impl PyDataFrame { wait_for_future( py, self.df + .as_ref() .clone() .write_json(path, DataFrameWriteOptions::new(), None), )?; @@ -450,7 +757,7 @@ impl PyDataFrame { py: Python<'py>, requested_schema: Option>, ) -> PyDataFusionResult> { - let mut batches = wait_for_future(py, self.df.clone().collect())?; + let mut batches = wait_for_future(py, self.df.as_ref().clone().collect())?; let mut schema: Schema = self.df.schema().to_owned().into(); if let Some(schema_capsule) = requested_schema { @@ -480,7 +787,7 @@ impl PyDataFrame { fn execute_stream(&self, py: Python) -> PyDataFusionResult { // create a Tokio runtime to run the async code let rt = &get_tokio_runtime().0; - let df = self.df.clone(); + let df = self.df.as_ref().clone(); let fut: JoinHandle> = rt.spawn(async move { df.execute_stream().await }); let stream = wait_for_future(py, fut).map_err(py_datafusion_err)?; @@ -490,7 +797,7 @@ impl PyDataFrame { fn execute_stream_partitioned(&self, py: Python) -> PyResult> { // create a Tokio runtime to run the async code let rt = &get_tokio_runtime().0; - let df = self.df.clone(); + let df = self.df.as_ref().clone(); let fut: JoinHandle>> = rt.spawn(async move { df.execute_stream_partitioned().await }); let stream = wait_for_future(py, fut).map_err(py_datafusion_err)?; @@ -545,43 +852,210 @@ impl PyDataFrame { // Executes this DataFrame to get the total number of rows. fn count(&self, py: Python) -> PyDataFusionResult { - Ok(wait_for_future(py, self.df.clone().count())?) + Ok(wait_for_future(py, self.df.as_ref().clone().count())?) } - #[pyo3(signature = (max_width=None, max_rows=None, show_nulls=None))] - pub fn to_string( - &self, - max_width: Option, - max_rows: Option, - show_nulls: Option, - py: Python, - ) -> PyDataFusionResult { - let batches = wait_for_future(py, self.df.clone().collect())?; + /// Get the current display configuration + #[getter] + fn display_config(&self) -> PyResult> { + Python::with_gil(|py| { + let config = (*self.config).clone(); + Py::new(py, config) + }) + } - let mut table = TableData::new(&batches)?; + /// Update display configuration + #[pyo3(signature = ( + max_table_bytes=None, + min_table_rows=None, + max_cell_length=None, + max_table_rows_in_repr=None + ))] + fn configure_display( + &mut self, + max_table_bytes: Option, + min_table_rows: Option, + max_cell_length: Option, + max_table_rows_in_repr: Option, + ) { + let mut new_config = (*self.config).clone(); + + if let Some(bytes) = max_table_bytes { + new_config.max_table_bytes = bytes; + } + + if let Some(rows) = min_table_rows { + new_config.min_table_rows = rows; + } - // Use the display configuration provided or default values - let max_width = max_width.unwrap_or(80); - let max_rows = max_rows; - let show_nulls = show_nulls.unwrap_or(false); + if let Some(length) = max_cell_length { + new_config.max_cell_length = length; + } - table.set_display_options(max_width, max_rows, show_nulls); + if let Some(rows) = max_table_rows_in_repr { + new_config.max_table_rows_in_repr = rows; + } - Ok(table.to_string()) + self.config = Arc::new(new_config); } - pub fn __repr__(&self, py: Python) -> PyDataFusionResult { - // Use default display configuration - self.to_string(None, None, None, py) + /// Reset display configuration to default values + #[pyo3(text_signature = "($self)")] + fn reset_display_config(&mut self) { + self.config = Arc::new(DisplayConfig::default()); } } -impl PyDataFrame { - pub fn new(df: DataFrame) -> Self { - Self { df: Arc::new(df) } +/// Print DataFrame +fn print_dataframe(py: Python, df: DataFrame) -> PyDataFusionResult<()> { + // Get string representation of record batches + let batches = wait_for_future(py, df.collect())?; + let batches_as_string = pretty::pretty_format_batches(&batches); + let result = match batches_as_string { + Ok(batch) => format!("DataFrame()\n{batch}"), + Err(err) => format!("Error: {:?}", err.to_string()), + }; + + // Import the Python 'builtins' module to access the print function + // Note that println! does not print to the Python debug console and is not visible in notebooks for instance + let print = py.import("builtins")?.getattr("print")?; + print.call1((result,))?; + Ok(()) +} + +fn project_schema(from_schema: Schema, to_schema: Schema) -> Result { + let merged_schema = Schema::try_merge(vec![from_schema, to_schema.clone()])?; + + let project_indices: Vec = to_schema + .fields + .iter() + .map(|field| field.name()) + .filter_map(|field_name| merged_schema.index_of(field_name).ok()) + .collect(); + + merged_schema.project(&project_indices) +} + +fn record_batch_into_schema( + record_batch: RecordBatch, + schema: &Schema, +) -> Result { + let schema = Arc::new(schema.clone()); + let base_schema = record_batch.schema(); + if base_schema.fields().len() == 0 { + // Nothing to project + return Ok(RecordBatch::new_empty(schema)); + } + + let array_size = record_batch.column(0).len(); + let mut data_arrays = Vec::with_capacity(schema.fields().len()); + + for field in schema.fields() { + let desired_data_type = field.data_type(); + if let Some(original_data) = record_batch.column_by_name(field.name()) { + let original_data_type = original_data.data_type(); + + if can_cast_types(original_data_type, desired_data_type) { + data_arrays.push(arrow::compute::kernels::cast( + original_data, + desired_data_type, + )?); + } else if field.is_nullable() { + data_arrays.push(new_null_array(desired_data_type, array_size)); + } else { + return Err(ArrowError::CastError(format!("Attempting to cast to non-nullable and non-castable field {} during schema projection.", field.name()))); + } + } else { + if !field.is_nullable() { + return Err(ArrowError::CastError(format!( + "Attempting to set null to non-nullable field {} during schema projection.", + field.name() + ))); + } + data_arrays.push(new_null_array(desired_data_type, array_size)); + } } - pub fn dataframe(&self) -> Arc { - self.df.clone() + RecordBatch::try_new(schema, data_arrays) +} + +/// This is a helper function to return the first non-empty record batch from executing a DataFrame. +/// It additionally returns a bool, which indicates if there are more record batches available. +/// We do this so we can determine if we should indicate to the user that the data has been +/// truncated. This collects until we have achived both of these two conditions +/// +/// - We have collected our minimum number of rows +/// - We have reached our limit, either data size or maximum number of rows +/// +/// Otherwise it will return when the stream has exhausted. If you want a specific number of +/// rows, set min_rows == max_rows. +async fn collect_record_batches_to_display( + df: DataFrame, + min_rows: usize, + max_rows: usize, + config: &DisplayConfig, +) -> Result<(Vec, bool), DataFusionError> { + let partitioned_stream = df.execute_stream_partitioned().await?; + let mut stream = futures::stream::iter(partitioned_stream).flatten(); + let mut size_estimate_so_far = 0; + let mut rows_so_far = 0; + let mut record_batches = Vec::default(); + let mut has_more = false; + + while (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows) + || rows_so_far < min_rows + { + let mut rb = match stream.next().await { + None => { + break; + } + Some(Ok(r)) => r, + Some(Err(e)) => return Err(e), + }; + + let mut rows_in_rb = rb.num_rows(); + if rows_in_rb > 0 { + size_estimate_so_far += rb.get_array_memory_size(); + + if size_estimate_so_far > config.max_table_bytes { + let ratio = config.max_table_bytes as f32 / size_estimate_so_far as f32; + let total_rows = rows_in_rb + rows_so_far; + + let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize; + if reduced_row_num < min_rows { + reduced_row_num = min_rows.min(total_rows); + } + + let limited_rows_this_rb = reduced_row_num - rows_so_far; + if limited_rows_this_rb < rows_in_rb { + rows_in_rb = limited_rows_this_rb; + rb = rb.slice(0, limited_rows_this_rb); + has_more = true; + } + } + + if rows_in_rb + rows_so_far > max_rows { + rb = rb.slice(0, max_rows - rows_so_far); + has_more = true; + } + + rows_so_far += rb.num_rows(); + record_batches.push(rb); + } + } + + if record_batches.is_empty() { + return Ok((Vec::default(), false)); } + + if !has_more { + // Data was not already truncated, so check to see if more record batches remain + has_more = match stream.try_next().await { + Ok(None) => false, // reached end + Ok(Some(_)) => true, + Err(_) => false, // Stream disconnected + }; + } + + Ok((record_batches, has_more)) } From 30c9d99a5a05777163f70a561b4ccffade452122 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 2 Apr 2025 14:52:04 +0800 Subject: [PATCH 20/51] revert to before DisplayConfig in PyDataFrame --- python/datafusion/dataframe.py | 55 ------- python/tests/test_dataframe.py | 281 --------------------------------- src/dataframe.rs | 130 ++------------- 3 files changed, 11 insertions(+), 455 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 3b2382502..26fe8f453 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -49,7 +49,6 @@ import pyarrow as pa from datafusion._internal import DataFrame as DataFrameInternal - from datafusion._internal import DisplayConfig from datafusion._internal import expr as expr_internal from enum import Enum @@ -814,60 +813,6 @@ def count(self) -> int: """ return self.df.count() - def configure_display( - self, - max_table_bytes: Optional[int] = None, - min_table_rows: Optional[int] = None, - max_cell_length: Optional[int] = None, - max_table_rows_in_repr: Optional[int] = None, - ) -> None: - """Configure display options for DataFrame representation. - - Args: - max_table_bytes: Maximum bytes to display for table presentation - (default: 2MB). - Set to lower value for large tables to limit memory usage. - min_table_rows: Minimum number of table rows to display (default: 20). - This is used for initial display and in notebooks. - max_cell_length: Maximum length of a cell before it gets minimized - (default: 25). - Longer cells will be truncated with an expand button. - max_table_rows_in_repr: Maximum number of rows to display in string - representation - (default: 10). - - Raises: - ValueError: If any of the provided values are less than or equal to 0. - """ - if any( - value is not None and value <= 0 - for value in ( - max_table_bytes, - min_table_rows, - max_cell_length, - max_table_rows_in_repr, - ) - ): - error_msg = "All values must be greater than 0." - raise ValueError(error_msg) - - self.df.configure_display( - max_table_bytes, min_table_rows, max_cell_length, max_table_rows_in_repr - ) - - def reset_display_config(self) -> None: - """Reset display configuration to default values.""" - self.df.reset_display_config() - - @property - def display_config(self) -> DisplayConfig: - """Get the current display configuration. - - Returns: - DisplayConfig: The current display configuration settings - """ - return self.df.display_config - @deprecated("Use :py:func:`unnest_columns` instead.") def unnest_column(self, column: str, preserve_nulls: bool = True) -> DataFrame: """See :py:func:`unnest_columns`.""" diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 51cdc173d..eda13930d 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1261,284 +1261,3 @@ def test_dataframe_repr_html(df) -> None: body_lines = [f"{v}" for inner in body_data for v in inner] body_pattern = "(.*?)".join(body_lines) assert len(re.findall(body_pattern, output, re.DOTALL)) == 1 - - -def test_display_config(df): - """Test the display configuration properties are accessible.""" - config = df.display_config - - # Verify default values - assert config.max_table_bytes == 2 * 1024 * 1024 # 2 MB - assert config.min_table_rows == 20 - assert config.max_cell_length == 25 - assert config.max_table_rows_in_repr == 10 - - -def test_configure_display(df): - """Test setting display configuration properties.""" - # Modify the display configuration - df.configure_display( - max_table_bytes=1024 * 1024, - min_table_rows=10, - max_cell_length=50, - max_table_rows_in_repr=15, - ) - - # Verify the changes took effect - config = df.display_config - assert config.max_table_bytes == 1024 * 1024 # 1 MB - assert config.min_table_rows == 10 - assert config.max_cell_length == 50 - assert config.max_table_rows_in_repr == 15 - - # Test partial update (only changing one property) - df.configure_display(max_table_rows_in_repr=5) - config = df.display_config - assert config.max_table_bytes == 1024 * 1024 # previous value retained - assert config.min_table_rows == 10 # previous value retained - assert config.max_cell_length == 50 # previous value retained - assert config.max_table_rows_in_repr == 5 # only this value changed - - # Test with extreme values - # Zero values - with pytest.raises(ValueError, match=r".*must be greater than 0.*"): - df.configure_display(max_table_bytes=0, min_table_rows=0, max_cell_length=0) - - # Test with negative values - # This tests for expected behavior when users accidentally pass negative values - # Since these are usize in Rust, we expect a Python ValueError when trying to pass - # negative values. - with pytest.raises(ValueError, match=r".*must be greater than 0.*"): - df.configure_display(max_table_bytes=-1) - - with pytest.raises(ValueError, match=r".*must be greater than 0.*"): - df.configure_display(min_table_rows=-5) - - with pytest.raises(ValueError, match=r".*must be greater than 0.*"): - df.configure_display(max_cell_length=-10) - - # Reset for next tests - df.reset_display_config() - - -def test_reset_display_config(df): - """Test resetting display configuration to defaults.""" - # First modify the configuration - df.configure_display( - max_table_bytes=1024 * 1024, - min_table_rows=10, - max_cell_length=50, - max_table_rows_in_repr=15, - ) - - # Verify changes took effect - config = df.display_config - assert config.max_table_bytes == 1024 * 1024 - assert config.min_table_rows == 10 - assert config.max_cell_length == 50 - assert config.max_table_rows_in_repr == 15 - - # Now reset to defaults - df.reset_display_config() - - # Verify defaults are restored - config = df.display_config - assert config.max_table_bytes == 2 * 1024 * 1024 # 2 MB - assert config.min_table_rows == 20 - assert config.max_cell_length == 25 - assert config.max_table_rows_in_repr == 10 - - -def test_min_table_rows_display(ctx): - """Test that at least min_table_rows rows are displayed.""" - # Create a dataframe with more rows than the default min_table_rows - rows = 100 - df = _create_numeric_test_df(ctx, rows) - - # Set min_table_rows to a specific value - custom_min_rows = 30 - df.configure_display(min_table_rows=custom_min_rows) - - # Get HTML representation - html_output = df._repr_html_() - - # Count table rows in the HTML (excluding header row) - # Each row has a tag - row_count = html_output.count("") - 1 # subtract 1 for the header row - - # Verify at least min_table_rows rows are displayed - assert row_count >= custom_min_rows, ( - f"Expected at least {custom_min_rows} rows, got {row_count}" - ) - - # If data was truncated, "Data truncated" message should be present - if row_count < rows: - assert "Data truncated" in html_output - - -def test_max_table_bytes_display(ctx): - """Test that reducing max_table_bytes limits the amount of data displayed.""" - # Create a dataframe with large string values to consume memory - # Each string is approximately 1000 bytes - large_strings = ["x" * 1000 for _ in range(50)] - batch = pa.RecordBatch.from_arrays([pa.array(large_strings)], names=["large_data"]) - df = ctx.create_dataframe([[batch]]) - - # First test with default settings - default_html = df._repr_html_() - default_row_count = default_html.count("") - 1 # subtract header row - - # Now set a very small max_table_bytes - df.configure_display(max_table_bytes=5000) # 5KB should only fit a few rows - limited_html = df._repr_html_() - limited_row_count = limited_html.count("") - 1 - - # Verify fewer rows are displayed with the byte limit - assert limited_row_count < default_row_count, ( - f"Expected fewer rows with byte limit. " - f"Default: {default_row_count}, Limited: {limited_row_count}" - ) - - # "Data truncated" should be present when limited - assert "Data truncated" in limited_html - - -def test_max_cell_length_display(ctx): - """Test that cells longer than max_cell_length are truncated in display.""" - # Create a dataframe with long string values - long_strings = [ - "short", - "medium text", - "this is a very long string that should be truncated", - ] - batch = pa.RecordBatch.from_arrays([pa.array(long_strings)], names=["text"]) - df = ctx.create_dataframe([[batch]]) - - # Set a small max_cell_length - max_length = 10 - df.configure_display(max_cell_length=max_length) - - # Get HTML representation - html_output = df._repr_html_() - - # Check for expand button for long text - assert "expandable-container" in html_output - - # Check that expandable class is used for long text - assert 'class="expandable"' in html_output - - # Look for the truncated text and expand button - long_text = long_strings[2] - assert long_text[:max_length] in html_output # Truncated text should be present - assert "expand-btn" in html_output # Expand button should be present - assert long_text in html_output # Full text should also be in the HTML (hidden) - - -def test_display_config_repr_string(ctx): - """Test that __repr__ respects display configuration.""" - # Create a dataframe with more rows than we want to show - # df.__repr__ returns max 10 rows by default, so we start test with 7 rows - rows = 7 - df = _create_numeric_test_df(ctx, rows) - - # Configure to show at least 5 rows in string representation - min_table_rows_in_display = 5 - df.configure_display(min_table_rows=min_table_rows_in_display) - - # Get the string representation - repr_str = df.__repr__() - - # Count the number of rows using helper function - lines_count = _count_lines_in_str(repr_str) - - assert lines_count >= min_table_rows_in_display - - # Now set min_rows higher and see if more rows appear - min_table_rows_in_display = 7 - rows = 11 - df = _create_numeric_test_df(ctx, rows) # Recreate to reset the state - df.configure_display(min_table_rows=min_table_rows_in_display) - - repr_str_more = df.__repr__() - # The string should contain "Data truncated" - assert "Data truncated" in repr_str_more - - # Count lines again - lines_count2 = _count_lines_in_str(repr_str_more) - - # Should show more rows now - assert lines_count2 > lines_count - assert lines_count2 >= min_table_rows_in_display - - -def _count_lines_in_str(repr_str: str) -> int: - """Count the number of rows displayed in a string representation. - - Args: - repr_str: String representation of the DataFrame. - - Returns: - Number of rows that appear in the string representation. - """ - # DataFrame tables are formatted with | value | patterns - # Count lines that match actual data rows (not headers or separators) - value_lines = 0 - for line in repr_str.split("\n"): - # Look for lines like "| 0 |", "| 1 |", etc. - if re.search(r"\|\s*\d+\s*\|", line): - value_lines += 1 - return value_lines - - -def _create_numeric_test_df(ctx, rows) -> DataFrame: - """Create a test dataframe with numeric values from 0 to rows-1. - - Args: - ctx: SessionContext to use for creating the dataframe. - rows: Number of rows to create. - - Returns: - DataFrame with a single column "values" containing numbers 0 to rows-1. - """ - data = list(range(rows)) - batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"]) - return ctx.create_dataframe([[batch]]) - - -def test_max_table_rows_in_repr(ctx): - """Test that max_table_rows_in_repr controls the number of rows in string - representation. - """ - # Create a dataframe with more rows than the default max_table_rows_in_repr (10) - rows = 20 - df = _create_numeric_test_df(ctx, rows) - - # First test with default setting (should limit to 10 rows) - repr_str = df.__repr__() - lines_default = _count_lines_in_str(repr_str) - - # Default should be 10 rows max - assert lines_default <= 10 - assert "Data truncated" in repr_str - - # Now set a custom max_table_rows_in_repr value - custom_max_rows = 15 - df.configure_display(max_table_rows_in_repr=custom_max_rows) - - # Get the string representation with new configuration - repr_str_more = df.__repr__() - lines_custom = _count_lines_in_str(repr_str_more) - - # Should show more rows than default but not more than configured max - assert lines_custom > lines_default - assert lines_custom <= custom_max_rows - assert "Data truncated" in repr_str_more - - # Now set max_rows higher than total rows - should show all rows - df.configure_display(max_table_rows_in_repr=25) - repr_str_all = df.__repr__() - lines_all = _count_lines_in_str(repr_str_all) - - # Should show all rows (20) - assert lines_all == rows - assert "Data truncated" not in repr_str_all diff --git a/src/dataframe.rs b/src/dataframe.rs index cda4dd690..be10b8c28 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -72,56 +72,9 @@ impl PyTableProvider { PyTable::new(table_provider) } } - -/// Configuration for DataFrame display in Python environment -#[pyclass(name = "DisplayConfig", module = "datafusion")] -#[derive(Debug, Clone)] -pub struct DisplayConfig { - /// Maximum bytes to display for table presentation (default: 2MB) - #[pyo3(get, set)] - pub max_table_bytes: usize, - /// Minimum number of table rows to display (default: 20) - #[pyo3(get, set)] - pub min_table_rows: usize, - /// Maximum length of a cell before it gets minimized (default: 25) - #[pyo3(get, set)] - pub max_cell_length: usize, - /// Maximum number of rows to display in repr string output (default: 10) - #[pyo3(get, set)] - pub max_table_rows_in_repr: usize, -} - -#[pymethods] -impl DisplayConfig { - #[new] - #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))] - fn new( - max_table_bytes: Option, - min_table_rows: Option, - max_cell_length: Option, - max_table_rows_in_repr: Option, - ) -> Self { - let default = DisplayConfig::default(); - Self { - max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes), - min_table_rows: min_table_rows.unwrap_or(default.min_table_rows), - max_cell_length: max_cell_length.unwrap_or(default.max_cell_length), - max_table_rows_in_repr: max_table_rows_in_repr - .unwrap_or(default.max_table_rows_in_repr), - } - } -} - -impl Default for DisplayConfig { - fn default() -> Self { - Self { - max_table_bytes: 2 * 1024 * 1024, // 2 MB - min_table_rows: 20, - max_cell_length: 25, - max_table_rows_in_repr: 10, - } - } -} +const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB +const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20; +const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25; /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. @@ -130,16 +83,12 @@ impl Default for DisplayConfig { #[derive(Clone)] pub struct PyDataFrame { df: Arc, - config: Arc, } impl PyDataFrame { /// creates a new PyDataFrame pub fn new(df: DataFrame) -> Self { - Self { - df: Arc::new(df), - config: Arc::new(DisplayConfig::default()), - } + Self { df: Arc::new(df) } } } @@ -169,12 +118,7 @@ impl PyDataFrame { fn __repr__(&self, py: Python) -> PyDataFusionResult { let (batches, has_more) = wait_for_future( py, - collect_record_batches_to_display( - self.df.as_ref().clone(), - self.config.min_table_rows, - self.config.max_table_rows_in_repr, - &self.config, - ), + collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10), )?; if batches.is_empty() { // This should not be reached, but do it for safety since we index into the vector below @@ -197,9 +141,8 @@ impl PyDataFrame { py, collect_record_batches_to_display( self.df.as_ref().clone(), - self.config.min_table_rows, + MIN_TABLE_ROWS_TO_DISPLAY, usize::MAX, - &self.config, ), )?; if batches.is_empty() { @@ -275,8 +218,8 @@ impl PyDataFrame { for (col, formatter) in batch_formatter.iter().enumerate() { let cell_data = formatter.value(batch_row).to_string(); // From testing, primitive data types do not typically get larger than 21 characters - if cell_data.len() > self.config.max_cell_length { - let short_cell_data = &cell_data[0..self.config.max_cell_length]; + if cell_data.len() > MAX_LENGTH_CELL_WITHOUT_MINIMIZE { + let short_cell_data = &cell_data[0..MAX_LENGTH_CELL_WITHOUT_MINIMIZE]; cells.push(format!("
@@ -854,56 +797,6 @@ impl PyDataFrame { fn count(&self, py: Python) -> PyDataFusionResult { Ok(wait_for_future(py, self.df.as_ref().clone().count())?) } - - /// Get the current display configuration - #[getter] - fn display_config(&self) -> PyResult> { - Python::with_gil(|py| { - let config = (*self.config).clone(); - Py::new(py, config) - }) - } - - /// Update display configuration - #[pyo3(signature = ( - max_table_bytes=None, - min_table_rows=None, - max_cell_length=None, - max_table_rows_in_repr=None - ))] - fn configure_display( - &mut self, - max_table_bytes: Option, - min_table_rows: Option, - max_cell_length: Option, - max_table_rows_in_repr: Option, - ) { - let mut new_config = (*self.config).clone(); - - if let Some(bytes) = max_table_bytes { - new_config.max_table_bytes = bytes; - } - - if let Some(rows) = min_table_rows { - new_config.min_table_rows = rows; - } - - if let Some(length) = max_cell_length { - new_config.max_cell_length = length; - } - - if let Some(rows) = max_table_rows_in_repr { - new_config.max_table_rows_in_repr = rows; - } - - self.config = Arc::new(new_config); - } - - /// Reset display configuration to default values - #[pyo3(text_signature = "($self)")] - fn reset_display_config(&mut self) { - self.config = Arc::new(DisplayConfig::default()); - } } /// Print DataFrame @@ -993,7 +886,6 @@ async fn collect_record_batches_to_display( df: DataFrame, min_rows: usize, max_rows: usize, - config: &DisplayConfig, ) -> Result<(Vec, bool), DataFusionError> { let partitioned_stream = df.execute_stream_partitioned().await?; let mut stream = futures::stream::iter(partitioned_stream).flatten(); @@ -1002,7 +894,7 @@ async fn collect_record_batches_to_display( let mut record_batches = Vec::default(); let mut has_more = false; - while (size_estimate_so_far < config.max_table_bytes && rows_so_far < max_rows) + while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows) || rows_so_far < min_rows { let mut rb = match stream.next().await { @@ -1017,8 +909,8 @@ async fn collect_record_batches_to_display( if rows_in_rb > 0 { size_estimate_so_far += rb.get_array_memory_size(); - if size_estimate_so_far > config.max_table_bytes { - let ratio = config.max_table_bytes as f32 / size_estimate_so_far as f32; + if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY { + let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32; let total_rows = rows_in_rb + rows_so_far; let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize; From 028f0ab091b105bd9ea6da48a04991c417de5ff3 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 2 Apr 2025 14:59:50 +0800 Subject: [PATCH 21/51] feat: Add DataframeDisplayConfig for customizable DataFrame display options - Introduced `DataframeDisplayConfig` struct to manage display settings for DataFrames. - Added fields for maximum bytes, minimum rows, maximum cell length, and maximum rows in repr. - Implemented a constructor with default values for easy initialization. - Updated `PySessionConfig` to include `display_config` with default settings. --- src/context.rs | 62 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/src/context.rs b/src/context.rs index 0db0f4d7e..3a71362da 100644 --- a/src/context.rs +++ b/src/context.rs @@ -72,16 +72,71 @@ use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider}; use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType}; use tokio::task::JoinHandle; +/// Configuration for displaying DataFrames +#[pyclass(name = "DataframeDisplayConfig", module = "datafusion", subclass)] +#[derive(Clone)] +pub struct DataframeDisplayConfig { + /// Maximum bytes to display for table presentation (default: 2MB) + #[pyo3(get, set)] + pub max_table_bytes: usize, + /// Minimum number of table rows to display (default: 20) + #[pyo3(get, set)] + pub min_table_rows: usize, + /// Maximum length of a cell before it gets minimized (default: 25) + #[pyo3(get, set)] + pub max_cell_length: usize, + /// Maximum number of rows to display in repr string output (default: 10) + #[pyo3(get, set)] + pub max_table_rows_in_repr: usize, +} + +#[pymethods] +impl DataframeDisplayConfig { + #[new] + #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))] + fn new( + max_table_bytes: Option, + min_table_rows: Option, + max_cell_length: Option, + max_table_rows_in_repr: Option, + ) -> Self { + let default = Self::default(); + Self { + max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes), + min_table_rows: min_table_rows.unwrap_or(default.min_table_rows), + max_cell_length: max_cell_length.unwrap_or(default.max_cell_length), + max_table_rows_in_repr: max_table_rows_in_repr + .unwrap_or(default.max_table_rows_in_repr), + } + } +} + +impl Default for DataframeDisplayConfig { + fn default() -> Self { + Self { + max_table_bytes: 2 * 1024 * 1024, // 2 MB + min_table_rows: 20, + max_cell_length: 25, + max_table_rows_in_repr: 10, + } + } +} + /// Configuration options for a SessionContext #[pyclass(name = "SessionConfig", module = "datafusion", subclass)] #[derive(Clone, Default)] pub struct PySessionConfig { pub config: SessionConfig, + #[pyo3(get, set)] + pub display_config: DataframeDisplayConfig, } impl From for PySessionConfig { fn from(config: SessionConfig) -> Self { - Self { config } + Self { + config, + display_config: DataframeDisplayConfig::default(), + } } } @@ -97,7 +152,10 @@ impl PySessionConfig { } } - Self { config } + Self { + config, + display_config: DataframeDisplayConfig::default(), + } } fn with_create_default_catalog_and_schema(&self, enabled: bool) -> Self { From b401e1a1a0f2698b4831cc00450ba69f292be9ca Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 2 Apr 2025 15:03:06 +0800 Subject: [PATCH 22/51] feat: Add method to configure DataFrame display options in PySessionConfig --- src/context.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/context.rs b/src/context.rs index 3a71362da..7adaebada 100644 --- a/src/context.rs +++ b/src/context.rs @@ -214,6 +214,12 @@ impl PySessionConfig { Self::from(self.config.clone().with_repartition_file_min_size(size)) } + fn with_dataframe_display_config(&self, display_config: DataframeDisplayConfig) -> Self { + let mut config = self.clone(); + config.display_config = display_config; + config + } + fn with_parquet_pruning(&self, enabled: bool) -> Self { Self::from(self.config.clone().with_parquet_pruning(enabled)) } From d2a1dc92b4e93eb5a6a44df8b946894e998ba47e Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 2 Apr 2025 15:07:14 +0800 Subject: [PATCH 23/51] feat: Add method to configure DataFrame display options in SessionConfig (python) - Introduced `with_dataframe_display_config` method in `SessionConfig` to allow customization of DataFrame display settings. - Parameters include `max_table_bytes`, `min_table_rows`, `max_cell_length`, and `max_table_rows_in_repr` for flexible display configurations. - Utilizes `DataframeDisplayConfig` for internal management of display settings. --- python/datafusion/context.py | 40 +++++++++++++-- src/context.rs | 97 ++++++++++++++++++------------------ src/lib.rs | 1 + 3 files changed, 87 insertions(+), 51 deletions(-) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 1429a4975..9adc2b654 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -36,6 +36,7 @@ from ._internal import SessionConfig as SessionConfigInternal from ._internal import SessionContext as SessionContextInternal from ._internal import SQLOptions as SQLOptionsInternal +from ._internal import DataframeDisplayConfig as DataframeDisplayConfigInternal if TYPE_CHECKING: import pathlib @@ -89,6 +90,37 @@ def __init__(self, config_options: dict[str, str] | None = None) -> None: """ self.config_internal = SessionConfigInternal(config_options) + def with_dataframe_display_config( + self, + max_table_bytes: int = None, + min_table_rows: int = None, + max_cell_length: int = None, + max_table_rows_in_repr: int = None, + ) -> SessionConfig: + """Configure the display options for DataFrames. + + Args: + max_table_bytes: Maximum bytes to display for table presentation (default: 2MB) + min_table_rows: Minimum number of table rows to display (default: 20) + max_cell_length: Maximum length of a cell before it gets minimized (default: 25) + max_table_rows_in_repr: Maximum number of rows to display in repr string output (default: 10) + + Returns: + A new :py:class:`SessionConfig` object with the updated display settings. + """ + + display_config = DataframeDisplayConfigInternal( + max_table_bytes=max_table_bytes, + min_table_rows=min_table_rows, + max_cell_length=max_cell_length, + max_table_rows_in_repr=max_table_rows_in_repr, + ) + + self.config_internal = self.config_internal.with_dataframe_display_config( + display_config + ) + return self + def with_create_default_catalog_and_schema( self, enabled: bool = True ) -> SessionConfig: @@ -806,9 +838,11 @@ def register_parquet( file_extension, skip_metadata, schema, - [sort_list_to_raw_sort_list(exprs) for exprs in file_sort_order] - if file_sort_order is not None - else None, + ( + [sort_list_to_raw_sort_list(exprs) for exprs in file_sort_order] + if file_sort_order is not None + else None + ), ) def register_csv( diff --git a/src/context.rs b/src/context.rs index 7adaebada..abf09b070 100644 --- a/src/context.rs +++ b/src/context.rs @@ -73,54 +73,6 @@ use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType}; use tokio::task::JoinHandle; /// Configuration for displaying DataFrames -#[pyclass(name = "DataframeDisplayConfig", module = "datafusion", subclass)] -#[derive(Clone)] -pub struct DataframeDisplayConfig { - /// Maximum bytes to display for table presentation (default: 2MB) - #[pyo3(get, set)] - pub max_table_bytes: usize, - /// Minimum number of table rows to display (default: 20) - #[pyo3(get, set)] - pub min_table_rows: usize, - /// Maximum length of a cell before it gets minimized (default: 25) - #[pyo3(get, set)] - pub max_cell_length: usize, - /// Maximum number of rows to display in repr string output (default: 10) - #[pyo3(get, set)] - pub max_table_rows_in_repr: usize, -} - -#[pymethods] -impl DataframeDisplayConfig { - #[new] - #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))] - fn new( - max_table_bytes: Option, - min_table_rows: Option, - max_cell_length: Option, - max_table_rows_in_repr: Option, - ) -> Self { - let default = Self::default(); - Self { - max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes), - min_table_rows: min_table_rows.unwrap_or(default.min_table_rows), - max_cell_length: max_cell_length.unwrap_or(default.max_cell_length), - max_table_rows_in_repr: max_table_rows_in_repr - .unwrap_or(default.max_table_rows_in_repr), - } - } -} - -impl Default for DataframeDisplayConfig { - fn default() -> Self { - Self { - max_table_bytes: 2 * 1024 * 1024, // 2 MB - min_table_rows: 20, - max_cell_length: 25, - max_table_rows_in_repr: 10, - } - } -} /// Configuration options for a SessionContext #[pyclass(name = "SessionConfig", module = "datafusion", subclass)] @@ -229,6 +181,55 @@ impl PySessionConfig { } } +#[pyclass(name = "DataframeDisplayConfig", module = "datafusion", subclass)] +#[derive(Clone)] +pub struct DataframeDisplayConfig { + /// Maximum bytes to display for table presentation (default: 2MB) + #[pyo3(get, set)] + pub max_table_bytes: usize, + /// Minimum number of table rows to display (default: 20) + #[pyo3(get, set)] + pub min_table_rows: usize, + /// Maximum length of a cell before it gets minimized (default: 25) + #[pyo3(get, set)] + pub max_cell_length: usize, + /// Maximum number of rows to display in repr string output (default: 10) + #[pyo3(get, set)] + pub max_table_rows_in_repr: usize, +} + +#[pymethods] +impl DataframeDisplayConfig { + #[new] + #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))] + fn new( + max_table_bytes: Option, + min_table_rows: Option, + max_cell_length: Option, + max_table_rows_in_repr: Option, + ) -> Self { + let default = Self::default(); + Self { + max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes), + min_table_rows: min_table_rows.unwrap_or(default.min_table_rows), + max_cell_length: max_cell_length.unwrap_or(default.max_cell_length), + max_table_rows_in_repr: max_table_rows_in_repr + .unwrap_or(default.max_table_rows_in_repr), + } + } +} + +impl Default for DataframeDisplayConfig { + fn default() -> Self { + Self { + max_table_bytes: 2 * 1024 * 1024, // 2 MB + min_table_rows: 20, + max_cell_length: 25, + max_table_rows_in_repr: 10, + } + } +} + /// Runtime options for a SessionContext #[pyclass(name = "RuntimeEnvBuilder", module = "datafusion", subclass)] #[derive(Clone)] diff --git a/src/lib.rs b/src/lib.rs index ce93ff0c3..61be65555 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -82,6 +82,7 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; From 07d7cf680a954e90561d79487fd9309d87752a27 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 2 Apr 2025 15:25:51 +0800 Subject: [PATCH 24/51] rename to PyDataframeDisplayConfig --- src/context.rs | 105 ++++++++++++++++++++++++------------------------- src/lib.rs | 2 +- 2 files changed, 53 insertions(+), 54 deletions(-) diff --git a/src/context.rs b/src/context.rs index abf09b070..9dac3ced0 100644 --- a/src/context.rs +++ b/src/context.rs @@ -73,6 +73,54 @@ use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType}; use tokio::task::JoinHandle; /// Configuration for displaying DataFrames +#[pyclass(name = "DataframeDisplayConfig", module = "datafusion", subclass)] +#[derive(Clone)] +pub struct PyDataframeDisplayConfig { + /// Maximum bytes to display for table presentation (default: 2MB) + #[pyo3(get, set)] + pub max_table_bytes: usize, + /// Minimum number of table rows to display (default: 20) + #[pyo3(get, set)] + pub min_table_rows: usize, + /// Maximum length of a cell before it gets minimized (default: 25) + #[pyo3(get, set)] + pub max_cell_length: usize, + /// Maximum number of rows to display in repr string output (default: 10) + #[pyo3(get, set)] + pub max_table_rows_in_repr: usize, +} + +#[pymethods] +impl PyDataframeDisplayConfig { + #[new] + #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))] + fn new( + max_table_bytes: Option, + min_table_rows: Option, + max_cell_length: Option, + max_table_rows_in_repr: Option, + ) -> Self { + let default = Self::default(); + Self { + max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes), + min_table_rows: min_table_rows.unwrap_or(default.min_table_rows), + max_cell_length: max_cell_length.unwrap_or(default.max_cell_length), + max_table_rows_in_repr: max_table_rows_in_repr + .unwrap_or(default.max_table_rows_in_repr), + } + } +} + +impl Default for PyDataframeDisplayConfig { + fn default() -> Self { + Self { + max_table_bytes: 2 * 1024 * 1024, // 2 MB + min_table_rows: 20, + max_cell_length: 25, + max_table_rows_in_repr: 10, + } + } +} /// Configuration options for a SessionContext #[pyclass(name = "SessionConfig", module = "datafusion", subclass)] @@ -80,14 +128,14 @@ use tokio::task::JoinHandle; pub struct PySessionConfig { pub config: SessionConfig, #[pyo3(get, set)] - pub display_config: DataframeDisplayConfig, + pub display_config: PyDataframeDisplayConfig, } impl From for PySessionConfig { fn from(config: SessionConfig) -> Self { Self { config, - display_config: DataframeDisplayConfig::default(), + display_config: PyDataframeDisplayConfig::default(), } } } @@ -106,7 +154,7 @@ impl PySessionConfig { Self { config, - display_config: DataframeDisplayConfig::default(), + display_config: PyDataframeDisplayConfig::default(), } } @@ -166,7 +214,7 @@ impl PySessionConfig { Self::from(self.config.clone().with_repartition_file_min_size(size)) } - fn with_dataframe_display_config(&self, display_config: DataframeDisplayConfig) -> Self { + fn with_dataframe_display_config(&self, display_config: PyDataframeDisplayConfig) -> Self { let mut config = self.clone(); config.display_config = display_config; config @@ -181,55 +229,6 @@ impl PySessionConfig { } } -#[pyclass(name = "DataframeDisplayConfig", module = "datafusion", subclass)] -#[derive(Clone)] -pub struct DataframeDisplayConfig { - /// Maximum bytes to display for table presentation (default: 2MB) - #[pyo3(get, set)] - pub max_table_bytes: usize, - /// Minimum number of table rows to display (default: 20) - #[pyo3(get, set)] - pub min_table_rows: usize, - /// Maximum length of a cell before it gets minimized (default: 25) - #[pyo3(get, set)] - pub max_cell_length: usize, - /// Maximum number of rows to display in repr string output (default: 10) - #[pyo3(get, set)] - pub max_table_rows_in_repr: usize, -} - -#[pymethods] -impl DataframeDisplayConfig { - #[new] - #[pyo3(signature = (max_table_bytes=None, min_table_rows=None, max_cell_length=None, max_table_rows_in_repr=None))] - fn new( - max_table_bytes: Option, - min_table_rows: Option, - max_cell_length: Option, - max_table_rows_in_repr: Option, - ) -> Self { - let default = Self::default(); - Self { - max_table_bytes: max_table_bytes.unwrap_or(default.max_table_bytes), - min_table_rows: min_table_rows.unwrap_or(default.min_table_rows), - max_cell_length: max_cell_length.unwrap_or(default.max_cell_length), - max_table_rows_in_repr: max_table_rows_in_repr - .unwrap_or(default.max_table_rows_in_repr), - } - } -} - -impl Default for DataframeDisplayConfig { - fn default() -> Self { - Self { - max_table_bytes: 2 * 1024 * 1024, // 2 MB - min_table_rows: 20, - max_cell_length: 25, - max_table_rows_in_repr: 10, - } - } -} - /// Runtime options for a SessionContext #[pyclass(name = "RuntimeEnvBuilder", module = "datafusion", subclass)] #[derive(Clone)] diff --git a/src/lib.rs b/src/lib.rs index 61be65555..a88b3e18c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -82,7 +82,7 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; From 625a1f28683a2bfc491cf2ffae2ee8289a9aab25 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 2 Apr 2025 15:36:41 +0800 Subject: [PATCH 25/51] feat: Add DataframeDisplayConfig class for customizable DataFrame display options - Introduced DataframeDisplayConfig to manage display settings for DataFrames. - Added properties for max_table_bytes, min_table_rows, max_cell_length, and max_table_rows_in_repr. - Each property includes getter and setter methods for easy configuration. - Default values provided for each parameter to enhance usability. --- python/datafusion/__init__.py | 2 + python/datafusion/context.py | 69 +++++++++++++++++++++++++++++++++++ src/context.rs | 1 - 3 files changed, 71 insertions(+), 1 deletion(-) diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index d871fdb71..a724d15a3 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -38,6 +38,7 @@ RuntimeEnvBuilder, SessionConfig, SessionContext, + DataframeDisplayConfig, SQLOptions, ) from .dataframe import DataFrame @@ -70,6 +71,7 @@ "ScalarUDF", "SessionConfig", "SessionContext", + "DataframeDisplayConfig", "Table", "WindowFrame", "WindowUDF", diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 9adc2b654..c14cd21dc 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -79,6 +79,75 @@ class TableProviderExportable(Protocol): def __datafusion_table_provider__(self) -> object: ... # noqa: D105 +class DataframeDisplayConfig: + """Configuration for displaying DataFrame results. + + This class allows you to control how DataFrames are displayed in Python. + """ + + def __init__( + self, + max_table_bytes: int = None, + min_table_rows: int = None, + max_cell_length: int = None, + max_table_rows_in_repr: int = None, + ) -> None: + """Create a new :py:class:`DataframeDisplayConfig` instance. + + Args: + max_table_bytes: Maximum bytes to display for table presentation (default: 2MB) + min_table_rows: Minimum number of table rows to display (default: 20) + max_cell_length: Maximum length of a cell before it gets minimized (default: 25) + max_table_rows_in_repr: Maximum number of rows to display in repr string output (default: 10) + """ + self.config_internal = DataframeDisplayConfigInternal( + max_table_bytes=max_table_bytes, + min_table_rows=min_table_rows, + max_cell_length=max_cell_length, + max_table_rows_in_repr=max_table_rows_in_repr, + ) + + @property + def max_table_bytes(self) -> int: + """Get the maximum bytes to display for table presentation.""" + return self.config_internal.max_table_bytes + + @max_table_bytes.setter + def max_table_bytes(self, value: int) -> None: + """Set the maximum bytes to display for table presentation.""" + self.config_internal.max_table_bytes = value + + @property + def min_table_rows(self) -> int: + """Get the minimum number of table rows to display.""" + return self.config_internal.min_table_rows + + @min_table_rows.setter + def min_table_rows(self, value: int) -> None: + """Set the minimum number of table rows to display.""" + self.config_internal.min_table_rows = value + + @property + def max_cell_length(self) -> int: + """Get the maximum length of a cell before it gets minimized.""" + return self.config_internal.max_cell_length + + @max_cell_length.setter + def max_cell_length(self, value: int) -> None: + """Set the maximum length of a cell before it gets minimized.""" + self.config_internal.max_cell_length = value + + @property + def max_table_rows_in_repr(self) -> int: + """Get the maximum number of rows to display in repr string output.""" + return self.config_internal.max_table_rows_in_repr + + @max_table_rows_in_repr.setter + def max_table_rows_in_repr(self, value: int) -> None: + """Set the maximum number of rows to display in repr string output.""" + self.config_internal.max_table_rows_in_repr = value + + class SessionConfig: """Session configuration options.""" diff --git a/src/context.rs b/src/context.rs index 9dac3ced0..7a4ebf466 100644 --- a/src/context.rs +++ b/src/context.rs @@ -127,7 +127,6 @@ impl Default for PyDataframeDisplayConfig { #[derive(Clone, Default)] pub struct PySessionConfig { pub config: SessionConfig, - #[pyo3(get, set)] pub display_config: PyDataframeDisplayConfig, } From 5dfb9ce7268f5cc9d3df24684ddd6d2c217c6ddf Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 2 Apr 2025 16:18:18 +0800 Subject: [PATCH 26/51] Fix ruff errors --- python/datafusion/__init__.py | 2 +- python/datafusion/context.py | 45 ++++++++++++++++++++++------------- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index a724d15a3..23f6c971d 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -61,6 +61,7 @@ "DFSchema", "DataFrame", "Database", + "DataframeDisplayConfig", "ExecutionPlan", "Expr", "LogicalPlan", @@ -71,7 +72,6 @@ "ScalarUDF", "SessionConfig", "SessionContext", - "DataframeDisplayConfig", "Table", "WindowFrame", "WindowUDF", diff --git a/python/datafusion/context.py b/python/datafusion/context.py index c14cd21dc..ad5744958 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -19,7 +19,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Protocol +from typing import TYPE_CHECKING, Any, Optional, Protocol try: from warnings import deprecated # Python 3.13+ @@ -87,18 +87,22 @@ class DataframeDisplayConfig: def __init__( self, - max_table_bytes: int = None, - min_table_rows: int = None, - max_cell_length: int = None, - max_table_rows_in_repr: int = None, + max_table_bytes: Optional[int] = None, + min_table_rows: Optional[int] = None, + max_cell_length: Optional[int] = None, + max_table_rows_in_repr: Optional[int] = None, ) -> None: """Create a new :py:class:`DataframeDisplayConfig` instance. Args: - max_table_bytes: Maximum bytes to display for table presentation (default: 2MB) - min_table_rows: Minimum number of table rows to display (default: 20) - max_cell_length: Maximum length of a cell before it gets minimized (default: 25) - max_table_rows_in_repr: Maximum number of rows to display in repr string output (default: 10) + max_table_bytes: Maximum bytes to display for table presentation + (default: 2MB) + min_table_rows: Minimum number of table rows to display + (default: 20) + max_cell_length: Maximum length of a cell before it gets minimized + (default: 25) + max_table_rows_in_repr: Maximum number of rows to display in repr + string output (default: 10) """ self.config_internal = DataframeDisplayConfigInternal( max_table_bytes=max_table_bytes, @@ -161,22 +165,31 @@ def __init__(self, config_options: dict[str, str] | None = None) -> None: def with_dataframe_display_config( self, - max_table_bytes: int = None, - min_table_rows: int = None, - max_cell_length: int = None, - max_table_rows_in_repr: int = None, + max_table_bytes: Optional[int] = None, + min_table_rows: Optional[int] = None, + max_cell_length: Optional[int] = None, + max_table_rows_in_repr: Optional[int] = None, ) -> SessionConfig: """Configure the display options for DataFrames. Args: - max_table_bytes: Maximum bytes to display for table presentation (default: 2MB) + max_table_bytes: Maximum bytes to display for table presentation + (default: 2MB) min_table_rows: Minimum number of table rows to display (default: 20) - max_cell_length: Maximum length of a cell before it gets minimized (default: 25) - max_table_rows_in_repr: Maximum number of rows to display in repr string output (default: 10) + max_cell_length: Maximum length of a cell before it gets minimized + (default: 25) + max_table_rows_in_repr: Maximum number of rows to display in repr string + output (default: 10) Returns: A new :py:class:`SessionConfig` object with the updated display settings. """ + display_config = DataframeDisplayConfigInternal( + max_table_bytes=max_table_bytes, + min_table_rows=min_table_rows, + max_cell_length=max_cell_length, + max_table_rows_in_repr=max_table_rows_in_repr, + ) display_config = DataframeDisplayConfigInternal( max_table_bytes=max_table_bytes, From 065fa407f473a2e1e6a6d6217cd74955d23cb10d Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 2 Apr 2025 16:34:48 +0800 Subject: [PATCH 27/51] feat: Enhance PyDataFrame to support customizable display options - Updated `PyDataFrame` constructor to accept a `PyDataframeDisplayConfig` parameter for improved DataFrame display customization. - Modified multiple methods in `PySessionContext` to pass the display configuration when creating `PyDataFrame` instances, ensuring consistent display settings across different DataFrame operations. --- src/context.rs | 39 +++++++++++++++++++++++++++------------ src/dataframe.rs | 9 +++++++-- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/src/context.rs b/src/context.rs index 7a4ebf466..06f5ac8ec 100644 --- a/src/context.rs +++ b/src/context.rs @@ -457,7 +457,7 @@ impl PySessionContext { pub fn sql(&mut self, query: &str, py: Python) -> PyDataFusionResult { let result = self.ctx.sql(query); let df = wait_for_future(py, result)?; - Ok(PyDataFrame::new(df)) + Ok(PyDataFrame::new(df, self.ctx.display_config.clone())) } #[pyo3(signature = (query, options=None))] @@ -474,7 +474,7 @@ impl PySessionContext { }; let result = self.ctx.sql_with_options(query, options); let df = wait_for_future(py, result)?; - Ok(PyDataFrame::new(df)) + Ok(PyDataFrame::new(df, self.ctx.display_config.clone())) } #[pyo3(signature = (partitions, name=None, schema=None))] @@ -509,13 +509,16 @@ impl PySessionContext { let table = wait_for_future(py, self._table(&table_name))?; - let df = PyDataFrame::new(table); + let df = PyDataFrame::new(table, self.ctx.display_config.clone()); Ok(df) } /// Create a DataFrame from an existing logical plan pub fn create_dataframe_from_logical_plan(&mut self, plan: PyLogicalPlan) -> PyDataFrame { - PyDataFrame::new(DataFrame::new(self.ctx.state(), plan.plan.as_ref().clone())) + PyDataFrame::new( + DataFrame::new(self.ctx.state(), plan.plan.as_ref().clone()), + self.ctx.display_config.clone(), + ) } /// Construct datafusion dataframe from Python list @@ -883,7 +886,7 @@ impl PySessionContext { pub fn table(&self, name: &str, py: Python) -> PyResult { let x = wait_for_future(py, self.ctx.table(name)) .map_err(|e| PyKeyError::new_err(e.to_string()))?; - Ok(PyDataFrame::new(x)) + Ok(PyDataFrame::new(x, self.ctx.display_config.clone())) } pub fn table_exist(&self, name: &str) -> PyDataFusionResult { @@ -891,7 +894,10 @@ impl PySessionContext { } pub fn empty_table(&self) -> PyDataFusionResult { - Ok(PyDataFrame::new(self.ctx.read_empty()?)) + Ok( + PyDataFrame::new(self.ctx.read_empty()?), + self.ctx.display_config.clone(), + ) } pub fn session_id(&self) -> String { @@ -926,7 +932,7 @@ impl PySessionContext { let result = self.ctx.read_json(path, options); wait_for_future(py, result)? }; - Ok(PyDataFrame::new(df)) + Ok(PyDataFrame::new(df, self.ctx.display_config.clone())) } #[allow(clippy::too_many_arguments)] @@ -971,12 +977,18 @@ impl PySessionContext { let paths = path.extract::>()?; let paths = paths.iter().map(|p| p as &str).collect::>(); let result = self.ctx.read_csv(paths, options); - let df = PyDataFrame::new(wait_for_future(py, result)?); + let df = PyDataFrame::new( + wait_for_future(py, result)?, + self.ctx.display_config.clone(), + ); Ok(df) } else { let path = path.extract::()?; let result = self.ctx.read_csv(path, options); - let df = PyDataFrame::new(wait_for_future(py, result)?); + let df = PyDataFrame::new( + wait_for_future(py, result)?, + self.ctx.display_config.clone(), + ); Ok(df) } } @@ -1014,7 +1026,10 @@ impl PySessionContext { .collect(); let result = self.ctx.read_parquet(path, options); - let df = PyDataFrame::new(wait_for_future(py, result)?); + let df = PyDataFrame::new( + wait_for_future(py, result)?, + self.ctx.display_config.clone(), + ); Ok(df) } @@ -1039,12 +1054,12 @@ impl PySessionContext { let read_future = self.ctx.read_avro(path, options); wait_for_future(py, read_future)? }; - Ok(PyDataFrame::new(df)) + Ok(PyDataFrame::new(df, self.ctx.display_config.clone())) } pub fn read_table(&self, table: &PyTable) -> PyDataFusionResult { let df = self.ctx.read_table(table.table())?; - Ok(PyDataFrame::new(df)) + Ok(PyDataFrame::new(df, self.ctx.display_config.clone())) } fn __repr__(&self) -> PyResult { diff --git a/src/dataframe.rs b/src/dataframe.rs index be10b8c28..6e49a91ff 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -43,6 +43,7 @@ use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods}; use tokio::task::JoinHandle; use crate::catalog::PyTable; +use crate::context::PyDataframeDisplayConfig; use crate::errors::{py_datafusion_err, PyDataFusionError}; use crate::expr::sort_expr::to_sort_expressions; use crate::physical_plan::PyExecutionPlan; @@ -83,12 +84,16 @@ const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25; #[derive(Clone)] pub struct PyDataFrame { df: Arc, + display_config: PyDataframeDisplayConfig, } impl PyDataFrame { /// creates a new PyDataFrame - pub fn new(df: DataFrame) -> Self { - Self { df: Arc::new(df) } + pub fn new(df: DataFrame, display_config: PyDataframeDisplayConfig) -> Self { + Self { + df: Arc::new(df), + display_config, + } } } From 7fa2c7c4e86855b2b76a69cf020682229348ddd4 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 2 Apr 2025 17:04:20 +0800 Subject: [PATCH 28/51] Amend PyDataFrame to use display_config instead of constants --- src/dataframe.rs | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/src/dataframe.rs b/src/dataframe.rs index 6e49a91ff..c188207c6 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -73,9 +73,6 @@ impl PyTableProvider { PyTable::new(table_provider) } } -const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB -const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20; -const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25; /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. @@ -84,7 +81,7 @@ const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25; #[derive(Clone)] pub struct PyDataFrame { df: Arc, - display_config: PyDataframeDisplayConfig, + display_config: Arc, } impl PyDataFrame { @@ -92,7 +89,7 @@ impl PyDataFrame { pub fn new(df: DataFrame, display_config: PyDataframeDisplayConfig) -> Self { Self { df: Arc::new(df), - display_config, + display_config: Arc::new(display_config), } } } @@ -121,9 +118,23 @@ impl PyDataFrame { } fn __repr__(&self, py: Python) -> PyDataFusionResult { + // Get display configuration values + let min_rows = self.display_config.min_table_rows; + let max_rows = self.display_config.max_table_rows_in_repr; + let max_bytes = self.display_config.max_table_bytes; + + // Collect record batches for display + let (batches, has_more) = wait_for_future( + py, + collect_record_batches_to_display( + self.df.as_ref().clone(), + self.display_config.min_table_rows, + self.display_config.max_table_rows_in_repr, + self.display_config.max_table_bytes, + ), let (batches, has_more) = wait_for_future( py, - collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10), + self.display_config.min_table_rows, self.display_config.max_table_rows_in_repr, self.display_config.max_table_bytes), )?; if batches.is_empty() { // This should not be reached, but do it for safety since we index into the vector below @@ -146,8 +157,9 @@ impl PyDataFrame { py, collect_record_batches_to_display( self.df.as_ref().clone(), - MIN_TABLE_ROWS_TO_DISPLAY, - usize::MAX, + self.display_config.min_table_rows, + self.display_config.max_table_rows_in_repr, + self.display_config.max_table_bytes, ), )?; if batches.is_empty() { @@ -223,8 +235,8 @@ impl PyDataFrame { for (col, formatter) in batch_formatter.iter().enumerate() { let cell_data = formatter.value(batch_row).to_string(); // From testing, primitive data types do not typically get larger than 21 characters - if cell_data.len() > MAX_LENGTH_CELL_WITHOUT_MINIMIZE { - let short_cell_data = &cell_data[0..MAX_LENGTH_CELL_WITHOUT_MINIMIZE]; + if cell_data.len() > self.display_config.max_cell_length { + let short_cell_data = &cell_data[0..self.display_config.max_cell_length]; cells.push(format!("
@@ -891,6 +903,7 @@ async fn collect_record_batches_to_display( df: DataFrame, min_rows: usize, max_rows: usize, + max_bytes: usize, ) -> Result<(Vec, bool), DataFusionError> { let partitioned_stream = df.execute_stream_partitioned().await?; let mut stream = futures::stream::iter(partitioned_stream).flatten(); @@ -899,7 +912,7 @@ async fn collect_record_batches_to_display( let mut record_batches = Vec::default(); let mut has_more = false; - while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows) + while (size_estimate_so_far < max_bytes && rows_so_far < max_rows) || rows_so_far < min_rows { let mut rb = match stream.next().await { @@ -914,8 +927,8 @@ async fn collect_record_batches_to_display( if rows_in_rb > 0 { size_estimate_so_far += rb.get_array_memory_size(); - if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY { - let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32; + if size_estimate_so_far > max_bytes { + let ratio = max_bytes as f32 / size_estimate_so_far as f32; let total_rows = rows_in_rb + rows_so_far; let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize; From cbc4759b3f731909f47ef3bb7b47e9d027b67dfe Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 2 Apr 2025 17:35:58 +0800 Subject: [PATCH 29/51] refactor: Simplify PySessionConfig and PySessionContext by removing unnecessary display_config handling - Removed display_config from PySessionConfig, streamlining its structure. - Updated PySessionContext to directly manage display_config, ensuring consistent access across methods. - Adjusted methods in PySessionContext to utilize the new display_config handling, enhancing clarity and maintainability. - Cleaned up code in PyDataFrame to ensure it correctly references the updated display_config. --- src/context.rs | 74 ++++++++++++++++++++++-------------------------- src/dataframe.rs | 7 +---- 2 files changed, 35 insertions(+), 46 deletions(-) diff --git a/src/context.rs b/src/context.rs index 06f5ac8ec..f2aaf0626 100644 --- a/src/context.rs +++ b/src/context.rs @@ -127,15 +127,11 @@ impl Default for PyDataframeDisplayConfig { #[derive(Clone, Default)] pub struct PySessionConfig { pub config: SessionConfig, - pub display_config: PyDataframeDisplayConfig, } impl From for PySessionConfig { fn from(config: SessionConfig) -> Self { - Self { - config, - display_config: PyDataframeDisplayConfig::default(), - } + Self { config } } } @@ -151,10 +147,7 @@ impl PySessionConfig { } } - Self { - config, - display_config: PyDataframeDisplayConfig::default(), - } + Self { config } } fn with_create_default_catalog_and_schema(&self, enabled: bool) -> Self { @@ -213,12 +206,6 @@ impl PySessionConfig { Self::from(self.config.clone().with_repartition_file_min_size(size)) } - fn with_dataframe_display_config(&self, display_config: PyDataframeDisplayConfig) -> Self { - let mut config = self.clone(); - config.display_config = display_config; - config - } - fn with_parquet_pruning(&self, enabled: bool) -> Self { Self::from(self.config.clone().with_parquet_pruning(enabled)) } @@ -332,6 +319,7 @@ impl PySQLOptions { #[derive(Clone)] pub struct PySessionContext { pub ctx: SessionContext, + pub display_config: PyDataframeDisplayConfig, } #[pymethods] @@ -341,6 +329,7 @@ impl PySessionContext { pub fn new( config: Option, runtime: Option, + display_config: Option, ) -> PyDataFusionResult { let config = if let Some(c) = config { c.config @@ -358,22 +347,33 @@ impl PySessionContext { .with_runtime_env(runtime) .with_default_features() .build(); + Ok(PySessionContext { ctx: SessionContext::new_with_state(session_state), + display_config: display_config.unwrap_or_default(), }) } pub fn enable_url_table(&self) -> PyResult { Ok(PySessionContext { ctx: self.ctx.clone().enable_url_table(), + display_config: self.display_config.clone(), }) } + pub fn with_display_config(&self, display_config: PyDataframeDisplayConfig) -> Self { + Self { + ctx: self.ctx.clone(), + display_config, + } + } + #[classmethod] #[pyo3(signature = ())] fn global_ctx(_cls: &Bound<'_, PyType>) -> PyResult { Ok(Self { ctx: get_global_ctx().clone(), + display_config: PyDataframeDisplayConfig::default(), }) } @@ -457,7 +457,7 @@ impl PySessionContext { pub fn sql(&mut self, query: &str, py: Python) -> PyDataFusionResult { let result = self.ctx.sql(query); let df = wait_for_future(py, result)?; - Ok(PyDataFrame::new(df, self.ctx.display_config.clone())) + Ok(PyDataFrame::new(df, self.display_config.clone())) } #[pyo3(signature = (query, options=None))] @@ -474,7 +474,7 @@ impl PySessionContext { }; let result = self.ctx.sql_with_options(query, options); let df = wait_for_future(py, result)?; - Ok(PyDataFrame::new(df, self.ctx.display_config.clone())) + Ok(PyDataFrame::new(df, self.display_config.clone())) } #[pyo3(signature = (partitions, name=None, schema=None))] @@ -509,7 +509,7 @@ impl PySessionContext { let table = wait_for_future(py, self._table(&table_name))?; - let df = PyDataFrame::new(table, self.ctx.display_config.clone()); + let df = PyDataFrame::new(table, self.display_config.clone()); Ok(df) } @@ -517,7 +517,7 @@ impl PySessionContext { pub fn create_dataframe_from_logical_plan(&mut self, plan: PyLogicalPlan) -> PyDataFrame { PyDataFrame::new( DataFrame::new(self.ctx.state(), plan.plan.as_ref().clone()), - self.ctx.display_config.clone(), + self.display_config.clone(), ) } @@ -886,7 +886,7 @@ impl PySessionContext { pub fn table(&self, name: &str, py: Python) -> PyResult { let x = wait_for_future(py, self.ctx.table(name)) .map_err(|e| PyKeyError::new_err(e.to_string()))?; - Ok(PyDataFrame::new(x, self.ctx.display_config.clone())) + Ok(PyDataFrame::new(x, self.display_config.clone())) } pub fn table_exist(&self, name: &str) -> PyDataFusionResult { @@ -894,10 +894,10 @@ impl PySessionContext { } pub fn empty_table(&self) -> PyDataFusionResult { - Ok( - PyDataFrame::new(self.ctx.read_empty()?), - self.ctx.display_config.clone(), - ) + Ok(PyDataFrame::new( + self.ctx.read_empty()?, + self.display_config.clone(), + )) } pub fn session_id(&self) -> String { @@ -932,7 +932,7 @@ impl PySessionContext { let result = self.ctx.read_json(path, options); wait_for_future(py, result)? }; - Ok(PyDataFrame::new(df, self.ctx.display_config.clone())) + Ok(PyDataFrame::new(df, self.display_config.clone())) } #[allow(clippy::too_many_arguments)] @@ -977,18 +977,12 @@ impl PySessionContext { let paths = path.extract::>()?; let paths = paths.iter().map(|p| p as &str).collect::>(); let result = self.ctx.read_csv(paths, options); - let df = PyDataFrame::new( - wait_for_future(py, result)?, - self.ctx.display_config.clone(), - ); + let df = PyDataFrame::new(wait_for_future(py, result)?, self.display_config.clone()); Ok(df) } else { let path = path.extract::()?; let result = self.ctx.read_csv(path, options); - let df = PyDataFrame::new( - wait_for_future(py, result)?, - self.ctx.display_config.clone(), - ); + let df = PyDataFrame::new(wait_for_future(py, result)?, self.display_config.clone()); Ok(df) } } @@ -1026,10 +1020,7 @@ impl PySessionContext { .collect(); let result = self.ctx.read_parquet(path, options); - let df = PyDataFrame::new( - wait_for_future(py, result)?, - self.ctx.display_config.clone(), - ); + let df = PyDataFrame::new(wait_for_future(py, result)?, self.display_config.clone()); Ok(df) } @@ -1054,12 +1045,12 @@ impl PySessionContext { let read_future = self.ctx.read_avro(path, options); wait_for_future(py, read_future)? }; - Ok(PyDataFrame::new(df, self.ctx.display_config.clone())) + Ok(PyDataFrame::new(df, self.display_config.clone())) } pub fn read_table(&self, table: &PyTable) -> PyDataFusionResult { let df = self.ctx.read_table(table.table())?; - Ok(PyDataFrame::new(df, self.ctx.display_config.clone())) + Ok(PyDataFrame::new(df, self.display_config.clone())) } fn __repr__(&self) -> PyResult { @@ -1175,6 +1166,9 @@ impl From for SessionContext { impl From for PySessionContext { fn from(ctx: SessionContext) -> PySessionContext { - PySessionContext { ctx } + PySessionContext { + ctx, + display_config: PyDataframeDisplayConfig::default(), + } } } diff --git a/src/dataframe.rs b/src/dataframe.rs index c188207c6..c4b98ceb1 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -118,11 +118,6 @@ impl PyDataFrame { } fn __repr__(&self, py: Python) -> PyDataFusionResult { - // Get display configuration values - let min_rows = self.display_config.min_table_rows; - let max_rows = self.display_config.max_table_rows_in_repr; - let max_bytes = self.display_config.max_table_bytes; - // Collect record batches for display let (batches, has_more) = wait_for_future( py, @@ -605,7 +600,7 @@ impl PyDataFrame { /// Calculate the exception of two `DataFrame`s. The two `DataFrame`s must have exactly the same schema fn except_all(&self, py_df: PyDataFrame) -> PyDataFusionResult { let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?; - Ok(Self::new(new_df)) + Ok(Self::new(new_df. self.display_config)) } /// Write a `DataFrame` to a CSV file. From 17379736e5ee82851e1be3fe8cae73747c08fd12 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 2 Apr 2025 18:00:46 +0800 Subject: [PATCH 30/51] refactor: Update PyDataFrame methods to consistently use display_config for DataFrame creation --- src/dataframe.rs | 54 ++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/src/dataframe.rs b/src/dataframe.rs index c4b98ceb1..5c06df985 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -127,10 +127,8 @@ impl PyDataFrame { self.display_config.max_table_rows_in_repr, self.display_config.max_table_bytes, ), - let (batches, has_more) = wait_for_future( - py, - self.display_config.min_table_rows, self.display_config.max_table_rows_in_repr, self.display_config.max_table_bytes), )?; + if batches.is_empty() { // This should not be reached, but do it for safety since we index into the vector below return Ok("No data to display".to_string()); @@ -281,7 +279,7 @@ impl PyDataFrame { fn describe(&self, py: Python) -> PyDataFusionResult { let df = self.df.as_ref().clone(); let stat_df = wait_for_future(py, df.describe())?; - Ok(Self::new(stat_df)) + Ok(Self::new(stat_df, self.display_config.as_ref().clone())) } /// Returns the schema from the logical plan @@ -311,31 +309,31 @@ impl PyDataFrame { fn select_columns(&self, args: Vec) -> PyDataFusionResult { let args = args.iter().map(|s| s.as_ref()).collect::>(); let df = self.df.as_ref().clone().select_columns(&args)?; - Ok(Self::new(df)) + Ok(Self::new(df, self.display_config.as_ref().clone())) } #[pyo3(signature = (*args))] fn select(&self, args: Vec) -> PyDataFusionResult { let expr = args.into_iter().map(|e| e.into()).collect(); let df = self.df.as_ref().clone().select(expr)?; - Ok(Self::new(df)) + Ok(Self::new(df, self.display_config.as_ref().clone())) } #[pyo3(signature = (*args))] fn drop(&self, args: Vec) -> PyDataFusionResult { let cols = args.iter().map(|s| s.as_ref()).collect::>(); let df = self.df.as_ref().clone().drop_columns(&cols)?; - Ok(Self::new(df)) + Ok(Self::new(df, self.display_config.as_ref().clone())) } fn filter(&self, predicate: PyExpr) -> PyDataFusionResult { let df = self.df.as_ref().clone().filter(predicate.into())?; - Ok(Self::new(df)) + Ok(Self::new(df, self.display_config.as_ref().clone())) } fn with_column(&self, name: &str, expr: PyExpr) -> PyDataFusionResult { let df = self.df.as_ref().clone().with_column(name, expr.into())?; - Ok(Self::new(df)) + Ok(Self::new(df, self.display_config.as_ref().clone())) } fn with_columns(&self, exprs: Vec) -> PyDataFusionResult { @@ -345,7 +343,7 @@ impl PyDataFrame { let name = format!("{}", expr.schema_name()); df = df.with_column(name.as_str(), expr)? } - Ok(Self::new(df)) + Ok(Self::new(df, self.display_config.as_ref().clone())) } /// Rename one column by applying a new projection. This is a no-op if the column to be @@ -356,27 +354,27 @@ impl PyDataFrame { .as_ref() .clone() .with_column_renamed(old_name, new_name)?; - Ok(Self::new(df)) + Ok(Self::new(df, self.display_config.as_ref().clone())) } fn aggregate(&self, group_by: Vec, aggs: Vec) -> PyDataFusionResult { let group_by = group_by.into_iter().map(|e| e.into()).collect(); let aggs = aggs.into_iter().map(|e| e.into()).collect(); let df = self.df.as_ref().clone().aggregate(group_by, aggs)?; - Ok(Self::new(df)) + Ok(Self::new(df, self.display_config.as_ref().clone())) } #[pyo3(signature = (*exprs))] fn sort(&self, exprs: Vec) -> PyDataFusionResult { let exprs = to_sort_expressions(exprs); let df = self.df.as_ref().clone().sort(exprs)?; - Ok(Self::new(df)) + Ok(Self::new(df, self.display_config.as_ref().clone())) } #[pyo3(signature = (count, offset=0))] fn limit(&self, count: usize, offset: usize) -> PyDataFusionResult { let df = self.df.as_ref().clone().limit(offset, Some(count))?; - Ok(Self::new(df)) + Ok(Self::new(df, self.display_config.as_ref().clone())) } /// Executes the plan, returning a list of `RecordBatch`es. @@ -393,7 +391,7 @@ impl PyDataFrame { /// Cache DataFrame. fn cache(&self, py: Python) -> PyDataFusionResult { let df = wait_for_future(py, self.df.as_ref().clone().cache())?; - Ok(Self::new(df)) + Ok(Self::new(df, self.display_config.as_ref().clone())) } /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch @@ -418,7 +416,7 @@ impl PyDataFrame { /// Filter out duplicate rows fn distinct(&self) -> PyDataFusionResult { let df = self.df.as_ref().clone().distinct()?; - Ok(Self::new(df)) + Ok(Self::new(df, self.display_config.as_ref().clone())) } fn join( @@ -452,7 +450,7 @@ impl PyDataFrame { &right_keys, None, )?; - Ok(Self::new(df)) + Ok(Self::new(df, self.display_config.as_ref().clone())) } fn join_on( @@ -481,7 +479,7 @@ impl PyDataFrame { .as_ref() .clone() .join_on(right.df.as_ref().clone(), join_type, exprs)?; - Ok(Self::new(df)) + Ok(Self::new(df, self.display_config.as_ref().clone())) } /// Print the query plan @@ -514,7 +512,7 @@ impl PyDataFrame { .as_ref() .clone() .repartition(Partitioning::RoundRobinBatch(num))?; - Ok(Self::new(new_df)) + Ok(Self::new(new_df, self.display_config.as_ref().clone())) } /// Repartition a `DataFrame` based on a logical partitioning scheme. @@ -526,7 +524,7 @@ impl PyDataFrame { .as_ref() .clone() .repartition(Partitioning::Hash(expr, num))?; - Ok(Self::new(new_df)) + Ok(Self::new(new_df, self.display_config.as_ref().clone())) } /// Calculate the union of two `DataFrame`s, preserving duplicate rows.The @@ -542,7 +540,7 @@ impl PyDataFrame { self.df.as_ref().clone().union(py_df.df.as_ref().clone())? }; - Ok(Self::new(new_df)) + Ok(Self::new(new_df, self.display_config.as_ref().clone())) } /// Calculate the distinct union of two `DataFrame`s. The @@ -553,7 +551,7 @@ impl PyDataFrame { .as_ref() .clone() .union_distinct(py_df.df.as_ref().clone())?; - Ok(Self::new(new_df)) + Ok(Self::new(new_df, self.display_config.as_ref().clone())) } #[pyo3(signature = (column, preserve_nulls=true))] @@ -566,7 +564,7 @@ impl PyDataFrame { .as_ref() .clone() .unnest_columns_with_options(&[column], unnest_options)?; - Ok(Self::new(df)) + Ok(Self::new(df, self.display_config.as_ref().clone())) } #[pyo3(signature = (columns, preserve_nulls=true))] @@ -584,7 +582,7 @@ impl PyDataFrame { .as_ref() .clone() .unnest_columns_with_options(&cols, unnest_options)?; - Ok(Self::new(df)) + Ok(Self::new(df, self.display_config.as_ref().clone())) } /// Calculate the intersection of two `DataFrame`s. The two `DataFrame`s must have exactly the same schema @@ -594,13 +592,13 @@ impl PyDataFrame { .as_ref() .clone() .intersect(py_df.df.as_ref().clone())?; - Ok(Self::new(new_df)) + Ok(Self::new(new_df, self.display_config.as_ref().clone())) } /// Calculate the exception of two `DataFrame`s. The two `DataFrame`s must have exactly the same schema fn except_all(&self, py_df: PyDataFrame) -> PyDataFusionResult { let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?; - Ok(Self::new(new_df. self.display_config)) + Ok(Self::new(new_df, self.display_config.as_ref().clone())) } /// Write a `DataFrame` to a CSV file. @@ -907,9 +905,7 @@ async fn collect_record_batches_to_display( let mut record_batches = Vec::default(); let mut has_more = false; - while (size_estimate_so_far < max_bytes && rows_so_far < max_rows) - || rows_so_far < min_rows - { + while (size_estimate_so_far < max_bytes && rows_so_far < max_rows) || rows_so_far < min_rows { let mut rb = match stream.next().await { None => { break; From 354ff45be5d280e53d24f2a0ae53a1409f3ce9a1 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 2 Apr 2025 18:10:03 +0800 Subject: [PATCH 31/51] feat: Add display configuration options to SessionContext for DataFrame presentation --- python/datafusion/context.py | 34 ++++++++++++++++++++++++++++++++++ src/context.rs | 2 +- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index ad5744958..02b035c7f 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -622,6 +622,40 @@ def global_ctx(cls) -> SessionContext: wrapper.ctx = internal_ctx return wrapper + def with_display_config( + self, + max_table_bytes: Optional[int] = None, + min_table_rows: Optional[int] = None, + max_cell_length: Optional[int] = None, + max_table_rows_in_repr: Optional[int] = None, + ) -> SessionContext: + """Configure the display options for DataFrames. + + Args: + max_table_bytes: Maximum bytes to display for table presentation + (default: 2MB) + min_table_rows: Minimum number of table rows to display + (default: 20) + max_cell_length: Maximum length of a cell before it gets minimized + (default: 25) + max_table_rows_in_repr: Maximum number of rows to display in repr + string output (default: 10) + + Returns: + A new :py:class:`SessionContext` object with the updated display settings. + """ + display_config = DataframeDisplayConfig( + max_table_bytes=max_table_bytes, + min_table_rows=min_table_rows, + max_cell_length=max_cell_length, + max_table_rows_in_repr=max_table_rows_in_repr, + ) + + klass = self.__class__ + obj = klass.__new__(klass) + obj.ctx = self.ctx.with_display_config(display_config.config_internal) + return obj + def enable_url_table(self) -> SessionContext: """Control if local files can be queried as tables. diff --git a/src/context.rs b/src/context.rs index f2aaf0626..6147cceff 100644 --- a/src/context.rs +++ b/src/context.rs @@ -324,7 +324,7 @@ pub struct PySessionContext { #[pymethods] impl PySessionContext { - #[pyo3(signature = (config=None, runtime=None))] + #[pyo3(signature = (config=None, runtime=None, display_config=None))] #[new] pub fn new( config: Option, From 984b90637b55be6b9f6b8f981a30aff6a8c260e9 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 2 Apr 2025 18:18:07 +0800 Subject: [PATCH 32/51] fix: Add validation for display configuration properties in DataframeDisplayConfig --- python/datafusion/context.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 02b035c7f..006c0cc32 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -119,6 +119,8 @@ def max_table_bytes(self) -> int: @max_table_bytes.setter def max_table_bytes(self, value: int) -> None: """Set the maximum bytes to display for table presentation.""" + if value <= 0: + raise ValueError("max_table_bytes must be greater than 0") self.config_internal.max_table_bytes = value @property @@ -129,6 +131,8 @@ def min_table_rows(self) -> int: @min_table_rows.setter def min_table_rows(self, value: int) -> None: """Set the minimum number of table rows to display.""" + if value <= 0: + raise ValueError("min_table_rows must be greater than 0") self.config_internal.min_table_rows = value @property @@ -139,6 +143,8 @@ def max_cell_length(self) -> int: @max_cell_length.setter def max_cell_length(self, value: int) -> None: """Set the maximum length of a cell before it gets minimized.""" + if value <= 0: + raise ValueError("max_cell_length must be greater than 0") self.config_internal.max_cell_length = value @property @@ -149,6 +155,8 @@ def max_table_rows_in_repr(self) -> int: @max_table_rows_in_repr.setter def max_table_rows_in_repr(self, value: int) -> None: """Set the maximum number of rows to display in repr string output.""" + if value <= 0: + raise ValueError("max_table_rows_in_repr must be greater than 0") self.config_internal.max_table_rows_in_repr = value From 1326d713fd61bd521a350949e1ce5ce44cc21866 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 2 Apr 2025 18:25:30 +0800 Subject: [PATCH 33/51] feat: Integrate DataframeDisplayConfig into SessionContext initialization --- python/datafusion/context.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 006c0cc32..28cd2c0f7 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -615,8 +615,8 @@ def __init__( """ config = config.config_internal if config is not None else None runtime = runtime.config_internal if runtime is not None else None - - self.ctx = SessionContextInternal(config, runtime) + display_config = DataframeDisplayConfigInternal() + self.ctx = SessionContextInternal(config, runtime, display_config) @classmethod def global_ctx(cls) -> SessionContext: From 0c4eaa61a7a68736d0ffad9cea22033f1c8c85f8 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 2 Apr 2025 18:25:45 +0800 Subject: [PATCH 34/51] test: Add tests for DataframeDisplayConfig initialization and SessionContext integration --- python/tests/test_dataframe.py | 97 ++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index eda13930d..453d902df 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -31,6 +31,7 @@ from datafusion import functions as f from datafusion.expr import Window from pyarrow.csv import write_csv +from datafusion.context import DataframeDisplayConfig @pytest.fixture @@ -51,6 +52,102 @@ def df(): return ctx.from_arrow(batch) +def test_display_config(): + # Test display_config initialization + config = DataframeDisplayConfig( + max_table_bytes=1024, + min_table_rows=10, + max_cell_length=15, + max_table_rows_in_repr=5, + ) + + assert config.max_table_bytes == 1024 + assert config.min_table_rows == 10 + assert config.max_cell_length == 15 + assert config.max_table_rows_in_repr == 5 + + # Test property setters + config.max_table_bytes = 2048 + config.min_table_rows = 20 + config.max_cell_length = 30 + config.max_table_rows_in_repr = 10 + + assert config.max_table_bytes == 2048 + assert config.min_table_rows == 20 + assert config.max_cell_length == 30 + assert config.max_table_rows_in_repr == 10 + + # Test property setter validation + with pytest.raises(ValueError, match="max_table_bytes must be greater than 0"): + config.max_table_bytes = 0 + + with pytest.raises(ValueError, match="min_table_rows must be greater than 0"): + config.min_table_rows = -1 + + with pytest.raises(ValueError, match="max_cell_length must be greater than 0"): + config.max_cell_length = 0 + + with pytest.raises( + ValueError, match="max_table_rows_in_repr must be greater than 0" + ): + config.max_table_rows_in_repr = -5 + + +def test_session_with_display_config(): + # Test with_display_config returns a new context with updated config + ctx = SessionContext() + + # Verify the default values are used initially + df = ctx.from_pylist([{"a": 1, "b": "x" * 50, "c": 3}] * 100) + html_repr = df._repr_html_() + + # Create a new context with custom display config + ctx2 = ctx.with_display_config( + max_table_bytes=1024, + min_table_rows=5, + max_cell_length=10, + max_table_rows_in_repr=3, + ) + + # Create a dataframe with the same data but using the new context + df2 = ctx2.from_pylist([{"a": 1, "b": "x" * 50, "c": 3}] * 100) + html_repr2 = df2._repr_html_() + + # The HTML representation should be different with different display configs + assert html_repr != html_repr2 + + # Check that the second representation has the short cell data based on the configured length + assert f'' in html_repr2 -def test_display_config_in_init(): +def test_display_config_in_init(data): # Test providing display config directly in SessionContext constructor display_config = DataframeDisplayConfig( max_table_bytes=1024, @@ -131,7 +136,7 @@ def test_display_config_in_init(): ) ctx = SessionContext() - df1 = ctx.from_pylist([{"a": 1, "b": "x" * 50, "c": 3}] * 100) + df1 = ctx.from_pylist(data) html_repr1 = df1._repr_html_() # Create a context with custom display config through the with_display_config method @@ -141,7 +146,7 @@ def test_display_config_in_init(): max_cell_length=10, max_table_rows_in_repr=3, ) - df2 = ctx2.from_pylist([{"a": 1, "b": "x" * 50, "c": 3}] * 100) + df2 = ctx2.from_pylist(data) html_repr2 = df2._repr_html_() # Both methods should result in equivalent display configuration @@ -1360,7 +1365,7 @@ def test_dataframe_repr_html(df) -> None: assert len(re.findall(body_pattern, output, re.DOTALL)) == 1 -def test_display_config_affects_repr(): +def test_display_config_affects_repr(data): max_table_rows_in_repr = 3 # Create a context with custom display config ctx = SessionContext().with_display_config( @@ -1368,71 +1373,62 @@ def test_display_config_affects_repr(): ) # Create a DataFrame with more rows than the display limit - data = [{"a": i, "b": f"value_{i}", "c": i * 10} for i in range(10)] df = ctx.from_pylist(data) - # Get the string representation - # +---+---------+----+ - # | a | b | c | - # +---+---------+----+ - # | 0 | value_0 | 0 | - # | 1 | value_1 | 10 | - # | 2 | value_2 | 20 | - # +---+---------+----+ - # Data truncated. repr_str = repr(df) # The representation should show truncated data (3 rows as specified) assert ( - repr_str.count("\n") <= max_table_rows_in_repr + 5 - ) # header row + separator lines + data rows + possibly truncation message + # 5 = 1 header row + 3 separator line + 1 truncation message + repr_str.count("\n") + <= max_table_rows_in_repr + 5 + ) assert "Data truncated" in repr_str # Create a context with larger display limit - ctx2 = SessionContext().with_display_config(max_table_rows_in_repr=15) + max_table_rows_in_repr = 100 + ctx2 = SessionContext().with_display_config( + max_table_rows_in_repr=max_table_rows_in_repr + ) df2 = ctx2.from_pylist(data) repr_str2 = repr(df2) # Should show all data without truncation message - assert repr_str2.count("\n") >= 10 # All rows should be shown + assert ( + # 4 = 1 header row + 3 separator lines + repr_str2.count("\n") + == max_table_rows_in_repr + 4 + ) # All rows should be shown assert "Data truncated" not in repr_str2 -def test_display_config_affects_html_repr(): +def test_display_config_affects_html_repr(data): # Create a context with custom display config to show only a small cell length ctx = SessionContext().with_display_config(max_cell_length=5) # Create a DataFrame with a column containing long strings - data = [ - {"a": 1, "b": "This is a very long string that should be truncated", "c": 100} - ] df = ctx.from_pylist(data) # Get the HTML representation html_str = df._repr_html_() # The cell should be truncated to 5 characters and have expansion button - assert ">This " in html_str # 5 character limit - assert "expandable" in html_str - assert "expand-btn" in html_str + assert ">xxxxx" in html_str # 5 character limit + expandable_class = 'class="expandable-container"' + assert expandable_class in html_str # Create a context with larger cell length limit - ctx2 = SessionContext().with_display_config(max_cell_length=50) + ctx2 = SessionContext().with_display_config(max_cell_length=60) df2 = ctx2.from_pylist(data) html_str2 = df2._repr_html_() # String shouldn't be truncated (or at least not in the same way) - if "expandable" in html_str2: - # If it still has an expandable div, it should contain more characters - assert ">This is a very long string that" in html_str2 - else: - # Or it might not need expansion at all - assert "This is a very long string that should be truncated" in html_str2 + assert expandable_class not in html_str2 -def test_display_config_rows_limit_in_html(): +def test_display_config_rows_limit_in_html(data): max_table_rows = 5 # Create a context with custom display config to limit rows ctx = SessionContext().with_display_config( @@ -1440,7 +1436,6 @@ def test_display_config_rows_limit_in_html(): ) # Create a DataFrame with 10 rows - data = [{"a": i, "b": f"value_{i}", "c": i * 10} for i in range(10)] df = ctx.from_pylist(data) # Get the HTML representation @@ -1452,7 +1447,7 @@ def test_display_config_rows_limit_in_html(): assert "Data truncated" in html_str # Create a context with larger row limit - max_table_rows = 20 + max_table_rows = 100 ctx2 = SessionContext().with_display_config( max_table_rows_in_repr=max_table_rows ) # Show more rows @@ -1462,11 +1457,11 @@ def test_display_config_rows_limit_in_html(): # Should show all rows row_count2 = html_str2.count("") - 1 # Subtract 1 for header row - assert row_count2 == 10 # Should show all 10 rows + assert row_count2 == max_table_rows assert "Data truncated" not in html_str2 -def test_display_config_max_bytes_limit(): +def test_display_config_max_bytes_limit(data): min_table_rows = 10 max_table_rows = 20 # Create a context with custom display config with very small byte limit @@ -1477,12 +1472,6 @@ def test_display_config_max_bytes_limit(): ) # Very small limit # Create a DataFrame with large content - # Generate some data with long strings to hit the byte limit quickly - large_string = "x" * 50 - data = [ - {"a": i, "b": large_string, "c": large_string} - for i in range(20) # 20 rows with long strings - ] df = ctx.from_pylist(data) # Get the HTML representation diff --git a/src/dataframe.rs b/src/dataframe.rs index 10247a79c..5d62ad2bd 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -944,9 +944,6 @@ async fn collect_record_batches_to_display( record_batches.push(rb); } } - println!( - "==> after while, size_estimate_so_far: {size_estimate_so_far}, rows_so_far: {rows_so_far}" - ); if record_batches.is_empty() { return Ok((Vec::default(), false)); From 2993854faf736c824c50edca6d0f3166b0ed8dd6 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 3 Apr 2025 14:31:18 +0800 Subject: [PATCH 41/51] fix: Update loop condition in collect_record_batches_to_display for correct row handling --- src/dataframe.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dataframe.rs b/src/dataframe.rs index 5d62ad2bd..5c06df985 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -905,7 +905,7 @@ async fn collect_record_batches_to_display( let mut record_batches = Vec::default(); let mut has_more = false; - while size_estimate_so_far < max_bytes && rows_so_far < max_rows && rows_so_far < min_rows { + while (size_estimate_so_far < max_bytes && rows_so_far < max_rows) || rows_so_far < min_rows { let mut rb = match stream.next().await { None => { break; From 71c64b9c8718a0d941ba8f0e32de813548480819 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 3 Apr 2025 15:03:44 +0800 Subject: [PATCH 42/51] fix ruff errors --- python/datafusion/__init__.py | 16 +++++----------- python/datafusion/context.py | 24 ++++++++++++------------ python/tests/test_dataframe.py | 31 +++++++++++++++---------------- 3 files changed, 32 insertions(+), 39 deletions(-) diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 23f6c971d..436c30e52 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -26,26 +26,20 @@ except ImportError: import importlib_metadata +# Local module imports from . import functions, object_store, substrait - -# The following imports are okay to remain as opaque to the user. from ._internal import Config from .catalog import Catalog, Database, Table -from .common import ( - DFSchema, -) +from .common import DFSchema from .context import ( + DataframeDisplayConfig, RuntimeEnvBuilder, + SQLOptions, SessionConfig, SessionContext, - DataframeDisplayConfig, - SQLOptions, ) from .dataframe import DataFrame -from .expr import ( - Expr, - WindowFrame, -) +from .expr import Expr, WindowFrame from .io import read_avro, read_csv, read_json, read_parquet from .plan import ExecutionPlan, LogicalPlan from .record_batch import RecordBatch, RecordBatchStream diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 4258c01fb..728bae458 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -26,26 +26,25 @@ except ImportError: from typing_extensions import deprecated # Python 3.12 +if TYPE_CHECKING: + import pandas as pd + import pathlib + import polars as pl + import pyarrow as pa + + from datafusion.plan import ExecutionPlan, LogicalPlan + from datafusion.catalog import Catalog, Table from datafusion.dataframe import DataFrame from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list from datafusion.record_batch import RecordBatchStream from datafusion.udf import AggregateUDF, ScalarUDF, WindowUDF +from ._internal import DataframeDisplayConfig as DataframeDisplayConfigInternal from ._internal import RuntimeEnvBuilder as RuntimeEnvBuilderInternal +from ._internal import SQLOptions as SQLOptionsInternal from ._internal import SessionConfig as SessionConfigInternal from ._internal import SessionContext as SessionContextInternal -from ._internal import SQLOptions as SQLOptionsInternal -from ._internal import DataframeDisplayConfig as DataframeDisplayConfigInternal - -if TYPE_CHECKING: - import pathlib - - import pandas as pd - import polars as pl - import pyarrow as pa - - from datafusion.plan import ExecutionPlan, LogicalPlan class ArrowStreamExportable(Protocol): @@ -131,7 +130,8 @@ def _validate_positive(self, value: int, name: str) -> None: ValueError: If the value is not positive """ if value <= 0: - raise ValueError(f"{name} must be greater than 0") + error_message = f"{name} must be greater than 0" + raise ValueError(error_message) @property def max_table_bytes(self) -> int: diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index f1c74f25a..52b0dc4f1 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -29,9 +29,9 @@ literal, ) from datafusion import functions as f +from datafusion.context import DataframeDisplayConfig from datafusion.expr import Window from pyarrow.csv import write_csv -from datafusion.context import DataframeDisplayConfig @pytest.fixture @@ -57,6 +57,11 @@ def data(): return [{"a": 1, "b": "x" * 50, "c": 3}] * 100 +@pytest.fixture +def span_expandable_class(): + return '" in html_repr2 def test_display_config_in_init(data): - # Test providing display config directly in SessionContext constructor - display_config = DataframeDisplayConfig( - max_table_bytes=1024, - min_table_rows=5, - max_cell_length=10, - max_table_rows_in_repr=3, - ) + # Test default display config directly in SessionContext constructor ctx = SessionContext() df1 = ctx.from_pylist(data) @@ -1403,7 +1403,7 @@ def test_display_config_affects_repr(data): assert "Data truncated" not in repr_str2 -def test_display_config_affects_html_repr(data): +def test_display_config_affects_html_repr(data, span_expandable_class): # Create a context with custom display config to show only a small cell length ctx = SessionContext().with_display_config(max_cell_length=5) @@ -1415,8 +1415,7 @@ def test_display_config_affects_html_repr(data): # The cell should be truncated to 5 characters and have expansion button assert ">xxxxx" in html_str # 5 character limit - expandable_class = 'class="expandable-container"' - assert expandable_class in html_str + assert span_expandable_class in html_str # Create a context with larger cell length limit ctx2 = SessionContext().with_display_config(max_cell_length=60) @@ -1425,7 +1424,7 @@ def test_display_config_affects_html_repr(data): html_str2 = df2._repr_html_() # String shouldn't be truncated (or at least not in the same way) - assert expandable_class not in html_str2 + assert span_expandable_class not in html_str2 def test_display_config_rows_limit_in_html(data): From ec7033a223b8d0ac8c7024084e3b4be4470be974 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 3 Apr 2025 15:13:36 +0800 Subject: [PATCH 43/51] fix ruff errors --- python/datafusion/__init__.py | 2 +- python/datafusion/context.py | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index c5ed77057..728b9c390 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -35,9 +35,9 @@ from .context import ( DataframeDisplayConfig, RuntimeEnvBuilder, - SQLOptions, SessionConfig, SessionContext, + SQLOptions, ) from .dataframe import DataFrame from .expr import Expr, WindowFrame diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 728bae458..73f7cbd09 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -26,14 +26,6 @@ except ImportError: from typing_extensions import deprecated # Python 3.12 -if TYPE_CHECKING: - import pandas as pd - import pathlib - import polars as pl - import pyarrow as pa - - from datafusion.plan import ExecutionPlan, LogicalPlan - from datafusion.catalog import Catalog, Table from datafusion.dataframe import DataFrame from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list @@ -42,9 +34,18 @@ from ._internal import DataframeDisplayConfig as DataframeDisplayConfigInternal from ._internal import RuntimeEnvBuilder as RuntimeEnvBuilderInternal -from ._internal import SQLOptions as SQLOptionsInternal from ._internal import SessionConfig as SessionConfigInternal from ._internal import SessionContext as SessionContextInternal +from ._internal import SQLOptions as SQLOptionsInternal + +if TYPE_CHECKING: + import pathlib + + import pandas as pd + import polars as pl + import pyarrow as pa + + from datafusion.plan import ExecutionPlan, LogicalPlan class ArrowStreamExportable(Protocol): From ad83fc5ebd19e93dd50519fc8e8c4529550470c2 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 3 Apr 2025 15:40:54 +0800 Subject: [PATCH 44/51] feat: Add optional display_config parameter to SessionContext constructor --- python/datafusion/context.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 73f7cbd09..83fd046b3 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -571,6 +571,7 @@ def __init__( self, config: SessionConfig | None = None, runtime: RuntimeEnvBuilder | None = None, + display_config: DataframeDisplayConfig | None = None, ) -> None: """Main interface for executing queries with DataFusion. @@ -594,7 +595,9 @@ def __init__( """ config = config.config_internal if config is not None else None runtime = runtime.config_internal if runtime is not None else None - display_config = DataframeDisplayConfigInternal() + display_config = ( + display_config.config_internal if display_config is not None else None + ) self.ctx = SessionContextInternal(config, runtime, display_config) @classmethod From fb90fbc3dcef9cbef5eb0961767dcd6d222d0378 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 3 Apr 2025 15:41:55 +0800 Subject: [PATCH 45/51] fix: Update test data size and improve display config tests - Reduced the size of test data in the `data` fixture from 100 to 10 entries for efficiency. - Added `normalize_uuid` function to standardize UUIDs in HTML representations for consistent testing. - Modified the `test_display_config_in_init` to use a custom display configuration and updated assertions to compare normalized HTML outputs. - Enhanced readability of assertions in `test_display_config_affects_repr` by formatting conditions. --- python/tests/test_dataframe.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index a72d21b84..88e96e27d 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import dis import os import re from typing import Any @@ -54,7 +55,7 @@ def df(): @pytest.fixture def data(): - return [{"a": 1, "b": "x" * 50, "c": 3}] * 100 + return [{"a": 1, "b": "x" * 50, "c": 3}] * 10 @pytest.fixture @@ -62,6 +63,14 @@ def span_expandable_class(): return '