apache · kosiew · Sep 1, 2025 · Sep 1, 2025 · Sep 2, 2025 · Sep 2, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -26,17 +26,34 @@ readme = "README.md"
 license = "Apache-2.0"
 edition = "2021"
 rust-version = "1.78"
-include = ["/src", "/datafusion", "/LICENSE.txt", "build.rs", "pyproject.toml", "Cargo.toml", "Cargo.lock"]
+include = [
+    "/src",
+    "/datafusion",
+    "/LICENSE.txt",
+    "build.rs",
+    "pyproject.toml",
+    "Cargo.toml",
+    "Cargo.lock",
+]
 
 [features]
 default = ["mimalloc"]
-protoc = [ "datafusion-substrait/protoc" ]
+protoc = ["datafusion-substrait/protoc"]
 substrait = ["dep:datafusion-substrait"]
 
 [dependencies]
-tokio = { version = "1.45", features = ["macros", "rt", "rt-multi-thread", "sync"] }
-pyo3 = { version = "0.24", features = ["extension-module", "abi3", "abi3-py39"] }
-pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"]}
+tokio = { version = "1.45", features = [
+    "macros",
+    "rt",
+    "rt-multi-thread",
+    "sync",
+] }
+pyo3 = { version = "0.24", features = [
+    "extension-module",
+    "abi3",
+    "abi3-py39",
+] }
+pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"] }
 pyo3-log = "0.12.4"
 arrow = { version = "55.1.0", features = ["pyarrow"] }
 datafusion = { version = "49.0.2", features = ["avro", "unicode_expressions"] }
@@ -45,15 +62,23 @@ datafusion-proto = { version = "49.0.2" }
 datafusion-ffi = { version = "49.0.2" }
 prost = "0.13.1" # keep in line with `datafusion-substrait`
 uuid = { version = "1.18", features = ["v4"] }
-mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] }
+mimalloc = { version = "0.1", optional = true, default-features = false, features = [
+    "local_dynamic_tls",
+] }
 async-trait = "0.1.89"
 futures = "0.3"
-object_store = { version = "0.12.3", features = ["aws", "gcp", "azure", "http"] }
+cstr = "0.2"
+object_store = { version = "0.12.3", features = [
+    "aws",
+    "gcp",
+    "azure",
+    "http",
+] }
 url = "2"
 log = "0.4.27"
 
 [build-dependencies]
-prost-types = "0.13.1" # keep in line with `datafusion-substrait`
+prost-types = "0.13.1"     # keep in line with `datafusion-substrait`
 pyo3-build-config = "0.24"
 
 [lib]

diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst
@@ -145,10 +145,118 @@ To materialize the results of your DataFrame operations:
 
     # Display results
     df.show()                         # Print tabular format to console
-    
+
     # Count rows
     count = df.count()
 
+Zero-copy streaming to Arrow-based Python libraries
+---------------------------------------------------
+
+DataFusion DataFrames implement the ``__arrow_c_stream__`` protocol, enabling
+zero-copy, lazy streaming into Arrow-based Python libraries. With the streaming 
+protocol, batches are produced on demand so you can process arbitrarily large 
+results without out-of-memory errors.
+
+.. note::
+
+    The protocol is implementation-agnostic and works with any Python library
+    that understands the Arrow C streaming interface (for example, PyArrow
+    or other Arrow-compatible implementations). The sections below provide a
+    short PyArrow-specific example and general guidance for other
+    implementations.
+
+PyArrow
+-------
+
+.. code-block:: python
+
+    import pyarrow as pa
+
+    # Create a PyArrow RecordBatchReader without materializing all batches
+    reader = pa.RecordBatchReader.from_stream(df)
+    for batch in reader:
+        ...  # process each batch as it is produced
+
+DataFrames are also iterable, yielding :class:`datafusion.RecordBatch`
+objects lazily so you can loop over results directly without importing
+PyArrow:
+
+.. code-block:: python
+
+    for batch in df:
+        ...  # each batch is a ``datafusion.RecordBatch``
+
+Each batch exposes ``to_pyarrow()``, allowing conversion to a PyArrow
+table. ``pa.table(df)`` collects the entire DataFrame eagerly into a
+PyArrow table::
+
+.. code-block:: python
+
+    import pyarrow as pa
+    table = pa.table(df)
+
+Asynchronous iteration is supported as well, allowing integration with
+``asyncio`` event loops::
+
+.. code-block:: python
+
+    async for batch in df:
+        ...  # process each batch as it is produced
+
+To work with the stream directly, use ``execute_stream()``, which returns a
+:class:`~datafusion.RecordBatchStream`::
+
+.. code-block:: python
+
+    stream = df.execute_stream()
+    for batch in stream:
+        ...
+
+Execute as Stream
+^^^^^^^^^^^^^^^^^
+
+For finer control over streaming execution, use
+:py:meth:`~datafusion.DataFrame.execute_stream` to obtain a
+:py:class:`datafusion.RecordBatchStream`:
+
+.. code-block:: python
+
+    stream = df.execute_stream()
+    for batch in stream:
+        ...  # process each batch as it is produced
+
+.. tip::
+
+    To get a PyArrow reader instead, call
+    ``pa.RecordBatchReader.from_stream(df)``.
+
+When partition boundaries are important,
+:py:meth:`~datafusion.DataFrame.execute_stream_partitioned`
+returns an iterable of :py:class:`datafusion.RecordBatchStream` objects, one per
+partition:
+
+.. code-block:: python
+
+    for stream in df.execute_stream_partitioned():
+        for batch in stream:
+            ...  # each stream yields RecordBatches
+
+To process partitions concurrently, first collect the streams into a list
+and then poll each one in a separate ``asyncio`` task:
+
+.. code-block:: python
+
+    import asyncio
+
+    async def consume(stream):
+        async for batch in stream:
+            ...
+
+    streams = list(df.execute_stream_partitioned())
+    await asyncio.gather(*(consume(s) for s in streams))
+
+See :doc:`../io/arrow` for additional details on the Arrow interface.
+
 HTML Rendering
 --------------
 

diff --git a/docs/source/user-guide/io/arrow.rst b/docs/source/user-guide/io/arrow.rst
@@ -60,14 +60,22 @@ Exporting from DataFusion
 DataFusion DataFrames implement ``__arrow_c_stream__`` PyCapsule interface, so any
 Python library that accepts these can import a DataFusion DataFrame directly.
 
-.. warning::
-    It is important to note that this will cause the DataFrame execution to happen, which may be
-    a time consuming task. That is, you will cause a
-    :py:func:`datafusion.dataframe.DataFrame.collect` operation call to occur.
+.. note::
+    Invoking ``__arrow_c_stream__`` still triggers execution of the underlying
+    query, but batches are yielded incrementally rather than materialized all at
+    once in memory. Consumers can process the stream as it arrives, avoiding the
+    memory overhead of a full
+    :py:func:`datafusion.dataframe.DataFrame.collect`.
+
+    For an example of this streamed execution and its memory safety, see the
+    ``test_arrow_c_stream_large_dataset`` unit test in
+    :mod:`python.tests.test_io`.
 
 
 .. ipython:: python
 
+    from datafusion import col, lit
+
     df = df.select((col("a") * lit(1.5)).alias("c"), lit("df").alias("d"))
     pa.table(df)
 
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -25,7 +25,9 @@
 from typing import (
     TYPE_CHECKING,
     Any,
+    AsyncIterator,
     Iterable,
+    Iterator,
     Literal,
     Optional,
     Union,
@@ -42,7 +44,7 @@
 from datafusion._internal import ParquetWriterOptions as ParquetWriterOptionsInternal
 from datafusion.expr import Expr, SortExpr, sort_or_default
 from datafusion.plan import ExecutionPlan, LogicalPlan
-from datafusion.record_batch import RecordBatchStream
+from datafusion.record_batch import RecordBatch, RecordBatchStream
 
 if TYPE_CHECKING:
     import pathlib
@@ -296,6 +298,9 @@ def __init__(
 class DataFrame:
     """Two dimensional table representation of data.
 
+    DataFrame objects are iterable; iterating over a DataFrame yields
+    :class:`datafusion.RecordBatch` instances lazily.
+
     See :ref:`user_guide_concepts` in the online documentation for more information.
     """
 
@@ -312,7 +317,7 @@ def into_view(self) -> pa.Table:
         return self.df.into_view()
 
     def __getitem__(self, key: str | list[str]) -> DataFrame:
-        """Return a new :py:class`DataFrame` with the specified column or columns.
+        """Return a new :py:class:`DataFrame` with the specified column or columns.
 
         Args:
             key: Column name or list of column names to select.
@@ -1105,21 +1110,54 @@ def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFram
         return DataFrame(self.df.unnest_columns(columns, preserve_nulls=preserve_nulls))
 
     def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
-        """Export an Arrow PyCapsule Stream.
+        """Export the DataFrame as an Arrow C Stream.
+
+        The DataFrame is executed using DataFusion's streaming APIs and exposed via
+        Arrow's C Stream interface. Record batches are produced incrementally, so the
+        full result set is never materialized in memory.
 
-        This will execute and collect the DataFrame. We will attempt to respect the
-        requested schema, but only trivial transformations will be applied such as only
-        returning the fields listed in the requested schema if their data types match
-        those in the DataFrame.
+        When ``requested_schema`` is provided, DataFusion applies only simple
+        projections such as selecting a subset of existing columns or reordering
+        them. Column renaming, computed expressions, or type coercion are not
+        supported through this interface.
 
         Args:
-            requested_schema: Attempt to provide the DataFrame using this schema.
+            requested_schema: Either a :py:class:`pyarrow.Schema` or an Arrow C
+                Schema capsule (``PyCapsule``) produced by
+                ``schema._export_to_c_capsule()``. The DataFrame will attempt to
+                align its output with the fields and order specified by this schema.
 
         Returns:
-            Arrow PyCapsule object.
+            Arrow ``PyCapsule`` object representing an ``ArrowArrayStream``.
+
+        Examples:
+            >>> schema = df.schema()
+            >>> stream = df.__arrow_c_stream__(schema)
+            >>> capsule = schema._export_to_c_capsule()
+            >>> stream = df.__arrow_c_stream__(capsule)
+
+        Notes:
+            The Arrow C Data Interface PyCapsule details are documented by Apache
+            Arrow and can be found at:
+            https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
         """
+        # ``DataFrame.__arrow_c_stream__`` in the Rust extension leverages
+        # ``execute_stream_partitioned`` under the hood to stream batches while
+        # preserving the original partition order.
         return self.df.__arrow_c_stream__(requested_schema)
 
+    def __iter__(self) -> Iterator[RecordBatch]:
+        """Return an iterator over this DataFrame's record batches."""
+        return iter(self.execute_stream())
+
+    def __aiter__(self) -> AsyncIterator[RecordBatch]:
+        """Return an async iterator over this DataFrame's record batches.
+
+        We're using __aiter__ because we support Python < 3.10 where aiter() is not
+        available.
+        """
+        return self.execute_stream().__aiter__()
+
     def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame:
         """Apply a function to the current DataFrame which returns another DataFrame.
 

diff --git a/python/datafusion/record_batch.py b/python/datafusion/record_batch.py
@@ -46,6 +46,26 @@ def to_pyarrow(self) -> pa.RecordBatch:
         """Convert to :py:class:`pa.RecordBatch`."""
         return self.record_batch.to_pyarrow()
 
+    def __arrow_c_array__(
+        self, requested_schema: object | None = None
+    ) -> tuple[object, object]:
+        """Export the record batch via the Arrow C Data Interface.
+
+        This allows zero-copy interchange with libraries that support the
+        `Arrow PyCapsule interface <https://arrow.apache.org/docs/format/
+        CDataInterface/PyCapsuleInterface.html>`_.
+
+        Args:
+            requested_schema: Attempt to provide the record batch using this
+                schema. Only straightforward projections such as column
+                selection or reordering are applied.
+
+        Returns:
+            Two Arrow PyCapsule objects representing the ``ArrowArray`` and
+            ``ArrowSchema``.
+        """
+        return self.record_batch.__arrow_c_array__(requested_schema)
+
 
 class RecordBatchStream:
     """This class represents a stream of record batches.
@@ -63,19 +83,19 @@ def next(self) -> RecordBatch:
         return next(self)
 
     async def __anext__(self) -> RecordBatch:
-        """Async iterator function."""
+        """Return the next :py:class:`RecordBatch` in the stream asynchronously."""
         next_batch = await self.rbs.__anext__()
         return RecordBatch(next_batch)
 
     def __next__(self) -> RecordBatch:
-        """Iterator function."""
+        """Return the next :py:class:`RecordBatch` in the stream."""
         next_batch = next(self.rbs)
         return RecordBatch(next_batch)
 
     def __aiter__(self) -> typing_extensions.Self:
-        """Async iterator function."""
+        """Return an asynchronous iterator over record batches."""
         return self
 
     def __iter__(self) -> typing_extensions.Self:
-        """Iterator function."""
+        """Return an iterator over record batches."""
         return self