feat(data): support list[str] URI columns in download() expression

Aydin-ab · Aydin-ab · commit 9b2e2501aec6 · 2026-06-15T20:11:30.000-07:00
The row-level download() expression only accepted a scalar str URI per row.
Rows that carry multiple files (e.g. a video row with N S3 frame paths) had
to hand-roll a per-row ThreadPoolExecutor. Accept a list&lt;string&gt; column
(also large_list / fixed_size_list of (large_)string): flatten every row's
URIs into one flat list, run them through the existing concurrent downloader
in a single pool, then re-nest into a list&lt;binary&gt; column preserving per-row
length and order (empty list -&gt; [], null cell -&gt; null, failed download -&gt;
None in place).

Additive: the scalar str path is unchanged -- every list branch is gated
behind is_uri_list_column, which is false for scalar columns. Both the
obstore and PyArrow-threaded download paths and the partition actor are made
list-aware, and the range-split hidden-size-column optimization is deferred
for list columns.

Signed-off-by: Aydin Abiar &lt;aydin@anyscale.com&gt;
diff --git a/python/ray/data/_internal/planner/_download_list_utils.py b/python/ray/data/_internal/planner/_download_list_utils.py
@@ -0,0 +1,86 @@
+"""Helpers for download columns whose cells are lists of URIs.
+
+A scalar download column holds one URI string per row; a list column (e.g. the
+frame paths of one video) holds many. These helpers flatten a ``list<string>``
+column into a single flat URI list so it runs through the same concurrent
+downloader as the scalar path, then re-nest the downloaded bytes back into a
+``list<binary>`` column with the original per-row shape and order.
+"""
+from typing import List, Optional, Tuple
+
+import pyarrow as pa
+
+
+def is_uri_list_column(arrow_type: "pa.DataType") -> bool:
+    """Return whether ``arrow_type`` is a list of strings (a multi-URI column).
+
+    Matches ``list`` / ``large_list`` / ``fixed_size_list`` of ``string`` or
+    ``large_string``. Scalar string columns return ``False`` and stay on the
+    unchanged single-URI-per-row download path.
+    """
+    if not (
+        pa.types.is_list(arrow_type)
+        or pa.types.is_large_list(arrow_type)
+        or pa.types.is_fixed_size_list(arrow_type)
+    ):
+        return False
+    value_type = arrow_type.value_type
+    return pa.types.is_string(value_type) or pa.types.is_large_string(value_type)
+
+
+def first_inner_uri(column: "pa.ChunkedArray") -> Optional[str]:
+    """Return the first non-null inner URI in a list<string> column, or ``None``.
+
+    Used only to pick the download path (obstore vs PyArrow) from a URI's
+    scheme, mirroring the scalar path's "look at the first URI" behavior. Scans
+    the offset-free child values, so it never indexes an empty or null cell.
+    """
+    for chunk in column.iterchunks():
+        for value in chunk.values:
+            if value.is_valid:
+                return value.as_py()
+    return None
+
+
+def flatten_uri_list(
+    column: "pa.ChunkedArray",
+) -> Tuple[List[Optional[str]], List[Optional[int]]]:
+    """Flatten a list<string> URI column into a flat URI list + per-row lengths.
+
+    Returns ``(flat_uris, row_lengths)``: ``flat_uris`` concatenates every row's
+    URIs in order (null inner elements are kept as ``None`` so positions stay
+    aligned with the downloaded bytes); ``row_lengths[i]`` is row ``i``'s URI
+    count, or ``None`` for a null cell. Pair with :func:`renest_downloaded_bytes`.
+    """
+    flat_uris: List[Optional[str]] = []
+    row_lengths: List[Optional[int]] = []
+    for uris in column.to_pylist():
+        if uris is None:
+            row_lengths.append(None)
+        else:
+            row_lengths.append(len(uris))
+            flat_uris.extend(uris)
+    return flat_uris, row_lengths
+
+
+def renest_downloaded_bytes(
+    flat_bytes: List[Optional[bytes]], row_lengths: List[Optional[int]]
+) -> "pa.Array":
+    """Re-nest flat downloaded bytes into a ``list<binary>`` column.
+
+    Inverse of :func:`flatten_uri_list`: slices ``flat_bytes`` back into one
+    inner list per row using ``row_lengths`` (``None`` -> null cell, ``0`` ->
+    empty list), preserving per-row length and order. Failed downloads stay
+    ``None`` in place, matching the scalar path. Always returns ``list<binary>``
+    (even for all-empty or all-null blocks) so output blocks concatenate without
+    a ``list<null>`` type clash.
+    """
+    nested: List[Optional[List[Optional[bytes]]]] = []
+    pos = 0
+    for length in row_lengths:
+        if length is None:
+            nested.append(None)
+            continue
+        nested.append(flat_bytes[pos : pos + length])
+        pos += length
+    return pa.array(nested, type=pa.list_(pa.binary()))
diff --git a/python/ray/data/_internal/planner/_obstore_download.py b/python/ray/data/_internal/planner/_obstore_download.py
@@ -9,6 +9,12 @@
 import pyarrow as pa
 import pyarrow.fs
 
+from ray.data._internal.planner._download_list_utils import (
+    first_inner_uri,
+    flatten_uri_list,
+    is_uri_list_column,
+    renest_downloaded_bytes,
+)
 from ray.data._internal.util import (
     RetryingPyFileSystem,
     _iter_arrow_table_for_target_max_block_size,
@@ -380,17 +386,23 @@ def download_bytes_async(
     if not isinstance(block, pa.Table):
         block = BlockAccessor.for_block(block).to_arrow()
 
-    first_uris = block.column(uri_column_names[0]).to_pylist()
-    if not first_uris:
-        yield block
-        return
+    first_column = block.column(uri_column_names[0])
+    if is_uri_list_column(first_column.type):
+        # list<string> column: peek the first inner URI for scheme detection.
+        first_uri = first_inner_uri(first_column)
+    else:
+        first_uris = first_column.to_pylist()
+        if not first_uris:
+            yield block
+            return
+        first_uri = first_uris[0]
 
     # Fall back to PyArrow for URI schemes obstore doesn't handle.
-    if not _is_obstore_supported_url(first_uris[0]):
+    if not _is_obstore_supported_url(first_uri):
         logger.debug(
             "URI scheme not supported by obstore (first URI: %s); "
             "falling back to PyArrow threaded download.",
-            first_uris[0],
+            first_uri,
         )
         yield from _yield_threaded_download_bytes(
             block,
@@ -420,7 +432,30 @@ def download_bytes_async(
     for uri_column_name, output_bytes_column_name in zip(
         uri_column_names, output_bytes_column_names
     ):
-        uris = output_block.column(uri_column_name).to_pylist()
+        column = output_block.column(uri_column_name)
+
+        if is_uri_list_column(column.type):
+            # Flatten every row's URIs into one flat list, download it through
+            # the same concurrent engine, then re-nest preserving per-row shape.
+            # List columns carry no __ray_file_size__ column (see
+            # AsyncPartitionActor), so no precomputed sizes are passed.
+            flat_uris, row_lengths = flatten_uri_list(column)
+            flat_bytes = (
+                asyncio.run(
+                    _download_uris_with_obstore(
+                        flat_uris, uri_column_name, filesystem=filesystem
+                    )
+                )
+                if flat_uris
+                else []
+            )
+            output_block = output_block.append_column(
+                output_bytes_column_name,
+                renest_downloaded_bytes(flat_bytes, row_lengths),
+            )
+            continue
+
+        uris = column.to_pylist()
 
         if not uris:
             continue
diff --git a/python/ray/data/_internal/planner/download_partition_actor.py b/python/ray/data/_internal/planner/download_partition_actor.py
@@ -8,6 +8,10 @@
 import pyarrow.fs as pafs
 from typing_extensions import override
 
+from ray.data._internal.planner._download_list_utils import (
+    flatten_uri_list,
+    is_uri_list_column,
+)
 from ray.data._internal.planner._obstore_download import (
     _FILE_SIZE_COLUMN_PREFIX,
     RAY_DATA_OBSTORE_RANGE_THRESHOLD,
@@ -73,11 +77,37 @@ def _partition_and_yield(self, block: pa.Table) -> Iterator[pa.Table]:
     def _sampled_file_sizes_for_partition_estimate(
         self, block: pa.Table, uri_column_name: str
     ) -> List[Optional[int]]:
-        uris = block.column(uri_column_name).to_pylist()
+        column = block.column(uri_column_name)
+        if is_uri_list_column(column.type):
+            return self._sampled_list_row_sizes(column)
+        uris = column.to_pylist()
         sample_uris = uris[: self.INIT_SAMPLE_BATCH_SIZE]
         # ``_sample_sizes`` returns concrete ``int``s; widen for this API.
         return cast(List[Optional[int]], self._sample_sizes(sample_uris))
 
+    def _sampled_list_row_sizes(
+        self, column: "pa.ChunkedArray"
+    ) -> List[Optional[int]]:
+        """Per-row size estimate for a list<string> URI column.
+
+        Samples the first ``INIT_SAMPLE_BATCH_SIZE`` rows and estimates each
+        row's download size as the sum of its inner files' sizes (a null/empty
+        cell contributes 0). Returns one estimate per sampled row, so it lines
+        up with the scalar columns in ``_estimate_nrows_per_partition``.
+        """
+        sample = column.slice(0, self.INIT_SAMPLE_BATCH_SIZE)
+        flat_uris, row_lengths = flatten_uri_list(sample)
+        flat_sizes = self._sample_sizes(flat_uris)
+        row_sizes: List[Optional[int]] = []
+        pos = 0
+        for length in row_lengths:
+            if length is None:
+                row_sizes.append(0)
+                continue
+            row_sizes.append(sum(flat_sizes[pos : pos + length]))
+            pos += length
+        return row_sizes
+
     def _estimate_nrows_per_partition(self, block: pa.Table) -> int:
         sampled_file_sizes_by_column = {}
         for uri_column_name in self._uri_column_names:
@@ -214,8 +244,13 @@ def __call__(self, block: pa.Table) -> Iterator[pa.Table]:
         self._validate_uri_columns(block)
 
         if block.num_rows > 0 and RAY_DATA_OBSTORE_RANGE_THRESHOLD > 0:
-            first_uri = block.column(self._uri_column_names[0])[0].as_py()
-            if _is_obstore_supported_url(first_uri):
+            first_column = block.column(self._uri_column_names[0])
+            # Range-split size hints assume scalar string cells; skip the probe
+            # for list URI columns (their sizes are sampled without a hidden
+            # size column).
+            if not is_uri_list_column(
+                first_column.type
+            ) and _is_obstore_supported_url(first_column[0].as_py()):
                 block = self._attach_file_sizes(block)
 
         yield from self._partition_and_yield(block)
@@ -289,8 +324,14 @@ def _attach_file_sizes(self, block: pa.Table) -> pa.Table:
         download path falls back to HEAD via obstore.
         """
         for uri_column_name in self._uri_column_names:
+            column = block.column(uri_column_name)
+            if is_uri_list_column(column.type):
+                # Defer the range-split size-hint optimization for list URI
+                # columns: the download path samples their sizes directly and
+                # does not expect a hidden size column.
+                continue
             size_col = f"{_FILE_SIZE_COLUMN_PREFIX}{uri_column_name}"
-            uris = block.column(uri_column_name).to_pylist()
+            uris = column.to_pylist()
             # Fetches all file sizes (not just a sample).
             sizes = self._sample_sizes(uris)
             block = block.append_column(size_col, pa.array(sizes, type=pa.int64()))
diff --git a/python/ray/data/_internal/planner/plan_download_op.py b/python/ray/data/_internal/planner/plan_download_op.py
@@ -15,6 +15,11 @@
 )
 from ray.data._internal.logical.operators import Download
 from ray.data._internal.output_buffer import OutputBlockSizeOption
+from ray.data._internal.planner._download_list_utils import (
+    flatten_uri_list,
+    is_uri_list_column,
+    renest_downloaded_bytes,
+)
 from ray.data._internal.planner._obstore_download import (
     OBSTORE_AVAILABLE,
     _log_fallback_warning,
@@ -192,10 +197,25 @@ def download_bytes_threaded(
     for uri_column_name, output_bytes_column_name in zip(
         uri_column_names, output_bytes_column_names
     ):
-        # Extract URIs from PyArrow table
-        uris = output_block.column(uri_column_name).to_pylist()
+        # Extract URIs from PyArrow table. For a list<string> column, flatten
+        # every row's URIs into one flat list (tracked by row_lengths) so they
+        # all run through the same concurrent pool, then re-nest below.
+        column = output_block.column(uri_column_name)
+        is_list = is_uri_list_column(column.type)
+        if is_list:
+            uris, row_lengths = flatten_uri_list(column)
+        else:
+            uris = column.to_pylist()
 
         if len(uris) == 0:
+            if is_list:
+                # Rows exist but hold only empty/null lists: still append the
+                # re-nested (empty/null) list<binary> column so the output schema
+                # stays consistent with blocks that did download bytes.
+                output_block = output_block.append_column(
+                    output_bytes_column_name,
+                    renest_downloaded_bytes([], row_lengths),
+                )
             continue
 
         def load_uri_bytes(uri_iterator):
@@ -253,11 +273,17 @@ def load_uri_bytes(uri_iterator):
             )
         )
 
-        # Add the new column to the PyArrow table
-        output_block = output_block.add_column(
-            len(output_block.column_names),
+        # Add the new column to the PyArrow table. For a list column, re-nest
+        # the flat bytes back into one inner list per row (preserving length and
+        # order); failed downloads stay None in place.
+        new_column = (
+            renest_downloaded_bytes(uri_bytes, row_lengths)
+            if is_list
+            else pa.array(uri_bytes)
+        )
+        output_block = output_block.append_column(
             output_bytes_column_name,
-            pa.array(uri_bytes),
+            new_column,
         )
 
     yield from _iter_arrow_table_for_target_max_block_size(
diff --git a/python/ray/data/expressions.py b/python/ray/data/expressions.py
@@ -1487,6 +1487,11 @@ class DownloadExpr(Expr):
 
     uri_column_name: str
     filesystem: "pyarrow.fs.FileSystem" = None
+    # Nominal type only; unused on the download path. ``with_column`` lowers a
+    # ``DownloadExpr`` to a ``Download`` op (see ``Dataset.with_column``) whose
+    # ``infer_schema`` returns the input schema, so the output column's real type
+    # comes from the produced blocks: ``binary`` for a scalar URI column,
+    # ``list<binary>`` for a ``list<string>`` one.
     data_type: DataType = field(default_factory=lambda: DataType.binary(), init=False)
 
     def structurally_equals(self, other: Any) -> bool:
diff --git a/python/ray/data/tests/test_download_expression.py b/python/ray/data/tests/test_download_expression.py