iterative
diff --git a/‎.github/workflows/tests-studio.yml
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/tests-studio.yml
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/tests.yml
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/tests.yml
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/get_started/common_sql_functions.py
Lines changed: 0 additions & 59 deletions b/‎examples/get_started/common_sql_functions.py
Lines changed: 0 additions & 59 deletions
diff --git a/‎pyproject.toml
Lines changed: 10 additions & 2 deletions b/‎pyproject.toml
Lines changed: 10 additions & 2 deletions
diff --git a/‎src/datachain/__init__.py
Lines changed: 10 additions & 0 deletions b/‎src/datachain/__init__.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/datachain/catalog/catalog.py
Lines changed: 2 additions & 0 deletions b/‎src/datachain/catalog/catalog.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/datachain/cli/__init__.py
Lines changed: 1 addition & 0 deletions b/‎src/datachain/cli/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/datachain/cli/commands/datasets.py
Lines changed: 59 additions & 8 deletions b/‎src/datachain/cli/commands/datasets.py
Lines changed: 59 additions & 8 deletions
diff --git a/‎src/datachain/cli/parser/__init__.py
Lines changed: 6 additions & 0 deletions b/‎src/datachain/cli/parser/__init__.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/datachain/dataset.py
Lines changed: 3 additions & 0 deletions b/‎src/datachain/dataset.py
Lines changed: 3 additions & 0 deletions
@@ -75,6 +75,9 @@ jobs:
           path: './backend/datachain'
           fetch-depth: 0
 
+      - name: Set up FFmpeg
+        uses: AnimMouse/setup-ffmpeg@v1
+
       - name: Set up Python ${{ matrix.pyv }}
         uses: actions/setup-python@v5
         with:
 
@@ -78,6 +78,9 @@ jobs:
           fetch-depth: 0
           ref: ${{ github.event.pull_request.head.sha || github.ref }}
 
+      - name: Set up FFmpeg
+        uses: AnimMouse/setup-ffmpeg@v1
+
       - name: Set up Python ${{ matrix.pyv }}
         uses: actions/setup-python@v5
         with:
 
@@ -52,62 +52,3 @@ def num_chars_udf(file):
     .select("a", "b", "greatest", "least")
     .show(10)
 )
-
-
-"""
-Expected output:
-
-                        file  num_chars
-                        path
-0    dogs-and-cats/cat.1.jpg        [1]
-1   dogs-and-cats/cat.1.json        [1]
-2   dogs-and-cats/cat.10.jpg     [1, 0]
-3  dogs-and-cats/cat.10.json     [1, 0]
-4  dogs-and-cats/cat.100.jpg  [1, 0, 0]
-
-[Limited by 5 rows]
-Processed: 400 rows [00:00, 14314.30 rows/s]
-                        file length            parts
-                        path
-0    dogs-and-cats/cat.1.jpg      9    [cat, 1, jpg]
-1   dogs-and-cats/cat.1.json     10   [cat, 1, json]
-2   dogs-and-cats/cat.10.jpg     10   [cat, 10, jpg]
-3  dogs-and-cats/cat.10.json     11  [cat, 10, json]
-4  dogs-and-cats/cat.100.jpg     11  [cat, 100, jpg]
-
-[Limited by 5 rows]
-Processed: 400 rows [00:00, 16364.66 rows/s]
-                        file     stem   ext
-                        path
-0    dogs-and-cats/cat.1.jpg    cat.1   jpg
-1   dogs-and-cats/cat.1.json    cat.1  json
-2   dogs-and-cats/cat.10.jpg   cat.10   jpg
-3  dogs-and-cats/cat.10.json   cat.10  json
-4  dogs-and-cats/cat.100.jpg  cat.100   jpg
-
-[Limited by 5 rows]
-                        file isdog iscat
-                        path
-0    dogs-and-cats/cat.1.jpg     0     1
-1   dogs-and-cats/cat.1.json     0     1
-2   dogs-and-cats/cat.10.jpg     0     1
-3  dogs-and-cats/cat.10.json     0     1
-4  dogs-and-cats/cat.100.jpg     0     1
-
-[Limited by 5 rows]
-Processed: 400 rows [00:00, 16496.93 rows/s]
-   a  b  greatest  least
-0  2  1         2      1
-1  2  1         2      1
-2  2  2         2      2
-3  2  2         2      2
-4  2  3         3      2
-5  2  3         3      2
-6  2  4         4      2
-7  2  4         4      2
-8  2  3         3      2
-9  2  3         3      2
-
-[Limited by 10 rows]
-
-"""
@@ -80,8 +80,16 @@ hf = [
   "numba>=0.60.0",
   "datasets[audio,vision]>=2.21.0"
 ]
+video = [
+  # Use 'av<14' because of incompatibility with imageio
+  # See https://github.com/PyAV-Org/PyAV/discussions/1700
+  "av<14",
+  "ffmpeg-python",
+  "imageio[ffmpeg]",
+  "opencv-python"
+]
 tests = [
-  "datachain[torch,remote,vector,hf]",
+  "datachain[torch,remote,vector,hf,video]",
   "pytest>=8,<9",
   "pytest-sugar>=0.9.6",
   "pytest-cov>=4.1.0",
@@ -110,7 +118,7 @@ examples = [
   "defusedxml",
   "accelerate",
   "huggingface_hub[hf_transfer]",
-  "ultralytics==8.3.68",
+  "ultralytics==8.3.70",
   "open_clip_torch"
 ]
 
 
@@ -4,9 +4,14 @@
     ArrowRow,
     File,
     FileError,
+    Image,
     ImageFile,
     TarVFile,
     TextFile,
+    Video,
+    VideoFile,
+    VideoFragment,
+    VideoFrame,
 )
 from datachain.lib.model_store import ModelStore
 from datachain.lib.udf import Aggregator, Generator, Mapper
@@ -27,13 +32,18 @@
     "File",
     "FileError",
     "Generator",
+    "Image",
     "ImageFile",
     "Mapper",
     "ModelStore",
     "Session",
     "Sys",
     "TarVFile",
     "TextFile",
+    "Video",
+    "VideoFile",
+    "VideoFragment",
+    "VideoFrame",
     "is_chain_type",
     "metrics",
     "param",
 
@@ -1209,6 +1209,8 @@ def signed_url(
         **kwargs,
     ) -> str:
         client_config = client_config or self.client_config
+        if client_config.get("anon"):
+            content_disposition = None
         client = Client.get_client(source, self.cache, **client_config)
         return client.url(
             path,
 
@@ -160,6 +160,7 @@ def handle_dataset_command(args, catalog):
             local=args.local,
             all=args.all,
             team=args.team,
+            latest_only=not args.versions,
         ),
         "rm": lambda: rm_dataset(
             catalog,
 
@@ -12,12 +12,27 @@
 from datachain.studio import list_datasets as list_datasets_studio
 
 
+def group_dataset_versions(datasets, latest_only=True):
+    grouped = {}
+    # Sort to ensure groupby works as expected
+    # (groupby expects consecutive items with the same key)
+    for name, version in sorted(datasets):
+        grouped.setdefault(name, []).append(version)
+
+    if latest_only:
+        # For each dataset name, pick the highest version.
+        return {name: max(versions) for name, versions in grouped.items()}
+    # For each dataset name, return a sorted list of unique versions.
+    return {name: sorted(set(versions)) for name, versions in grouped.items()}
+
+
 def list_datasets(
     catalog: "Catalog",
     studio: bool = False,
     local: bool = False,
     all: bool = True,
     team: Optional[str] = None,
+    latest_only: bool = True,
 ):
     token = Config().read().get("studio", {}).get("token")
     all, local, studio = determine_flavors(studio, local, all, token)
@@ -27,15 +42,48 @@ def list_datasets(
         set(list_datasets_studio(team=team)) if (all or studio) and token else set()
     )
 
+    # Group the datasets for both local and studio sources.
+    local_grouped = group_dataset_versions(local_datasets, latest_only)
+    studio_grouped = group_dataset_versions(studio_datasets, latest_only)
+
+    # Merge all dataset names from both sources.
+    all_dataset_names = sorted(set(local_grouped.keys()) | set(studio_grouped.keys()))
+
+    datasets = []
+    if latest_only:
+        # For each dataset name, get the latest version from each source (if available).
+        for name in all_dataset_names:
+            datasets.append((name, (local_grouped.get(name), studio_grouped.get(name))))
+    else:
+        # For each dataset name, merge all versions from both sources.
+        for name in all_dataset_names:
+            local_versions = local_grouped.get(name, [])
+            studio_versions = studio_grouped.get(name, [])
+
+            # If neither source has any versions, record it as (None, None).
+            if not local_versions and not studio_versions:
+                datasets.append((name, (None, None)))
+            else:
+                # For each unique version from either source, record its presence.
+                for version in sorted(set(local_versions) | set(studio_versions)):
+                    datasets.append(
+                        (
+                            name,
+                            (
+                                version if version in local_versions else None,
+                                version if version in studio_versions else None,
+                            ),
+                        )
+                    )
+
     rows = [
         _datasets_tabulate_row(
             name=name,
-            version=version,
             both=(all or (local and studio)) and token,
-            local=(name, version) in local_datasets,
-            studio=(name, version) in studio_datasets,
+            local_version=local_version,
+            studio_version=studio_version,
         )
-        for name, version in local_datasets.union(studio_datasets)
+        for name, (local_version, studio_version) in datasets
     ]
 
     print(tabulate(rows, headers="keys"))
@@ -47,14 +95,17 @@ def list_datasets_local(catalog: "Catalog"):
             yield (d.name, v.version)
 
 
-def _datasets_tabulate_row(name, version, both, local, studio):
+def _datasets_tabulate_row(name, both, local_version, studio_version):
     row = {
         "Name": name,
-        "Version": version,
     }
     if both:
-        row["Studio"] = "\u2714" if studio else "\u2716"
-        row["Local"] = "\u2714" if local else "\u2716"
+        row["Studio"] = f"v{studio_version}" if studio_version else "\u2716"
+        row["Local"] = f"v{local_version}" if local_version else "\u2716"
+    else:
+        latest_version = local_version or studio_version
+        row["Latest Version"] = f"v{latest_version}" if latest_version else "\u2716"
+
     return row
 
 
 
@@ -254,6 +254,12 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         description="List datasets.",
         formatter_class=CustomHelpFormatter,
     )
+    datasets_ls_parser.add_argument(
+        "--versions",
+        action="store_true",
+        default=False,
+        help="List all the versions of each dataset",
+    )
     datasets_ls_parser.add_argument(
         "--studio",
         action="store_true",
 
@@ -628,6 +628,9 @@ def merge_versions(self, other: "DatasetListRecord") -> "DatasetListRecord":
         self.versions.sort(key=lambda v: v.version)
         return self
 
+    def latest_version(self) -> DatasetListVersion:
+        return max(self.versions, key=lambda v: v.version)
+
     @property
     def is_bucket_listing(self) -> bool:
         """