Skip to content

Commit 53fce90

Browse files
authored
Merge branch 'main' into ultralytics-yolo
2 parents 3994e06 + 86fc806 commit 53fce90

25 files changed

+1055
-222
lines changed

.github/workflows/tests-studio.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ jobs:
7575
path: './backend/datachain'
7676
fetch-depth: 0
7777

78+
- name: Set up FFmpeg
79+
uses: AnimMouse/setup-ffmpeg@v1
80+
7881
- name: Set up Python ${{ matrix.pyv }}
7982
uses: actions/setup-python@v5
8083
with:

.github/workflows/tests.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ jobs:
7878
fetch-depth: 0
7979
ref: ${{ github.event.pull_request.head.sha || github.ref }}
8080

81+
- name: Set up FFmpeg
82+
uses: AnimMouse/setup-ffmpeg@v1
83+
8184
- name: Set up Python ${{ matrix.pyv }}
8285
uses: actions/setup-python@v5
8386
with:

examples/get_started/common_sql_functions.py

Lines changed: 0 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -52,62 +52,3 @@ def num_chars_udf(file):
5252
.select("a", "b", "greatest", "least")
5353
.show(10)
5454
)
55-
56-
57-
"""
58-
Expected output:
59-
60-
file num_chars
61-
path
62-
0 dogs-and-cats/cat.1.jpg [1]
63-
1 dogs-and-cats/cat.1.json [1]
64-
2 dogs-and-cats/cat.10.jpg [1, 0]
65-
3 dogs-and-cats/cat.10.json [1, 0]
66-
4 dogs-and-cats/cat.100.jpg [1, 0, 0]
67-
68-
[Limited by 5 rows]
69-
Processed: 400 rows [00:00, 14314.30 rows/s]
70-
file length parts
71-
path
72-
0 dogs-and-cats/cat.1.jpg 9 [cat, 1, jpg]
73-
1 dogs-and-cats/cat.1.json 10 [cat, 1, json]
74-
2 dogs-and-cats/cat.10.jpg 10 [cat, 10, jpg]
75-
3 dogs-and-cats/cat.10.json 11 [cat, 10, json]
76-
4 dogs-and-cats/cat.100.jpg 11 [cat, 100, jpg]
77-
78-
[Limited by 5 rows]
79-
Processed: 400 rows [00:00, 16364.66 rows/s]
80-
file stem ext
81-
path
82-
0 dogs-and-cats/cat.1.jpg cat.1 jpg
83-
1 dogs-and-cats/cat.1.json cat.1 json
84-
2 dogs-and-cats/cat.10.jpg cat.10 jpg
85-
3 dogs-and-cats/cat.10.json cat.10 json
86-
4 dogs-and-cats/cat.100.jpg cat.100 jpg
87-
88-
[Limited by 5 rows]
89-
file isdog iscat
90-
path
91-
0 dogs-and-cats/cat.1.jpg 0 1
92-
1 dogs-and-cats/cat.1.json 0 1
93-
2 dogs-and-cats/cat.10.jpg 0 1
94-
3 dogs-and-cats/cat.10.json 0 1
95-
4 dogs-and-cats/cat.100.jpg 0 1
96-
97-
[Limited by 5 rows]
98-
Processed: 400 rows [00:00, 16496.93 rows/s]
99-
a b greatest least
100-
0 2 1 2 1
101-
1 2 1 2 1
102-
2 2 2 2 2
103-
3 2 2 2 2
104-
4 2 3 3 2
105-
5 2 3 3 2
106-
6 2 4 4 2
107-
7 2 4 4 2
108-
8 2 3 3 2
109-
9 2 3 3 2
110-
111-
[Limited by 10 rows]
112-
113-
"""

pyproject.toml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,16 @@ hf = [
8080
"numba>=0.60.0",
8181
"datasets[audio,vision]>=2.21.0"
8282
]
83+
video = [
84+
# Use 'av<14' because of incompatibility with imageio
85+
# See https://github.com/PyAV-Org/PyAV/discussions/1700
86+
"av<14",
87+
"ffmpeg-python",
88+
"imageio[ffmpeg]",
89+
"opencv-python"
90+
]
8391
tests = [
84-
"datachain[torch,remote,vector,hf]",
92+
"datachain[torch,remote,vector,hf,video]",
8593
"pytest>=8,<9",
8694
"pytest-sugar>=0.9.6",
8795
"pytest-cov>=4.1.0",
@@ -110,7 +118,7 @@ examples = [
110118
"defusedxml",
111119
"accelerate",
112120
"huggingface_hub[hf_transfer]",
113-
"ultralytics==8.3.68",
121+
"ultralytics==8.3.70",
114122
"open_clip_torch"
115123
]
116124

src/datachain/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,14 @@
44
ArrowRow,
55
File,
66
FileError,
7+
Image,
78
ImageFile,
89
TarVFile,
910
TextFile,
11+
Video,
12+
VideoFile,
13+
VideoFragment,
14+
VideoFrame,
1015
)
1116
from datachain.lib.model_store import ModelStore
1217
from datachain.lib.udf import Aggregator, Generator, Mapper
@@ -27,13 +32,18 @@
2732
"File",
2833
"FileError",
2934
"Generator",
35+
"Image",
3036
"ImageFile",
3137
"Mapper",
3238
"ModelStore",
3339
"Session",
3440
"Sys",
3541
"TarVFile",
3642
"TextFile",
43+
"Video",
44+
"VideoFile",
45+
"VideoFragment",
46+
"VideoFrame",
3747
"is_chain_type",
3848
"metrics",
3949
"param",

src/datachain/catalog/catalog.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1209,6 +1209,8 @@ def signed_url(
12091209
**kwargs,
12101210
) -> str:
12111211
client_config = client_config or self.client_config
1212+
if client_config.get("anon"):
1213+
content_disposition = None
12121214
client = Client.get_client(source, self.cache, **client_config)
12131215
return client.url(
12141216
path,

src/datachain/cli/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ def handle_dataset_command(args, catalog):
160160
local=args.local,
161161
all=args.all,
162162
team=args.team,
163+
latest_only=not args.versions,
163164
),
164165
"rm": lambda: rm_dataset(
165166
catalog,

src/datachain/cli/commands/datasets.py

Lines changed: 59 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,27 @@
1212
from datachain.studio import list_datasets as list_datasets_studio
1313

1414

15+
def group_dataset_versions(datasets, latest_only=True):
16+
grouped = {}
17+
# Sort to ensure groupby works as expected
18+
# (groupby expects consecutive items with the same key)
19+
for name, version in sorted(datasets):
20+
grouped.setdefault(name, []).append(version)
21+
22+
if latest_only:
23+
# For each dataset name, pick the highest version.
24+
return {name: max(versions) for name, versions in grouped.items()}
25+
# For each dataset name, return a sorted list of unique versions.
26+
return {name: sorted(set(versions)) for name, versions in grouped.items()}
27+
28+
1529
def list_datasets(
1630
catalog: "Catalog",
1731
studio: bool = False,
1832
local: bool = False,
1933
all: bool = True,
2034
team: Optional[str] = None,
35+
latest_only: bool = True,
2136
):
2237
token = Config().read().get("studio", {}).get("token")
2338
all, local, studio = determine_flavors(studio, local, all, token)
@@ -27,15 +42,48 @@ def list_datasets(
2742
set(list_datasets_studio(team=team)) if (all or studio) and token else set()
2843
)
2944

45+
# Group the datasets for both local and studio sources.
46+
local_grouped = group_dataset_versions(local_datasets, latest_only)
47+
studio_grouped = group_dataset_versions(studio_datasets, latest_only)
48+
49+
# Merge all dataset names from both sources.
50+
all_dataset_names = sorted(set(local_grouped.keys()) | set(studio_grouped.keys()))
51+
52+
datasets = []
53+
if latest_only:
54+
# For each dataset name, get the latest version from each source (if available).
55+
for name in all_dataset_names:
56+
datasets.append((name, (local_grouped.get(name), studio_grouped.get(name))))
57+
else:
58+
# For each dataset name, merge all versions from both sources.
59+
for name in all_dataset_names:
60+
local_versions = local_grouped.get(name, [])
61+
studio_versions = studio_grouped.get(name, [])
62+
63+
# If neither source has any versions, record it as (None, None).
64+
if not local_versions and not studio_versions:
65+
datasets.append((name, (None, None)))
66+
else:
67+
# For each unique version from either source, record its presence.
68+
for version in sorted(set(local_versions) | set(studio_versions)):
69+
datasets.append(
70+
(
71+
name,
72+
(
73+
version if version in local_versions else None,
74+
version if version in studio_versions else None,
75+
),
76+
)
77+
)
78+
3079
rows = [
3180
_datasets_tabulate_row(
3281
name=name,
33-
version=version,
3482
both=(all or (local and studio)) and token,
35-
local=(name, version) in local_datasets,
36-
studio=(name, version) in studio_datasets,
83+
local_version=local_version,
84+
studio_version=studio_version,
3785
)
38-
for name, version in local_datasets.union(studio_datasets)
86+
for name, (local_version, studio_version) in datasets
3987
]
4088

4189
print(tabulate(rows, headers="keys"))
@@ -47,14 +95,17 @@ def list_datasets_local(catalog: "Catalog"):
4795
yield (d.name, v.version)
4896

4997

50-
def _datasets_tabulate_row(name, version, both, local, studio):
98+
def _datasets_tabulate_row(name, both, local_version, studio_version):
5199
row = {
52100
"Name": name,
53-
"Version": version,
54101
}
55102
if both:
56-
row["Studio"] = "\u2714" if studio else "\u2716"
57-
row["Local"] = "\u2714" if local else "\u2716"
103+
row["Studio"] = f"v{studio_version}" if studio_version else "\u2716"
104+
row["Local"] = f"v{local_version}" if local_version else "\u2716"
105+
else:
106+
latest_version = local_version or studio_version
107+
row["Latest Version"] = f"v{latest_version}" if latest_version else "\u2716"
108+
58109
return row
59110

60111

src/datachain/cli/parser/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,12 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
254254
description="List datasets.",
255255
formatter_class=CustomHelpFormatter,
256256
)
257+
datasets_ls_parser.add_argument(
258+
"--versions",
259+
action="store_true",
260+
default=False,
261+
help="List all the versions of each dataset",
262+
)
257263
datasets_ls_parser.add_argument(
258264
"--studio",
259265
action="store_true",

src/datachain/dataset.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -628,6 +628,9 @@ def merge_versions(self, other: "DatasetListRecord") -> "DatasetListRecord":
628628
self.versions.sort(key=lambda v: v.version)
629629
return self
630630

631+
def latest_version(self) -> DatasetListVersion:
632+
return max(self.versions, key=lambda v: v.version)
633+
631634
@property
632635
def is_bucket_listing(self) -> bool:
633636
"""

0 commit comments

Comments
 (0)