Skip to content

Download using xet #1305

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 35 commits into from
May 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
d5529b1
Download using xet
coyotte508 Mar 21, 2025
f9b2761
fix file-download-info test
coyotte508 Mar 21, 2025
7c6b0a0
safetensors
coyotte508 Mar 21, 2025
1e6c638
load etag & size from xet payload
coyotte508 Mar 21, 2025
75b51b0
Merge branch 'main' into download-xet
coyotte508 Mar 22, 2025
b0b58db
Merge remote-tracking branch 'origin/main' into download-xet
coyotte508 Mar 24, 2025
b02c4e3
fix file-download-info
coyotte508 Mar 24, 2025
c4f5e40
update download-file tests
coyotte508 Mar 24, 2025
10761d4
doc
coyotte508 Mar 24, 2025
290d086
fix mocked tests
coyotte508 Mar 24, 2025
e604d27
fix E2E
coyotte508 Mar 24, 2025
f1c81be
Merge branch 'main' into download-xet
coyotte508 Apr 2, 2025
f0c8024
merge main
coyotte508 Apr 8, 2025
6a1d819
fix test again?
coyotte508 Apr 8, 2025
a698ecf
enable browser tests again
coyotte508 Apr 8, 2025
8a7b415
download web blob with access token
coyotte508 Apr 8, 2025
01789b5
fixup! download web blob with access token
coyotte508 Apr 8, 2025
411ed2d
reorder
coyotte508 Apr 8, 2025
6ab4ad1
test to download private file
coyotte508 Apr 8, 2025
ba9990f
throw errors in WebBlob
coyotte508 Apr 8, 2025
17d17a9
fixup! throw errors in WebBlob
coyotte508 Apr 8, 2025
754550e
Merge branch 'main' into download-xet
coyotte508 Apr 29, 2025
bbd15d1
remove 10s timeout
coyotte508 Apr 29, 2025
12ca93d
remove more timeouts
coyotte508 Apr 29, 2025
60af372
Fix E2Es maybe cc @SBrandeis @hanouticelina
coyotte508 Apr 29, 2025
f8d10cf
Merge branch 'main' into download-xet
coyotte508 Apr 29, 2025
b4edda3
increase timeout for browser
coyotte508 Apr 29, 2025
cd1ae6f
debug json format for file info in browser mode
coyotte508 Apr 29, 2025
7bd33f8
Merge remote-tracking branch 'origin/main' into download-xet
coyotte508 Apr 30, 2025
3786afc
use headers now
coyotte508 Apr 30, 2025
5030616
lint
coyotte508 Apr 30, 2025
bbc3a76
remove extra dev dep
coyotte508 Apr 30, 2025
e559afe
Merge remote-tracking branch 'origin/main' into download-xet
coyotte508 May 5, 2025
524e987
Use X-Linked-Etag header in priority
coyotte508 May 5, 2025
5e0a117
Merge branch 'main' into download-xet
coyotte508 May 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@ It's not a hard requirement, but please consider using an icon from [Gitmoji](ht

If you want to run only specific tests, you can do `pnpm test -- -t "test name"`.

You can also do `npx vitest ./packages/hub/src/utils/XetBlob.spec.ts` to run a specific test file.

Or `cd packages/hub && npx vitest --browser.name=chrome --browser.headless --config vitest-browser.config.mts ./src/utils/XetBlob.spec.ts` to run browser tests on a specific file
You can also do `pnpm --filter hub test ./src/utils/XetBlob.spec.ts` to run a specific test file.

## Adding a package

Expand Down
3 changes: 2 additions & 1 deletion e2e/ts/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
package-lock.json
package-lock.json
pnpm-lock.yaml
3 changes: 0 additions & 3 deletions packages/hub/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,6 @@
],
"author": "Hugging Face",
"license": "MIT",
"devDependencies": {
"@types/node": "^20.11.28"
},
"dependencies": {
"@huggingface/tasks": "workspace:^"
}
Expand Down
17 changes: 0 additions & 17 deletions packages/hub/pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 1 addition & 5 deletions packages/hub/src/lib/commit.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ describe("commit", () => {

try {
const readme1 = await downloadFile({ repo, path: "README.md", hubUrl: TEST_HUB_URL });
assert.strictEqual(readme1?.status, 200);
assert(readme1, "Readme doesn't exist");

const nodeOperation: CommitFile[] = isFrontend
? []
Expand Down Expand Up @@ -77,11 +77,9 @@ describe("commit", () => {
});

const fileContent = await downloadFile({ repo, path: "test.txt", hubUrl: TEST_HUB_URL });
assert.strictEqual(fileContent?.status, 200);
assert.strictEqual(await fileContent?.text(), "This is me");

const lfsFileContent = await downloadFile({ repo, path: "test.lfs.txt", hubUrl: TEST_HUB_URL });
assert.strictEqual(lfsFileContent?.status, 200);
assert.strictEqual(await lfsFileContent?.text(), lfsContent);

const lfsFileUrl = `${TEST_HUB_URL}/${repoName}/raw/main/test.lfs.txt`;
Expand All @@ -98,15 +96,13 @@ size ${lfsContent.length}

if (!isFrontend) {
const fileUrlContent = await downloadFile({ repo, path: "tsconfig.json", hubUrl: TEST_HUB_URL });
assert.strictEqual(fileUrlContent?.status, 200);
assert.strictEqual(
await fileUrlContent?.text(),
(await import("node:fs")).readFileSync("./tsconfig.json", "utf-8")
);
}

const webResourceContent = await downloadFile({ repo, path: "lamaral.json", hubUrl: TEST_HUB_URL });
assert.strictEqual(webResourceContent?.status, 200);
assert.strictEqual(await webResourceContent?.text(), await (await fetch(tokenizerJsonUrl)).text());

const readme2 = await downloadFile({ repo, path: "README.md", hubUrl: TEST_HUB_URL });
Expand Down
31 changes: 22 additions & 9 deletions packages/hub/src/lib/download-file-to-cache-dir.spec.ts
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
import { expect, test, describe, vi, beforeEach } from "vitest";
import type { RepoDesignation, RepoId } from "../types/public";
import { dirname, join } from "node:path";
import { lstat, mkdir, stat, symlink, writeFile, rename } from "node:fs/promises";
import { lstat, mkdir, stat, symlink, rename } from "node:fs/promises";
import { pathsInfo } from "./paths-info";
import type { Stats } from "node:fs";
import { createWriteStream, type Stats } from "node:fs";
import { getHFHubCachePath, getRepoFolderName } from "./cache-management";
import { toRepoId } from "../utils/toRepoId";
import { downloadFileToCacheDir } from "./download-file-to-cache-dir";
import { createSymlink } from "../utils/symlink";

vi.mock("node:fs/promises", () => ({
writeFile: vi.fn(),
rename: vi.fn(),
symlink: vi.fn(),
lstat: vi.fn(),
mkdir: vi.fn(),
stat: vi.fn(),
}));

vi.mock("node:fs", () => ({
createWriteStream: vi.fn(),
}));

vi.mock("./paths-info", () => ({
pathsInfo: vi.fn(),
}));
Expand Down Expand Up @@ -63,11 +66,15 @@ describe("downloadFileToCacheDir", () => {
beforeEach(() => {
vi.resetAllMocks();
// mock 200 request
vi.mocked(fetchMock).mockResolvedValue({
status: 200,
ok: true,
body: "dummy-body",
} as unknown as Response);
vi.mocked(fetchMock).mockResolvedValue(
new Response("dummy-body", {
status: 200,
headers: {
etag: DUMMY_ETAG,
"Content-Range": "bytes 0-54/55",
},
})
);

// prevent to use caching
vi.mocked(stat).mockRejectedValue(new Error("Do not exists"));
Expand Down Expand Up @@ -235,6 +242,9 @@ describe("downloadFileToCacheDir", () => {
},
]);

// eslint-disable-next-line @typescript-eslint/no-explicit-any
vi.mocked(createWriteStream).mockReturnValue(async function* () {} as any);

const output = await downloadFileToCacheDir({
repo: DUMMY_REPO,
path: "/README.md",
Expand Down Expand Up @@ -276,6 +286,9 @@ describe("downloadFileToCacheDir", () => {
},
]);

// eslint-disable-next-line @typescript-eslint/no-explicit-any
vi.mocked(createWriteStream).mockReturnValue(async function* () {} as any);

await downloadFileToCacheDir({
repo: DUMMY_REPO,
path: "/README.md",
Expand All @@ -284,7 +297,7 @@ describe("downloadFileToCacheDir", () => {

const incomplete = `${expectedBlob}.incomplete`;
// 1. should write fetch#response#body to incomplete file
expect(writeFile).toHaveBeenCalledWith(incomplete, "dummy-body");
expect(createWriteStream).toHaveBeenCalledWith(incomplete);
// 2. should rename the incomplete to the blob expected name
expect(rename).toHaveBeenCalledWith(incomplete, expectedBlob);
// 3. should create symlink pointing to blob
Expand Down
15 changes: 10 additions & 5 deletions packages/hub/src/lib/download-file-to-cache-dir.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import { getHFHubCachePath, getRepoFolderName } from "./cache-management";
import { dirname, join } from "node:path";
import { writeFile, rename, lstat, mkdir, stat } from "node:fs/promises";
import { rename, lstat, mkdir, stat } from "node:fs/promises";
import type { CommitInfo, PathInfo } from "./paths-info";
import { pathsInfo } from "./paths-info";
import type { CredentialsParams, RepoDesignation } from "../types/public";
import { toRepoId } from "../utils/toRepoId";
import { downloadFile } from "./download-file";
import { createSymlink } from "../utils/symlink";
import { Readable } from "node:stream";
import type { ReadableStream } from "node:stream/web";
import { pipeline } from "node:stream/promises";
import { createWriteStream } from "node:fs";

export const REGEX_COMMIT_HASH: RegExp = new RegExp("^[0-9a-f]{40}$");

Expand Down Expand Up @@ -115,15 +119,16 @@ export async function downloadFileToCacheDir(
const incomplete = `${blobPath}.incomplete`;
console.debug(`Downloading ${params.path} to ${incomplete}`);

const response: Response | null = await downloadFile({
const blob: Blob | null = await downloadFile({
...params,
revision: commitHash,
});

if (!response || !response.ok || !response.body) throw new Error(`invalid response for file ${params.path}`);
if (!blob) {
throw new Error(`invalid response for file ${params.path}`);
}

// @ts-expect-error resp.body is a Stream, but Stream in internal to node
await writeFile(incomplete, response.body);
await pipeline(Readable.fromWeb(blob.stream() as ReadableStream), createWriteStream(incomplete));

// rename .incomplete file to expect blob
await rename(incomplete, blobPath);
Expand Down
115 changes: 66 additions & 49 deletions packages/hub/src/lib/download-file.spec.ts
Original file line number Diff line number Diff line change
@@ -1,65 +1,82 @@
import { expect, test, describe, vi } from "vitest";
import { expect, test, describe, assert } from "vitest";
import { downloadFile } from "./download-file";
import type { RepoId } from "../types/public";

const DUMMY_REPO: RepoId = {
name: "hello-world",
type: "model",
};
import { deleteRepo } from "./delete-repo";
import { createRepo } from "./create-repo";
import { TEST_ACCESS_TOKEN, TEST_HUB_URL, TEST_USER } from "../test/consts";
import { insecureRandomString } from "../utils/insecureRandomString";

describe("downloadFile", () => {
test("hubUrl params should overwrite HUB_URL", async () => {
const fetchMock: typeof fetch = vi.fn();
vi.mocked(fetchMock).mockResolvedValue({
status: 200,
ok: true,
} as Response);
test("should download regular file", async () => {
const blob = await downloadFile({
repo: {
type: "model",
name: "openai-community/gpt2",
},
path: "README.md",
});

const text = await blob?.slice(0, 1000).text();
assert(
text?.includes(`---
language: en
tags:
- exbert

license: mit
---


# GPT-2

await downloadFile({
repo: DUMMY_REPO,
path: "/README.md",
hubUrl: "http://dummy-hub",
fetch: fetchMock,
Test the whole generation capabilities here: https://transformer.huggingface.co/doc/gpt2-large`)
);
});
test("should downoad xet file", async () => {
const blob = await downloadFile({
repo: {
type: "model",
name: "celinah/xet-experiments",
},
path: "large_text.txt",
});

expect(fetchMock).toHaveBeenCalledWith("http://dummy-hub/hello-world/resolve/main//README.md", expect.anything());
const text = await blob?.slice(0, 100).text();
expect(text).toMatch("this is a text file.".repeat(10).slice(0, 100));
});

test("raw params should use raw url", async () => {
const fetchMock: typeof fetch = vi.fn();
vi.mocked(fetchMock).mockResolvedValue({
status: 200,
ok: true,
} as Response);
test("should download private file", async () => {
const repoName = `datasets/${TEST_USER}/TEST-${insecureRandomString()}`;

await downloadFile({
repo: DUMMY_REPO,
path: "README.md",
raw: true,
fetch: fetchMock,
const result = await createRepo({
accessToken: TEST_ACCESS_TOKEN,
hubUrl: TEST_HUB_URL,
private: true,
repo: repoName,
files: [{ path: ".gitattributes", content: new Blob(["*.html filter=lfs diff=lfs merge=lfs -text"]) }],
});

expect(fetchMock).toHaveBeenCalledWith("https://huggingface.co/hello-world/raw/main/README.md", expect.anything());
});
assert.deepStrictEqual(result, {
repoUrl: `${TEST_HUB_URL}/${repoName}`,
});

try {
const blob = await downloadFile({
repo: repoName,
path: ".gitattributes",
hubUrl: TEST_HUB_URL,
accessToken: TEST_ACCESS_TOKEN,
});

test("internal server error should propagate the error", async () => {
const fetchMock: typeof fetch = vi.fn();
vi.mocked(fetchMock).mockResolvedValue({
status: 500,
ok: false,
headers: new Map<string, string>([["Content-Type", "application/json"]]),
json: () => ({
error: "Dummy internal error",
}),
} as unknown as Response);
assert(blob, "File should be found");

await expect(async () => {
await downloadFile({
repo: DUMMY_REPO,
path: "README.md",
raw: true,
fetch: fetchMock,
const text = await blob?.text();
assert.strictEqual(text, "*.html filter=lfs diff=lfs merge=lfs -text");
} finally {
await deleteRepo({
repo: repoName,
hubUrl: TEST_HUB_URL,
accessToken: TEST_ACCESS_TOKEN,
});
}).rejects.toThrowError("Dummy internal error");
}
});
});
Loading