Skip to content

Download using xet #1305

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@ It's not a hard requirement, but please consider using an icon from [Gitmoji](ht

If you want to run only specific tests, you can do `pnpm test -- -t "test name"`.

You can also do `npx vitest ./packages/hub/src/utils/XetBlob.spec.ts` to run a specific test file.

Or `cd packages/hub && npx vitest --browser.name=chrome --browser.headless --config vitest-browser.config.mts ./src/utils/XetBlob.spec.ts` to run browser tests on a specific file
You can also do `pnpm --filter hub test ./src/utils/XetBlob.spec.ts` to run a specific test file.

## Adding a package

Expand Down
6 changes: 1 addition & 5 deletions packages/hub/src/lib/commit.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ describe("commit", () => {

try {
const readme1 = await downloadFile({ repo, path: "README.md", hubUrl: TEST_HUB_URL });
assert.strictEqual(readme1?.status, 200);
assert(readme1, "Readme doesn't exist");

const nodeOperation: CommitFile[] = isFrontend
? []
Expand Down Expand Up @@ -77,11 +77,9 @@ describe("commit", () => {
});

const fileContent = await downloadFile({ repo, path: "test.txt", hubUrl: TEST_HUB_URL });
assert.strictEqual(fileContent?.status, 200);
assert.strictEqual(await fileContent?.text(), "This is me");

const lfsFileContent = await downloadFile({ repo, path: "test.lfs.txt", hubUrl: TEST_HUB_URL });
assert.strictEqual(lfsFileContent?.status, 200);
assert.strictEqual(await lfsFileContent?.text(), lfsContent);

const lfsFileUrl = `${TEST_HUB_URL}/${repoName}/raw/main/test.lfs.txt`;
Expand All @@ -98,15 +96,13 @@ size ${lfsContent.length}

if (!isFrontend) {
const fileUrlContent = await downloadFile({ repo, path: "tsconfig.json", hubUrl: TEST_HUB_URL });
assert.strictEqual(fileUrlContent?.status, 200);
assert.strictEqual(
await fileUrlContent?.text(),
(await import("node:fs")).readFileSync("./tsconfig.json", "utf-8")
);
}

const webResourceContent = await downloadFile({ repo, path: "lamaral.json", hubUrl: TEST_HUB_URL });
assert.strictEqual(webResourceContent?.status, 200);
assert.strictEqual(await webResourceContent?.text(), await (await fetch(tokenizerJsonUrl)).text());

const readme2 = await downloadFile({ repo, path: "README.md", hubUrl: TEST_HUB_URL });
Expand Down
31 changes: 22 additions & 9 deletions packages/hub/src/lib/download-file-to-cache-dir.spec.ts
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
import { expect, test, describe, vi, beforeEach } from "vitest";
import type { RepoDesignation, RepoId } from "../types/public";
import { dirname, join } from "node:path";
import { lstat, mkdir, stat, symlink, writeFile, rename } from "node:fs/promises";
import { lstat, mkdir, stat, symlink, rename } from "node:fs/promises";
import { pathsInfo } from "./paths-info";
import type { Stats } from "node:fs";
import { createWriteStream, type Stats } from "node:fs";
import { getHFHubCachePath, getRepoFolderName } from "./cache-management";
import { toRepoId } from "../utils/toRepoId";
import { downloadFileToCacheDir } from "./download-file-to-cache-dir";
import { createSymlink } from "../utils/symlink";

vi.mock("node:fs/promises", () => ({
writeFile: vi.fn(),
rename: vi.fn(),
symlink: vi.fn(),
lstat: vi.fn(),
mkdir: vi.fn(),
stat: vi.fn(),
}));

vi.mock("node:fs", () => ({
createWriteStream: vi.fn(),
}));

vi.mock("./paths-info", () => ({
pathsInfo: vi.fn(),
}));
Expand Down Expand Up @@ -63,11 +66,15 @@ describe("downloadFileToCacheDir", () => {
beforeEach(() => {
vi.resetAllMocks();
// mock 200 request
vi.mocked(fetchMock).mockResolvedValue({
status: 200,
ok: true,
body: "dummy-body",
} as unknown as Response);
vi.mocked(fetchMock).mockResolvedValue(
new Response("dummy-body", {
status: 200,
headers: {
etag: DUMMY_ETAG,
"Content-Range": "bytes 0-54/55",
},
})
);

// prevent to use caching
vi.mocked(stat).mockRejectedValue(new Error("Do not exists"));
Expand Down Expand Up @@ -235,6 +242,9 @@ describe("downloadFileToCacheDir", () => {
},
]);

// eslint-disable-next-line @typescript-eslint/no-explicit-any
vi.mocked(createWriteStream).mockReturnValue(async function* () {} as any);

const output = await downloadFileToCacheDir({
repo: DUMMY_REPO,
path: "/README.md",
Expand Down Expand Up @@ -276,6 +286,9 @@ describe("downloadFileToCacheDir", () => {
},
]);

// eslint-disable-next-line @typescript-eslint/no-explicit-any
vi.mocked(createWriteStream).mockReturnValue(async function* () {} as any);

await downloadFileToCacheDir({
repo: DUMMY_REPO,
path: "/README.md",
Expand All @@ -284,7 +297,7 @@ describe("downloadFileToCacheDir", () => {

const incomplete = `${expectedBlob}.incomplete`;
// 1. should write fetch#response#body to incomplete file
expect(writeFile).toHaveBeenCalledWith(incomplete, "dummy-body");
expect(createWriteStream).toHaveBeenCalledWith(incomplete);
// 2. should rename the incomplete to the blob expected name
expect(rename).toHaveBeenCalledWith(incomplete, expectedBlob);
// 3. should create symlink pointing to blob
Expand Down
15 changes: 10 additions & 5 deletions packages/hub/src/lib/download-file-to-cache-dir.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import { getHFHubCachePath, getRepoFolderName } from "./cache-management";
import { dirname, join } from "node:path";
import { writeFile, rename, lstat, mkdir, stat } from "node:fs/promises";
import { rename, lstat, mkdir, stat } from "node:fs/promises";
import type { CommitInfo, PathInfo } from "./paths-info";
import { pathsInfo } from "./paths-info";
import type { CredentialsParams, RepoDesignation } from "../types/public";
import { toRepoId } from "../utils/toRepoId";
import { downloadFile } from "./download-file";
import { createSymlink } from "../utils/symlink";
import { Readable } from "node:stream";
import type { ReadableStream } from "node:stream/web";
import { pipeline } from "node:stream/promises";
import { createWriteStream } from "node:fs";

export const REGEX_COMMIT_HASH: RegExp = new RegExp("^[0-9a-f]{40}$");

Expand Down Expand Up @@ -115,15 +119,16 @@ export async function downloadFileToCacheDir(
const incomplete = `${blobPath}.incomplete`;
console.debug(`Downloading ${params.path} to ${incomplete}`);

const response: Response | null = await downloadFile({
const blob: Blob | null = await downloadFile({
...params,
revision: commitHash,
});

if (!response || !response.ok || !response.body) throw new Error(`invalid response for file ${params.path}`);
if (!blob) {
throw new Error(`invalid response for file ${params.path}`);
}

// @ts-expect-error resp.body is a Stream, but Stream in internal to node
await writeFile(incomplete, response.body);
await pipeline(Readable.fromWeb(blob.stream() as ReadableStream), createWriteStream(incomplete));

// rename .incomplete file to expect blob
await rename(incomplete, blobPath);
Expand Down
115 changes: 66 additions & 49 deletions packages/hub/src/lib/download-file.spec.ts
Original file line number Diff line number Diff line change
@@ -1,65 +1,82 @@
import { expect, test, describe, vi } from "vitest";
import { expect, test, describe, assert } from "vitest";
import { downloadFile } from "./download-file";
import type { RepoId } from "../types/public";

const DUMMY_REPO: RepoId = {
name: "hello-world",
type: "model",
};
import { deleteRepo } from "./delete-repo";
import { createRepo } from "./create-repo";
import { TEST_ACCESS_TOKEN, TEST_HUB_URL, TEST_USER } from "../test/consts";
import { insecureRandomString } from "../utils/insecureRandomString";

describe("downloadFile", () => {
test("hubUrl params should overwrite HUB_URL", async () => {
const fetchMock: typeof fetch = vi.fn();
vi.mocked(fetchMock).mockResolvedValue({
status: 200,
ok: true,
} as Response);
test("should download regular file", async () => {
const blob = await downloadFile({
repo: {
type: "model",
name: "openai-community/gpt2",
},
path: "README.md",
});

const text = await blob?.slice(0, 1000).text();
assert(
text?.includes(`---
language: en
tags:
- exbert

license: mit
---


# GPT-2

await downloadFile({
repo: DUMMY_REPO,
path: "/README.md",
hubUrl: "http://dummy-hub",
fetch: fetchMock,
Test the whole generation capabilities here: https://transformer.huggingface.co/doc/gpt2-large`)
);
});
test("should downoad xet file", async () => {
const blob = await downloadFile({
repo: {
type: "model",
name: "celinah/xet-experiments",
},
path: "large_text.txt",
});

expect(fetchMock).toHaveBeenCalledWith("http://dummy-hub/hello-world/resolve/main//README.md", expect.anything());
const text = await blob?.slice(0, 100).text();
expect(text).toMatch("this is a text file.".repeat(10).slice(0, 100));
});

test("raw params should use raw url", async () => {
const fetchMock: typeof fetch = vi.fn();
vi.mocked(fetchMock).mockResolvedValue({
status: 200,
ok: true,
} as Response);
test("should download private file", async () => {
const repoName = `datasets/${TEST_USER}/TEST-${insecureRandomString()}`;

await downloadFile({
repo: DUMMY_REPO,
path: "README.md",
raw: true,
fetch: fetchMock,
const result = await createRepo({
accessToken: TEST_ACCESS_TOKEN,
hubUrl: TEST_HUB_URL,
private: true,
repo: repoName,
files: [{ path: ".gitattributes", content: new Blob(["*.html filter=lfs diff=lfs merge=lfs -text"]) }],
});

expect(fetchMock).toHaveBeenCalledWith("https://huggingface.co/hello-world/raw/main/README.md", expect.anything());
});
assert.deepStrictEqual(result, {
repoUrl: `${TEST_HUB_URL}/${repoName}`,
});

try {
const blob = await downloadFile({
repo: repoName,
path: ".gitattributes",
hubUrl: TEST_HUB_URL,
accessToken: TEST_ACCESS_TOKEN,
});

test("internal server error should propagate the error", async () => {
const fetchMock: typeof fetch = vi.fn();
vi.mocked(fetchMock).mockResolvedValue({
status: 500,
ok: false,
headers: new Map<string, string>([["Content-Type", "application/json"]]),
json: () => ({
error: "Dummy internal error",
}),
} as unknown as Response);
assert(blob, "File should be found");

await expect(async () => {
await downloadFile({
repo: DUMMY_REPO,
path: "README.md",
raw: true,
fetch: fetchMock,
const text = await blob?.text();
assert.strictEqual(text, "*.html filter=lfs diff=lfs merge=lfs -text");
} finally {
await deleteRepo({
repo: repoName,
hubUrl: TEST_HUB_URL,
accessToken: TEST_ACCESS_TOKEN,
});
}).rejects.toThrowError("Dummy internal error");
}
});
});
72 changes: 42 additions & 30 deletions packages/hub/src/lib/download-file.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import { HUB_URL } from "../consts";
import { createApiError } from "../error";
import type { CredentialsParams, RepoDesignation } from "../types/public";
import { checkCredentials } from "../utils/checkCredentials";
import { toRepoId } from "../utils/toRepoId";
import { WebBlob } from "../utils/WebBlob";
import { XetBlob } from "../utils/XetBlob";
import type { FileDownloadInfoOutput } from "./file-download-info";
import { fileDownloadInfo } from "./file-download-info";

/**
* @returns null when the file doesn't exist
Expand All @@ -23,43 +24,54 @@ export async function downloadFile(
* @default "main"
*/
revision?: string;
/**
* Fetch only a specific part of the file
*/
range?: [number, number];
hubUrl?: string;
/**
* Custom fetch function to use instead of the default one, for example to use a proxy or edit headers.
*/
fetch?: typeof fetch;
/**
* Whether to use the xet protocol to download the file (if applicable).
*
* Currently there's experimental support for it, so it's not enabled by default.
*
* It will be enabled automatically in a future minor version.
*
* @default false
*/
xet?: boolean;
/**
* Can save an http request if provided
*/
downloadInfo?: FileDownloadInfoOutput;
} & Partial<CredentialsParams>
): Promise<Response | null> {
): Promise<Blob | null> {
const accessToken = checkCredentials(params);
const repoId = toRepoId(params.repo);
const url = `${params.hubUrl ?? HUB_URL}/${repoId.type === "model" ? "" : `${repoId.type}s/`}${repoId.name}/${
params.raw ? "raw" : "resolve"
}/${encodeURIComponent(params.revision ?? "main")}/${params.path}`;

const resp = await (params.fetch ?? fetch)(url, {
headers: {
...(accessToken
? {
Authorization: `Bearer ${accessToken}`,
}
: {}),
...(params.range
? {
Range: `bytes=${params.range[0]}-${params.range[1]}`,
}
: {}),
},
});
const info =
params.downloadInfo ??
(await fileDownloadInfo({
accessToken,
repo: params.repo,
path: params.path,
revision: params.revision,
hubUrl: params.hubUrl,
fetch: params.fetch,
raw: params.raw,
}));

if (resp.status === 404 && resp.headers.get("X-Error-Code") === "EntryNotFound") {
if (!info) {
return null;
} else if (!resp.ok) {
throw await createApiError(resp);
}

return resp;
if (info.xet && params.xet) {
return new XetBlob({
hash: info.xet.hash,
refreshUrl: info.xet.refreshUrl.href,
fetch: params.fetch,
accessToken,
size: info.size,
});
}

return new WebBlob(new URL(info.url), 0, info.size, "", true, params.fetch ?? fetch, accessToken);
}
Loading
Loading