From 59d8dbac512fd3f4028f46c1cc147b8d4e94ffc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Thu, 16 Jan 2025 16:40:05 +0545 Subject: [PATCH] ReferenceFileSystem: use fs.open instead of fs._open There is a bug in `fsspec==2024.12.0` that causes the `ReferenceFileSystem` to incorrectly make `fs._open` return a coroutine object instead of a file-like object. (See a proposed PR to fix this issue: fsspec/filesystem_spec#1769.) We have a test for the expected behavior (`test_arrow_generator_partitioned` in `tests/unit/lib/test_arrow.py`) running in the CI environment. But that does not fail because the latest version of `fsspec` does not get installed in the CI due to the upper limit set by the `datasets` library. The `datasets` library is only installed as part of the `hf` and `tests` extras, so the default installation of `datachain` will encounter this issue. Fixes https://github.com/iterative/datachain/issues/806. --- src/datachain/lib/arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datachain/lib/arrow.py b/src/datachain/lib/arrow.py index 28954477b..2c3aebb2f 100644 --- a/src/datachain/lib/arrow.py +++ b/src/datachain/lib/arrow.py @@ -33,7 +33,7 @@ def _open(self, path, mode="rb", *args, **kwargs): # reads the whole file in-memory. (uri,) = self.references[path] protocol, _ = split_protocol(uri) - return self.fss[protocol]._open(uri, mode, *args, **kwargs) + return self.fss[protocol].open(uri, mode, *args, **kwargs) class ArrowGenerator(Generator):