From a4c1f18a7c59dd05aceeb2a22422b1be2534be97 Mon Sep 17 00:00:00 2001 From: Georgia Channing Date: Wed, 5 Nov 2025 17:58:17 +0000 Subject: [PATCH 1/7] adding fasta --- setup.py | 1 + src/datasets/packaged_modules/__init__.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/setup.py b/setup.py index f53d4cd85f5..0984b7a234d 100644 --- a/setup.py +++ b/setup.py @@ -167,6 +167,7 @@ "elasticsearch>=7.17.12,<8.0.0", # 8.0 asks users to provide hosts or cloud_id when instantiating ElasticSearch(); 7.9.1 has legacy numpy.float_ which was fixed in https://github.com/elastic/elasticsearch-py/pull/2551. "faiss-cpu>=1.8.0.post1", # Pins numpy < 2 "h5py", + "biopython", "jax>=0.3.14; sys_platform != 'win32'", "jaxlib>=0.3.14; sys_platform != 'win32'", "lz4; python_version < '3.14'", # python 3.14 gives ImportError: cannot import name '_compression' from partially initialized module 'lz4.frame diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py index 9d076df44b7..57b32af8377 100644 --- a/src/datasets/packaged_modules/__init__.py +++ b/src/datasets/packaged_modules/__init__.py @@ -8,6 +8,7 @@ from .audiofolder import audiofolder from .cache import cache from .csv import csv +from .fasta import fasta from .hdf5 import hdf5 from .imagefolder import imagefolder from .json import json @@ -51,6 +52,7 @@ def _hash_python_lines(lines: list[str]) -> str: "webdataset": (webdataset.__name__, _hash_python_lines(inspect.getsource(webdataset).splitlines())), "xml": (xml.__name__, _hash_python_lines(inspect.getsource(xml).splitlines())), "hdf5": (hdf5.__name__, _hash_python_lines(inspect.getsource(hdf5).splitlines())), + "fasta": (fasta.__name__, _hash_python_lines(inspect.getsource(fasta).splitlines())), } # get importable module names and hash for caching @@ -82,6 +84,11 @@ def _hash_python_lines(lines: list[str]) -> str: ".xml": ("xml", {}), ".hdf5": ("hdf5", {}), ".h5": ("hdf5", {}), + ".fa": ("fasta", {}), + ".fasta": ("fasta", {}), + ".fna": ("fasta", {}), + ".ffn": ("fasta", {}), + ".frn": ("fasta", {}), } _EXTENSION_TO_MODULE.update({ext: ("imagefolder", {}) for ext in imagefolder.ImageFolder.EXTENSIONS}) _EXTENSION_TO_MODULE.update({ext.upper(): ("imagefolder", {}) for ext in imagefolder.ImageFolder.EXTENSIONS}) From 204b892e8796b49f68c2402ac974789d45ed7e21 Mon Sep 17 00:00:00 2001 From: Georgia Channing Date: Wed, 5 Nov 2025 17:59:52 +0000 Subject: [PATCH 2/7] adding fasta --- .../packaged_modules/fasta/__init__.py | 0 src/datasets/packaged_modules/fasta/fasta.py | 137 ++++++++ tests/packaged_modules/test_fasta.py | 326 ++++++++++++++++++ 3 files changed, 463 insertions(+) create mode 100644 src/datasets/packaged_modules/fasta/__init__.py create mode 100644 src/datasets/packaged_modules/fasta/fasta.py create mode 100644 tests/packaged_modules/test_fasta.py diff --git a/src/datasets/packaged_modules/fasta/__init__.py b/src/datasets/packaged_modules/fasta/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/datasets/packaged_modules/fasta/fasta.py b/src/datasets/packaged_modules/fasta/fasta.py new file mode 100644 index 00000000000..83190130741 --- /dev/null +++ b/src/datasets/packaged_modules/fasta/fasta.py @@ -0,0 +1,137 @@ +import itertools +from dataclasses import dataclass +from typing import TYPE_CHECKING, Dict, Iterable, Optional +from typing import List as ListT + +import pyarrow as pa +from Bio import SeqIO + +import datasets +from datasets.features import Value +from datasets.table import table_cast +from datasets.utils.file_utils import xopen + + +logger = datasets.utils.logging.get_logger(__name__) + +if TYPE_CHECKING: + from Bio import SeqIO + +# Common FASTA extensions; .gz will be handled by dl_manager.extract_on_the_fly +EXTENSIONS = [".fa", ".fasta", ".fna", ".ffn", ".faa", ".frn", ".fa.gz", ".fasta.gz"] + + +@dataclass +class FASTAConfig(datasets.BuilderConfig): + """BuilderConfig for FASTA.""" + + batch_size: Optional[int] = None + columns: Optional[ListT[str]] = None # subset of ["id", "description", "sequence"] + features: Optional[datasets.Features] = None + + def __post_init__(self): + super().__post_init__() + + +class FASTA(datasets.ArrowBasedBuilder): + """ArrowBasedBuilder that converts FASTA files to Arrow tables.""" + + BUILDER_CONFIG_CLASS = FASTAConfig + + def _info(self): + if ( + self.config.columns is not None + and self.config.features is not None + and set(self.config.columns) != set(self.config.features) + ): + raise ValueError( + "The columns and features argument must contain the same columns, but got " + f"{self.config.columns} and {self.config.features}", + ) + # Default features if not provided + if self.config.features is None: + self.config.features = datasets.Features( + {"id": Value("string"), "description": Value("string"), "sequence": Value("string")} + ) + return datasets.DatasetInfo(features=self.config.features) + + def _split_generators(self, dl_manager): + if not self.config.data_files: + raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}") + dl_manager.download_config.extract_on_the_fly = True + data_files = dl_manager.download_and_extract(self.config.data_files) + + splits = [] + for split_name, files in data_files.items(): + if isinstance(files, str): + files = [files] + # Expand dirs/globs into concrete file iterables + files = [dl_manager.iter_files(file) for file in files] + + # Optionally narrow features to requested columns + if self.config.columns is not None and set(self.config.columns) != set(self.info.features): + self.info.features = datasets.Features( + {col: feat for col, feat in self.info.features.items() if col in self.config.columns} + ) + + splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files})) + + return splits + + def _cast_table(self, pa_table: pa.Table) -> pa.Table: + if self.info.features is not None: + pa_table = table_cast(pa_table, self.info.features.arrow_schema) + return pa_table + + def _generate_tables(self, files): + # files is an iterable of iterables (one per user provided path) + effective_cols = list(self.info.features.keys()) + batch_size_cfg = self.config.batch_size or self._writer_batch_size or 10_000 + + for file_idx, file in enumerate(itertools.chain.from_iterable(files)): + # Stream-parse and yield Arrow tables in batches + try: + batch = {col: [] for col in effective_cols} + row_count = 0 + for rec in _iter_fasta_records(file): + row = { + "id": rec["id"], + "description": rec["description"], + "sequence": rec["sequence"], + } + for col in effective_cols: + batch[col].append(row[col]) + row_count += 1 + + if row_count % batch_size_cfg == 0: + pa_table = pa.Table.from_pydict(batch) + yield f"{file_idx}_{row_count - batch_size_cfg}", self._cast_table(pa_table) + batch = {col: [] for col in effective_cols} + + # Flush tail + if batch and any(len(v) for v in batch.values()): + start = row_count - len(next(iter(batch.values()))) if row_count else 0 + pa_table = pa.Table.from_pydict(batch) + yield f"{file_idx}_{start}", self._cast_table(pa_table) + + except ValueError as e: + logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}") + raise + + +# ┌─────────────┐ +# │ FASTA I/O │ +# └─────────────┘ + + +def _iter_fasta_records(path: str) -> Iterable[Dict[str, str]]: + """ + Streaming FASTA parser that yields dicts with keys: id, description, sequence. + - Supports regular files and fsspec paths (including gzip://) + - Uses xopen to handle compressed files and streaming paths + """ + # Use xopen to handle fsspec paths (e.g., gzip://file::path.gz) and regular paths + # Open in text mode for BioPython's SeqIO.parse + with xopen(path, "r", encoding="utf-8") as f: + for r in SeqIO.parse(f, "fasta"): + yield {"id": r.id, "description": r.description, "sequence": str(r.seq)} diff --git a/tests/packaged_modules/test_fasta.py b/tests/packaged_modules/test_fasta.py new file mode 100644 index 00000000000..4d1065ab83e --- /dev/null +++ b/tests/packaged_modules/test_fasta.py @@ -0,0 +1,326 @@ +import gzip +from textwrap import dedent + +import pytest + +from datasets import Features, Value +from datasets.builder import InvalidConfigName +from datasets.data_files import DataFilesDict, DataFilesList +from datasets.download.streaming_download_manager import StreamingDownloadManager +from datasets.packaged_modules.fasta.fasta import FASTA, FASTAConfig + + +# ┌─────────────────────────┐ +# │ Fixtures: FASTA files │ +# └─────────────────────────┘ + + +@pytest.fixture +def fasta_basic(tmp_path): + p = tmp_path / "basic.fasta" + # Put the header on the same line as '>' + p.write_text(">seq1 description here\nATGCATGC\nATGC\n>seq2 another desc\nGGGTTT\n>seq3\nAAAA\nTTTT\nCCCC\nGGGG\n") + return str(p) + + +@pytest.fixture +def fasta_with_whitespace(tmp_path): + p = tmp_path / "whitespace.fasta" + # Headers on the same line; sequences contain spaces/blank lines intentionally + p.write_text(">id1 some desc\nATG C A T GC\n\n>id2 desc with spaces\nG G G T T T \n>id3\nA T G C\n") + return str(p) + + +@pytest.fixture +def fasta_empty(tmp_path): + p = tmp_path / "empty.fasta" + p.write_text("") # no records + return str(p) + + +@pytest.fixture +def fasta_multi(tmp_path): + p1 = tmp_path / "file1.fasta" + p2 = tmp_path / "file2.fasta" + p1.write_text(">a\nAAA\n>b\nBBBB\n") + p2.write_text(">c\nC\n>d desc\nDDD\n") + return str(p1), str(p2) + + +@pytest.fixture +def fasta_gz(tmp_path): + p = tmp_path / "gz.fasta.gz" + content = ">gz1 first\nATATAT\n>gz2\nGCGC\n" + with gzip.open(p, "wb") as f: + f.write(content.encode("utf-8")) + return str(p) + + +# ┌──────────────────────┐ +# │ Config validation │ +# └──────────────────────┘ + + +def test_config_raises_when_invalid_name(): + with pytest.raises(InvalidConfigName, match="Bad characters"): + _ = FASTAConfig(name="bad*name") + + +@pytest.mark.parametrize("data_files", ["str_path", ["str_path"], DataFilesList(["str_path"], [()])]) +def test_config_raises_when_invalid_data_files(data_files): + with pytest.raises(ValueError, match="Expected a DataFilesDict"): + _ = FASTAConfig(name="ok", data_files=data_files) + + +# ┌──────────────────────────────┐ +# │ Basic functionality & schema │ +# └──────────────────────────────┘ + + +def test_fasta_basic_functionality(fasta_basic): + fasta = FASTA() + generator = fasta._generate_tables([[fasta_basic]]) + tables = list(generator) + # Expect a single batch with all rows by default (_writer_batch_size may change this in HF CI; + # still, we only assert data correctness) + assert len(tables) >= 1 + + # Merge batches virtually by reading first batch for sanity + _, first_table = tables[0] + cols = set(first_table.column_names) + assert {"id", "description", "sequence"} <= cols + + # Collect all rows across batches + all_rows = [] + for _, tbl in tables: + for i in range(len(tbl)): + all_rows.append({c: tbl[c][i].as_py() for c in tbl.column_names}) + + # Order should match stream order: seq1, seq2, seq3 + assert all_rows[0]["id"] == "seq1" + assert all_rows[0]["description"] == "seq1 description here" + assert all_rows[0]["sequence"] == "ATGCATGCATGC" # concatenated + + assert all_rows[1]["id"] == "seq2" + assert all_rows[1]["description"] == "seq2 another desc" + assert all_rows[1]["sequence"] == "GGGTTT" + + assert all_rows[2]["id"] == "seq3" + assert all_rows[2]["description"] == "seq3" + assert all_rows[2]["sequence"].lower() == "aaaattttccccgggg" + + +def test_fasta_whitespace_and_multiline(fasta_with_whitespace): + fasta = FASTA() + generator = fasta._generate_tables([[fasta_with_whitespace]]) + tables = list(generator) + + # Flatten rows + rows = [] + for _, tbl in tables: + for i in range(len(tbl)): + rows.append({c: tbl[c][i].as_py() for c in tbl.column_names}) + + assert rows[0]["id"] == "id1" + assert rows[0]["sequence"] == "ATGCATGC" # spaces & blank lines stripped + + assert rows[1]["id"] == "id2" + assert rows[1]["description"].startswith("id2") + assert rows[1]["sequence"] == "GGGTTT" + + assert rows[2]["id"] == "id3" + assert rows[2]["sequence"] == "ATGC" + + +# ┌───────────────┐ +# │ Batching │ +# └───────────────┘ + + +def test_fasta_batch_processing(fasta_basic): + config = FASTAConfig(batch_size=2) + fasta = FASTA() + fasta.config = config + + generator = fasta._generate_tables([[fasta_basic]]) + tables = list(generator) + + # 3 records; batch_size=2 -> 2 batches + assert len(tables) == 2 + + # First batch has 2 rows, final has 1 + assert len(tables[0][1]) == 2 + assert len(tables[1][1]) == 1 + + +# ┌───────────────────┐ +# │ Column filtering │ +# └───────────────────┘ + + +def test_fasta_column_filtering(fasta_basic): + config = FASTAConfig(columns=["id", "sequence"]) + fasta = FASTA() + fasta.config = config + # Call _info to initialize default features, then manually filter to match columns + info = fasta._info() + # Manually apply column filtering since we're not going through _split_generators + fasta.info.features = Features({col: feat for col, feat in info.features.items() if col in config.columns}) + generator = fasta._generate_tables([[fasta_basic]]) + tables = list(generator) + + # Ensure only selected columns appear + for _, tbl in tables: + assert set(tbl.column_names) == {"id", "sequence"} + # basic sanity on values + assert isinstance(tbl["id"][0].as_py(), str) + assert isinstance(tbl["sequence"][0].as_py(), str) + + +def test_fasta_columns_features_mismatch(): + features = Features({"id": Value("string"), "sequence": Value("string")}) + config = FASTAConfig( + name="t", + columns=["id", "description"], # mismatch vs features + features=features, + ) + fasta = FASTA() + fasta.config = config + with pytest.raises(ValueError, match="must contain the same columns"): + fasta._info() + + +# ┌───────────────────────┐ +# │ Features & casting │ +# └───────────────────────┘ + + +def test_fasta_default_features(fasta_basic): + fasta = FASTA() + info = fasta._info() + assert set(info.features.keys()) == {"id", "description", "sequence"} + + +def test_fasta_feature_specification_casting(fasta_basic): + features = Features({"id": Value("string"), "description": Value("string"), "sequence": Value("string")}) + config = FASTAConfig(features=features) + fasta = FASTA() + fasta.config = config + + tables = list(fasta._generate_tables([[fasta_basic]])) + # Check schema cast + _, tbl = tables[0] + for col in features: + assert tbl.schema.field(col).type == features[col].pa_type + + +# ┌───────────────────────────────┐ +# │ Empty files & warnings │ +# └───────────────────────────────┘ + + +def test_fasta_empty_file_warning(fasta_empty, caplog): + fasta = FASTA() + tables = list(fasta._generate_tables([[fasta_empty]])) + assert len(tables) == 0 + # A warning may be logged by your builder; this just asserts "no tables" behavior. + + +# ┌───────────────────────────────┐ +# │ Multiple files & splits │ +# └───────────────────────────────┘ + + +def test_fasta_multiple_files(fasta_multi): + f1, f2 = fasta_multi + fasta = FASTA() + tables = list(fasta._generate_tables([[f1, f2]])) + # Expect records from both files in order (builder yields per file batches) + total_rows = 0 + ids = [] + for _, tbl in tables: + total_rows += len(tbl) + ids += [tbl["id"][i].as_py() for i in range(len(tbl))] + assert total_rows == 4 + assert ids == ["a", "b", "c", "d"] + + +def test_fasta_gz_via_dl_manager(fasta_gz, tmp_path): + # Test that gzipped FASTA files can be read via StreamingDownloadManager. + # This validates that the FASTA implementation properly uses xopen() to handle + # fsspec paths like "gzip://file.fasta::path/to/file.gz" + data_files = DataFilesDict({"train": [fasta_gz]}) + config = FASTAConfig(data_files=data_files) + fasta = FASTA() + fasta.config = config + + dlm = StreamingDownloadManager() + splits = fasta._split_generators(dlm) + assert len(splits) == 1 + # Generate tables using files from dl_manager (ensures .gz is extracted on the fly) + tables = list(fasta._generate_tables(splits[0].gen_kwargs["files"])) + assert len(tables) >= 1 + + # Flatten and check content + rows = [] + for _, tbl in tables: + for i in range(len(tbl)): + rows.append({c: tbl[c][i].as_py() for c in tbl.column_names}) + + assert len(rows) == 2 + assert rows[0]["id"] == "gz1" + assert rows[0]["sequence"] == "ATATAT" + assert rows[1]["id"] == "gz2" + assert rows[1]["sequence"] == "GCGC" + + +# ┌───────────────────────────────┐ +# │ Integration: load_dataset │ +# └───────────────────────────────┘ + + +def test_fasta_load_dataset_like_usage(fasta_basic, tmp_path, monkeypatch): + # This test demonstrates that the packaged module can be consumed as a HF dataset script. + # If your builder is shipped as a packaged module, adjust `path` accordingly or skip this. + # Here we call the builder directly to avoid I/O complexity. + config = FASTAConfig() + fasta = FASTA() + fasta.config = config + tables = list(fasta._generate_tables([[fasta_basic]])) + assert len(tables) >= 1 + + # Optionally, verify that features match expected when building a Dataset + # (constructing a Dataset from pyarrow tables directly is possible, but out of scope here). + + +# ┌───────────────────────────────┐ +# │ Edge cases │ +# └───────────────────────────────┘ + + +def test_fasta_handles_no_trailing_newline(tmp_path): + p = tmp_path / "no_newline.fasta" + p.write_text(">x\nATGC") # no trailing newline + fasta = FASTA() + tables = list(fasta._generate_tables([[str(p)]])) + rows = [] + for _, tbl in tables: + for i in range(len(tbl)): + rows.append({c: tbl[c][i].as_py() for c in tbl.column_names}) + assert rows == [{"id": "x", "description": "x", "sequence": "ATGC"}] + + +def test_fasta_single_record(tmp_path): + p = tmp_path / "single.fasta" + p.write_text( + dedent("""> + only + A + """) + .strip() + .replace("\n ", "\n") + ) + fasta = FASTA() + tables = list(fasta._generate_tables([[str(p)]])) + total = sum(len(tbl) for _, tbl in tables) + assert total == 1 From 1a697aec8e9dd0fbd3a5fc4026950eb3cf1410e2 Mon Sep 17 00:00:00 2001 From: Tobias Pitters <31857876+CloseChoice@users.noreply.github.com> Date: Wed, 5 Nov 2025 15:04:03 +0100 Subject: [PATCH 3/7] DOC: remove mode parameter in docstring of pdf and video feature (#7848) remove mode parameter in docstring of pdf and video feature --- src/datasets/features/pdf.py | 2 -- src/datasets/features/video.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/src/datasets/features/pdf.py b/src/datasets/features/pdf.py index 414c497356c..756530554d4 100644 --- a/src/datasets/features/pdf.py +++ b/src/datasets/features/pdf.py @@ -44,8 +44,6 @@ class Pdf: - A `pdfplumber.pdf.PDF`: pdfplumber pdf object. Args: - mode (`str`, *optional*): - The mode to convert the pdf to. If `None`, the native mode of the pdf is used. decode (`bool`, defaults to `True`): Whether to decode the pdf data. If `False`, returns the underlying dictionary in the format `{"path": pdf_path, "bytes": pdf_bytes}`. diff --git a/src/datasets/features/video.py b/src/datasets/features/video.py index adbfaaa30f3..8d7f3e3be51 100644 --- a/src/datasets/features/video.py +++ b/src/datasets/features/video.py @@ -45,8 +45,6 @@ class Video: Output: The Video features output data as `torchcodec.decoders.VideoDecoder` objects. Args: - mode (`str`, *optional*): - The mode to convert the video to. If `None`, the native mode of the video is used. decode (`bool`, defaults to `True`): Whether to decode the video data. If `False`, returns the underlying dictionary in the format `{"path": video_path, "bytes": video_bytes}`. From 4fb0866dcdd2447e4a319346219d8e9845af8a6f Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Wed, 5 Nov 2025 17:00:45 +0100 Subject: [PATCH 4/7] release: 4.4.1 (#7849) --- setup.py | 2 +- src/datasets/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 0984b7a234d..84cb3f9913c 100644 --- a/setup.py +++ b/setup.py @@ -233,7 +233,7 @@ setup( name="datasets", - version="4.4.1.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.4.1", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) description="HuggingFace community-driven open-source library of datasets", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py index b35961d27bb..268d13ad6dc 100644 --- a/src/datasets/__init__.py +++ b/src/datasets/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "4.4.1.dev0" +__version__ = "4.4.1" from .arrow_dataset import Column, Dataset from .arrow_reader import ReadInstruction From ff5db680c4a4ad33cbdc4ea799bf59995f1a24e0 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Wed, 5 Nov 2025 17:02:32 +0100 Subject: [PATCH 5/7] dev version (#7850) --- setup.py | 2 +- src/datasets/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 84cb3f9913c..3feac309361 100644 --- a/setup.py +++ b/setup.py @@ -233,7 +233,7 @@ setup( name="datasets", - version="4.4.1", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.4.2.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) description="HuggingFace community-driven open-source library of datasets", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py index 268d13ad6dc..6b2dc7d8600 100644 --- a/src/datasets/__init__.py +++ b/src/datasets/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "4.4.1" +__version__ = "4.4.2.dev0" from .arrow_dataset import Column, Dataset from .arrow_reader import ReadInstruction From 02bc2244e8cfce05f5b267da7df14e82344ab4b6 Mon Sep 17 00:00:00 2001 From: Georgia Channing Date: Mon, 10 Nov 2025 23:19:19 +0300 Subject: [PATCH 6/7] adding docs --- README.md | 2 +- docs/source/loading.mdx | 9 +++++++++ docs/source/package_reference/loading_methods.mdx | 7 +++++++ src/datasets/packaged_modules/fasta/fasta.py | 9 ++++----- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index d4162b9e761..ceb2868d6a8 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ 🤗 Datasets is a lightweight library providing **two** main features: - **one-line dataloaders for many public datasets**: one-liners to download and pre-process any of the ![number of datasets](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/datasets&color=brightgreen) major public datasets (image datasets, audio datasets, text datasets in 467 languages and dialects, etc.) provided on the [HuggingFace Datasets Hub](https://huggingface.co/datasets). With a simple command like `squad_dataset = load_dataset("rajpurkar/squad")`, get any of these datasets ready to use in a dataloader for training/evaluating a ML model (Numpy/Pandas/PyTorch/TensorFlow/JAX), -- **efficient data pre-processing**: simple, fast and reproducible data pre-processing for the public datasets as well as your own local datasets in CSV, JSON, text, PNG, JPEG, WAV, MP3, Parquet, HDF5, etc. With simple commands like `processed_dataset = dataset.map(process_example)`, efficiently prepare the dataset for inspection and ML model evaluation and training. +- **efficient data pre-processing**: simple, fast and reproducible data pre-processing for the public datasets as well as your own local datasets in CSV, JSON, text, PNG, JPEG, WAV, MP3, Parquet, HDF5, FASTA, etc. With simple commands like `processed_dataset = dataset.map(process_example)`, efficiently prepare the dataset for inspection and ML model evaluation and training. [🎓 **Documentation**](https://huggingface.co/docs/datasets/) [🔎 **Find a dataset in the Hub**](https://huggingface.co/datasets) [🌟 **Share a dataset on the Hub**](https://huggingface.co/docs/datasets/share) diff --git a/docs/source/loading.mdx b/docs/source/loading.mdx index eb73ab84b5a..70ec2f9fd47 100644 --- a/docs/source/loading.mdx +++ b/docs/source/loading.mdx @@ -180,6 +180,15 @@ For now only the Arrow streaming format is supported. The Arrow IPC file format Note that the HDF5 loader assumes that the file has "tabular" structure, i.e. that all datasets in the file have (the same number of) rows on their first dimension. +## FASTA files + +[FASTA](https://www.ncbi.nlm.nih.gov/genbank/fastaformat/) files are commonly used genomic, protein, and nucleotide sequencing data. + +```py +>>> from datasets import load_dataset +>>> dataset = load_dataset("fasta", data_files="data.fasta") +``` + ### SQL Read database contents with [`~datasets.Dataset.from_sql`] by specifying the URI to connect to your database. You can read both table names and queries: diff --git a/docs/source/package_reference/loading_methods.mdx b/docs/source/package_reference/loading_methods.mdx index 4792d1b88f7..fab23c12aae 100644 --- a/docs/source/package_reference/loading_methods.mdx +++ b/docs/source/package_reference/loading_methods.mdx @@ -97,6 +97,13 @@ load_dataset("csv", data_dir="path/to/data/dir", sep="\t") [[autodoc]] datasets.packaged_modules.hdf5.HDF5 +### FASTA + +[[autodoc]] datasets.packaged_modules.fasta.FASTAConfig + +[[autodoc]] datasets.packaged_modules.fasta.FASTA + + ### Pdf [[autodoc]] datasets.packaged_modules.pdffolder.PdfFolderConfig diff --git a/src/datasets/packaged_modules/fasta/fasta.py b/src/datasets/packaged_modules/fasta/fasta.py index 83190130741..e3797db5062 100644 --- a/src/datasets/packaged_modules/fasta/fasta.py +++ b/src/datasets/packaged_modules/fasta/fasta.py @@ -9,7 +9,6 @@ import datasets from datasets.features import Value from datasets.table import table_cast -from datasets.utils.file_utils import xopen logger = datasets.utils.logging.get_logger(__name__) @@ -33,8 +32,8 @@ def __post_init__(self): super().__post_init__() -class FASTA(datasets.ArrowBasedBuilder): - """ArrowBasedBuilder that converts FASTA files to Arrow tables.""" +class FASTA(datasets.GeneratorBasedBuilder): + """GeneratorBasedBuilder that converts FASTA files to Arrow tables.""" BUILDER_CONFIG_CLASS = FASTAConfig @@ -89,7 +88,7 @@ def _generate_tables(self, files): batch_size_cfg = self.config.batch_size or self._writer_batch_size or 10_000 for file_idx, file in enumerate(itertools.chain.from_iterable(files)): - # Stream-parse and yield Arrow tables in batches + # Stream-parse and yield Arrow tables try: batch = {col: [] for col in effective_cols} row_count = 0 @@ -132,6 +131,6 @@ def _iter_fasta_records(path: str) -> Iterable[Dict[str, str]]: """ # Use xopen to handle fsspec paths (e.g., gzip://file::path.gz) and regular paths # Open in text mode for BioPython's SeqIO.parse - with xopen(path, "r", encoding="utf-8") as f: + with open(path, "r", encoding="utf-8") as f: for r in SeqIO.parse(f, "fasta"): yield {"id": r.id, "description": r.description, "sequence": str(r.seq)} From ccdd83a83e9d7636cc1b5c6c7c04f244aa919118 Mon Sep 17 00:00:00 2001 From: Georgia Channing Date: Tue, 11 Nov 2025 19:47:16 +0300 Subject: [PATCH 7/7] adding fastq support --- tests/packaged_modules/test_fasta.py | 384 +++++++++++++++++++++------ 1 file changed, 307 insertions(+), 77 deletions(-) diff --git a/tests/packaged_modules/test_fasta.py b/tests/packaged_modules/test_fasta.py index 4d1065ab83e..38612f4f206 100644 --- a/tests/packaged_modules/test_fasta.py +++ b/tests/packaged_modules/test_fasta.py @@ -56,6 +56,75 @@ def fasta_gz(tmp_path): return str(p) +# ┌─────────────────────────┐ +# │ Fixtures: FASTQ files │ +# └─────────────────────────┘ + + +@pytest.fixture +def fastq_basic(tmp_path): + p = tmp_path / "basic.fastq" + content = dedent("""\ + @SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36 + GGGTGATGGCCGCTGCCGATGGCGTCAAATCCCACC + +SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36 + IIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IG9IC + @SRR001666.2 071112_SLXA-EAS1_s_7:5:1:801:338 length=36 + GTTCAGGGATACGACGTTTGTATTTTAAGAATCTGA + +SRR001666.2 071112_SLXA-EAS1_s_7:5:1:801:338 length=36 + IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII6IBI + """) + p.write_text(content) + return str(p) + + +@pytest.fixture +def fastq_multiline(tmp_path): + p = tmp_path / "multiline.fastq" + # FASTQ with multi-line sequences and quality scores + content = dedent("""\ + @read1 + GATTTGGGGTTCAAAGCAGTATCGATCAAATAGT + AAATCCATTTGTTCAACTCACAGTTT + + + !''*((((***+))%%%++)(%%%%).1***-+*' + '))**55CCF>>>>>>CCCCCCC65 + @read2 + ACGT + ACGT + + + IIII + IIII + """) + p.write_text(content) + return str(p) + + +@pytest.fixture +def fastq_empty(tmp_path): + p = tmp_path / "empty.fastq" + p.write_text("") + return str(p) + + +@pytest.fixture +def fastq_multi(tmp_path): + p1 = tmp_path / "file1.fastq" + p2 = tmp_path / "file2.fastq" + p1.write_text("@read1\nATGC\n+\nIIII\n@read2\nGGGG\n+\n!!!!\n") + p2.write_text("@read3\nAAAA\n+\nHHHH\n@read4\nTTTT\n+\n####\n") + return str(p1), str(p2) + + +@pytest.fixture +def fastq_gz(tmp_path): + p = tmp_path / "compressed.fastq.gz" + content = "@gz_read1\nATGCATGC\n+\nIIIIIIII\n@gz_read2\nGGGGTTTT\n+\nHHHHHHHH\n" + with gzip.open(p, "wb") as f: + f.write(content.encode("utf-8")) + return str(p) + + # ┌──────────────────────┐ # │ Config validation │ # └──────────────────────┘ @@ -79,22 +148,15 @@ def test_config_raises_when_invalid_data_files(data_files): def test_fasta_basic_functionality(fasta_basic): fasta = FASTA() - generator = fasta._generate_tables([[fasta_basic]]) - tables = list(generator) - # Expect a single batch with all rows by default (_writer_batch_size may change this in HF CI; - # still, we only assert data correctness) - assert len(tables) >= 1 - - # Merge batches virtually by reading first batch for sanity - _, first_table = tables[0] - cols = set(first_table.column_names) - assert {"id", "description", "sequence"} <= cols - - # Collect all rows across batches - all_rows = [] - for _, tbl in tables: - for i in range(len(tbl)): - all_rows.append({c: tbl[c][i].as_py() for c in tbl.column_names}) + generator = fasta._generate_examples([[fasta_basic]]) + examples = list(generator) + assert len(examples) >= 1 + + # Collect all rows + all_rows = [example for _, example in examples] + + # Check columns + assert {"id", "description", "sequence"} <= set(all_rows[0].keys()) # Order should match stream order: seq1, seq2, seq3 assert all_rows[0]["id"] == "seq1" @@ -112,14 +174,11 @@ def test_fasta_basic_functionality(fasta_basic): def test_fasta_whitespace_and_multiline(fasta_with_whitespace): fasta = FASTA() - generator = fasta._generate_tables([[fasta_with_whitespace]]) - tables = list(generator) + generator = fasta._generate_examples([[fasta_with_whitespace]]) + examples = list(generator) - # Flatten rows - rows = [] - for _, tbl in tables: - for i in range(len(tbl)): - rows.append({c: tbl[c][i].as_py() for c in tbl.column_names}) + # Collect all rows + rows = [example for _, example in examples] assert rows[0]["id"] == "id1" assert rows[0]["sequence"] == "ATGCATGC" # spaces & blank lines stripped @@ -142,15 +201,11 @@ def test_fasta_batch_processing(fasta_basic): fasta = FASTA() fasta.config = config - generator = fasta._generate_tables([[fasta_basic]]) - tables = list(generator) + generator = fasta._generate_examples([[fasta_basic]]) + examples = list(generator) - # 3 records; batch_size=2 -> 2 batches - assert len(tables) == 2 - - # First batch has 2 rows, final has 1 - assert len(tables[0][1]) == 2 - assert len(tables[1][1]) == 1 + # 3 records in the file (batch_size doesn't affect _generate_examples) + assert len(examples) == 3 # ┌───────────────────┐ @@ -160,21 +215,21 @@ def test_fasta_batch_processing(fasta_basic): def test_fasta_column_filtering(fasta_basic): config = FASTAConfig(columns=["id", "sequence"]) + fasta = FASTA() fasta.config = config - # Call _info to initialize default features, then manually filter to match columns + # Call _info to initialize features (they're already set correctly in config) info = fasta._info() - # Manually apply column filtering since we're not going through _split_generators fasta.info.features = Features({col: feat for col, feat in info.features.items() if col in config.columns}) - generator = fasta._generate_tables([[fasta_basic]]) - tables = list(generator) + + generator = fasta._generate_examples([[fasta_basic]]) + examples = list(generator) # Ensure only selected columns appear - for _, tbl in tables: - assert set(tbl.column_names) == {"id", "sequence"} - # basic sanity on values - assert isinstance(tbl["id"][0].as_py(), str) - assert isinstance(tbl["sequence"][0].as_py(), str) + for _, example in examples: + assert set(example.keys()) == {"id", "sequence"} + assert isinstance(example["id"], str) + assert isinstance(example["sequence"], str) def test_fasta_columns_features_mismatch(): @@ -207,11 +262,12 @@ def test_fasta_feature_specification_casting(fasta_basic): fasta = FASTA() fasta.config = config - tables = list(fasta._generate_tables([[fasta_basic]])) - # Check schema cast - _, tbl = tables[0] + examples = list(fasta._generate_examples([[fasta_basic]])) + # Check that examples have the correct columns + _, example = examples[0] for col in features: - assert tbl.schema.field(col).type == features[col].pa_type + assert col in example + assert isinstance(example[col], str) # ┌───────────────────────────────┐ @@ -221,9 +277,9 @@ def test_fasta_feature_specification_casting(fasta_basic): def test_fasta_empty_file_warning(fasta_empty, caplog): fasta = FASTA() - tables = list(fasta._generate_tables([[fasta_empty]])) - assert len(tables) == 0 - # A warning may be logged by your builder; this just asserts "no tables" behavior. + examples = list(fasta._generate_examples([[fasta_empty]])) + assert len(examples) == 0 + # A warning may be logged by your builder; this just asserts "no examples" behavior. # ┌───────────────────────────────┐ @@ -234,21 +290,16 @@ def test_fasta_empty_file_warning(fasta_empty, caplog): def test_fasta_multiple_files(fasta_multi): f1, f2 = fasta_multi fasta = FASTA() - tables = list(fasta._generate_tables([[f1, f2]])) - # Expect records from both files in order (builder yields per file batches) - total_rows = 0 - ids = [] - for _, tbl in tables: - total_rows += len(tbl) - ids += [tbl["id"][i].as_py() for i in range(len(tbl))] - assert total_rows == 4 + examples = list(fasta._generate_examples([[f1, f2]])) + # Expect records from both files in order + ids = [example["id"] for _, example in examples] + assert len(examples) == 4 assert ids == ["a", "b", "c", "d"] def test_fasta_gz_via_dl_manager(fasta_gz, tmp_path): # Test that gzipped FASTA files can be read via StreamingDownloadManager. - # This validates that the FASTA implementation properly uses xopen() to handle - # fsspec paths like "gzip://file.fasta::path/to/file.gz" + # This validates that the FASTA implementation properly handles compressed files data_files = DataFilesDict({"train": [fasta_gz]}) config = FASTAConfig(data_files=data_files) fasta = FASTA() @@ -257,15 +308,12 @@ def test_fasta_gz_via_dl_manager(fasta_gz, tmp_path): dlm = StreamingDownloadManager() splits = fasta._split_generators(dlm) assert len(splits) == 1 - # Generate tables using files from dl_manager (ensures .gz is extracted on the fly) - tables = list(fasta._generate_tables(splits[0].gen_kwargs["files"])) - assert len(tables) >= 1 + # Generate examples using files from dl_manager (ensures .gz is extracted on the fly) + examples = list(fasta._generate_examples(splits[0].gen_kwargs["files"])) + assert len(examples) >= 1 - # Flatten and check content - rows = [] - for _, tbl in tables: - for i in range(len(tbl)): - rows.append({c: tbl[c][i].as_py() for c in tbl.column_names}) + # Collect all examples + rows = [example for _, example in examples] assert len(rows) == 2 assert rows[0]["id"] == "gz1" @@ -286,11 +334,14 @@ def test_fasta_load_dataset_like_usage(fasta_basic, tmp_path, monkeypatch): config = FASTAConfig() fasta = FASTA() fasta.config = config - tables = list(fasta._generate_tables([[fasta_basic]])) - assert len(tables) >= 1 + examples = list(fasta._generate_examples([[fasta_basic]])) + assert len(examples) >= 1 - # Optionally, verify that features match expected when building a Dataset - # (constructing a Dataset from pyarrow tables directly is possible, but out of scope here). + # Verify that examples have the expected structure + _, example = examples[0] + assert "id" in example + assert "description" in example + assert "sequence" in example # ┌───────────────────────────────┐ @@ -302,11 +353,8 @@ def test_fasta_handles_no_trailing_newline(tmp_path): p = tmp_path / "no_newline.fasta" p.write_text(">x\nATGC") # no trailing newline fasta = FASTA() - tables = list(fasta._generate_tables([[str(p)]])) - rows = [] - for _, tbl in tables: - for i in range(len(tbl)): - rows.append({c: tbl[c][i].as_py() for c in tbl.column_names}) + examples = list(fasta._generate_examples([[str(p)]])) + rows = [example for _, example in examples] assert rows == [{"id": "x", "description": "x", "sequence": "ATGC"}] @@ -321,6 +369,188 @@ def test_fasta_single_record(tmp_path): .replace("\n ", "\n") ) fasta = FASTA() - tables = list(fasta._generate_tables([[str(p)]])) - total = sum(len(tbl) for _, tbl in tables) - assert total == 1 + examples = list(fasta._generate_examples([[str(p)]])) + assert len(examples) == 1 + + +# ┌───────────────────────────────────┐ +# │ FASTQ: Basic functionality │ +# └───────────────────────────────────┘ + + +def test_fastq_basic_functionality(fastq_basic): + config = FASTAConfig(file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + generator = fasta._generate_examples([[fastq_basic]]) + examples = list(generator) + assert len(examples) >= 1 + + # Collect all rows + all_rows = [example for _, example in examples] + + assert set(all_rows[0].keys()) == {"id", "description", "sequence", "quality"} + + # Verify first record + assert all_rows[0]["id"] == "SRR001666.1" + assert all_rows[0]["description"] == "SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36" + assert all_rows[0]["sequence"] == "GGGTGATGGCCGCTGCCGATGGCGTCAAATCCCACC" + assert all_rows[0]["quality"] == "IIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IG9IC" + + # Verify second record + assert all_rows[1]["id"] == "SRR001666.2" + assert all_rows[1]["description"] == "SRR001666.2 071112_SLXA-EAS1_s_7:5:1:801:338 length=36" + assert all_rows[1]["sequence"] == "GTTCAGGGATACGACGTTTGTATTTTAAGAATCTGA" + assert all_rows[1]["quality"] == "IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII6IBI" + + +def test_fastq_multiline_sequences(fastq_multiline): + config = FASTAConfig(file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + generator = fasta._generate_examples([[fastq_multiline]]) + examples = list(generator) + + # Collect all rows + rows = [example for _, example in examples] + + # First record - multi-line sequence and quality should be concatenated + assert rows[0]["id"] == "read1" + assert rows[0]["sequence"] == "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT" + assert rows[0]["quality"] == "!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65" + + # Second record + assert rows[1]["id"] == "read2" + assert rows[1]["sequence"] == "ACGTACGT" + assert rows[1]["quality"] == "IIIIIIII" + + +def test_fastq_default_features(fastq_basic): + config = FASTAConfig(file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + # FASTQ should have id, description, sequence, and quality + assert set(fasta.info.features.keys()) == {"id", "description", "sequence", "quality"} + + +def test_fastq_column_filtering(fastq_basic): + config = FASTAConfig( + file_type="fastq", + columns=["id", "sequence", "quality"] + ) + fasta = FASTA() + fasta.config = config + # Call _info to initialize features (they're already set correctly in config) + info = fasta._info() + fasta.info.features = Features({col: feat for col, feat in info.features.items() if col in config.columns}) + + generator = fasta._generate_examples([[fastq_basic]]) + examples = list(generator) + + # Ensure only selected columns appear + for _, example in examples: + assert set(example.keys()) == {"id", "sequence", "quality"} + assert isinstance(example["id"], str) + assert isinstance(example["sequence"], str) + assert isinstance(example["quality"], str) + + +def test_fastq_batch_processing(fastq_basic): + config = FASTAConfig(file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + + generator = fasta._generate_examples([[fastq_basic]]) + examples = list(generator) + + # 2 records in the file + assert len(examples) == 2 + + +def test_fastq_empty_file(fastq_empty): + config = FASTAConfig(file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + examples = list(fasta._generate_examples([[fastq_empty]])) + assert len(examples) == 0 + + +def test_fastq_multiple_files(fastq_multi): + f1, f2 = fastq_multi + config = FASTAConfig(file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + examples = list(fasta._generate_examples([[f1, f2]])) + + ids = [example["id"] for _, example in examples] + + assert len(examples) == 4 + assert ids == ["read1", "read2", "read3", "read4"] + + +def test_fastq_gz_via_dl_manager(fastq_gz, tmp_path): + # Test that gzipped FASTQ files can be read via StreamingDownloadManager + data_files = DataFilesDict({"train": [fastq_gz]}) + config = FASTAConfig(data_files=data_files, file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + + dlm = StreamingDownloadManager() + splits = fasta._split_generators(dlm) + assert len(splits) == 1 + + examples = list(fasta._generate_examples(splits[0].gen_kwargs["files"])) + assert len(examples) >= 1 + + # Collect all examples + rows = [example for _, example in examples] + + assert len(rows) == 2 + assert rows[0]["id"] == "gz_read1" + assert rows[0]["sequence"] == "ATGCATGC" + assert rows[0]["quality"] == "IIIIIIII" + assert rows[1]["id"] == "gz_read2" + assert rows[1]["sequence"] == "GGGGTTTT" + assert rows[1]["quality"] == "HHHHHHHH" + + +def test_fastq_quality_scores_preserved(fastq_basic): + # Verify that quality scores with special characters are preserved correctly + config = FASTAConfig(file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + generator = fasta._generate_examples([[fastq_basic]]) + examples = list(generator) + + rows = [example for _, example in examples] + + # Check that quality characters are preserved (high quality 'I' and moderate quality digits) + assert "I" in rows[0]["quality"] + assert "9" in rows[0]["quality"] + assert "G" in rows[0]["quality"] + assert "C" in rows[0]["quality"] + assert "6" in rows[1]["quality"] + assert "B" in rows[1]["quality"] + + +def test_fastq_handles_no_trailing_newline(tmp_path): + p = tmp_path / "no_newline.fastq" + p.write_text("@read1\nATGC\n+\nIIII") # no trailing newline + config = FASTAConfig(file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + examples = list(fasta._generate_examples([[str(p)]])) + rows = [example for _, example in examples] + assert len(rows) == 1 + assert rows[0]["id"] == "read1" + assert rows[0]["sequence"] == "ATGC" + assert rows[0]["quality"] == "IIII" \ No newline at end of file