huggingface · cyyever · Mar 6, 2025 · Mar 6, 2025 · Apr 15, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,6 @@
+[project]
+requires-python = ">=3.9"
+
 [tool.ruff]
 line-length = 119
 

diff --git a/src/datasets/features/pdf.py b/src/datasets/features/pdf.py
@@ -166,7 +166,7 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "pdfplumber.pdf
 
         return pdf
 
-    def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]:
+    def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]:
         """If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary."""
         from .features import Value
 

diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py
@@ -95,7 +95,7 @@ def _hash_python_lines(lines: list[str]) -> str:
     _MODULE_TO_EXTENSIONS[_module].append(".zip")
 
 # Used to filter data files based on file names
-_MODULE_TO_METADATA_FILE_NAMES: Dict[str, List[str]] = {}
+_MODULE_TO_METADATA_FILE_NAMES: dict[str, list[str]] = {}
 for _module in _MODULE_TO_EXTENSIONS:
     _MODULE_TO_METADATA_FILE_NAMES[_module] = []
 _MODULE_TO_METADATA_FILE_NAMES["imagefolder"] = imagefolder.ImageFolder.METADATA_FILENAMES

diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py
@@ -2,8 +2,9 @@
 import io
 import itertools
 import os
+from collections.abc import Iterator
 from dataclasses import dataclass
-from typing import Any, Callable, Iterator, Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import pandas as pd
 import pyarrow as pa

diff --git a/templates/new_dataset_script.py b/templates/new_dataset_script.py
@@ -15,7 +15,6 @@
 """TODO: Add a description here."""
 
 
-import csv
 import json
 import os
 

diff --git a/tests/distributed_scripts/run_torch_distributed.py b/tests/distributed_scripts/run_torch_distributed.py
@@ -1,6 +1,5 @@
 import os
 from argparse import ArgumentParser
-from typing import List
 
 import torch.utils.data
 
@@ -16,7 +15,7 @@ class FailedTestError(RuntimeError):
     pass
 
 
-def gen(shards: List[str]):
+def gen(shards: list[str]):
     for shard in shards:
         for i in range(NUM_ITEMS_PER_SHARD):
             yield {"i": i, "shard": shard}

diff --git a/tests/features/test_features.py b/tests/features/test_features.py
@@ -1,5 +1,4 @@
 import datetime
-from typing import List, Tuple
 from unittest import TestCase
 from unittest.mock import MagicMock, patch
 
@@ -858,7 +857,7 @@ def test_features_to_arrow_schema(features: Features):
 
 
 @pytest.mark.parametrize("features", NESTED_COMPARISON)
-def test_features_alignment(features: Tuple[List[Features], Features]):
+def test_features_alignment(features: tuple[list[Features], Features]):
     inputs, expected = features
     _check_if_features_can_be_aligned(inputs)  # Check that we can align, will raise otherwise.
     assert _align_features(inputs) == expected

diff --git a/tests/io/test_sql.py b/tests/io/test_sql.py
@@ -57,8 +57,7 @@ def iter_sql_file(sqlite_path):
     with contextlib.closing(sqlite3.connect(sqlite_path)) as con:
         cur = con.cursor()
         cur.execute("SELECT * FROM dataset")
-        for row in cur:
-            yield row
+        yield from cur
 
 
 @require_sqlalchemy

diff --git a/tests/packaged_modules/test_webdataset.py b/tests/packaged_modules/test_webdataset.py
@@ -93,7 +93,7 @@ def test_gzipped_text_webdataset(gzipped_text_wds_file, text_path):
     _, examples = zip(*generator)
     assert len(examples) == 3
     assert isinstance(examples[0]["txt.gz"], str)
-    with open(text_path, "r") as f:
+    with open(text_path) as f:
         assert examples[0]["txt.gz"].replace("\r\n", "\n") == f.read().replace("\r\n", "\n")
 
 

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
@@ -3175,12 +3175,11 @@ def test_tf_dataset_options(self, in_memory):
             self.assertEqual(len(tf_dataset), 2)  # One batch of 3 and one batch of 1
             self.assertEqual(len(tf_dataset_with_drop), 1)  # Incomplete batch of 1 is dropped
         # Test that `NotImplementedError` is raised `batch_size` is None and `num_workers` is > 0
-        if sys.version_info >= (3, 8):
-            with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset:
-                with self.assertRaisesRegex(
-                    NotImplementedError, "`batch_size` must be specified when using multiple workers"
-                ):
-                    dset.to_tf_dataset(columns="col_1", batch_size=None, num_workers=2)
+        with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset:
+            with self.assertRaisesRegex(
+                NotImplementedError, "`batch_size` must be specified when using multiple workers"
+            ):
+                dset.to_tf_dataset(columns="col_1", batch_size=None, num_workers=2)
         del tf_dataset  # For correct cleanup
         del tf_dataset_with_drop
 
@@ -3960,8 +3959,7 @@ def _gen():
             {"col_1": "2", "col_2": 2, "col_3": 2.0},
             {"col_1": "3", "col_2": 3, "col_3": 3.0},
         ]
-        for item in data:
-            yield item
+        yield from data
 
     return _gen
 

diff --git a/tests/test_data_files.py b/tests/test_data_files.py
@@ -1,7 +1,6 @@
 import copy
 import os
 from pathlib import Path
-from typing import List
 from unittest.mock import patch
 
 import fsspec
@@ -509,7 +508,7 @@ def test_DataFilesPatternsDict(text_file):
     assert isinstance(data_files_dict["train"], DataFilesList)
 
 
-def mock_fs(file_paths: List[str]):
+def mock_fs(file_paths: list[str]):
     """
     Set up a mock filesystem for fsspec containing the provided files
 

diff --git a/tests/test_file_utils.py b/tests/test_file_utils.py
@@ -361,7 +361,7 @@ def _readd_double_slash_removed_by_path(path_as_posix: str) -> str:
         (
             str(Path().resolve()),
             ("file.txt",),
-            str((Path().resolve() / "file.txt")),
+            str(Path().resolve() / "file.txt"),
         ),
     ],
 )

diff --git a/tests/test_table.py b/tests/test_table.py
@@ -2,7 +2,7 @@
 import pickle
 from decimal import Decimal
 from functools import partial
-from typing import List, Union
+from typing import Union
 from unittest.mock import MagicMock
 
 import numpy as np
@@ -40,7 +40,7 @@ def in_memory_pa_table(arrow_file) -> pa.Table:
     return pa.ipc.open_stream(arrow_file).read_all()
 
 
-def _to_testing_blocks(table: TableBlock) -> List[List[TableBlock]]:
+def _to_testing_blocks(table: TableBlock) -> list[list[TableBlock]]:
     assert len(table) > 2
     blocks = [
         [table.slice(0, 2)],
@@ -1049,7 +1049,7 @@ def test_concat_tables(arrow_file, in_memory_pa_table):
     assert isinstance(concatenated_table.blocks[0][2], InMemoryTable)
 
 
-def _interpolation_search_ground_truth(arr: List[int], x: int) -> Union[int, IndexError]:
+def _interpolation_search_ground_truth(arr: list[int], x: int) -> Union[int, IndexError]:
     for i in range(len(arr) - 1):
         if arr[i] <= x < arr[i + 1]:
             return i

diff --git a/utils/release.py b/utils/release.py
@@ -30,7 +30,7 @@
 
 def update_version_in_file(fname, version, pattern):
     """Update the version in one file using a specific pattern."""
-    with open(fname, "r", encoding="utf-8", newline="\n") as f:
+    with open(fname, encoding="utf-8", newline="\n") as f:
         code = f.read()
     re_pattern, replace = REPLACE_PATTERNS[pattern]
     replace = replace.replace("VERSION", version)
@@ -47,7 +47,7 @@ def global_version_update(version):
 
 def get_version():
     """Reads the current version in the __init__."""
-    with open(REPLACE_FILES["init"], "r") as f:
+    with open(REPLACE_FILES["init"]) as f:
         code = f.read()
     default_version = REPLACE_PATTERNS["init"][0].search(code).groups()[0]
     return packaging.version.parse(default_version)