From f62f0b8b3dff536e66a3cb57af0eacd0ac750b5c Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 6 Mar 2025 10:07:45 +0800
Subject: [PATCH 1/3] Use pyupgrade --py39-plus for remaining files

---
 src/datasets/packaged_modules/__init__.py          |  2 +-
 .../folder_based_builder/folder_based_builder.py   |  3 ++-
 templates/new_dataset_script.py                    |  1 -
 tests/distributed_scripts/run_torch_distributed.py |  3 +--
 tests/features/test_features.py                    |  3 +--
 tests/io/test_sql.py                               |  3 +--
 tests/packaged_modules/test_webdataset.py          |  2 +-
 tests/test_arrow_dataset.py                        | 14 ++++++--------
 tests/test_data_files.py                           |  3 +--
 tests/test_file_utils.py                           |  2 +-
 tests/test_table.py                                |  6 +++---
 utils/release.py                                   |  4 ++--
 12 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py
index f61c6ddd3de..f20290d6127 100644
--- a/src/datasets/packaged_modules/__init__.py
+++ b/src/datasets/packaged_modules/__init__.py
@@ -95,7 +95,7 @@ def _hash_python_lines(lines: list[str]) -> str:
     _MODULE_TO_EXTENSIONS[_module].append(".zip")
 
 # Used to filter data files based on file names
-_MODULE_TO_METADATA_FILE_NAMES: Dict[str, List[str]] = {}
+_MODULE_TO_METADATA_FILE_NAMES: dict[str, list[str]] = {}
 for _module in _MODULE_TO_EXTENSIONS:
     _MODULE_TO_METADATA_FILE_NAMES[_module] = []
 _MODULE_TO_METADATA_FILE_NAMES["imagefolder"] = imagefolder.ImageFolder.METADATA_FILENAMES
diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py
index 44f604e9ee5..77d044f5358 100644
--- a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py
+++ b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py
@@ -2,8 +2,9 @@
 import io
 import itertools
 import os
+from collections.abc import Iterator
 from dataclasses import dataclass
-from typing import Any, Callable, Iterator, Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import pandas as pd
 import pyarrow as pa
diff --git a/templates/new_dataset_script.py b/templates/new_dataset_script.py
index 3f8d14181b5..0e047db2525 100644
--- a/templates/new_dataset_script.py
+++ b/templates/new_dataset_script.py
@@ -15,7 +15,6 @@
 """TODO: Add a description here."""
 
 
-import csv
 import json
 import os
 
diff --git a/tests/distributed_scripts/run_torch_distributed.py b/tests/distributed_scripts/run_torch_distributed.py
index 4cfe247dd74..68ecd8984d9 100644
--- a/tests/distributed_scripts/run_torch_distributed.py
+++ b/tests/distributed_scripts/run_torch_distributed.py
@@ -1,6 +1,5 @@
 import os
 from argparse import ArgumentParser
-from typing import List
 
 import torch.utils.data
 
@@ -16,7 +15,7 @@ class FailedTestError(RuntimeError):
     pass
 
 
-def gen(shards: List[str]):
+def gen(shards: list[str]):
     for shard in shards:
         for i in range(NUM_ITEMS_PER_SHARD):
             yield {"i": i, "shard": shard}
diff --git a/tests/features/test_features.py b/tests/features/test_features.py
index 6234d7ede62..53b308c07b7 100644
--- a/tests/features/test_features.py
+++ b/tests/features/test_features.py
@@ -1,5 +1,4 @@
 import datetime
-from typing import List, Tuple
 from unittest import TestCase
 from unittest.mock import MagicMock, patch
 
@@ -858,7 +857,7 @@ def test_features_to_arrow_schema(features: Features):
 
 
 @pytest.mark.parametrize("features", NESTED_COMPARISON)
-def test_features_alignment(features: Tuple[List[Features], Features]):
+def test_features_alignment(features: tuple[list[Features], Features]):
     inputs, expected = features
     _check_if_features_can_be_aligned(inputs)  # Check that we can align, will raise otherwise.
     assert _align_features(inputs) == expected
diff --git a/tests/io/test_sql.py b/tests/io/test_sql.py
index 5adda22033f..daefc5c87b3 100644
--- a/tests/io/test_sql.py
+++ b/tests/io/test_sql.py
@@ -57,8 +57,7 @@ def iter_sql_file(sqlite_path):
     with contextlib.closing(sqlite3.connect(sqlite_path)) as con:
         cur = con.cursor()
         cur.execute("SELECT * FROM dataset")
-        for row in cur:
-            yield row
+        yield from cur
 
 
 @require_sqlalchemy
diff --git a/tests/packaged_modules/test_webdataset.py b/tests/packaged_modules/test_webdataset.py
index 128f13022fc..6374628cbd2 100644
--- a/tests/packaged_modules/test_webdataset.py
+++ b/tests/packaged_modules/test_webdataset.py
@@ -93,7 +93,7 @@ def test_gzipped_text_webdataset(gzipped_text_wds_file, text_path):
     _, examples = zip(*generator)
     assert len(examples) == 3
     assert isinstance(examples[0]["txt.gz"], str)
-    with open(text_path, "r") as f:
+    with open(text_path) as f:
         assert examples[0]["txt.gz"].replace("\r\n", "\n") == f.read().replace("\r\n", "\n")
 
 
diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
index 20fab1962e4..7569201431d 100644
--- a/tests/test_arrow_dataset.py
+++ b/tests/test_arrow_dataset.py
@@ -3175,12 +3175,11 @@ def test_tf_dataset_options(self, in_memory):
             self.assertEqual(len(tf_dataset), 2)  # One batch of 3 and one batch of 1
             self.assertEqual(len(tf_dataset_with_drop), 1)  # Incomplete batch of 1 is dropped
         # Test that `NotImplementedError` is raised `batch_size` is None and `num_workers` is > 0
-        if sys.version_info >= (3, 8):
-            with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset:
-                with self.assertRaisesRegex(
-                    NotImplementedError, "`batch_size` must be specified when using multiple workers"
-                ):
-                    dset.to_tf_dataset(columns="col_1", batch_size=None, num_workers=2)
+        with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset:
+            with self.assertRaisesRegex(
+                NotImplementedError, "`batch_size` must be specified when using multiple workers"
+            ):
+                dset.to_tf_dataset(columns="col_1", batch_size=None, num_workers=2)
         del tf_dataset  # For correct cleanup
         del tf_dataset_with_drop
 
@@ -3960,8 +3959,7 @@ def _gen():
             {"col_1": "2", "col_2": 2, "col_3": 2.0},
             {"col_1": "3", "col_2": 3, "col_3": 3.0},
         ]
-        for item in data:
-            yield item
+        yield from data
 
     return _gen
 
diff --git a/tests/test_data_files.py b/tests/test_data_files.py
index 74f48dbd2d5..7b3ea7ac914 100644
--- a/tests/test_data_files.py
+++ b/tests/test_data_files.py
@@ -1,7 +1,6 @@
 import copy
 import os
 from pathlib import Path
-from typing import List
 from unittest.mock import patch
 
 import fsspec
@@ -509,7 +508,7 @@ def test_DataFilesPatternsDict(text_file):
     assert isinstance(data_files_dict["train"], DataFilesList)
 
 
-def mock_fs(file_paths: List[str]):
+def mock_fs(file_paths: list[str]):
     """
     Set up a mock filesystem for fsspec containing the provided files
 
diff --git a/tests/test_file_utils.py b/tests/test_file_utils.py
index 6f6ac01df9a..ba0bfd33278 100644
--- a/tests/test_file_utils.py
+++ b/tests/test_file_utils.py
@@ -361,7 +361,7 @@ def _readd_double_slash_removed_by_path(path_as_posix: str) -> str:
         (
             str(Path().resolve()),
             ("file.txt",),
-            str((Path().resolve() / "file.txt")),
+            str(Path().resolve() / "file.txt"),
         ),
     ],
 )
diff --git a/tests/test_table.py b/tests/test_table.py
index 3d3db09e5d6..4258b8bda45 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -2,7 +2,7 @@
 import pickle
 from decimal import Decimal
 from functools import partial
-from typing import List, Union
+from typing import Union
 from unittest.mock import MagicMock
 
 import numpy as np
@@ -40,7 +40,7 @@ def in_memory_pa_table(arrow_file) -> pa.Table:
     return pa.ipc.open_stream(arrow_file).read_all()
 
 
-def _to_testing_blocks(table: TableBlock) -> List[List[TableBlock]]:
+def _to_testing_blocks(table: TableBlock) -> list[list[TableBlock]]:
     assert len(table) > 2
     blocks = [
         [table.slice(0, 2)],
@@ -1049,7 +1049,7 @@ def test_concat_tables(arrow_file, in_memory_pa_table):
     assert isinstance(concatenated_table.blocks[0][2], InMemoryTable)
 
 
-def _interpolation_search_ground_truth(arr: List[int], x: int) -> Union[int, IndexError]:
+def _interpolation_search_ground_truth(arr: list[int], x: int) -> Union[int, IndexError]:
     for i in range(len(arr) - 1):
         if arr[i] <= x < arr[i + 1]:
             return i
diff --git a/utils/release.py b/utils/release.py
index 04a0cf02793..7e0be943a64 100644
--- a/utils/release.py
+++ b/utils/release.py
@@ -30,7 +30,7 @@
 
 def update_version_in_file(fname, version, pattern):
     """Update the version in one file using a specific pattern."""
-    with open(fname, "r", encoding="utf-8", newline="\n") as f:
+    with open(fname, encoding="utf-8", newline="\n") as f:
         code = f.read()
     re_pattern, replace = REPLACE_PATTERNS[pattern]
     replace = replace.replace("VERSION", version)
@@ -47,7 +47,7 @@ def global_version_update(version):
 
 def get_version():
     """Reads the current version in the __init__."""
-    with open(REPLACE_FILES["init"], "r") as f:
+    with open(REPLACE_FILES["init"]) as f:
         code = f.read()
     default_version = REPLACE_PATTERNS["init"][0].search(code).groups()[0]
     return packaging.version.parse(default_version)

From 6a4b7830299ed23d020c545a2d77ba86f41d4197 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 6 Mar 2025 10:15:34 +0800
Subject: [PATCH 2/3] Set ruff target version

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 511a9e0d744..4b1c14cbc7d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,6 @@
+[project]
+requires-python = ">=3.9"
+
 [tool.ruff]
 line-length = 119
 

From 5dac55f8e74de175b3e6a250899d0d1eae763ad1 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Tue, 15 Apr 2025 22:47:49 +0800
Subject: [PATCH 3/3] More fixes

Signed-off-by: cyy <cyyever@outlook.com>
---
 src/datasets/features/pdf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/features/pdf.py b/src/datasets/features/pdf.py
index 7e62c50831c..21029b8bb82 100644
--- a/src/datasets/features/pdf.py
+++ b/src/datasets/features/pdf.py
@@ -166,7 +166,7 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "pdfplumber.pdf
 
         return pdf
 
-    def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]:
+    def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]:
         """If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary."""
         from .features import Value