From f62f0b8b3dff536e66a3cb57af0eacd0ac750b5c Mon Sep 17 00:00:00 2001 From: cyy Date: Thu, 6 Mar 2025 10:07:45 +0800 Subject: [PATCH 1/3] Use pyupgrade --py39-plus for remaining files --- src/datasets/packaged_modules/__init__.py | 2 +- .../folder_based_builder/folder_based_builder.py | 3 ++- templates/new_dataset_script.py | 1 - tests/distributed_scripts/run_torch_distributed.py | 3 +-- tests/features/test_features.py | 3 +-- tests/io/test_sql.py | 3 +-- tests/packaged_modules/test_webdataset.py | 2 +- tests/test_arrow_dataset.py | 14 ++++++-------- tests/test_data_files.py | 3 +-- tests/test_file_utils.py | 2 +- tests/test_table.py | 6 +++--- utils/release.py | 4 ++-- 12 files changed, 20 insertions(+), 26 deletions(-) diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py index f61c6ddd3de..f20290d6127 100644 --- a/src/datasets/packaged_modules/__init__.py +++ b/src/datasets/packaged_modules/__init__.py @@ -95,7 +95,7 @@ def _hash_python_lines(lines: list[str]) -> str: _MODULE_TO_EXTENSIONS[_module].append(".zip") # Used to filter data files based on file names -_MODULE_TO_METADATA_FILE_NAMES: Dict[str, List[str]] = {} +_MODULE_TO_METADATA_FILE_NAMES: dict[str, list[str]] = {} for _module in _MODULE_TO_EXTENSIONS: _MODULE_TO_METADATA_FILE_NAMES[_module] = [] _MODULE_TO_METADATA_FILE_NAMES["imagefolder"] = imagefolder.ImageFolder.METADATA_FILENAMES diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py index 44f604e9ee5..77d044f5358 100644 --- a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py +++ b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py @@ -2,8 +2,9 @@ import io import itertools import os +from collections.abc import Iterator from dataclasses import dataclass -from typing import Any, Callable, Iterator, Optional, Union +from typing import Any, Callable, Optional, Union import pandas as pd import pyarrow as pa diff --git a/templates/new_dataset_script.py b/templates/new_dataset_script.py index 3f8d14181b5..0e047db2525 100644 --- a/templates/new_dataset_script.py +++ b/templates/new_dataset_script.py @@ -15,7 +15,6 @@ """TODO: Add a description here.""" -import csv import json import os diff --git a/tests/distributed_scripts/run_torch_distributed.py b/tests/distributed_scripts/run_torch_distributed.py index 4cfe247dd74..68ecd8984d9 100644 --- a/tests/distributed_scripts/run_torch_distributed.py +++ b/tests/distributed_scripts/run_torch_distributed.py @@ -1,6 +1,5 @@ import os from argparse import ArgumentParser -from typing import List import torch.utils.data @@ -16,7 +15,7 @@ class FailedTestError(RuntimeError): pass -def gen(shards: List[str]): +def gen(shards: list[str]): for shard in shards: for i in range(NUM_ITEMS_PER_SHARD): yield {"i": i, "shard": shard} diff --git a/tests/features/test_features.py b/tests/features/test_features.py index 6234d7ede62..53b308c07b7 100644 --- a/tests/features/test_features.py +++ b/tests/features/test_features.py @@ -1,5 +1,4 @@ import datetime -from typing import List, Tuple from unittest import TestCase from unittest.mock import MagicMock, patch @@ -858,7 +857,7 @@ def test_features_to_arrow_schema(features: Features): @pytest.mark.parametrize("features", NESTED_COMPARISON) -def test_features_alignment(features: Tuple[List[Features], Features]): +def test_features_alignment(features: tuple[list[Features], Features]): inputs, expected = features _check_if_features_can_be_aligned(inputs) # Check that we can align, will raise otherwise. assert _align_features(inputs) == expected diff --git a/tests/io/test_sql.py b/tests/io/test_sql.py index 5adda22033f..daefc5c87b3 100644 --- a/tests/io/test_sql.py +++ b/tests/io/test_sql.py @@ -57,8 +57,7 @@ def iter_sql_file(sqlite_path): with contextlib.closing(sqlite3.connect(sqlite_path)) as con: cur = con.cursor() cur.execute("SELECT * FROM dataset") - for row in cur: - yield row + yield from cur @require_sqlalchemy diff --git a/tests/packaged_modules/test_webdataset.py b/tests/packaged_modules/test_webdataset.py index 128f13022fc..6374628cbd2 100644 --- a/tests/packaged_modules/test_webdataset.py +++ b/tests/packaged_modules/test_webdataset.py @@ -93,7 +93,7 @@ def test_gzipped_text_webdataset(gzipped_text_wds_file, text_path): _, examples = zip(*generator) assert len(examples) == 3 assert isinstance(examples[0]["txt.gz"], str) - with open(text_path, "r") as f: + with open(text_path) as f: assert examples[0]["txt.gz"].replace("\r\n", "\n") == f.read().replace("\r\n", "\n") diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 20fab1962e4..7569201431d 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -3175,12 +3175,11 @@ def test_tf_dataset_options(self, in_memory): self.assertEqual(len(tf_dataset), 2) # One batch of 3 and one batch of 1 self.assertEqual(len(tf_dataset_with_drop), 1) # Incomplete batch of 1 is dropped # Test that `NotImplementedError` is raised `batch_size` is None and `num_workers` is > 0 - if sys.version_info >= (3, 8): - with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset: - with self.assertRaisesRegex( - NotImplementedError, "`batch_size` must be specified when using multiple workers" - ): - dset.to_tf_dataset(columns="col_1", batch_size=None, num_workers=2) + with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset: + with self.assertRaisesRegex( + NotImplementedError, "`batch_size` must be specified when using multiple workers" + ): + dset.to_tf_dataset(columns="col_1", batch_size=None, num_workers=2) del tf_dataset # For correct cleanup del tf_dataset_with_drop @@ -3960,8 +3959,7 @@ def _gen(): {"col_1": "2", "col_2": 2, "col_3": 2.0}, {"col_1": "3", "col_2": 3, "col_3": 3.0}, ] - for item in data: - yield item + yield from data return _gen diff --git a/tests/test_data_files.py b/tests/test_data_files.py index 74f48dbd2d5..7b3ea7ac914 100644 --- a/tests/test_data_files.py +++ b/tests/test_data_files.py @@ -1,7 +1,6 @@ import copy import os from pathlib import Path -from typing import List from unittest.mock import patch import fsspec @@ -509,7 +508,7 @@ def test_DataFilesPatternsDict(text_file): assert isinstance(data_files_dict["train"], DataFilesList) -def mock_fs(file_paths: List[str]): +def mock_fs(file_paths: list[str]): """ Set up a mock filesystem for fsspec containing the provided files diff --git a/tests/test_file_utils.py b/tests/test_file_utils.py index 6f6ac01df9a..ba0bfd33278 100644 --- a/tests/test_file_utils.py +++ b/tests/test_file_utils.py @@ -361,7 +361,7 @@ def _readd_double_slash_removed_by_path(path_as_posix: str) -> str: ( str(Path().resolve()), ("file.txt",), - str((Path().resolve() / "file.txt")), + str(Path().resolve() / "file.txt"), ), ], ) diff --git a/tests/test_table.py b/tests/test_table.py index 3d3db09e5d6..4258b8bda45 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -2,7 +2,7 @@ import pickle from decimal import Decimal from functools import partial -from typing import List, Union +from typing import Union from unittest.mock import MagicMock import numpy as np @@ -40,7 +40,7 @@ def in_memory_pa_table(arrow_file) -> pa.Table: return pa.ipc.open_stream(arrow_file).read_all() -def _to_testing_blocks(table: TableBlock) -> List[List[TableBlock]]: +def _to_testing_blocks(table: TableBlock) -> list[list[TableBlock]]: assert len(table) > 2 blocks = [ [table.slice(0, 2)], @@ -1049,7 +1049,7 @@ def test_concat_tables(arrow_file, in_memory_pa_table): assert isinstance(concatenated_table.blocks[0][2], InMemoryTable) -def _interpolation_search_ground_truth(arr: List[int], x: int) -> Union[int, IndexError]: +def _interpolation_search_ground_truth(arr: list[int], x: int) -> Union[int, IndexError]: for i in range(len(arr) - 1): if arr[i] <= x < arr[i + 1]: return i diff --git a/utils/release.py b/utils/release.py index 04a0cf02793..7e0be943a64 100644 --- a/utils/release.py +++ b/utils/release.py @@ -30,7 +30,7 @@ def update_version_in_file(fname, version, pattern): """Update the version in one file using a specific pattern.""" - with open(fname, "r", encoding="utf-8", newline="\n") as f: + with open(fname, encoding="utf-8", newline="\n") as f: code = f.read() re_pattern, replace = REPLACE_PATTERNS[pattern] replace = replace.replace("VERSION", version) @@ -47,7 +47,7 @@ def global_version_update(version): def get_version(): """Reads the current version in the __init__.""" - with open(REPLACE_FILES["init"], "r") as f: + with open(REPLACE_FILES["init"]) as f: code = f.read() default_version = REPLACE_PATTERNS["init"][0].search(code).groups()[0] return packaging.version.parse(default_version) From 6a4b7830299ed23d020c545a2d77ba86f41d4197 Mon Sep 17 00:00:00 2001 From: cyy Date: Thu, 6 Mar 2025 10:15:34 +0800 Subject: [PATCH 2/3] Set ruff target version --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 511a9e0d744..4b1c14cbc7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,6 @@ +[project] +requires-python = ">=3.9" + [tool.ruff] line-length = 119 From 5dac55f8e74de175b3e6a250899d0d1eae763ad1 Mon Sep 17 00:00:00 2001 From: cyy Date: Tue, 15 Apr 2025 22:47:49 +0800 Subject: [PATCH 3/3] More fixes Signed-off-by: cyy --- src/datasets/features/pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/features/pdf.py b/src/datasets/features/pdf.py index 7e62c50831c..21029b8bb82 100644 --- a/src/datasets/features/pdf.py +++ b/src/datasets/features/pdf.py @@ -166,7 +166,7 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "pdfplumber.pdf return pdf - def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]: + def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]: """If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.""" from .features import Value