diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..4f525be6 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,39 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-json + - id: check-added-large-files + - id: check-merge-conflict + + - repo: https://github.com/psf/black + rev: 24.4.2 + hooks: + - id: black + language_version: python3 + args: ["--line-length", "120"] + + - repo: https://github.com/PyCQA/flake8 + rev: 7.1.0 + hooks: + - id: flake8 + additional_dependencies: [flake8-bugbear] + args: ["--max-line-length", "120", "--ignore=F401"] + + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.11.0 + hooks: + - id: mypy + args: [ "--ignore-missing-imports" ] + + - repo: local + hooks: + - id: cargo-fmt + name: cargo fmt + entry: cargo fmt -- + language: system + types: [rust] + pass_filenames: false diff --git a/python/Makefile b/python/Makefile index 4badaca8..3be8f905 100644 --- a/python/Makefile +++ b/python/Makefile @@ -30,6 +30,8 @@ setup-venv: ## Setup the virtualenv setup: ## Setup the requirements $(info --- Setup dependencies ---) pip install "$(MATURIN_VERSION)" + pip install pre-commit + pre-commit install .PHONY: build build: setup ## Build Python binding of delta-rs diff --git a/python/hudi/_internal.pyi b/python/hudi/_internal.pyi index fd97cc31..7d3e0a66 100644 --- a/python/hudi/_internal.pyi +++ b/python/hudi/_internal.pyi @@ -21,7 +21,6 @@ import pyarrow __version__: str - @dataclass(init=False) class HudiFileSlice: file_group_id: str @@ -33,24 +32,16 @@ class HudiFileSlice: def base_file_relative_path(self) -> str: ... - @dataclass(init=False) class HudiTable: - def __init__( - self, - table_uri: str, - options: Optional[Dict[str, str]] = None, + self, + table_uri: str, + options: Optional[Dict[str, str]] = None, ): ... - def get_schema(self) -> "pyarrow.Schema": ... - def split_file_slices(self, n: int) -> List[List[HudiFileSlice]]: ... - def get_file_slices(self) -> List[HudiFileSlice]: ... - def read_file_slice(self, base_file_relative_path) -> pyarrow.RecordBatch: ... - def read_snapshot(self) -> List["pyarrow.RecordBatch"]: ... - def read_snapshot_as_of(self, timestamp: str) -> List["pyarrow.RecordBatch"]: ... diff --git a/python/tests/test_table_read.py b/python/tests/test_table_read.py index e56463c5..d9947875 100644 --- a/python/tests/test_table_read.py +++ b/python/tests/test_table_read.py @@ -28,20 +28,32 @@ def test_sample_table(get_sample_table): table_path = get_sample_table table = HudiTable(table_path) - assert table.get_schema().names == ['_hoodie_commit_time', '_hoodie_commit_seqno', '_hoodie_record_key', - '_hoodie_partition_path', '_hoodie_file_name', 'ts', 'uuid', 'rider', 'driver', - 'fare', 'city'] + assert table.get_schema().names == [ + "_hoodie_commit_time", + "_hoodie_commit_seqno", + "_hoodie_record_key", + "_hoodie_partition_path", + "_hoodie_file_name", + "ts", + "uuid", + "rider", + "driver", + "fare", + "city", + ] file_slices = table.get_file_slices() assert len(file_slices) == 5 - assert set(f.commit_time for f in file_slices) == {'20240402123035233', '20240402144910683'} + assert set(f.commit_time for f in file_slices) == {"20240402123035233", "20240402144910683"} assert all(f.num_records == 1 for f in file_slices) file_slice_paths = [f.base_file_relative_path() for f in file_slices] - assert set(file_slice_paths) == {'chennai/68d3c349-f621-4cd8-9e8b-c6dd8eb20d08-0_4-12-0_20240402123035233.parquet', - 'san_francisco/d9082ffd-2eb1-4394-aefc-deb4a61ecc57-0_1-9-0_20240402123035233.parquet', - 'san_francisco/780b8586-3ad0-48ef-a6a1-d2217845ce4a-0_0-8-0_20240402123035233.parquet', - 'san_francisco/5a226868-2934-4f84-a16f-55124630c68d-0_0-7-24_20240402144910683.parquet', - 'sao_paulo/ee915c68-d7f8-44f6-9759-e691add290d8-0_3-11-0_20240402123035233.parquet'} + assert set(file_slice_paths) == { + "chennai/68d3c349-f621-4cd8-9e8b-c6dd8eb20d08-0_4-12-0_20240402123035233.parquet", + "san_francisco/d9082ffd-2eb1-4394-aefc-deb4a61ecc57-0_1-9-0_20240402123035233.parquet", + "san_francisco/780b8586-3ad0-48ef-a6a1-d2217845ce4a-0_0-8-0_20240402123035233.parquet", + "san_francisco/5a226868-2934-4f84-a16f-55124630c68d-0_0-7-24_20240402144910683.parquet", + "sao_paulo/ee915c68-d7f8-44f6-9759-e691add290d8-0_3-11-0_20240402123035233.parquet", + } batch = table.read_file_slice(file_slice_paths[0]) t = pa.Table.from_batches([batch]) @@ -54,28 +66,71 @@ def test_sample_table(get_sample_table): batches = table.read_snapshot() t = pa.Table.from_batches(batches).select([0, 5, 6, 9]).sort_by("ts") - assert t.to_pylist() == [{'_hoodie_commit_time': '20240402144910683', 'ts': 1695046462179, - 'uuid': '9909a8b1-2d15-4d3d-8ec9-efc48c536a00', 'fare': 339.0}, - {'_hoodie_commit_time': '20240402123035233', 'ts': 1695091554788, - 'uuid': 'e96c4396-3fad-413a-a942-4cb36106d721', 'fare': 27.7}, - {'_hoodie_commit_time': '20240402123035233', 'ts': 1695115999911, - 'uuid': 'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa', 'fare': 17.85}, - {'_hoodie_commit_time': '20240402123035233', 'ts': 1695159649087, - 'uuid': '334e26e9-8355-45cc-97c6-c31daf0df330', 'fare': 19.1}, - {'_hoodie_commit_time': '20240402123035233', 'ts': 1695516137016, - 'uuid': 'e3cf430c-889d-4015-bc98-59bdce1e530c', 'fare': 34.15}] + assert t.to_pylist() == [ + { + "_hoodie_commit_time": "20240402144910683", + "ts": 1695046462179, + "uuid": "9909a8b1-2d15-4d3d-8ec9-efc48c536a00", + "fare": 339.0, + }, + { + "_hoodie_commit_time": "20240402123035233", + "ts": 1695091554788, + "uuid": "e96c4396-3fad-413a-a942-4cb36106d721", + "fare": 27.7, + }, + { + "_hoodie_commit_time": "20240402123035233", + "ts": 1695115999911, + "uuid": "c8abbe79-8d89-47ea-b4ce-4d224bae5bfa", + "fare": 17.85, + }, + { + "_hoodie_commit_time": "20240402123035233", + "ts": 1695159649087, + "uuid": "334e26e9-8355-45cc-97c6-c31daf0df330", + "fare": 19.1, + }, + { + "_hoodie_commit_time": "20240402123035233", + "ts": 1695516137016, + "uuid": "e3cf430c-889d-4015-bc98-59bdce1e530c", + "fare": 34.15, + }, + ] - table = HudiTable(table_path, { - "hoodie.read.as.of.timestamp": "20240402123035233"}) + table = HudiTable(table_path, {"hoodie.read.as.of.timestamp": "20240402123035233"}) batches = table.read_snapshot() t = pa.Table.from_batches(batches).select([0, 5, 6, 9]).sort_by("ts") - assert t.to_pylist() == [{'_hoodie_commit_time': '20240402123035233', 'ts': 1695046462179, - 'uuid': '9909a8b1-2d15-4d3d-8ec9-efc48c536a00', 'fare': 33.9}, - {'_hoodie_commit_time': '20240402123035233', 'ts': 1695091554788, - 'uuid': 'e96c4396-3fad-413a-a942-4cb36106d721', 'fare': 27.7}, - {'_hoodie_commit_time': '20240402123035233', 'ts': 1695115999911, - 'uuid': 'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa', 'fare': 17.85}, - {'_hoodie_commit_time': '20240402123035233', 'ts': 1695159649087, - 'uuid': '334e26e9-8355-45cc-97c6-c31daf0df330', 'fare': 19.1}, - {'_hoodie_commit_time': '20240402123035233', 'ts': 1695516137016, - 'uuid': 'e3cf430c-889d-4015-bc98-59bdce1e530c', 'fare': 34.15}] + assert t.to_pylist() == [ + { + "_hoodie_commit_time": "20240402123035233", + "ts": 1695046462179, + "uuid": "9909a8b1-2d15-4d3d-8ec9-efc48c536a00", + "fare": 33.9, + }, + { + "_hoodie_commit_time": "20240402123035233", + "ts": 1695091554788, + "uuid": "e96c4396-3fad-413a-a942-4cb36106d721", + "fare": 27.7, + }, + { + "_hoodie_commit_time": "20240402123035233", + "ts": 1695115999911, + "uuid": "c8abbe79-8d89-47ea-b4ce-4d224bae5bfa", + "fare": 17.85, + }, + { + "_hoodie_commit_time": "20240402123035233", + "ts": 1695159649087, + "uuid": "334e26e9-8355-45cc-97c6-c31daf0df330", + "fare": 19.1, + }, + { + "_hoodie_commit_time": "20240402123035233", + "ts": 1695516137016, + "uuid": "e3cf430c-889d-4015-bc98-59bdce1e530c", + "fare": 34.15, + }, + ]