Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-json
- id: check-added-large-files
- id: check-merge-conflict

- repo: https://github.com/psf/black
rev: 24.4.2
hooks:
- id: black
language_version: python3
args: ["--line-length", "120"]

- repo: https://github.com/PyCQA/flake8
rev: 7.1.0
hooks:
- id: flake8
additional_dependencies: [flake8-bugbear]
args: ["--max-line-length", "120", "--ignore=F401"]

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.11.0
hooks:
- id: mypy
args: [ "--ignore-missing-imports" ]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can all the python tools settings be put in pyproject.toml ? we can use section like [tool.mypy], [tool.ruff] ? we can use ruff instead of flake8+black, much faster


- repo: local
hooks:
- id: cargo-fmt
name: cargo fmt
entry: cargo fmt --
language: system
types: [rust]
pass_filenames: false
2 changes: 2 additions & 0 deletions python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ setup-venv: ## Setup the virtualenv
setup: ## Setup the requirements
$(info --- Setup dependencies ---)
pip install "$(MATURIN_VERSION)"
pip install pre-commit
pre-commit install

.PHONY: build
build: setup ## Build Python binding of delta-rs
Expand Down
15 changes: 3 additions & 12 deletions python/hudi/_internal.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import pyarrow

__version__: str


@dataclass(init=False)
class HudiFileSlice:
file_group_id: str
Expand All @@ -33,24 +32,16 @@ class HudiFileSlice:

def base_file_relative_path(self) -> str: ...


@dataclass(init=False)
class HudiTable:

def __init__(
self,
table_uri: str,
options: Optional[Dict[str, str]] = None,
self,
table_uri: str,
options: Optional[Dict[str, str]] = None,
): ...

def get_schema(self) -> "pyarrow.Schema": ...

def split_file_slices(self, n: int) -> List[List[HudiFileSlice]]: ...

def get_file_slices(self) -> List[HudiFileSlice]: ...

def read_file_slice(self, base_file_relative_path) -> pyarrow.RecordBatch: ...

def read_snapshot(self) -> List["pyarrow.RecordBatch"]: ...

def read_snapshot_as_of(self, timestamp: str) -> List["pyarrow.RecordBatch"]: ...
117 changes: 86 additions & 31 deletions python/tests/test_table_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,32 @@ def test_sample_table(get_sample_table):
table_path = get_sample_table
table = HudiTable(table_path)

assert table.get_schema().names == ['_hoodie_commit_time', '_hoodie_commit_seqno', '_hoodie_record_key',
'_hoodie_partition_path', '_hoodie_file_name', 'ts', 'uuid', 'rider', 'driver',
'fare', 'city']
assert table.get_schema().names == [
"_hoodie_commit_time",
"_hoodie_commit_seqno",
"_hoodie_record_key",
"_hoodie_partition_path",
"_hoodie_file_name",
"ts",
"uuid",
"rider",
"driver",
"fare",
"city",
]

file_slices = table.get_file_slices()
assert len(file_slices) == 5
assert set(f.commit_time for f in file_slices) == {'20240402123035233', '20240402144910683'}
assert set(f.commit_time for f in file_slices) == {"20240402123035233", "20240402144910683"}
assert all(f.num_records == 1 for f in file_slices)
file_slice_paths = [f.base_file_relative_path() for f in file_slices]
assert set(file_slice_paths) == {'chennai/68d3c349-f621-4cd8-9e8b-c6dd8eb20d08-0_4-12-0_20240402123035233.parquet',
'san_francisco/d9082ffd-2eb1-4394-aefc-deb4a61ecc57-0_1-9-0_20240402123035233.parquet',
'san_francisco/780b8586-3ad0-48ef-a6a1-d2217845ce4a-0_0-8-0_20240402123035233.parquet',
'san_francisco/5a226868-2934-4f84-a16f-55124630c68d-0_0-7-24_20240402144910683.parquet',
'sao_paulo/ee915c68-d7f8-44f6-9759-e691add290d8-0_3-11-0_20240402123035233.parquet'}
assert set(file_slice_paths) == {
"chennai/68d3c349-f621-4cd8-9e8b-c6dd8eb20d08-0_4-12-0_20240402123035233.parquet",
"san_francisco/d9082ffd-2eb1-4394-aefc-deb4a61ecc57-0_1-9-0_20240402123035233.parquet",
"san_francisco/780b8586-3ad0-48ef-a6a1-d2217845ce4a-0_0-8-0_20240402123035233.parquet",
"san_francisco/5a226868-2934-4f84-a16f-55124630c68d-0_0-7-24_20240402144910683.parquet",
"sao_paulo/ee915c68-d7f8-44f6-9759-e691add290d8-0_3-11-0_20240402123035233.parquet",
}

batch = table.read_file_slice(file_slice_paths[0])
t = pa.Table.from_batches([batch])
Expand All @@ -54,28 +66,71 @@ def test_sample_table(get_sample_table):

batches = table.read_snapshot()
t = pa.Table.from_batches(batches).select([0, 5, 6, 9]).sort_by("ts")
assert t.to_pylist() == [{'_hoodie_commit_time': '20240402144910683', 'ts': 1695046462179,
'uuid': '9909a8b1-2d15-4d3d-8ec9-efc48c536a00', 'fare': 339.0},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695091554788,
'uuid': 'e96c4396-3fad-413a-a942-4cb36106d721', 'fare': 27.7},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695115999911,
'uuid': 'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa', 'fare': 17.85},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695159649087,
'uuid': '334e26e9-8355-45cc-97c6-c31daf0df330', 'fare': 19.1},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695516137016,
'uuid': 'e3cf430c-889d-4015-bc98-59bdce1e530c', 'fare': 34.15}]
assert t.to_pylist() == [
{
"_hoodie_commit_time": "20240402144910683",
"ts": 1695046462179,
"uuid": "9909a8b1-2d15-4d3d-8ec9-efc48c536a00",
"fare": 339.0,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695091554788,
"uuid": "e96c4396-3fad-413a-a942-4cb36106d721",
"fare": 27.7,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695115999911,
"uuid": "c8abbe79-8d89-47ea-b4ce-4d224bae5bfa",
"fare": 17.85,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695159649087,
"uuid": "334e26e9-8355-45cc-97c6-c31daf0df330",
"fare": 19.1,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695516137016,
"uuid": "e3cf430c-889d-4015-bc98-59bdce1e530c",
"fare": 34.15,
},
]

table = HudiTable(table_path, {
"hoodie.read.as.of.timestamp": "20240402123035233"})
table = HudiTable(table_path, {"hoodie.read.as.of.timestamp": "20240402123035233"})
batches = table.read_snapshot()
t = pa.Table.from_batches(batches).select([0, 5, 6, 9]).sort_by("ts")
assert t.to_pylist() == [{'_hoodie_commit_time': '20240402123035233', 'ts': 1695046462179,
'uuid': '9909a8b1-2d15-4d3d-8ec9-efc48c536a00', 'fare': 33.9},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695091554788,
'uuid': 'e96c4396-3fad-413a-a942-4cb36106d721', 'fare': 27.7},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695115999911,
'uuid': 'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa', 'fare': 17.85},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695159649087,
'uuid': '334e26e9-8355-45cc-97c6-c31daf0df330', 'fare': 19.1},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695516137016,
'uuid': 'e3cf430c-889d-4015-bc98-59bdce1e530c', 'fare': 34.15}]
assert t.to_pylist() == [
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695046462179,
"uuid": "9909a8b1-2d15-4d3d-8ec9-efc48c536a00",
"fare": 33.9,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695091554788,
"uuid": "e96c4396-3fad-413a-a942-4cb36106d721",
"fare": 27.7,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695115999911,
"uuid": "c8abbe79-8d89-47ea-b4ce-4d224bae5bfa",
"fare": 17.85,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695159649087,
"uuid": "334e26e9-8355-45cc-97c6-c31daf0df330",
"fare": 19.1,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695516137016,
"uuid": "e3cf430c-889d-4015-bc98-59bdce1e530c",
"fare": 34.15,
},
]