Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions python/hudi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
HudiFileGroupReader,
HudiFileSlice,
HudiInstant,
HudiReadConfig,
HudiTable,
HudiTableConfig,
HudiTimeline,
)
from hudi._internal import __version__ as __version__
Expand All @@ -32,7 +34,9 @@
"HudiFileGroupReader",
"HudiFileSlice",
"HudiInstant",
"HudiReadConfig",
"HudiTable",
"HudiTableBuilder",
"HudiTableConfig",
"HudiTimeline",
]
33 changes: 33 additions & 0 deletions python/hudi/_internal.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,45 @@
# specific language governing permissions and limitations
# under the License.
from dataclasses import dataclass
from enum import Enum
from typing import Dict, List, Optional, Tuple

import pyarrow # type: ignore

__version__: str

class HudiTableConfig(Enum):
"""Configurations for Hudi tables, most of them are persisted in `hoodie.properties`."""

BASE_FILE_FORMAT = "hoodie.table.base.file.format"
BASE_PATH = "hoodie.base.path"
CHECKSUM = "hoodie.table.checksum"
CREATE_SCHEMA = "hoodie.table.create.schema"
DATABASE_NAME = "hoodie.database.name"
DROPS_PARTITION_FIELDS = "hoodie.datasource.write.drop.partition.columns"
IS_HIVE_STYLE_PARTITIONING = "hoodie.datasource.write.hive_style_partitioning"
IS_PARTITION_PATH_URLENCODED = "hoodie.datasource.write.partitionpath.urlencode"
KEY_GENERATOR_CLASS = "hoodie.table.keygenerator.class"
PARTITION_FIELDS = "hoodie.table.partition.fields"
PRECOMBINE_FIELD = "hoodie.table.precombine.field"
POPULATES_META_FIELDS = "hoodie.populate.meta.fields"
RECORD_KEY_FIELDS = "hoodie.table.recordkey.fields"
RECORD_MERGE_STRATEGY = "hoodie.table.record.merge.strategy"
TABLE_NAME = "hoodie.table.name"
TABLE_TYPE = "hoodie.table.type"
TABLE_VERSION = "hoodie.table.version"
TIMELINE_LAYOUT_VERSION = "hoodie.timeline.layout.version"
TIMELINE_TIMEZONE = "hoodie.table.timeline.timezone"

class HudiReadConfig(Enum):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the goal is to auto expose the config enums so we don't have to copy the config strings for another language binding. try to research if this is doable with pyo3 to bind python enum to a rust enum without duplicating every config entry.

Copy link
Contributor Author

@yunchipang yunchipang Sep 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@xushiyan i added a macros crate to auto-gen enums for python using pyo3. PTAL if this aligns better with the goal. thanks!

"""Configurations for reading Hudi tables."""

FILE_GROUP_START_TIMESTAMP = "hoodie.read.file_group.start_timestamp"
FILE_GROUP_END_TIMESTAMP = "hoodie.read.file_group.end_timestamp"
INPUT_PARTITIONS = "hoodie.read.input.partitions"
LISTING_PARALLELISM = "hoodie.read.listing.parallelism"
USE_READ_OPTIMIZED_MODE = "hoodie.read.use.read_optimized.mode"

@dataclass(init=False)
class HudiFileGroupReader:
"""
Expand Down
22 changes: 14 additions & 8 deletions python/hudi/table/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
# specific language governing permissions and limitations
# under the License.
from dataclasses import dataclass, field
from typing import Dict, Optional
from typing import Dict, Optional, Union

from hudi._internal import HudiTable, build_hudi_table
from hudi._internal import HudiReadConfig, HudiTable, HudiTableConfig, build_hudi_table


@dataclass
Expand Down Expand Up @@ -57,18 +57,21 @@ def _add_options(
target_attr = getattr(self, f"{category}_options") if category else self.options
target_attr.update(options)

def with_hudi_option(self, k: str, v: str) -> "HudiTableBuilder":
def with_hudi_option(
self, k: Union[str, HudiTableConfig, HudiReadConfig], v: str
) -> "HudiTableBuilder":
"""
Adds a Hudi option to the builder.

Parameters:
k (str): The key of the option.
k (Union[str, HudiTableConfig, HudiReadConfig]): The key of the option. Can be a string or enum.
v (str): The value of the option.

Returns:
HudiTableBuilder: The builder instance.
"""
self._add_options({k: v}, "hudi")
key = k.value if isinstance(k, (HudiTableConfig, HudiReadConfig)) else k
self._add_options({key: v}, "hudi")
return self

def with_hudi_options(self, hudi_options: Dict[str, str]) -> "HudiTableBuilder":
Expand Down Expand Up @@ -113,18 +116,21 @@ def with_storage_options(
self._add_options(storage_options, "storage")
return self

def with_option(self, k: str, v: str) -> "HudiTableBuilder":
def with_option(
self, k: Union[str, HudiTableConfig, HudiReadConfig], v: str
) -> "HudiTableBuilder":
"""
Adds a generic option to the builder.

Parameters:
k (str): The key of the option.
k (Union[str, HudiTableConfig, HudiReadConfig]): The key of the option. Can be a string or enum.
v (str): The value of the option.

Returns:
HudiTableBuilder: The builder instance.
"""
self._add_options({k: v})
key = k.value if isinstance(k, (HudiTableConfig, HudiReadConfig)) else k
self._add_options({key: v})
return self

def with_options(self, options: Dict[str, str]) -> "HudiTableBuilder":
Expand Down
132 changes: 132 additions & 0 deletions python/src/internal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ use tokio::runtime::Runtime;

#[cfg(feature = "datafusion")]
use datafusion::error::DataFusionError;
use hudi::config::read::HudiReadConfig;
use hudi::config::table::HudiTableConfig;
use hudi::error::CoreError;
use hudi::file_group::file_slice::FileSlice;
use hudi::file_group::reader::FileGroupReader;
Expand Down Expand Up @@ -603,6 +605,136 @@ pub fn build_hudi_table(
Ok(HudiTable { inner })
}

#[cfg(not(tarpaulin_include))]
#[pyclass(name = "HudiTableConfig")]
#[derive(Clone, Debug)]
pub struct PyHudiTableConfig {
inner: HudiTableConfig,
}

#[cfg(not(tarpaulin_include))]
#[pymethods]
impl PyHudiTableConfig {
#[classattr]
const BASE_FILE_FORMAT: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::BaseFileFormat,
};
#[classattr]
const BASE_PATH: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::BasePath,
};
#[classattr]
const CHECKSUM: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::Checksum,
};
#[classattr]
const CREATE_SCHEMA: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::CreateSchema,
};
#[classattr]
const DATABASE_NAME: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::DatabaseName,
};
#[classattr]
const DROPS_PARTITION_FIELDS: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::DropsPartitionFields,
};
#[classattr]
const IS_HIVE_STYLE_PARTITIONING: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::IsHiveStylePartitioning,
};
#[classattr]
const IS_PARTITION_PATH_URLENCODED: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::IsPartitionPathUrlencoded,
};
#[classattr]
const KEY_GENERATOR_CLASS: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::KeyGeneratorClass,
};
#[classattr]
const PARTITION_FIELDS: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::PartitionFields,
};
#[classattr]
const PRECOMBINE_FIELD: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::PrecombineField,
};
#[classattr]
const POPULATES_META_FIELDS: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::PopulatesMetaFields,
};
#[classattr]
const RECORD_KEY_FIELDS: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::RecordKeyFields,
};
#[classattr]
const RECORD_MERGE_STRATEGY: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::RecordMergeStrategy,
};
#[classattr]
const TABLE_NAME: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::TableName,
};
#[classattr]
const TABLE_TYPE: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::TableType,
};
#[classattr]
const TABLE_VERSION: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::TableVersion,
};
#[classattr]
const TIMELINE_LAYOUT_VERSION: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::TimelineLayoutVersion,
};
#[classattr]
const TIMELINE_TIMEZONE: PyHudiTableConfig = PyHudiTableConfig {
inner: HudiTableConfig::TimelineTimezone,
};

#[getter]
fn value(&self) -> String {
self.inner.as_ref().to_string()
}
}

#[cfg(not(tarpaulin_include))]
#[pyclass(name = "HudiReadConfig")]
#[derive(Clone, Debug)]
pub struct PyHudiReadConfig {
inner: HudiReadConfig,
}

#[cfg(not(tarpaulin_include))]
#[pymethods]
impl PyHudiReadConfig {
#[classattr]
const FILE_GROUP_START_TIMESTAMP: PyHudiReadConfig = PyHudiReadConfig {
inner: HudiReadConfig::FileGroupStartTimestamp,
};
#[classattr]
const FILE_GROUP_END_TIMESTAMP: PyHudiReadConfig = PyHudiReadConfig {
inner: HudiReadConfig::FileGroupEndTimestamp,
};
#[classattr]
const INPUT_PARTITIONS: PyHudiReadConfig = PyHudiReadConfig {
inner: HudiReadConfig::InputPartitions,
};
#[classattr]
const LISTING_PARALLELISM: PyHudiReadConfig = PyHudiReadConfig {
inner: HudiReadConfig::ListingParallelism,
};
#[classattr]
const USE_READ_OPTIMIZED_MODE: PyHudiReadConfig = PyHudiReadConfig {
inner: HudiReadConfig::UseReadOptimizedMode,
};

#[getter]
fn value(&self) -> String {
self.inner.as_ref().to_string()
}
}

#[cfg(not(tarpaulin_include))]
pub fn rt() -> &'static Runtime {
static TOKIO_RT: OnceLock<Runtime> = OnceLock::new();
Expand Down
7 changes: 6 additions & 1 deletion python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,16 @@ mod datafusion_internal;
fn _internal(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add("__version__", env!("CARGO_PKG_VERSION"))?;

use internal::{HudiFileGroupReader, HudiFileSlice, HudiInstant, HudiTable, HudiTimeline};
use internal::{
HudiFileGroupReader, HudiFileSlice, HudiInstant, HudiTable, HudiTimeline, PyHudiReadConfig,
PyHudiTableConfig,
};
m.add_class::<HudiFileGroupReader>()?;
m.add_class::<HudiFileSlice>()?;
m.add_class::<HudiInstant>()?;
m.add_class::<PyHudiReadConfig>()?;
m.add_class::<HudiTable>()?;
m.add_class::<PyHudiTableConfig>()?;
m.add_class::<HudiTimeline>()?;

#[cfg(feature = "datafusion")]
Expand Down
60 changes: 59 additions & 1 deletion python/tests/test_table_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import pyarrow as pa
import pytest

from hudi import HudiTableBuilder
from hudi import HudiReadConfig, HudiTableBuilder, HudiTableConfig


@pytest.fixture
Expand Down Expand Up @@ -141,3 +141,61 @@ def test_setting_table_options(
table.hudi_options().get("hoodie.read.file_group.start_timestamp")
== "resolved value"
)


def test_with_hudi_option_enum(builder):
"""Test that HudiTableConfig and HudiReadConfig enums work with with_hudi_option."""
builder.with_hudi_option(HudiTableConfig.TABLE_NAME, "test_table")
assert builder.hudi_options["hoodie.table.name"] == "test_table"

builder.with_hudi_option(HudiReadConfig.INPUT_PARTITIONS, "5")
assert builder.hudi_options["hoodie.read.input.partitions"] == "5"


def test_with_option_enum(builder):
"""Test that HudiTableConfig and HudiReadConfig enums work with with_option."""
builder.with_option(HudiTableConfig.BASE_FILE_FORMAT, "parquet")
assert builder.options["hoodie.table.base.file.format"] == "parquet"

builder.with_option(HudiReadConfig.LISTING_PARALLELISM, "10")
assert builder.options["hoodie.read.listing.parallelism"] == "10"


def test_enum_values_match_expected_strings():
"""Test that enum values match the expected configuration key strings."""
assert HudiTableConfig.TABLE_NAME.value == "hoodie.table.name"
assert HudiTableConfig.TABLE_TYPE.value == "hoodie.table.type"
assert HudiTableConfig.BASE_FILE_FORMAT.value == "hoodie.table.base.file.format"

assert HudiReadConfig.INPUT_PARTITIONS.value == "hoodie.read.input.partitions"
assert HudiReadConfig.LISTING_PARALLELISM.value == "hoodie.read.listing.parallelism"
assert (
HudiReadConfig.USE_READ_OPTIMIZED_MODE.value
== "hoodie.read.use.read_optimized.mode"
)
assert (
HudiReadConfig.FILE_GROUP_START_TIMESTAMP.value
== "hoodie.read.file_group.start_timestamp"
)
assert (
HudiReadConfig.FILE_GROUP_END_TIMESTAMP.value
== "hoodie.read.file_group.end_timestamp"
)


def test_mixed_string_and_enum_usage(builder):
"""Test that strings and enums can be used together."""
builder.with_hudi_option("custom.string.key", "string_value")
builder.with_hudi_option(HudiTableConfig.TABLE_NAME, "enum_table")

assert builder.hudi_options["custom.string.key"] == "string_value"
assert builder.hudi_options["hoodie.table.name"] == "enum_table"


def test_backward_compatibility(builder):
"""Test that existing string-based API still works."""
builder.with_hudi_option("hoodie.table.name", "string_table")
builder.with_option("hoodie.read.input.partitions", "8")

assert builder.hudi_options["hoodie.table.name"] == "string_table"
assert builder.options["hoodie.read.input.partitions"] == "8"
Loading
Loading