diff --git a/metadata-ingestion/scripts/docs_config_table.py b/metadata-ingestion/scripts/docs_config_table.py index 3c73d11eac1cc3..67e60bb3d517c2 100644 --- a/metadata-ingestion/scripts/docs_config_table.py +++ b/metadata-ingestion/scripts/docs_config_table.py @@ -364,8 +364,7 @@ def should_hide_field(schema_field, current_source: str, schema_dict: Dict[str, properties = def_schema.get("properties", {}) if field_name in properties: field_schema = properties[field_name] - schema_extra = field_schema.get("schema_extra", {}) - supported_sources = schema_extra.get("supported_sources") + supported_sources = field_schema.get("supported_sources", {}) if supported_sources and current_source: return current_source.lower() not in [s.lower() for s in supported_sources] diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg index 4cc8b1c79eb3f6..ae11023d80faf7 100644 --- a/metadata-ingestion/setup.cfg +++ b/metadata-ingestion/setup.cfg @@ -1,7 +1,8 @@ [mypy] plugins = ./tests/test_helpers/sqlalchemy_mypy_plugin.py, - pydantic.mypy + pydantic.mypy, + pydantic.v1.mypy exclude = ^(venv/|build/|dist/|examples/transforms/setup.py) ignore_missing_imports = yes namespace_packages = no diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index a4190617b80479..14393dfe03317c 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -18,10 +18,7 @@ "typing_extensions>=4.5.0", # Actual dependencies. "typing-inspect", - # pydantic 1.8.2 is incompatible with mypy 0.910. - # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. - # pydantic 1.10.3 is incompatible with typing-extensions 4.1.1 - https://github.com/pydantic/pydantic/issues/4885 - "pydantic>=1.10.0,!=1.10.3", + "pydantic>=2.4.0", "mixpanel>=4.9.0", # Airflow depends on fairly old versions of sentry-sdk, so we want to be loose with our constraints. "sentry-sdk", @@ -58,12 +55,6 @@ "ruamel.yaml", } -pydantic_no_v2 = { - # pydantic 2 makes major, backwards-incompatible changes - https://github.com/pydantic/pydantic/issues/4887 - # Tags sources that require the pydantic v2 API. - "pydantic<2", -} - rest_common = {"requests", "requests_file"} kafka_common = { @@ -119,7 +110,7 @@ dbt_common = { *sqlglot_lib, - "more_itertools", + "more-itertools", } cachetools_lib = { @@ -507,10 +498,7 @@ # It's technically wrong for packages to depend on setuptools. However, it seems mlflow does it anyways. "setuptools", }, - "datahub-debug": { - "dnspython==2.7.0", - "requests" - }, + "datahub-debug": {"dnspython==2.7.0", "requests"}, "mode": {"requests", "python-liquid", "tenacity>=8.0.1"} | sqlglot_lib, "mongodb": {"pymongo[srv]>=3.11", "packaging"}, "mssql": sql_common | mssql_common, @@ -729,7 +717,6 @@ if plugin for dependency in plugins[plugin] ), - *pydantic_no_v2, } dev_requirements = { @@ -964,30 +951,7 @@ extras_require={ "base": list(framework_common), **{ - plugin: list( - framework_common - | ( - # While pydantic v2 support is experimental, require that all plugins - # continue to use v1. This will ensure that no ingestion recipes break. - pydantic_no_v2 - if plugin - not in { - "airflow", - "datahub-rest", - "datahub-kafka", - "sync-file-emitter", - "sql-parser", - # Some sources have been manually tested for compatibility with pydantic v2. - "iceberg", - "feast", - "bigquery-slim", - "snowflake-slim", - "mysql", # tested in smoke-test - } - else set() - ) - | dependencies - ) + plugin: list(framework_common | dependencies) for (plugin, dependencies) in plugins.items() }, "all": list( diff --git a/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py b/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py index 0c1eb820a1fcc1..59c6d41eb5a09a 100644 --- a/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py +++ b/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py @@ -17,6 +17,7 @@ import yaml from pydantic import ( BaseModel, + ConfigDict, Field, StrictStr, root_validator, @@ -66,33 +67,17 @@ StructuredPropertyUrn, TagUrn, ) -from datahub.pydantic.compat import ( - PYDANTIC_VERSION, -) from datahub.specific.dataset import DatasetPatchBuilder from datahub.utilities.urns.dataset_urn import DatasetUrn -logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class StrictModel(BaseModel): - """ - Base model with strict validation. - Compatible with both Pydantic v1 and v2. - """ - - if PYDANTIC_VERSION >= 2: - # Pydantic v2 config - model_config = { - "validate_assignment": True, - "extra": "forbid", - } - else: - # Pydantic v1 config - class Config: - validate_assignment = True - extra = "forbid" + model_config = ConfigDict( + validate_assignment=True, + extra="forbid", + ) # Define type aliases for the complex types @@ -134,7 +119,7 @@ def urn_strip(urn: str) -> str: class SchemaFieldSpecification(StrictModel): id: Optional[str] = None - urn: Optional[str] = None + urn: Optional[str] = Field(default=None, validate_default=True) structured_properties: Optional[StructuredProperties] = None type: Optional[str] = None nativeDataType: Optional[str] = None @@ -288,66 +273,33 @@ def _from_datahub_type( return "record" raise ValueError(f"Type {input_type} is not a valid primitive type") - if PYDANTIC_VERSION < 2: - - def dict(self, **kwargs): - """Custom dict method for Pydantic v1 to handle YAML serialization properly.""" - exclude = kwargs.pop("exclude", None) or set() - - # If description and doc are identical, exclude doc from the output - if self.description == self.doc and self.description is not None: - exclude.add("doc") - - # if nativeDataType and type are identical, exclude nativeDataType from the output - if self.nativeDataType == self.type and self.nativeDataType is not None: - exclude.add("nativeDataType") + def model_dump(self, **kwargs): + """Custom model_dump method for Pydantic v2 to handle YAML serialization properly.""" + exclude = kwargs.pop("exclude", None) or set() - # if the id is the same as the urn's fieldPath, exclude id from the output + # If description and doc are identical, exclude doc from the output + if self.description == self.doc and self.description is not None: + exclude.add("doc") - if self.urn: - field_urn = SchemaFieldUrn.from_string(self.urn) - if Dataset._simplify_field_path(field_urn.field_path) == self.id: - exclude.add("urn") + # if nativeDataType and type are identical, exclude nativeDataType from the output + if self.nativeDataType == self.type and self.nativeDataType is not None: + exclude.add("nativeDataType") - kwargs.pop("exclude_defaults", None) + # if the id is the same as the urn's fieldPath, exclude id from the output + if self.urn: + field_urn = SchemaFieldUrn.from_string(self.urn) + if Dataset._simplify_field_path(field_urn.field_path) == self.id: + exclude.add("urn") - self.structured_properties = ( - StructuredPropertiesHelper.simplify_structured_properties_list( - self.structured_properties - ) + self.structured_properties = ( + StructuredPropertiesHelper.simplify_structured_properties_list( + self.structured_properties ) - - return super().dict(exclude=exclude, exclude_defaults=True, **kwargs) - - else: - # For v2, implement model_dump with similar logic as dict - def model_dump(self, **kwargs): - """Custom model_dump method for Pydantic v2 to handle YAML serialization properly.""" - exclude = kwargs.pop("exclude", None) or set() - - # If description and doc are identical, exclude doc from the output - if self.description == self.doc and self.description is not None: - exclude.add("doc") - - # if nativeDataType and type are identical, exclude nativeDataType from the output - if self.nativeDataType == self.type and self.nativeDataType is not None: - exclude.add("nativeDataType") - - # if the id is the same as the urn's fieldPath, exclude id from the output - if self.urn: - field_urn = SchemaFieldUrn.from_string(self.urn) - if Dataset._simplify_field_path(field_urn.field_path) == self.id: - exclude.add("urn") - - self.structured_properties = ( - StructuredPropertiesHelper.simplify_structured_properties_list( - self.structured_properties - ) + ) + if hasattr(super(), "model_dump"): + return super().model_dump( # type: ignore + exclude=exclude, exclude_defaults=True, **kwargs ) - if hasattr(super(), "model_dump"): - return super().model_dump( # type: ignore - exclude=exclude, exclude_defaults=True, **kwargs - ) class SchemaSpecification(BaseModel): @@ -380,9 +332,9 @@ class Dataset(StrictModel): id: Optional[str] = None platform: Optional[str] = None env: str = "PROD" - urn: Optional[str] = None + urn: Optional[str] = Field(default=None, validate_default=True) description: Optional[str] = None - name: Optional[str] = None + name: Optional[str] = Field(default=None, validate_default=True) schema_metadata: Optional[SchemaSpecification] = Field(default=None, alias="schema") downstreams: Optional[List[str]] = None properties: Optional[Dict[str, str]] = None @@ -940,80 +892,42 @@ def from_datahub( downstreams=downstreams if config.include_downstreams else None, ) - if PYDANTIC_VERSION < 2: + def model_dump(self, **kwargs): + """Custom model_dump method to handle YAML serialization properly.""" + exclude = kwargs.pop("exclude", None) or set() - def dict(self, **kwargs): - """Custom dict method for Pydantic v1 to handle YAML serialization properly.""" - exclude = kwargs.pop("exclude", set()) + # If id and name are identical, exclude name from the output + if self.id == self.name and self.id is not None: + exclude.add("name") - # If id and name are identical, exclude name from the output - if self.id == self.name and self.id is not None: - exclude.add("name") - - # if subtype and subtypes are identical or subtypes is a singleton list, exclude subtypes from the output - if self.subtypes and len(self.subtypes) == 1: - self.subtype = self.subtypes[0] - exclude.add("subtypes") + # if subtype and subtypes are identical or subtypes is a singleton list, exclude subtypes from the output + if self.subtypes and len(self.subtypes) == 1: + self.subtype = self.subtypes[0] + exclude.add("subtypes") + if hasattr(super(), "model_dump"): + result = super().model_dump(exclude=exclude, **kwargs) # type: ignore + else: result = super().dict(exclude=exclude, **kwargs) - # Custom handling for schema_metadata/schema - if self.schema_metadata and "schema" in result: - schema_data = result["schema"] - - # Handle fields if they exist - if "fields" in schema_data and isinstance(schema_data["fields"], list): - # Process each field using its custom dict method - processed_fields = [] - if self.schema_metadata and self.schema_metadata.fields: - for field in self.schema_metadata.fields: - if field: - # Use dict method for Pydantic v1 - processed_field = field.dict(**kwargs) - processed_fields.append(processed_field) - - # Replace the fields in the result with the processed ones - schema_data["fields"] = processed_fields - - return result - else: - - def model_dump(self, **kwargs): - """Custom model_dump method for Pydantic v2 to handle YAML serialization properly.""" - exclude = kwargs.pop("exclude", None) or set() - - # If id and name are identical, exclude name from the output - if self.id == self.name and self.id is not None: - exclude.add("name") - - # if subtype and subtypes are identical or subtypes is a singleton list, exclude subtypes from the output - if self.subtypes and len(self.subtypes) == 1: - self.subtype = self.subtypes[0] - exclude.add("subtypes") - - if hasattr(super(), "model_dump"): - result = super().model_dump(exclude=exclude, **kwargs) # type: ignore - else: - result = super().dict(exclude=exclude, **kwargs) - - # Custom handling for schema_metadata/schema - if self.schema_metadata and "schema" in result: - schema_data = result["schema"] + # Custom handling for schema_metadata/schema + if self.schema_metadata and "schema" in result: + schema_data = result["schema"] - # Handle fields if they exist - if "fields" in schema_data and isinstance(schema_data["fields"], list): - # Process each field using its custom model_dump method - processed_fields = [] - if self.schema_metadata and self.schema_metadata.fields: - for field in self.schema_metadata.fields: - if field: - processed_field = field.model_dump(**kwargs) - processed_fields.append(processed_field) + # Handle fields if they exist + if "fields" in schema_data and isinstance(schema_data["fields"], list): + # Process each field using its custom model_dump method + processed_fields = [] + if self.schema_metadata and self.schema_metadata.fields: + for field in self.schema_metadata.fields: + if field: + processed_field = field.model_dump(**kwargs) + processed_fields.append(processed_field) - # Replace the fields in the result with the processed ones - schema_data["fields"] = processed_fields + # Replace the fields in the result with the processed ones + schema_data["fields"] = processed_fields - return result + return result def to_yaml( self, @@ -1025,13 +939,7 @@ def to_yaml( Returns True if file was written, False if no changes were detected. """ # Create new model data - # Create new model data - choose dict() or model_dump() based on Pydantic version - if PYDANTIC_VERSION >= 2: - new_data = self.model_dump( - exclude_none=True, exclude_unset=True, by_alias=True - ) - else: - new_data = self.dict(exclude_none=True, exclude_unset=True, by_alias=True) + new_data = self.model_dump(exclude_none=True, exclude_unset=True, by_alias=True) # Set up ruamel.yaml for preserving comments yaml_handler = YAML(typ="rt") # round-trip mode diff --git a/metadata-ingestion/src/datahub/api/entities/external/restricted_text.py b/metadata-ingestion/src/datahub/api/entities/external/restricted_text.py index 99e5534c65a4c7..4eb0b17a513650 100644 --- a/metadata-ingestion/src/datahub/api/entities/external/restricted_text.py +++ b/metadata-ingestion/src/datahub/api/entities/external/restricted_text.py @@ -13,20 +13,8 @@ from typing import Any, ClassVar, Optional, Set, Union -# Check Pydantic version and import accordingly -try: - from pydantic import VERSION - - PYDANTIC_V2 = int(VERSION.split(".")[0]) >= 2 -except (ImportError, AttributeError): - # Fallback for older versions that don't have VERSION - PYDANTIC_V2 = False - -if PYDANTIC_V2: - from pydantic import GetCoreSchemaHandler # type: ignore[attr-defined] - from pydantic_core import core_schema -else: - from pydantic.validators import str_validator +from pydantic import GetCoreSchemaHandler +from pydantic_core import core_schema class RestrictedTextConfig: @@ -179,69 +167,24 @@ def with_config( truncation_suffix=truncation_suffix, ) - # Pydantic v2 methods - if PYDANTIC_V2: - - @classmethod - def _validate( - cls, - __input_value: Union[str, "RestrictedText"], - _: core_schema.ValidationInfo, - ) -> "RestrictedText": - """Validate and create a RestrictedText instance.""" - if isinstance(__input_value, RestrictedText): - return __input_value - return cls(__input_value) - - @classmethod - def __get_pydantic_core_schema__( - cls, source: type[Any], handler: GetCoreSchemaHandler - ) -> core_schema.CoreSchema: - """Get the Pydantic core schema for this type.""" - return core_schema.with_info_after_validator_function( - cls._validate, - core_schema.str_schema(), - field_name=cls.__name__, - ) - - # Pydantic v1 methods - else: - - @classmethod - def __get_validators__(cls): - """Pydantic v1 validator method.""" - yield cls.validate - - @classmethod - def validate(cls, v, field=None): - """Validate and create a RestrictedText instance for Pydantic v1.""" - if isinstance(v, RestrictedText): - return v - - if not isinstance(v, str): - # Let pydantic handle the string validation - v = str_validator(v) - - # Create instance - instance = cls(v) - - # Check if there's a field default that contains configuration - if ( - field - and hasattr(field, "default") - and isinstance(field.default, RestrictedTextConfig) - ): - config = field.default - instance._configure( - max_length=config.max_length, - forbidden_chars=config.forbidden_chars, - replacement_char=config.replacement_char, - truncation_suffix=config.truncation_suffix, - ) - - return instance - - @classmethod - def __modify_schema__(cls, field_schema): - """Modify the JSON schema for Pydantic v1.""" - field_schema.update(type="string", examples=["example string"]) + @classmethod + def _validate( + cls, + __input_value: Union[str, "RestrictedText"], + _: core_schema.ValidationInfo, + ) -> "RestrictedText": + """Validate and create a RestrictedText instance.""" + if isinstance(__input_value, RestrictedText): + return __input_value + return cls(__input_value) + + @classmethod + def __get_pydantic_core_schema__( + cls, source_type: Any, handler: GetCoreSchemaHandler + ) -> core_schema.CoreSchema: + """Get the Pydantic core schema for this type.""" + return core_schema.with_info_after_validator_function( + cls._validate, + core_schema.str_schema(), + field_name=cls.__name__, + ) diff --git a/metadata-ingestion/src/datahub/cli/quickstart_versioning.py b/metadata-ingestion/src/datahub/cli/quickstart_versioning.py index 9739af5127f4d1..16bf7559d45ef8 100644 --- a/metadata-ingestion/src/datahub/cli/quickstart_versioning.py +++ b/metadata-ingestion/src/datahub/cli/quickstart_versioning.py @@ -21,7 +21,7 @@ class QuickstartExecutionPlan(BaseModel): composefile_git_ref: str docker_tag: str - mysql_tag: Optional[str] + mysql_tag: Optional[str] = None def _is_it_a_version(version: str) -> bool: diff --git a/metadata-ingestion/src/datahub/configuration/_config_enum.py b/metadata-ingestion/src/datahub/configuration/_config_enum.py index 190a006b077d9f..3c9349382bc4ea 100644 --- a/metadata-ingestion/src/datahub/configuration/_config_enum.py +++ b/metadata-ingestion/src/datahub/configuration/_config_enum.py @@ -1,10 +1,8 @@ from enum import Enum +from typing import Any -import pydantic -import pydantic.types -import pydantic.validators - -from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 +from pydantic import GetCoreSchemaHandler +from pydantic_core import CoreSchema, core_schema class ConfigEnum(Enum): @@ -17,25 +15,13 @@ def _generate_next_value_( # type: ignore # From https://stackoverflow.com/a/44785241/5004662. return name - if PYDANTIC_VERSION_2: - # if TYPE_CHECKING: - # from pydantic import GetCoreSchemaHandler - - @classmethod - def __get_pydantic_core_schema__(cls, source_type, handler): # type: ignore - from pydantic_core import core_schema - - return core_schema.no_info_before_validator_function( - cls.validate, handler(source_type) - ) - - else: - - @classmethod - def __get_validators__(cls) -> "pydantic.types.CallableGenerator": - # We convert the text to uppercase before attempting to match it to an enum value. - yield cls.validate - yield pydantic.validators.enum_member_validator + @classmethod + def __get_pydantic_core_schema__( + cls, source_type: Any, handler: GetCoreSchemaHandler + ) -> CoreSchema: + return core_schema.no_info_before_validator_function( + cls.validate, handler(source_type) + ) @classmethod def validate(cls, v): # type: ignore[no-untyped-def] diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index aae894f1c5370e..bca4162ae70432 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -14,14 +14,12 @@ runtime_checkable, ) -import pydantic from cached_property import cached_property -from pydantic import BaseModel, Extra, ValidationError +from pydantic import BaseModel, ValidationError from pydantic.fields import Field from typing_extensions import Protocol, Self from datahub.configuration._config_enum import ConfigEnum as ConfigEnum -from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 from datahub.utilities.dedup_list import deduplicate_list REDACT_KEYS = { @@ -97,35 +95,22 @@ def _schema_extra(schema: Dict[str, Any], model: Type["ConfigModel"]) -> None: for key in remove_fields: del schema["properties"][key] - # This is purely to suppress pydantic's warnings, since this class is used everywhere. - if PYDANTIC_VERSION_2: - extra = "forbid" - ignored_types = (cached_property,) - json_schema_extra = _schema_extra - else: - extra = Extra.forbid - underscore_attrs_are_private = True - keep_untouched = ( - cached_property, - ) # needed to allow cached_property to work. See https://github.com/samuelcolvin/pydantic/issues/1241 for more info. - schema_extra = _schema_extra + extra = "forbid" + ignored_types = (cached_property,) + json_schema_extra = _schema_extra @classmethod def parse_obj_allow_extras(cls, obj: Any) -> Self: - if PYDANTIC_VERSION_2: - try: - with unittest.mock.patch.dict( - cls.model_config, # type: ignore - {"extra": "allow"}, - clear=False, - ): - cls.model_rebuild(force=True) # type: ignore - return cls.parse_obj(obj) - finally: + try: + with unittest.mock.patch.dict( + cls.model_config, # type: ignore + {"extra": "allow"}, + clear=False, + ): cls.model_rebuild(force=True) # type: ignore - else: - with unittest.mock.patch.object(cls.Config, "extra", pydantic.Extra.allow): return cls.parse_obj(obj) + finally: + cls.model_rebuild(force=True) # type: ignore class PermissiveConfigModel(ConfigModel): @@ -135,10 +120,7 @@ class PermissiveConfigModel(ConfigModel): # It is usually used for argument bags that are passed through to third-party libraries. class Config: - if PYDANTIC_VERSION_2: # noqa: SIM108 - extra = "allow" - else: - extra = Extra.allow + extra = "allow" class TransformerSemantics(ConfigEnum): diff --git a/metadata-ingestion/src/datahub/configuration/import_resolver.py b/metadata-ingestion/src/datahub/configuration/import_resolver.py index 19627c7b8c9569..6708121adc7d0e 100644 --- a/metadata-ingestion/src/datahub/configuration/import_resolver.py +++ b/metadata-ingestion/src/datahub/configuration/import_resolver.py @@ -12,4 +12,4 @@ def _pydantic_resolver(v: Union[T, str]) -> T: def pydantic_resolve_key(field: str) -> classmethod: - return pydantic.validator(field, pre=True, allow_reuse=True)(_pydantic_resolver) + return pydantic.validator(field, pre=True, allow_reuse=True)(_pydantic_resolver) # type: ignore diff --git a/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py b/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py index bd931abe2e84d1..a6e5ef7067ddfb 100644 --- a/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py +++ b/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py @@ -1,40 +1,11 @@ -import pydantic.version -from packaging.version import Version - -PYDANTIC_VERSION_2: bool -if Version(pydantic.version.VERSION) >= Version("2.0"): - PYDANTIC_VERSION_2 = True -else: - PYDANTIC_VERSION_2 = False - - -# This can be used to silence deprecation warnings while we migrate. -if PYDANTIC_VERSION_2: - from pydantic import PydanticDeprecatedSince20 # type: ignore -else: - - class PydanticDeprecatedSince20(Warning): # type: ignore - pass - - -if PYDANTIC_VERSION_2: - from pydantic import BaseModel as GenericModel - from pydantic.v1 import ( # type: ignore - BaseModel as v1_BaseModel, - Extra as v1_Extra, - Field as v1_Field, - root_validator as v1_root_validator, - validator as v1_validator, - ) -else: - from pydantic import ( # type: ignore - BaseModel as v1_BaseModel, - Extra as v1_Extra, - Field as v1_Field, - root_validator as v1_root_validator, - validator as v1_validator, - ) - from pydantic.generics import GenericModel # type: ignore +from pydantic import PydanticDeprecatedSince20 +from pydantic.v1 import ( # type: ignore + BaseModel as v1_BaseModel, + Extra as v1_Extra, + Field as v1_Field, + root_validator as v1_root_validator, + validator as v1_validator, +) class v1_ConfigModel(v1_BaseModel): @@ -48,10 +19,10 @@ class Config: underscore_attrs_are_private = True +# TODO: Remove the warning ignore on PydanticDeprecatedSince20. + __all__ = [ - "PYDANTIC_VERSION_2", "PydanticDeprecatedSince20", - "GenericModel", "v1_ConfigModel", "v1_Field", "v1_root_validator", diff --git a/metadata-ingestion/src/datahub/configuration/validate_field_deprecation.py b/metadata-ingestion/src/datahub/configuration/validate_field_deprecation.py index 6134c4dab48174..bfc5423ae9382f 100644 --- a/metadata-ingestion/src/datahub/configuration/validate_field_deprecation.py +++ b/metadata-ingestion/src/datahub/configuration/validate_field_deprecation.py @@ -31,4 +31,4 @@ def _validate_deprecated(cls: Type, values: dict) -> dict: # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264 # This hack ensures that multiple field deprecated do not overwrite each other. _validate_deprecated.__name__ = f"{_validate_deprecated.__name__}_{field}" - return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_deprecated) + return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_deprecated) # type: ignore diff --git a/metadata-ingestion/src/datahub/configuration/validate_field_removal.py b/metadata-ingestion/src/datahub/configuration/validate_field_removal.py index 6b8f0df0588e68..ace3dbf94b925e 100644 --- a/metadata-ingestion/src/datahub/configuration/validate_field_removal.py +++ b/metadata-ingestion/src/datahub/configuration/validate_field_removal.py @@ -25,4 +25,4 @@ def _validate_field_removal(cls: Type, values: dict) -> dict: # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264 # This hack ensures that multiple field removals do not overwrite each other. _validate_field_removal.__name__ = f"{_validate_field_removal.__name__}_{field}" - return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_removal) + return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_removal) # type: ignore diff --git a/metadata-ingestion/src/datahub/configuration/validate_field_rename.py b/metadata-ingestion/src/datahub/configuration/validate_field_rename.py index de2a16e9bf247d..a1ad06ed1cb2bd 100644 --- a/metadata-ingestion/src/datahub/configuration/validate_field_rename.py +++ b/metadata-ingestion/src/datahub/configuration/validate_field_rename.py @@ -49,6 +49,4 @@ def _validate_field_rename(cls: Type, values: dict) -> dict: # validator with pre=True gets all the values that were passed in. # Given that a renamed field doesn't show up in the fields list, we can't use # the field-level validator, even with a different field name. - return pydantic.root_validator(pre=True, skip_on_failure=True, allow_reuse=True)( - _validate_field_rename - ) + return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_rename) # type: ignore diff --git a/metadata-ingestion/src/datahub/configuration/validate_multiline_string.py b/metadata-ingestion/src/datahub/configuration/validate_multiline_string.py index 0baaf4f0264b99..034d9dfdd5cc20 100644 --- a/metadata-ingestion/src/datahub/configuration/validate_multiline_string.py +++ b/metadata-ingestion/src/datahub/configuration/validate_multiline_string.py @@ -28,4 +28,4 @@ def _validate_field( # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264 # This hack ensures that multiple field deprecated do not overwrite each other. _validate_field.__name__ = f"{_validate_field.__name__}_{field}" - return pydantic.validator(field, pre=True, allow_reuse=True)(_validate_field) + return pydantic.validator(field, pre=True, allow_reuse=True)(_validate_field) # type: ignore diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py index ba03083854e785..a6e92493a07c37 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py @@ -7,7 +7,6 @@ from pydantic.fields import Field from datahub.configuration.common import ConfigModel -from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 from datahub.ingestion.glossary.classifier import Classifier from datahub.utilities.str_enum import StrEnum @@ -51,10 +50,7 @@ class ValuesFactorConfig(ConfigModel): class PredictionFactorsAndWeights(ConfigModel): class Config: - if PYDANTIC_VERSION_2: - populate_by_name = True - else: - allow_population_by_field_name = True + populate_by_name = True Name: float = Field(alias="name") Description: float = Field(alias="description") @@ -64,10 +60,7 @@ class Config: class InfoTypeConfig(ConfigModel): class Config: - if PYDANTIC_VERSION_2: - populate_by_name = True - else: - allow_population_by_field_name = True + populate_by_name = True Prediction_Factors_and_Weights: PredictionFactorsAndWeights = Field( description="Factors and their weights to consider when predicting info types", diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py index 4982bef370d92b..067dff70cc990f 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py @@ -85,7 +85,9 @@ class PipelineConfig(ConfigModel): source: SourceConfig sink: Optional[DynamicTypedConfig] = None transformers: Optional[List[DynamicTypedConfig]] = None - flags: FlagsConfig = Field(default=FlagsConfig(), hidden_from_docs=True) + flags: FlagsConfig = Field( + default=FlagsConfig(), json_schema_extra={"hidden_from_docs": True} + ) reporting: List[ReporterConfig] = [] run_id: str = DEFAULT_RUN_ID datahub_api: Optional[DatahubClientConfig] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/abs/config.py b/metadata-ingestion/src/datahub/ingestion/source/abs/config.py index 583876a8dda6c0..0df1644ddcffa2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/abs/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/abs/config.py @@ -151,7 +151,7 @@ def platform_not_empty(cls, platform: Any, values: dict) -> str: raise ValueError("platform must not be empty") return platform - @pydantic.root_validator() + @pydantic.root_validator(skip_on_failure=True) def ensure_profiling_pattern_is_passed_to_profiling( cls, values: Dict[str, Any] ) -> Dict[str, Any]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/abs/datalake_profiler_config.py b/metadata-ingestion/src/datahub/ingestion/source/abs/datalake_profiler_config.py index d12ff7415faefc..58e930eb6e809c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/abs/datalake_profiler_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/abs/datalake_profiler_config.py @@ -72,7 +72,7 @@ class DataLakeProfilerConfig(ConfigModel): description="Whether to profile for the sample values for all columns.", ) - @pydantic.root_validator() + @pydantic.root_validator(skip_on_failure=True) def ensure_field_level_settings_are_normalized( cls: "DataLakeProfilerConfig", values: Dict[str, Any] ) -> Dict[str, Any]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/tag_entities.py b/metadata-ingestion/src/datahub/ingestion/source/aws/tag_entities.py index b122388851d2ea..e3f858ef7032d2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/tag_entities.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/tag_entities.py @@ -37,7 +37,7 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId): tag_key: str tag_value: Optional[str] = None - platform_instance: Optional[str] + platform_instance: Optional[str] = None catalog: Optional[str] = None exists_in_lake_formation: bool = False persisted: bool = False @@ -227,7 +227,7 @@ class LakeFormationTagPlatformResource(BaseModel, ExternalEntity): datahub_urns: LinkedResourceSet managed_by_datahub: bool id: LakeFormationTagPlatformResourceId - allowed_values: Optional[List[str]] + allowed_values: Optional[List[str]] = None def get_id(self) -> ExternalEntityId: return self.id diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure/azure_common.py b/metadata-ingestion/src/datahub/ingestion/source/azure/azure_common.py index 46de4e09d7ee5b..546ae04e991556 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure/azure_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure/azure_common.py @@ -81,7 +81,7 @@ def get_credentials( ) return self.sas_token if self.sas_token is not None else self.account_key - @root_validator() + @root_validator(skip_on_failure=True) def _check_credential_values(cls, values: Dict) -> Dict: if ( values.get("account_key") diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 29d13da550a0cc..560d301445cc62 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -184,7 +184,7 @@ class BigQueryFilterConfig(SQLFilterConfig): # NOTE: `schema_pattern` is added here only to hide it from docs. schema_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) @root_validator(pre=False, skip_on_failure=True) @@ -321,7 +321,7 @@ class BigQueryV2Config( ) number_of_datasets_process_in_batch: int = Field( - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, default=10000, description="Number of table queried in batch when getting metadata. This is a low level config property " "which should be touched with care.", @@ -440,13 +440,13 @@ def have_table_data_read_permission(self) -> bool: ) run_optimized_column_query: bool = Field( - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, default=False, description="Run optimized column query to get column information. This is an experimental feature and may not work for all cases.", ) file_backed_cache_size: int = Field( - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, default=2000, description="Maximum number of entries for the in-memory caches of FileBacked data structures.", ) @@ -459,7 +459,7 @@ def have_table_data_read_permission(self) -> bool: schema_resolution_batch_size: int = Field( default=100, description="The number of tables to process in a batch when resolving schema from DataHub.", - hidden_from_schema=True, + json_schema_extra={"hidden_from_docs": True}, ) max_threads_dataset_parallelism: int = Field( @@ -518,6 +518,6 @@ def validate_bigquery_audit_metadata_datasets( def get_table_pattern(self, pattern: List[str]) -> str: return "|".join(pattern) if pattern else "" - platform_instance_not_supported_for_bigquery = pydantic_removed_field( + platform_instance_not_supported_for_bigquery: classmethod = pydantic_removed_field( "platform_instance" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py index 0f9471219c6590..a7bff0ee643dc9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py @@ -91,7 +91,7 @@ class BigQueryQueriesExtractorConfig(BigQueryBaseConfig): description="Local path to store the audit log.", # TODO: For now, this is simply an advanced config to make local testing easier. # Eventually, we will want to store date-specific files in the directory and use it as a cache. - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) user_email_pattern: AllowDenyPattern = Field( diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py index d09bdb07449776..2a7544e1c4953c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py @@ -62,8 +62,8 @@ class SortKey(ConfigModel): date_format: Optional[str] = Field( default=None, - type=str, description="The date format to use when sorting. This is used to parse the date from the key. The format should follow the java [SimpleDateFormat](https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html) format.", + validate_default=True, ) @pydantic.validator("date_format", always=True) @@ -110,7 +110,7 @@ class Config: # This is not used yet, but will be used in the future to sort the partitions sort_key: Optional[SortKey] = Field( - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, default=None, description="Sort key to use when sorting the partitions. This is useful when the partitions are not sorted in the order of the data. The key can be a compound key based on the path_spec variables.", ) @@ -260,7 +260,7 @@ def get_folder_named_vars( ) -> Union[None, parse.Result, parse.Match]: return self.compiled_folder_include.parse(path) - @pydantic.root_validator() + @pydantic.root_validator(skip_on_failure=True) def validate_no_double_stars(cls, values: Dict) -> Dict: if "include" not in values: return values @@ -476,6 +476,7 @@ def glob_include(self): return glob_include @pydantic.root_validator(skip_on_failure=True) + @classmethod def validate_path_spec(cls, values: Dict) -> Dict[str, Any]: # validate that main fields are populated required_fields = ["include", "file_types", "default_extension"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py index d5f41cc2eb5a41..265dc5a607aad7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py @@ -101,13 +101,13 @@ class DataHubSourceConfig(StatefulIngestionConfigBase): pull_from_datahub_api: bool = Field( default=False, description="Use the DataHub API to fetch versioned aspects.", - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) max_workers: int = Field( default=5 * (os.cpu_count() or 4), description="Number of worker threads to use for datahub api ingestion.", - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern()) @@ -119,7 +119,7 @@ class DataHubSourceConfig(StatefulIngestionConfigBase): ) structured_properties_template_cache_invalidation_interval: int = Field( - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, default=60, description="Interval in seconds to invalidate the structured properties template cache.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_config.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_config.py index a5a35f9750d07a..5c57c59af2b99b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_config.py @@ -102,7 +102,7 @@ class ProfileConfig(GEProfilingBaseConfig): ) include_field_median_value: bool = Field( default=False, - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, description="Median causes a number of issues in Dremio.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index 7b9bd9dd097709..5d6b0dc56905c0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -125,7 +125,7 @@ class GEProfilingConfig(GEProfilingBaseConfig): description="Profile table only if it has been updated since these many number of days. " "If set to `null`, no constraint of last modified time for tables to profile. " "Supported only in `snowflake` and `BigQuery`.", - schema_extra={"supported_sources": ["snowflake", "bigquery"]}, + json_schema_extra={"supported_sources": ["snowflake", "bigquery"]}, ) profile_table_size_limit: Optional[int] = Field( @@ -133,7 +133,7 @@ class GEProfilingConfig(GEProfilingBaseConfig): description="Profile tables only if their size is less than specified GBs. If set to `null`, " "no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and " "`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.", - schema_extra={ + json_schema_extra={ "supported_sources": ["snowflake", "bigquery", "unity-catalog", "oracle"] }, ) @@ -143,14 +143,14 @@ class GEProfilingConfig(GEProfilingBaseConfig): description="Profile tables only if their row count is less than specified count. " "If set to `null`, no limit on the row count of tables to profile. Supported only in " "`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.", - schema_extra={"supported_sources": ["snowflake", "bigquery", "oracle"]}, + json_schema_extra={"supported_sources": ["snowflake", "bigquery", "oracle"]}, ) profile_table_row_count_estimate_only: bool = Field( default=False, description="Use an approximate query for row count. This will be much faster but slightly " "less accurate. Only supported for Postgres and MySQL. ", - schema_extra={"supported_sources": ["postgres", "mysql"]}, + json_schema_extra={"supported_sources": ["postgres", "mysql"]}, ) # The query combiner enables us to combine multiple queries into a single query, @@ -167,32 +167,32 @@ class GEProfilingConfig(GEProfilingBaseConfig): default=True, description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. " "If enabled, latest partition data is used for profiling.", - schema_extra={"supported_sources": ["athena", "bigquery"]}, + json_schema_extra={"supported_sources": ["athena", "bigquery"]}, ) partition_datetime: Optional[datetime.datetime] = Field( default=None, description="If specified, profile only the partition which matches this datetime. " "If not specified, profile the latest partition. Only Bigquery supports this.", - schema_extra={"supported_sources": ["bigquery"]}, + json_schema_extra={"supported_sources": ["bigquery"]}, ) use_sampling: bool = Field( default=True, description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. " "If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ", - schema_extra={"supported_sources": ["bigquery", "snowflake"]}, + json_schema_extra={"supported_sources": ["bigquery", "snowflake"]}, ) sample_size: int = Field( default=10000, description="Number of rows to be sampled from table for column level profiling." "Applicable only if `use_sampling` is set to True.", - schema_extra={"supported_sources": ["bigquery", "snowflake"]}, + json_schema_extra={"supported_sources": ["bigquery", "snowflake"]}, ) profile_external_tables: bool = Field( default=False, description="Whether to profile external tables. Only Snowflake and Redshift supports this.", - schema_extra={"supported_sources": ["redshift", "snowflake"]}, + json_schema_extra={"supported_sources": ["redshift", "snowflake"]}, ) tags_to_ignore_sampling: Optional[List[str]] = pydantic.Field( diff --git a/metadata-ingestion/src/datahub/ingestion/source/hex/hex.py b/metadata-ingestion/src/datahub/ingestion/source/hex/hex.py index 808a1af767ed05..b02ce3f089ce18 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/hex/hex.py +++ b/metadata-ingestion/src/datahub/ingestion/source/hex/hex.py @@ -68,7 +68,7 @@ class HexSourceConfig( ) include_components: bool = Field( default=True, - desciption="Include Hex Components in the ingestion", + description="Include Hex Components in the ingestion", ) page_size: int = Field( default=HEX_API_PAGE_SIZE_DEFAULT, diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py index c97840950af2aa..d84b5fa7f6875c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py @@ -524,11 +524,11 @@ def _get_dataset_properties_aspect( custom_properties["format-version"] = str(table.metadata.format_version) custom_properties["partition-spec"] = str(self._get_partition_aspect(table)) last_modified: Optional[int] = table.metadata.last_updated_ms - if table.current_snapshot(): - custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id) - custom_properties["manifest-list"] = table.current_snapshot().manifest_list + if current_snapshot := table.current_snapshot(): + custom_properties["snapshot-id"] = str(current_snapshot.snapshot_id) + custom_properties["manifest-list"] = current_snapshot.manifest_list if not last_modified: - last_modified = int(table.current_snapshot().timestamp_ms) + last_modified = int(current_snapshot.timestamp_ms) if "created-at" in custom_properties: try: dt = dateutil_parser.isoparse(custom_properties["created-at"]) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py index 0f8d86a2cbd295..b3168b2e4adae5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py @@ -6,6 +6,7 @@ import pydantic from looker_sdk.sdk.api40.models import DBConnection from pydantic import Field, validator +from pydantic_core import core_schema from datahub.configuration import ConfigModel from datahub.configuration.common import AllowDenyPattern, ConfigurationError @@ -31,23 +32,29 @@ class NamingPattern(ConfigModel): pattern: str @classmethod - def __get_validators__(cls): - yield cls.pydantic_accept_raw_pattern - yield cls.validate - yield cls.pydantic_validate_pattern - - @classmethod - def pydantic_accept_raw_pattern(cls, v): - if isinstance(v, (NamingPattern, dict)): - return v - assert isinstance(v, str), "pattern must be a string" - return {"pattern": v} - - @classmethod - def pydantic_validate_pattern(cls, v): - assert isinstance(v, NamingPattern) - assert v.validate_pattern(cls.REQUIRE_AT_LEAST_ONE_VAR) - return v + def __get_pydantic_core_schema__( + cls, source_type: Any, handler: Any + ) -> core_schema.CoreSchema: + """Pydantic v2 core schema for NamingPattern validation.""" + + def validate_string_input(value: Any) -> Union[NamingPattern, dict]: + if isinstance(value, (cls, dict)): + return value + if isinstance(value, str): + return {"pattern": value} + raise ValueError("pattern must be a string") + + def validate_pattern_rules(instance: "NamingPattern") -> "NamingPattern": + instance.validate_pattern(cls.REQUIRE_AT_LEAST_ONE_VAR) + return instance + + return core_schema.chain_schema( + [ + core_schema.no_info_plain_validator_function(validate_string_input), + handler(cls), + core_schema.no_info_plain_validator_function(validate_pattern_rules), + ] + ) @classmethod def allowed_docstring(cls) -> str: @@ -136,7 +143,7 @@ class LookerCommonConfig(EnvConfigMixin, PlatformInstanceConfigMixin): # TODO: This shouldn't be part of the config. "looker", description="Default platform name.", - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) extract_column_level_lineage: bool = Field( True, diff --git a/metadata-ingestion/src/datahub/ingestion/source/mode.py b/metadata-ingestion/src/datahub/ingestion/source/mode.py index a2f012c1fa0a65..7411c3c2c4a867 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mode.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mode.py @@ -203,7 +203,7 @@ class ModeConfig( items_per_page: int = Field( default=DEFAULT_API_ITEMS_PER_PAGE, description="Number of items per page for paginated API requests.", - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) @validator("connect_uri") diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py index 64f6b130e0fe79..59b51fed26996f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py @@ -57,7 +57,6 @@ ) log = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) _type_mapping: Dict[Union[Type, str], Type] = { "list": UnionTypeClass, diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 3805d744ccb526..d318c0bfcf8e16 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -292,12 +292,12 @@ class PowerBiDashboardSourceConfig( StatefulIngestionConfigBase, DatasetSourceConfigMixin, IncrementalLineageConfigMixin ): platform_name: str = pydantic.Field( - default=Constant.PLATFORM_NAME, hidden_from_docs=True + default=Constant.PLATFORM_NAME, json_schema_extra={"hidden_from_docs": True} ) platform_urn: str = pydantic.Field( default=builder.make_data_platform_urn(platform=Constant.PLATFORM_NAME), - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) # Organization Identifier @@ -306,7 +306,7 @@ class PowerBiDashboardSourceConfig( workspace_id: Optional[str] = pydantic.Field( default=None, description="[deprecated] Use workspace_id_pattern instead", - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) # PowerBi workspace identifier workspace_id_pattern: AllowDenyPattern = pydantic.Field( @@ -333,7 +333,7 @@ class PowerBiDashboardSourceConfig( "DataHub supported datasources." "You can configured platform instance for dataset lineage. " "See Quickstart Recipe for mapping", - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) ) # PowerBI datasource's server to platform instance mapping @@ -531,7 +531,7 @@ class PowerBiDashboardSourceConfig( metadata_api_timeout: int = pydantic.Field( default=30, description="timeout in seconds for Metadata Rest Api.", - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) @root_validator(skip_on_failure=True) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server_domain.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server_domain.py index b30f7317ca2371..f7260a194f8360 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server_domain.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server_domain.py @@ -27,10 +27,8 @@ class CatalogItem(BaseModel): is_favorite: bool = Field(alias="IsFavorite") user_info: Any = Field(None, alias="UserInfo") display_name: Optional[str] = Field(None, alias="DisplayName") - has_data_sources: bool = Field(default=False, alias="HasDataSources") - data_sources: Optional[List["DataSource"]] = Field( - default_factory=list, alias="DataSources" - ) + has_data_sources: bool = Field(False, alias="HasDataSources") + data_sources: Optional[List["DataSource"]] = Field(None, alias="DataSources") @validator("display_name", always=True) def validate_diplay_name(cls, value, values): diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py index 80b834edb3940d..dcb816bbd8b7e8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py @@ -97,7 +97,7 @@ class RedshiftConfig( scheme: str = Field( default="redshift+redshift_connector", description="", - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) _database_alias_removed = pydantic_removed_field("database_alias") diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/datashares.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/datashares.py index a4e7d509fda5e4..652f3b8a21bd02 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/datashares.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/datashares.py @@ -26,7 +26,7 @@ class OutboundSharePlatformResource(BaseModel): namespace: str - platform_instance: Optional[str] + platform_instance: Optional[str] = None env: str source_database: str share_name: str diff --git a/metadata-ingestion/src/datahub/ingestion/source/slack/slack.py b/metadata-ingestion/src/datahub/ingestion/source/slack/slack.py index c0798aec350850..1cd65df7bd09c8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/slack/slack.py +++ b/metadata-ingestion/src/datahub/ingestion/source/slack/slack.py @@ -203,38 +203,31 @@ class SlackSourceConfig( description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email`, `users.profile:read`, and `team:read` scopes.", ) enrich_user_metadata: bool = Field( - type=bool, - default=True, + True, description="When enabled, will enrich provisioned DataHub users' metadata with information from Slack.", ) ingest_users: bool = Field( - type=bool, - default=True, + True, description="Whether to ingest users. When set to true, will ingest all users in the Slack workspace (as platform resources) to simplify user enrichment after they are provisioned on DataHub.", ) api_requests_per_min: int = Field( - type=int, - default=10, + 10, description="Number of API requests per minute. Low-level config. Do not tweak unless you are facing any issues.", ) ingest_public_channels: bool = Field( - type=bool, - default=False, + False, description="Whether to ingest public channels. If set to true needs `channels:read` scope.", ) channels_iteration_limit: int = Field( - type=int, - default=200, + 200, description="Limit the number of channels to be ingested in a iteration. Low-level config. Do not tweak unless you are facing any issues.", ) channel_min_members: int = Field( - type=int, - default=2, + 2, description="Ingest channels with at least this many members.", ) should_ingest_archived_channels: bool = Field( - type=bool, - default=False, + False, description="Whether to ingest archived channels.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 195f8e3eefd3fc..927dfd7af4abf3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -262,7 +262,7 @@ class SnowflakeV2Config( ) structured_properties_template_cache_invalidation_interval: int = Field( - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, default=60, description="Interval in seconds to invalidate the structured properties template cache.", ) @@ -332,7 +332,7 @@ class SnowflakeV2Config( # Allows empty containers to be ingested before datasets are added, avoiding permission errors warn_no_datasets: bool = Field( - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, default=False, description="If True, warns when no datasets are found during ingestion. If False, ingestion fails when no datasets are found.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 63077f7f1a35ca..af29d9e01f7301 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -72,7 +72,7 @@ class ColumnUpstreamJob(BaseModel): class ColumnUpstreamLineage(BaseModel): - column_name: Optional[str] + column_name: Optional[str] = None upstreams: List[ColumnUpstreamJob] = Field(default_factory=list) @@ -91,9 +91,9 @@ class Query(BaseModel): class UpstreamLineageEdge(BaseModel): DOWNSTREAM_TABLE_NAME: str DOWNSTREAM_TABLE_DOMAIN: str - UPSTREAM_TABLES: Optional[List[UpstreamTableNode]] - UPSTREAM_COLUMNS: Optional[List[ColumnUpstreamLineage]] - QUERIES: Optional[List[Query]] + UPSTREAM_TABLES: Optional[List[UpstreamTableNode]] = None + UPSTREAM_COLUMNS: Optional[List[ColumnUpstreamLineage]] = None + QUERIES: Optional[List[Query]] = None _json_upstream_tables = pydantic_parse_json("UPSTREAM_TABLES") _json_upstream_columns = pydantic_parse_json("UPSTREAM_COLUMNS") diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 3075b48dc393d9..bec3f83c1dc2e7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -109,7 +109,7 @@ class SnowflakeQueriesExtractorConfig(ConfigModel): description="Local path to store the audit log.", # TODO: For now, this is simply an advanced config to make local testing easier. # Eventually, we will want to store date-specific files in the directory and use it as a cache. - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) include_lineage: bool = True diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py index 2e1c7d0ed6ee56..17a1b441ff1c46 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py @@ -127,7 +127,11 @@ class ClickHouseConfig( ): # defaults host_port: str = Field(default="localhost:8123", description="ClickHouse host URL.") - scheme: str = Field(default="clickhouse", description="", hidden_from_docs=True) + scheme: str = Field( + default="clickhouse", + description="", + json_schema_extra={"hidden_from_docs": True}, + ) password: pydantic.SecretStr = Field( default=pydantic.SecretStr(""), description="password" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py index 562e9192e0a28a..895c5a30073c21 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py @@ -651,10 +651,12 @@ def get_view_definition_patched(self, connection, view_name, schema=None, **kw): class HiveConfig(TwoTierSQLAlchemyConfig): # defaults - scheme: str = Field(default="hive", hidden_from_docs=True) + scheme: str = Field(default="hive", json_schema_extra={"hidden_from_docs": True}) # Overriding as table location lineage is richer implementation here than with include_table_location_lineage - include_table_location_lineage: bool = Field(default=False, hidden_from_docs=True) + include_table_location_lineage: bool = Field( + default=False, json_schema_extra={"hidden_from_docs": True} + ) emit_storage_lineage: bool = Field( default=False, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hive_metastore.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hive_metastore.py index 6d5977fbb07594..fab011ed48a789 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hive_metastore.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hive_metastore.py @@ -99,7 +99,11 @@ class HiveMetastore(BasicSQLAlchemyConfig): default="localhost:3306", description="Host URL and port to connect to. Example: localhost:3306", ) - scheme: str = Field(default="mysql+pymysql", description="", hidden_from_docs=True) + scheme: str = Field( + default="mysql+pymysql", + description="", + json_schema_extra={"hidden_from_docs": True}, + ) database_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), @@ -124,7 +128,7 @@ class HiveMetastore(BasicSQLAlchemyConfig): ) include_view_lineage: bool = Field( - default=False, description="", hidden_from_docs=True + default=False, description="", json_schema_extra={"hidden_from_docs": True} ) include_catalog_name_in_ids: bool = Field( diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py index d0c9d724056a69..bf8ba5332aedc0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -75,7 +75,11 @@ class SQLServerConfig(BasicSQLAlchemyConfig): # defaults host_port: str = Field(default="localhost:1433", description="MSSQL host URL.") - scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True) + scheme: str = Field( + default="mssql+pytds", + description="", + json_schema_extra={"hidden_from_docs": True}, + ) # TODO: rename to include_procedures ? include_stored_procedures: bool = Field( diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/presto.py b/metadata-ingestion/src/datahub/ingestion/source/sql/presto.py index eb5292fda125d2..4adff61c10660d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/presto.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/presto.py @@ -87,7 +87,9 @@ def _get_full_table( # type: ignore class PrestoConfig(TrinoConfig): # defaults - scheme: str = Field(default="presto", description="", hidden_from_docs=True) + scheme: str = Field( + default="presto", description="", json_schema_extra={"hidden_from_docs": True} + ) @platform_name("Presto", doc_order=1) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py b/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py index d7ea886a86b388..01ba24c3ab536c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py @@ -221,7 +221,9 @@ class ConnectorDetail(PlatformInstanceConfigMixin, EnvConfigMixin): class TrinoConfig(BasicSQLAlchemyConfig): # defaults - scheme: str = Field(default="trino", description="", hidden_from_docs=True) + scheme: str = Field( + default="trino", description="", json_schema_extra={"hidden_from_docs": True} + ) database: str = Field(description="database (catalog)") catalog_to_connector_details: Dict[str, ConnectorDetail] = Field( diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py index fa29e33dc421ab..6270145486477f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py @@ -31,7 +31,7 @@ class TwoTierSQLAlchemyConfig(BasicSQLAlchemyConfig): # The superclass contains a `schema_pattern` field, so we need this here # to override the documentation. default=AllowDenyPattern.allow_all(), - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, description="Deprecated in favour of database_pattern.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py index f3e8e774e4388f..66ae5f78c15448 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py @@ -56,7 +56,7 @@ class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin): use_schema_resolver: bool = Field( description="Read SchemaMetadata aspects from DataHub to aid in SQL parsing. Turn off only for testing.", default=True, - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) default_db: Optional[str] = Field( description="The default database to use for unqualified table names", diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py index 994854836ae379..c261dea7236710 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py @@ -11,7 +11,6 @@ ConfigurationError, DynamicTypedConfig, ) -from datahub.configuration.pydantic_migration_helpers import GenericModel from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.ingestion.api.common import PipelineContext @@ -58,22 +57,22 @@ class StatefulIngestionConfig(ConfigModel): max_checkpoint_state_size: pydantic.PositiveInt = Field( default=2**24, # 16 MB description="The maximum size of the checkpoint state in bytes. Default is 16MB", - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) state_provider: Optional[DynamicTypedStateProviderConfig] = Field( default=None, description="The ingestion state provider configuration.", - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) ignore_old_state: bool = Field( default=False, description="If set to True, ignores the previous checkpoint state.", - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) ignore_new_state: bool = Field( default=False, description="If set to True, ignores the current checkpoint state.", - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) @pydantic.root_validator(skip_on_failure=True) @@ -89,7 +88,7 @@ def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: CustomConfig = TypeVar("CustomConfig", bound=StatefulIngestionConfig) -class StatefulIngestionConfigBase(GenericModel, Generic[CustomConfig]): +class StatefulIngestionConfigBase(ConfigModel, Generic[CustomConfig]): """ Base configuration class for stateful ingestion for source configs to inherit from. """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 232cf0613beaae..0856635b60daa4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -659,7 +659,7 @@ def projects_backward_compatibility(cls, values: Dict) -> Dict: return values - @root_validator() + @root_validator(skip_on_failure=True) def validate_config_values(cls, values: Dict) -> Dict: tags_for_hidden_assets = values.get("tags_for_hidden_assets") ingest_tags = values.get("ingest_tags") diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py index 4e797436262cfd..5fcc6f8cb1ddd4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py @@ -251,7 +251,7 @@ class UnityCatalogSourceConfig( lineage_max_workers: int = pydantic.Field( default=5 * (os.cpu_count() or 4), description="Number of worker threads to use for column lineage thread pool executor. Set to 1 to disable.", - hidden_from_docs=True, + json_schema_extra={"hidden_from_docs": True}, ) include_usage_statistics: bool = Field( diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/tag_entities.py b/metadata-ingestion/src/datahub/ingestion/source/unity/tag_entities.py index 992ba5a6d8110a..c01a01d5d997a7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/tag_entities.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/tag_entities.py @@ -36,7 +36,7 @@ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId): tag_key: str tag_value: Optional[str] = None - platform_instance: Optional[str] + platform_instance: Optional[str] = None exists_in_unity_catalog: bool = False persisted: bool = False @@ -218,7 +218,7 @@ class UnityCatalogTagPlatformResource(BaseModel, ExternalEntity): datahub_urns: LinkedResourceSet managed_by_datahub: bool id: UnityCatalogTagPlatformResourceId - allowed_values: Optional[List[str]] + allowed_values: Optional[List[str]] = None def get_id(self) -> ExternalEntityId: return self.id diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py index 81a88f84179bfd..53139e1590b926 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py @@ -60,7 +60,7 @@ class TrinoConnectorInfo(BaseModel): partitionIds: List[str] - truncated: Optional[bool] + truncated: Optional[bool] = None class TrinoAccessedMetadata(BaseModel): @@ -80,7 +80,7 @@ class TrinoJoinedAccessEvent(BaseModel): table: Optional[str] = None accessed_metadata: List[TrinoAccessedMetadata] starttime: datetime = Field(alias="create_time") - endtime: Optional[datetime] = Field(alias="end_time") + endtime: Optional[datetime] = Field(None, alias="end_time") class EnvBasedSourceBaseConfig: diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py b/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py index 02ddc1735c0d11..6293f3a7b0e55e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py @@ -200,7 +200,9 @@ class BaseUsageConfig(BaseTimeWindowConfig): "Total character limit for all queries in a single usage aspect." " Queries will be truncated to length `queries_character_limit / top_n_queries`." ), - hidden_from_docs=True, # Don't want to encourage people to break elasticsearch + json_schema_extra={ + "hidden_from_docs": True + }, # Don't want to encourage people to break elasticsearch ) top_n_queries: pydantic.PositiveInt = Field( diff --git a/metadata-ingestion/src/datahub/pydantic/__init__.py b/metadata-ingestion/src/datahub/pydantic/__init__.py deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/metadata-ingestion/src/datahub/pydantic/compat.py b/metadata-ingestion/src/datahub/pydantic/compat.py deleted file mode 100644 index f6c1208ae85afd..00000000000000 --- a/metadata-ingestion/src/datahub/pydantic/compat.py +++ /dev/null @@ -1,58 +0,0 @@ -import functools -from typing import Any, Callable, Optional, TypeVar, cast - -# Define a type variable for the decorator -F = TypeVar("F", bound=Callable[..., Any]) - - -# Check which Pydantic version is installed -def get_pydantic_version() -> int: - """Determine if Pydantic v1 or v2 is installed.""" - try: - import pydantic - - version = pydantic.__version__ - return 1 if version.startswith("1.") else 2 - except (ImportError, AttributeError): - # Default to v1 if we can't determine version - return 1 - - -PYDANTIC_VERSION = get_pydantic_version() - - -# Create compatibility layer for dict-like methods -def compat_dict_method(v1_method: Optional[Callable] = None) -> Callable: - """ - Decorator to make a dict method work with both Pydantic v1 and v2. - - In v1: Uses the decorated method (typically dict) - In v2: Redirects to model_dump with appropriate parameter mapping - """ - - def decorator(func: F) -> F: - @functools.wraps(func) - def wrapper(self, *args, **kwargs): - if PYDANTIC_VERSION >= 2: - # Map v1 parameters to v2 parameters - # exclude -> exclude - # exclude_unset -> exclude_unset - # exclude_defaults -> exclude_defaults - # exclude_none -> exclude_none - # by_alias -> by_alias - model_dump_kwargs = kwargs.copy() - - # Handle the 'exclude' parameter differently between versions - exclude = kwargs.get("exclude", set()) - if isinstance(exclude, (set, dict)): - model_dump_kwargs["exclude"] = exclude - - return self.model_dump(**model_dump_kwargs) - return func(self, *args, **kwargs) - - return cast(F, wrapper) - - # Allow use as both @compat_dict_method and @compat_dict_method() - if v1_method is None: - return decorator - return decorator(v1_method) diff --git a/metadata-ingestion/src/datahub/sdk/search_filters.py b/metadata-ingestion/src/datahub/sdk/search_filters.py index d395e7b0090142..9a67c2962b115a 100644 --- a/metadata-ingestion/src/datahub/sdk/search_filters.py +++ b/metadata-ingestion/src/datahub/sdk/search_filters.py @@ -15,7 +15,6 @@ import pydantic from datahub.configuration.common import ConfigModel -from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 from datahub.ingestion.graph.client import flexible_entity_type_to_graphql from datahub.ingestion.graph.filters import ( FilterOperator, @@ -34,12 +33,7 @@ class _BaseFilter(ConfigModel): class Config: - # We can't wrap this in a TYPE_CHECKING block because the pydantic plugin - # doesn't recognize it properly. So unfortunately we'll need to live - # with the deprecation warning w/ pydantic v2. - allow_population_by_field_name = True - if PYDANTIC_VERSION_2: - populate_by_name = True + populate_by_name = True @abc.abstractmethod def compile(self) -> _OrFilters: @@ -319,21 +313,13 @@ def dfs(self) -> Iterator[_BaseFilter]: # Required to resolve forward references to "Filter" -if PYDANTIC_VERSION_2: - _And.model_rebuild() # type: ignore - _Or.model_rebuild() # type: ignore - _Not.model_rebuild() # type: ignore -else: - _And.update_forward_refs() - _Or.update_forward_refs() - _Not.update_forward_refs() +_And.model_rebuild() +_Or.model_rebuild() +_Not.model_rebuild() def load_filters(obj: Any) -> Filter: - if PYDANTIC_VERSION_2: - return pydantic.TypeAdapter(Filter).validate_python(obj) # type: ignore - else: - return pydantic.parse_obj_as(Filter, obj) # type: ignore + return pydantic.TypeAdapter(Filter).validate_python(obj) # We need FilterDsl for two reasons: diff --git a/metadata-ingestion/src/datahub/sql_parsing/_models.py b/metadata-ingestion/src/datahub/sql_parsing/_models.py index d586e7d6d9045b..97da2880476bdb 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/_models.py +++ b/metadata-ingestion/src/datahub/sql_parsing/_models.py @@ -4,7 +4,6 @@ import sqlglot from pydantic import BaseModel -from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 from datahub.metadata.schema_classes import SchemaFieldDataTypeClass @@ -16,10 +15,8 @@ class _ParserBaseModel( }, ): def json(self, *args: Any, **kwargs: Any) -> str: - if PYDANTIC_VERSION_2: - return super().model_dump_json(*args, **kwargs) # type: ignore - else: - return super().json(*args, **kwargs) + # TODO: Remove this method and migrate to model_dump_json. + return super().model_dump_json(*args, **kwargs) @functools.total_ordering diff --git a/metadata-ingestion/src/datahub/utilities/lossy_collections.py b/metadata-ingestion/src/datahub/utilities/lossy_collections.py index 31d6d0eb842d04..66051d23535447 100644 --- a/metadata-ingestion/src/datahub/utilities/lossy_collections.py +++ b/metadata-ingestion/src/datahub/utilities/lossy_collections.py @@ -1,7 +1,8 @@ import random -from typing import Dict, Generic, Iterable, Iterator, List, Set, TypeVar, Union +from typing import Any, Dict, Generic, Iterable, Iterator, List, Set, TypeVar, Union -from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 +from pydantic import GetCoreSchemaHandler +from pydantic_core import CoreSchema, core_schema T = TypeVar("T") _KT = TypeVar("_KT") @@ -47,15 +48,11 @@ def __repr__(self) -> str: def __str__(self) -> str: return repr(self) - if PYDANTIC_VERSION_2: - # With pydantic 2, it doesn't recognize that this is a list subclass, - # so we need to make it explicit. - - @classmethod - def __get_pydantic_core_schema__(cls, source_type, handler): # type: ignore - from pydantic_core import core_schema - - return core_schema.no_info_after_validator_function(cls, handler(list)) + @classmethod + def __get_pydantic_core_schema__( + cls, source_type: Any, handler: GetCoreSchemaHandler + ) -> CoreSchema: + return core_schema.no_info_after_validator_function(cls, handler(list)) def as_obj(self) -> List[Union[T, str]]: from datahub.ingestion.api.report import Report