Skip to content

drop pydantic v1 support #14014

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions metadata-ingestion/scripts/docs_config_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,8 +364,7 @@ def should_hide_field(schema_field, current_source: str, schema_dict: Dict[str,
properties = def_schema.get("properties", {})
if field_name in properties:
field_schema = properties[field_name]
schema_extra = field_schema.get("schema_extra", {})
supported_sources = schema_extra.get("supported_sources")
supported_sources = field_schema.get("supported_sources", {})

if supported_sources and current_source:
return current_source.lower() not in [s.lower() for s in supported_sources]
Expand Down
3 changes: 2 additions & 1 deletion metadata-ingestion/setup.cfg
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
[mypy]
plugins =
./tests/test_helpers/sqlalchemy_mypy_plugin.py,
pydantic.mypy
pydantic.mypy,
pydantic.v1.mypy
exclude = ^(venv/|build/|dist/|examples/transforms/setup.py)
ignore_missing_imports = yes
namespace_packages = no
Expand Down
44 changes: 4 additions & 40 deletions metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,7 @@
"typing_extensions>=4.5.0",
# Actual dependencies.
"typing-inspect",
# pydantic 1.8.2 is incompatible with mypy 0.910.
# See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910.
# pydantic 1.10.3 is incompatible with typing-extensions 4.1.1 - https://github.com/pydantic/pydantic/issues/4885
"pydantic>=1.10.0,!=1.10.3",
"pydantic>=2.4.0",
"mixpanel>=4.9.0",
# Airflow depends on fairly old versions of sentry-sdk, so we want to be loose with our constraints.
"sentry-sdk",
Expand Down Expand Up @@ -58,12 +55,6 @@
"ruamel.yaml",
}

pydantic_no_v2 = {
# pydantic 2 makes major, backwards-incompatible changes - https://github.com/pydantic/pydantic/issues/4887
# Tags sources that require the pydantic v2 API.
"pydantic<2",
}

rest_common = {"requests", "requests_file"}

kafka_common = {
Expand Down Expand Up @@ -119,7 +110,7 @@

dbt_common = {
*sqlglot_lib,
"more_itertools",
"more-itertools",
}

cachetools_lib = {
Expand Down Expand Up @@ -507,10 +498,7 @@
# It's technically wrong for packages to depend on setuptools. However, it seems mlflow does it anyways.
"setuptools",
},
"datahub-debug": {
"dnspython==2.7.0",
"requests"
},
"datahub-debug": {"dnspython==2.7.0", "requests"},
"mode": {"requests", "python-liquid", "tenacity>=8.0.1"} | sqlglot_lib,
"mongodb": {"pymongo[srv]>=3.11", "packaging"},
"mssql": sql_common | mssql_common,
Expand Down Expand Up @@ -729,7 +717,6 @@
if plugin
for dependency in plugins[plugin]
),
*pydantic_no_v2,
}

dev_requirements = {
Expand Down Expand Up @@ -964,30 +951,7 @@
extras_require={
"base": list(framework_common),
**{
plugin: list(
framework_common
| (
# While pydantic v2 support is experimental, require that all plugins
# continue to use v1. This will ensure that no ingestion recipes break.
pydantic_no_v2
if plugin
not in {
"airflow",
"datahub-rest",
"datahub-kafka",
"sync-file-emitter",
"sql-parser",
# Some sources have been manually tested for compatibility with pydantic v2.
"iceberg",
"feast",
"bigquery-slim",
"snowflake-slim",
"mysql", # tested in smoke-test
}
else set()
)
| dependencies
)
plugin: list(framework_common | dependencies)
for (plugin, dependencies) in plugins.items()
},
"all": list(
Expand Down
208 changes: 58 additions & 150 deletions metadata-ingestion/src/datahub/api/entities/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import yaml
from pydantic import (
BaseModel,
ConfigDict,
Field,
StrictStr,
root_validator,
Expand Down Expand Up @@ -66,33 +67,17 @@
StructuredPropertyUrn,
TagUrn,
)
from datahub.pydantic.compat import (
PYDANTIC_VERSION,
)
from datahub.specific.dataset import DatasetPatchBuilder
from datahub.utilities.urns.dataset_urn import DatasetUrn

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class StrictModel(BaseModel):
"""
Base model with strict validation.
Compatible with both Pydantic v1 and v2.
"""

if PYDANTIC_VERSION >= 2:
# Pydantic v2 config
model_config = {
"validate_assignment": True,
"extra": "forbid",
}
else:
# Pydantic v1 config
class Config:
validate_assignment = True
extra = "forbid"
model_config = ConfigDict(
validate_assignment=True,
extra="forbid",
)


# Define type aliases for the complex types
Expand Down Expand Up @@ -134,7 +119,7 @@ def urn_strip(urn: str) -> str:

class SchemaFieldSpecification(StrictModel):
id: Optional[str] = None
urn: Optional[str] = None
urn: Optional[str] = Field(default=None, validate_default=True)
structured_properties: Optional[StructuredProperties] = None
type: Optional[str] = None
nativeDataType: Optional[str] = None
Expand Down Expand Up @@ -288,66 +273,33 @@ def _from_datahub_type(
return "record"
raise ValueError(f"Type {input_type} is not a valid primitive type")

if PYDANTIC_VERSION < 2:

def dict(self, **kwargs):
"""Custom dict method for Pydantic v1 to handle YAML serialization properly."""
exclude = kwargs.pop("exclude", None) or set()

# If description and doc are identical, exclude doc from the output
if self.description == self.doc and self.description is not None:
exclude.add("doc")

# if nativeDataType and type are identical, exclude nativeDataType from the output
if self.nativeDataType == self.type and self.nativeDataType is not None:
exclude.add("nativeDataType")
def model_dump(self, **kwargs):
"""Custom model_dump method for Pydantic v2 to handle YAML serialization properly."""
exclude = kwargs.pop("exclude", None) or set()

# if the id is the same as the urn's fieldPath, exclude id from the output
# If description and doc are identical, exclude doc from the output
if self.description == self.doc and self.description is not None:
exclude.add("doc")

if self.urn:
field_urn = SchemaFieldUrn.from_string(self.urn)
if Dataset._simplify_field_path(field_urn.field_path) == self.id:
exclude.add("urn")
# if nativeDataType and type are identical, exclude nativeDataType from the output
if self.nativeDataType == self.type and self.nativeDataType is not None:
exclude.add("nativeDataType")

kwargs.pop("exclude_defaults", None)
# if the id is the same as the urn's fieldPath, exclude id from the output
if self.urn:
field_urn = SchemaFieldUrn.from_string(self.urn)
if Dataset._simplify_field_path(field_urn.field_path) == self.id:
exclude.add("urn")

self.structured_properties = (
StructuredPropertiesHelper.simplify_structured_properties_list(
self.structured_properties
)
self.structured_properties = (
StructuredPropertiesHelper.simplify_structured_properties_list(
self.structured_properties
)

return super().dict(exclude=exclude, exclude_defaults=True, **kwargs)

else:
# For v2, implement model_dump with similar logic as dict
def model_dump(self, **kwargs):
"""Custom model_dump method for Pydantic v2 to handle YAML serialization properly."""
exclude = kwargs.pop("exclude", None) or set()

# If description and doc are identical, exclude doc from the output
if self.description == self.doc and self.description is not None:
exclude.add("doc")

# if nativeDataType and type are identical, exclude nativeDataType from the output
if self.nativeDataType == self.type and self.nativeDataType is not None:
exclude.add("nativeDataType")

# if the id is the same as the urn's fieldPath, exclude id from the output
if self.urn:
field_urn = SchemaFieldUrn.from_string(self.urn)
if Dataset._simplify_field_path(field_urn.field_path) == self.id:
exclude.add("urn")

self.structured_properties = (
StructuredPropertiesHelper.simplify_structured_properties_list(
self.structured_properties
)
)
if hasattr(super(), "model_dump"):
return super().model_dump( # type: ignore
exclude=exclude, exclude_defaults=True, **kwargs
)
if hasattr(super(), "model_dump"):
return super().model_dump( # type: ignore
exclude=exclude, exclude_defaults=True, **kwargs
)


class SchemaSpecification(BaseModel):
Expand Down Expand Up @@ -380,9 +332,9 @@ class Dataset(StrictModel):
id: Optional[str] = None
platform: Optional[str] = None
env: str = "PROD"
urn: Optional[str] = None
urn: Optional[str] = Field(default=None, validate_default=True)
description: Optional[str] = None
name: Optional[str] = None
name: Optional[str] = Field(default=None, validate_default=True)
schema_metadata: Optional[SchemaSpecification] = Field(default=None, alias="schema")
downstreams: Optional[List[str]] = None
properties: Optional[Dict[str, str]] = None
Expand Down Expand Up @@ -940,80 +892,42 @@ def from_datahub(
downstreams=downstreams if config.include_downstreams else None,
)

if PYDANTIC_VERSION < 2:
def model_dump(self, **kwargs):
"""Custom model_dump method to handle YAML serialization properly."""
exclude = kwargs.pop("exclude", None) or set()

def dict(self, **kwargs):
"""Custom dict method for Pydantic v1 to handle YAML serialization properly."""
exclude = kwargs.pop("exclude", set())
# If id and name are identical, exclude name from the output
if self.id == self.name and self.id is not None:
exclude.add("name")

# If id and name are identical, exclude name from the output
if self.id == self.name and self.id is not None:
exclude.add("name")

# if subtype and subtypes are identical or subtypes is a singleton list, exclude subtypes from the output
if self.subtypes and len(self.subtypes) == 1:
self.subtype = self.subtypes[0]
exclude.add("subtypes")
# if subtype and subtypes are identical or subtypes is a singleton list, exclude subtypes from the output
if self.subtypes and len(self.subtypes) == 1:
self.subtype = self.subtypes[0]
exclude.add("subtypes")

if hasattr(super(), "model_dump"):
result = super().model_dump(exclude=exclude, **kwargs) # type: ignore
else:
result = super().dict(exclude=exclude, **kwargs)

# Custom handling for schema_metadata/schema
if self.schema_metadata and "schema" in result:
schema_data = result["schema"]

# Handle fields if they exist
if "fields" in schema_data and isinstance(schema_data["fields"], list):
# Process each field using its custom dict method
processed_fields = []
if self.schema_metadata and self.schema_metadata.fields:
for field in self.schema_metadata.fields:
if field:
# Use dict method for Pydantic v1
processed_field = field.dict(**kwargs)
processed_fields.append(processed_field)

# Replace the fields in the result with the processed ones
schema_data["fields"] = processed_fields

return result
else:

def model_dump(self, **kwargs):
"""Custom model_dump method for Pydantic v2 to handle YAML serialization properly."""
exclude = kwargs.pop("exclude", None) or set()

# If id and name are identical, exclude name from the output
if self.id == self.name and self.id is not None:
exclude.add("name")

# if subtype and subtypes are identical or subtypes is a singleton list, exclude subtypes from the output
if self.subtypes and len(self.subtypes) == 1:
self.subtype = self.subtypes[0]
exclude.add("subtypes")

if hasattr(super(), "model_dump"):
result = super().model_dump(exclude=exclude, **kwargs) # type: ignore
else:
result = super().dict(exclude=exclude, **kwargs)

# Custom handling for schema_metadata/schema
if self.schema_metadata and "schema" in result:
schema_data = result["schema"]
# Custom handling for schema_metadata/schema
if self.schema_metadata and "schema" in result:
schema_data = result["schema"]

# Handle fields if they exist
if "fields" in schema_data and isinstance(schema_data["fields"], list):
# Process each field using its custom model_dump method
processed_fields = []
if self.schema_metadata and self.schema_metadata.fields:
for field in self.schema_metadata.fields:
if field:
processed_field = field.model_dump(**kwargs)
processed_fields.append(processed_field)
# Handle fields if they exist
if "fields" in schema_data and isinstance(schema_data["fields"], list):
# Process each field using its custom model_dump method
processed_fields = []
if self.schema_metadata and self.schema_metadata.fields:
for field in self.schema_metadata.fields:
if field:
processed_field = field.model_dump(**kwargs)
processed_fields.append(processed_field)

# Replace the fields in the result with the processed ones
schema_data["fields"] = processed_fields
# Replace the fields in the result with the processed ones
schema_data["fields"] = processed_fields

return result
return result

def to_yaml(
self,
Expand All @@ -1025,13 +939,7 @@ def to_yaml(
Returns True if file was written, False if no changes were detected.
"""
# Create new model data
# Create new model data - choose dict() or model_dump() based on Pydantic version
if PYDANTIC_VERSION >= 2:
new_data = self.model_dump(
exclude_none=True, exclude_unset=True, by_alias=True
)
else:
new_data = self.dict(exclude_none=True, exclude_unset=True, by_alias=True)
new_data = self.model_dump(exclude_none=True, exclude_unset=True, by_alias=True)

# Set up ruamel.yaml for preserving comments
yaml_handler = YAML(typ="rt") # round-trip mode
Expand Down
Loading
Loading