datahub-project · hsheth2 · Jul 8, 2025 · Jul 8, 2025 · Jul 9, 2025 · Jul 9, 2025
diff --git a/metadata-ingestion/scripts/docs_config_table.py b/metadata-ingestion/scripts/docs_config_table.py
@@ -364,8 +364,7 @@ def should_hide_field(schema_field, current_source: str, schema_dict: Dict[str,
         properties = def_schema.get("properties", {})
         if field_name in properties:
             field_schema = properties[field_name]
-            schema_extra = field_schema.get("schema_extra", {})
-            supported_sources = schema_extra.get("supported_sources")
+            supported_sources = field_schema.get("supported_sources", {})
 
             if supported_sources and current_source:
                 return current_source.lower() not in [s.lower() for s in supported_sources]

diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg
@@ -1,7 +1,8 @@
 [mypy]
 plugins =
     ./tests/test_helpers/sqlalchemy_mypy_plugin.py,
-    pydantic.mypy
+    pydantic.mypy,
+    pydantic.v1.mypy
 exclude = ^(venv/|build/|dist/|examples/transforms/setup.py)
 ignore_missing_imports = yes
 namespace_packages = no

diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
@@ -18,10 +18,7 @@
     "typing_extensions>=4.5.0",
     # Actual dependencies.
     "typing-inspect",
-    # pydantic 1.8.2 is incompatible with mypy 0.910.
-    # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910.
-    # pydantic 1.10.3 is incompatible with typing-extensions 4.1.1 - https://github.com/pydantic/pydantic/issues/4885
-    "pydantic>=1.10.0,!=1.10.3",
+    "pydantic>=2.4.0",
     "mixpanel>=4.9.0",
     # Airflow depends on fairly old versions of sentry-sdk, so we want to be loose with our constraints.
     "sentry-sdk",
@@ -58,12 +55,6 @@
     "ruamel.yaml",
 }
 
-pydantic_no_v2 = {
-    # pydantic 2 makes major, backwards-incompatible changes - https://github.com/pydantic/pydantic/issues/4887
-    # Tags sources that require the pydantic v2 API.
-    "pydantic<2",
-}
-
 rest_common = {"requests", "requests_file"}
 
 kafka_common = {
@@ -119,7 +110,7 @@
 
 dbt_common = {
     *sqlglot_lib,
-    "more_itertools",
+    "more-itertools",
 }
 
 cachetools_lib = {
@@ -507,10 +498,7 @@
         # It's technically wrong for packages to depend on setuptools. However, it seems mlflow does it anyways.
         "setuptools",
     },
-    "datahub-debug": {
-        "dnspython==2.7.0",
-        "requests"
-    },
+    "datahub-debug": {"dnspython==2.7.0", "requests"},
     "mode": {"requests", "python-liquid", "tenacity>=8.0.1"} | sqlglot_lib,
     "mongodb": {"pymongo[srv]>=3.11", "packaging"},
     "mssql": sql_common | mssql_common,
@@ -729,7 +717,6 @@
         if plugin
         for dependency in plugins[plugin]
     ),
-    *pydantic_no_v2,
 }
 
 dev_requirements = {
@@ -964,30 +951,7 @@
     extras_require={
         "base": list(framework_common),
         **{
-            plugin: list(
-                framework_common
-                | (
-                    # While pydantic v2 support is experimental, require that all plugins
-                    # continue to use v1. This will ensure that no ingestion recipes break.
-                    pydantic_no_v2
-                    if plugin
-                    not in {
-                        "airflow",
-                        "datahub-rest",
-                        "datahub-kafka",
-                        "sync-file-emitter",
-                        "sql-parser",
-                        # Some sources have been manually tested for compatibility with pydantic v2.
-                        "iceberg",
-                        "feast",
-                        "bigquery-slim",
-                        "snowflake-slim",
-                        "mysql",  # tested in smoke-test
-                    }
-                    else set()
-                )
-                | dependencies
-            )
+            plugin: list(framework_common | dependencies)
             for (plugin, dependencies) in plugins.items()
         },
         "all": list(

diff --git a/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py b/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py
@@ -17,6 +17,7 @@
 import yaml
 from pydantic import (
     BaseModel,
+    ConfigDict,
     Field,
     StrictStr,
     root_validator,
@@ -66,33 +67,17 @@
     StructuredPropertyUrn,
     TagUrn,
 )
-from datahub.pydantic.compat import (
-    PYDANTIC_VERSION,
-)
 from datahub.specific.dataset import DatasetPatchBuilder
 from datahub.utilities.urns.dataset_urn import DatasetUrn
 
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
 class StrictModel(BaseModel):
-    """
-    Base model with strict validation.
-    Compatible with both Pydantic v1 and v2.
-    """
-
-    if PYDANTIC_VERSION >= 2:
-        # Pydantic v2 config
-        model_config = {
-            "validate_assignment": True,
-            "extra": "forbid",
-        }
-    else:
-        # Pydantic v1 config
-        class Config:
-            validate_assignment = True
-            extra = "forbid"
+    model_config = ConfigDict(
+        validate_assignment=True,
+        extra="forbid",
+    )
 
 
 # Define type aliases for the complex types
@@ -134,7 +119,7 @@ def urn_strip(urn: str) -> str:
 
 class SchemaFieldSpecification(StrictModel):
     id: Optional[str] = None
-    urn: Optional[str] = None
+    urn: Optional[str] = Field(default=None, validate_default=True)
     structured_properties: Optional[StructuredProperties] = None
     type: Optional[str] = None
     nativeDataType: Optional[str] = None
@@ -288,66 +273,33 @@ def _from_datahub_type(
             return "record"
         raise ValueError(f"Type {input_type} is not a valid primitive type")
 
-    if PYDANTIC_VERSION < 2:
-
-        def dict(self, **kwargs):
-            """Custom dict method for Pydantic v1 to handle YAML serialization properly."""
-            exclude = kwargs.pop("exclude", None) or set()
-
-            # If description and doc are identical, exclude doc from the output
-            if self.description == self.doc and self.description is not None:
-                exclude.add("doc")
-
-            # if nativeDataType and type are identical, exclude nativeDataType from the output
-            if self.nativeDataType == self.type and self.nativeDataType is not None:
-                exclude.add("nativeDataType")
+    def model_dump(self, **kwargs):
+        """Custom model_dump method for Pydantic v2 to handle YAML serialization properly."""
+        exclude = kwargs.pop("exclude", None) or set()
 
-            # if the id is the same as the urn's fieldPath, exclude id from the output
+        # If description and doc are identical, exclude doc from the output
+        if self.description == self.doc and self.description is not None:
+            exclude.add("doc")
 
-            if self.urn:
-                field_urn = SchemaFieldUrn.from_string(self.urn)
-                if Dataset._simplify_field_path(field_urn.field_path) == self.id:
-                    exclude.add("urn")
+        # if nativeDataType and type are identical, exclude nativeDataType from the output
+        if self.nativeDataType == self.type and self.nativeDataType is not None:
+            exclude.add("nativeDataType")
 
-            kwargs.pop("exclude_defaults", None)
+        # if the id is the same as the urn's fieldPath, exclude id from the output
+        if self.urn:
+            field_urn = SchemaFieldUrn.from_string(self.urn)
+            if Dataset._simplify_field_path(field_urn.field_path) == self.id:
+                exclude.add("urn")
 
-            self.structured_properties = (
-                StructuredPropertiesHelper.simplify_structured_properties_list(
-                    self.structured_properties
-                )
+        self.structured_properties = (
+            StructuredPropertiesHelper.simplify_structured_properties_list(
+                self.structured_properties
             )
-
-            return super().dict(exclude=exclude, exclude_defaults=True, **kwargs)
-
-    else:
-        # For v2, implement model_dump with similar logic as dict
-        def model_dump(self, **kwargs):
-            """Custom model_dump method for Pydantic v2 to handle YAML serialization properly."""
-            exclude = kwargs.pop("exclude", None) or set()
-
-            # If description and doc are identical, exclude doc from the output
-            if self.description == self.doc and self.description is not None:
-                exclude.add("doc")
-
-            # if nativeDataType and type are identical, exclude nativeDataType from the output
-            if self.nativeDataType == self.type and self.nativeDataType is not None:
-                exclude.add("nativeDataType")
-
-            # if the id is the same as the urn's fieldPath, exclude id from the output
-            if self.urn:
-                field_urn = SchemaFieldUrn.from_string(self.urn)
-                if Dataset._simplify_field_path(field_urn.field_path) == self.id:
-                    exclude.add("urn")
-
-            self.structured_properties = (
-                StructuredPropertiesHelper.simplify_structured_properties_list(
-                    self.structured_properties
-                )
+        )
+        if hasattr(super(), "model_dump"):
+            return super().model_dump(  # type: ignore
+                exclude=exclude, exclude_defaults=True, **kwargs
             )
-            if hasattr(super(), "model_dump"):
-                return super().model_dump(  # type: ignore
-                    exclude=exclude, exclude_defaults=True, **kwargs
-                )
 
 
 class SchemaSpecification(BaseModel):
@@ -380,9 +332,9 @@ class Dataset(StrictModel):
     id: Optional[str] = None
     platform: Optional[str] = None
     env: str = "PROD"
-    urn: Optional[str] = None
+    urn: Optional[str] = Field(default=None, validate_default=True)
     description: Optional[str] = None
-    name: Optional[str] = None
+    name: Optional[str] = Field(default=None, validate_default=True)
     schema_metadata: Optional[SchemaSpecification] = Field(default=None, alias="schema")
     downstreams: Optional[List[str]] = None
     properties: Optional[Dict[str, str]] = None
@@ -940,80 +892,42 @@ def from_datahub(
             downstreams=downstreams if config.include_downstreams else None,
         )
 
-    if PYDANTIC_VERSION < 2:
+    def model_dump(self, **kwargs):
+        """Custom model_dump method to handle YAML serialization properly."""
+        exclude = kwargs.pop("exclude", None) or set()
 
-        def dict(self, **kwargs):
-            """Custom dict method for Pydantic v1 to handle YAML serialization properly."""
-            exclude = kwargs.pop("exclude", set())
+        # If id and name are identical, exclude name from the output
+        if self.id == self.name and self.id is not None:
+            exclude.add("name")
 
-            # If id and name are identical, exclude name from the output
-            if self.id == self.name and self.id is not None:
-                exclude.add("name")
-
-            # if subtype and subtypes are identical or subtypes is a singleton list, exclude subtypes from the output
-            if self.subtypes and len(self.subtypes) == 1:
-                self.subtype = self.subtypes[0]
-                exclude.add("subtypes")
+        # if subtype and subtypes are identical or subtypes is a singleton list, exclude subtypes from the output
+        if self.subtypes and len(self.subtypes) == 1:
+            self.subtype = self.subtypes[0]
+            exclude.add("subtypes")
 
+        if hasattr(super(), "model_dump"):
+            result = super().model_dump(exclude=exclude, **kwargs)  # type: ignore
+        else:
             result = super().dict(exclude=exclude, **kwargs)
 
-            # Custom handling for schema_metadata/schema
-            if self.schema_metadata and "schema" in result:
-                schema_data = result["schema"]
-
-                # Handle fields if they exist
-                if "fields" in schema_data and isinstance(schema_data["fields"], list):
-                    # Process each field using its custom dict method
-                    processed_fields = []
-                    if self.schema_metadata and self.schema_metadata.fields:
-                        for field in self.schema_metadata.fields:
-                            if field:
-                                # Use dict method for Pydantic v1
-                                processed_field = field.dict(**kwargs)
-                                processed_fields.append(processed_field)
-
-                    # Replace the fields in the result with the processed ones
-                    schema_data["fields"] = processed_fields
-
-            return result
-    else:
-
-        def model_dump(self, **kwargs):
-            """Custom model_dump method for Pydantic v2 to handle YAML serialization properly."""
-            exclude = kwargs.pop("exclude", None) or set()
-
-            # If id and name are identical, exclude name from the output
-            if self.id == self.name and self.id is not None:
-                exclude.add("name")
-
-            # if subtype and subtypes are identical or subtypes is a singleton list, exclude subtypes from the output
-            if self.subtypes and len(self.subtypes) == 1:
-                self.subtype = self.subtypes[0]
-                exclude.add("subtypes")
-
-            if hasattr(super(), "model_dump"):
-                result = super().model_dump(exclude=exclude, **kwargs)  # type: ignore
-            else:
-                result = super().dict(exclude=exclude, **kwargs)
-
-            # Custom handling for schema_metadata/schema
-            if self.schema_metadata and "schema" in result:
-                schema_data = result["schema"]
+        # Custom handling for schema_metadata/schema
+        if self.schema_metadata and "schema" in result:
+            schema_data = result["schema"]
 
-                # Handle fields if they exist
-                if "fields" in schema_data and isinstance(schema_data["fields"], list):
-                    # Process each field using its custom model_dump method
-                    processed_fields = []
-                    if self.schema_metadata and self.schema_metadata.fields:
-                        for field in self.schema_metadata.fields:
-                            if field:
-                                processed_field = field.model_dump(**kwargs)
-                                processed_fields.append(processed_field)
+            # Handle fields if they exist
+            if "fields" in schema_data and isinstance(schema_data["fields"], list):
+                # Process each field using its custom model_dump method
+                processed_fields = []
+                if self.schema_metadata and self.schema_metadata.fields:
+                    for field in self.schema_metadata.fields:
+                        if field:
+                            processed_field = field.model_dump(**kwargs)
+                            processed_fields.append(processed_field)
 
-                    # Replace the fields in the result with the processed ones
-                    schema_data["fields"] = processed_fields
+                # Replace the fields in the result with the processed ones
+                schema_data["fields"] = processed_fields
 
-            return result
+        return result
 
     def to_yaml(
         self,
@@ -1025,13 +939,7 @@ def to_yaml(
         Returns True if file was written, False if no changes were detected.
         """
         # Create new model data
-        # Create new model data - choose dict() or model_dump() based on Pydantic version
-        if PYDANTIC_VERSION >= 2:
-            new_data = self.model_dump(
-                exclude_none=True, exclude_unset=True, by_alias=True
-            )
-        else:
-            new_data = self.dict(exclude_none=True, exclude_unset=True, by_alias=True)
+        new_data = self.model_dump(exclude_none=True, exclude_unset=True, by_alias=True)
 
         # Set up ruamel.yaml for preserving comments
         yaml_handler = YAML(typ="rt")  # round-trip mode