From fbea98e3c3f2ef408e48c4a3f181f10c344726dd Mon Sep 17 00:00:00 2001 From: "Rob J. Caskey" Date: Tue, 2 Dec 2025 08:17:32 -0500 Subject: [PATCH 1/8] perf(ingestion): compile regex patterns for filtering hot path AllowDenyPattern now compiles regex patterns once using cached_property instead of recompiling on every match. This affects database, schema, and table filtering across all SQL connectors and many non-SQL sources including BigQuery, S3, Kafka, Looker, PowerBI, and others. Similarly, Snowflake's temporary_tables_pattern is now compiled once at config initialization rather than on every table check. The optimization reduces regex compilation overhead in the hot path during metadata extraction without changing filtering behavior. --- .../src/datahub/configuration/common.py | 19 ++++++++++--------- .../source/snowflake/snowflake_config.py | 9 +++++++++ .../source/snowflake/snowflake_queries.py | 4 ++-- .../source/snowflake/snowflake_v2.py | 5 ++--- 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index d2ad64e661c7b7..d217d0918c554d 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -384,6 +384,14 @@ class AllowDenyPattern(ConfigModel): def regex_flags(self) -> int: return re.IGNORECASE if self.ignoreCase else 0 + @cached_property + def _compiled_allow(self) -> List[re.Pattern]: + return [re.compile(pattern, self.regex_flags) for pattern in self.allow] + + @cached_property + def _compiled_deny(self) -> List[re.Pattern]: + return [re.compile(pattern, self.regex_flags) for pattern in self.deny] + @classmethod def allow_all(cls) -> "AllowDenyPattern": return AllowDenyPattern() @@ -392,17 +400,10 @@ def allowed(self, string: str) -> bool: if self.denied(string): return False - return any( - re.match(allow_pattern, string, self.regex_flags) - for allow_pattern in self.allow - ) + return any(pattern.match(string) for pattern in self._compiled_allow) def denied(self, string: str) -> bool: - for deny_pattern in self.deny: - if re.match(deny_pattern, string, self.regex_flags): - return True - - return False + return any(pattern.match(string) for pattern in self._compiled_deny) def is_fully_specified_allow_list(self) -> bool: """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index e08a929092965f..2c7f3bd527d77c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -1,10 +1,12 @@ import logging +import re from collections import defaultdict from dataclasses import dataclass from enum import Enum from typing import Dict, List, Optional, Set import pydantic +from cached_property import cached_property from pydantic import Field, ValidationInfo, field_validator, model_validator from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs @@ -404,6 +406,13 @@ class SnowflakeV2Config( "This may be required in the case of _eg_ temporary tables being created in a different database than the ones in the database_name patterns.", ) + @cached_property + def _compiled_temporary_tables_pattern(self) -> List[re.Pattern]: + return [ + re.compile(pattern, re.IGNORECASE) + for pattern in self.temporary_tables_pattern + ] + @field_validator("convert_urns_to_lowercase", mode="after") @classmethod def validate_convert_urns_to_lowercase(cls, v): diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index a7babecf048582..bd6334fe20bb5d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -284,8 +284,8 @@ def local_temp_path(self) -> pathlib.Path: def is_temp_table(self, name: str) -> bool: if any( - re.match(pattern, name, flags=re.IGNORECASE) - for pattern in self.config.temporary_tables_pattern + pattern.match(name) + for pattern in self.config._compiled_temporary_tables_pattern ): return True diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index f5558d1d7cb107..82f31d0040a7a7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -5,7 +5,6 @@ import os import os.path import platform -import re from dataclasses import dataclass from typing import Dict, Iterable, List, Optional, Union @@ -469,8 +468,8 @@ class SnowflakePrivilege: def _is_temp_table(self, name: str) -> bool: if any( - re.match(pattern, name, flags=re.IGNORECASE) - for pattern in self.config.temporary_tables_pattern + pattern.match(name) + for pattern in self.config._compiled_temporary_tables_pattern ): return True From 00cfddb47bd9b6af850b15d316f2f0709900f173 Mon Sep 17 00:00:00 2001 From: "Rob J. Caskey" Date: Tue, 2 Dec 2025 08:17:32 -0500 Subject: [PATCH 2/8] perf(ingestion): compile regex patterns for filtering hot path AllowDenyPattern now compiles regex patterns once using cached_property instead of recompiling on every match. This affects database, schema, and table filtering across all SQL connectors and many non-SQL sources including BigQuery, S3, Kafka, Looker, PowerBI, and others. Similarly, Snowflake's temporary_tables_pattern is now compiled once at config initialization rather than on every table check. The optimization reduces regex compilation overhead in the hot path during metadata extraction without changing filtering behavior. --- metadata-ingestion/src/datahub/configuration/common.py | 4 ++-- .../ingestion/source/snowflake/snowflake_config.py | 2 +- .../ingestion/source/snowflake/snowflake_queries.py | 8 ++++++++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index d217d0918c554d..07c58e5d6db89a 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -385,11 +385,11 @@ def regex_flags(self) -> int: return re.IGNORECASE if self.ignoreCase else 0 @cached_property - def _compiled_allow(self) -> List[re.Pattern]: + def _compiled_allow(self) -> "List[re.Pattern]": return [re.compile(pattern, self.regex_flags) for pattern in self.allow] @cached_property - def _compiled_deny(self) -> List[re.Pattern]: + def _compiled_deny(self) -> "List[re.Pattern]": return [re.compile(pattern, self.regex_flags) for pattern in self.deny] @classmethod diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 2c7f3bd527d77c..8b1064cbed8e6f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -407,7 +407,7 @@ class SnowflakeV2Config( ) @cached_property - def _compiled_temporary_tables_pattern(self) -> List[re.Pattern]: + def _compiled_temporary_tables_pattern(self) -> "List[re.Pattern[str]]": return [ re.compile(pattern, re.IGNORECASE) for pattern in self.temporary_tables_pattern diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index bd6334fe20bb5d..7977794f4d4333 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -11,6 +11,7 @@ from typing import Any, Dict, Iterable, List, Optional, Union import pydantic +from cached_property import cached_property from typing_extensions import Self from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs @@ -147,6 +148,13 @@ class SnowflakeQueriesExtractorConfig(ConfigModel): query_dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD + @cached_property + def _compiled_temporary_tables_pattern(self) -> "List[re.Pattern[str]]": + return [ + re.compile(pattern, re.IGNORECASE) + for pattern in self.temporary_tables_pattern + ] + class SnowflakeQueriesSourceConfig( SnowflakeQueriesExtractorConfig, SnowflakeIdentifierConfig, SnowflakeFilterConfig From fe614559c4683db2aac9f937be4cc9d8344bcfb2 Mon Sep 17 00:00:00 2001 From: "Rob J. Caskey" Date: Tue, 2 Dec 2025 08:51:39 -0500 Subject: [PATCH 3/8] fix(ingestion): use functools.cached_property for mypy compatibility Switch from cached-property package to stdlib functools.cached_property to fix mypy type checking errors with disallow_untyped_decorators. --- metadata-ingestion/src/datahub/configuration/common.py | 2 +- .../src/datahub/ingestion/source/snowflake/snowflake_config.py | 2 +- .../src/datahub/ingestion/source/snowflake/snowflake_queries.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 07c58e5d6db89a..569ac2123f0ec5 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -4,6 +4,7 @@ import unittest.mock from abc import ABC, abstractmethod from enum import auto +from functools import cached_property from typing import ( IO, TYPE_CHECKING, @@ -21,7 +22,6 @@ import pydantic import pydantic_core -from cached_property import cached_property from pydantic import BaseModel, ConfigDict, SecretStr, ValidationError, model_validator from pydantic.fields import Field from typing_extensions import Protocol, Self diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 8b1064cbed8e6f..deb8ce6a6e4b19 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -3,10 +3,10 @@ from collections import defaultdict from dataclasses import dataclass from enum import Enum +from functools import cached_property from typing import Dict, List, Optional, Set import pydantic -from cached_property import cached_property from pydantic import Field, ValidationInfo, field_validator, model_validator from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 7977794f4d4333..bda43ace4b9782 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -8,10 +8,10 @@ import tempfile from dataclasses import dataclass from datetime import datetime, timezone +from functools import cached_property from typing import Any, Dict, Iterable, List, Optional, Union import pydantic -from cached_property import cached_property from typing_extensions import Self from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs From 343864c1016505145938cd352af7aad3e0558c14 Mon Sep 17 00:00:00 2001 From: "Rob J. Caskey" Date: Tue, 2 Dec 2025 09:39:02 -0500 Subject: [PATCH 4/8] fix(ingestion): add type ignore comments for cached_property mypy compatibility --- metadata-ingestion/src/datahub/configuration/common.py | 6 +++--- .../datahub/ingestion/source/snowflake/snowflake_config.py | 4 ++-- .../datahub/ingestion/source/snowflake/snowflake_queries.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 569ac2123f0ec5..73685da0784f3b 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -4,7 +4,6 @@ import unittest.mock from abc import ABC, abstractmethod from enum import auto -from functools import cached_property from typing import ( IO, TYPE_CHECKING, @@ -22,6 +21,7 @@ import pydantic import pydantic_core +from cached_property import cached_property from pydantic import BaseModel, ConfigDict, SecretStr, ValidationError, model_validator from pydantic.fields import Field from typing_extensions import Protocol, Self @@ -384,11 +384,11 @@ class AllowDenyPattern(ConfigModel): def regex_flags(self) -> int: return re.IGNORECASE if self.ignoreCase else 0 - @cached_property + @cached_property # type: ignore[misc] def _compiled_allow(self) -> "List[re.Pattern]": return [re.compile(pattern, self.regex_flags) for pattern in self.allow] - @cached_property + @cached_property # type: ignore[misc] def _compiled_deny(self) -> "List[re.Pattern]": return [re.compile(pattern, self.regex_flags) for pattern in self.deny] diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index deb8ce6a6e4b19..91f427a2749e18 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -3,10 +3,10 @@ from collections import defaultdict from dataclasses import dataclass from enum import Enum -from functools import cached_property from typing import Dict, List, Optional, Set import pydantic +from cached_property import cached_property from pydantic import Field, ValidationInfo, field_validator, model_validator from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs @@ -406,7 +406,7 @@ class SnowflakeV2Config( "This may be required in the case of _eg_ temporary tables being created in a different database than the ones in the database_name patterns.", ) - @cached_property + @cached_property # type: ignore[misc] def _compiled_temporary_tables_pattern(self) -> "List[re.Pattern[str]]": return [ re.compile(pattern, re.IGNORECASE) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index bda43ace4b9782..f09eed1dece08b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -8,10 +8,10 @@ import tempfile from dataclasses import dataclass from datetime import datetime, timezone -from functools import cached_property from typing import Any, Dict, Iterable, List, Optional, Union import pydantic +from cached_property import cached_property from typing_extensions import Self from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs @@ -148,7 +148,7 @@ class SnowflakeQueriesExtractorConfig(ConfigModel): query_dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD - @cached_property + @cached_property # type: ignore[misc] def _compiled_temporary_tables_pattern(self) -> "List[re.Pattern[str]]": return [ re.compile(pattern, re.IGNORECASE) From 4a70ed2c4d368480c0f08d1e6c9375b6745ce333 Mon Sep 17 00:00:00 2001 From: "Rob J. Caskey" Date: Tue, 2 Dec 2025 11:09:10 -0500 Subject: [PATCH 5/8] chore: trigger CI From 1a903d6ed6eae5f814068975163979ec974b07e2 Mon Sep 17 00:00:00 2001 From: "Rob J. Caskey" Date: Tue, 2 Dec 2025 12:09:54 -0500 Subject: [PATCH 6/8] fix(ingestion): avoid cached_property to prevent Python 3.11 Pydantic incompatibility Replace @cached_property with manual caching using hasattr/setattr pattern. This avoids triggering stricter Pydantic inspection on Python 3.11. --- .../src/datahub/configuration/common.py | 28 +++++++++++++------ .../source/snowflake/snowflake_config.py | 18 +++++++----- .../source/snowflake/snowflake_queries.py | 20 +++++++------ .../source/snowflake/snowflake_v2.py | 2 +- 4 files changed, 43 insertions(+), 25 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 73685da0784f3b..2f3b0f46622a7a 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -384,13 +384,23 @@ class AllowDenyPattern(ConfigModel): def regex_flags(self) -> int: return re.IGNORECASE if self.ignoreCase else 0 - @cached_property # type: ignore[misc] - def _compiled_allow(self) -> "List[re.Pattern]": - return [re.compile(pattern, self.regex_flags) for pattern in self.allow] - - @cached_property # type: ignore[misc] - def _compiled_deny(self) -> "List[re.Pattern]": - return [re.compile(pattern, self.regex_flags) for pattern in self.deny] + def _get_compiled_allow(self) -> "List[re.Pattern]": + if not hasattr(self, "_cached_compiled_allow"): + object.__setattr__( + self, + "_cached_compiled_allow", + [re.compile(pattern, self.regex_flags) for pattern in self.allow], + ) + return self._cached_compiled_allow # type: ignore[attr-defined] + + def _get_compiled_deny(self) -> "List[re.Pattern]": + if not hasattr(self, "_cached_compiled_deny"): + object.__setattr__( + self, + "_cached_compiled_deny", + [re.compile(pattern, self.regex_flags) for pattern in self.deny], + ) + return self._cached_compiled_deny # type: ignore[attr-defined] @classmethod def allow_all(cls) -> "AllowDenyPattern": @@ -400,10 +410,10 @@ def allowed(self, string: str) -> bool: if self.denied(string): return False - return any(pattern.match(string) for pattern in self._compiled_allow) + return any(pattern.match(string) for pattern in self._get_compiled_allow()) def denied(self, string: str) -> bool: - return any(pattern.match(string) for pattern in self._compiled_deny) + return any(pattern.match(string) for pattern in self._get_compiled_deny()) def is_fully_specified_allow_list(self) -> bool: """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 91f427a2749e18..f6da182aad6bcd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -6,7 +6,6 @@ from typing import Dict, List, Optional, Set import pydantic -from cached_property import cached_property from pydantic import Field, ValidationInfo, field_validator, model_validator from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs @@ -406,12 +405,17 @@ class SnowflakeV2Config( "This may be required in the case of _eg_ temporary tables being created in a different database than the ones in the database_name patterns.", ) - @cached_property # type: ignore[misc] - def _compiled_temporary_tables_pattern(self) -> "List[re.Pattern[str]]": - return [ - re.compile(pattern, re.IGNORECASE) - for pattern in self.temporary_tables_pattern - ] + def _get_compiled_temporary_tables_pattern(self) -> "List[re.Pattern[str]]": + if not hasattr(self, "_cached_temp_tables_pattern"): + object.__setattr__( + self, + "_cached_temp_tables_pattern", + [ + re.compile(pattern, re.IGNORECASE) + for pattern in self.temporary_tables_pattern + ], + ) + return self._cached_temp_tables_pattern # type: ignore[attr-defined] @field_validator("convert_urns_to_lowercase", mode="after") @classmethod diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index f09eed1dece08b..09c16717766b5e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -11,7 +11,6 @@ from typing import Any, Dict, Iterable, List, Optional, Union import pydantic -from cached_property import cached_property from typing_extensions import Self from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs @@ -148,12 +147,17 @@ class SnowflakeQueriesExtractorConfig(ConfigModel): query_dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD - @cached_property # type: ignore[misc] - def _compiled_temporary_tables_pattern(self) -> "List[re.Pattern[str]]": - return [ - re.compile(pattern, re.IGNORECASE) - for pattern in self.temporary_tables_pattern - ] + def _get_compiled_temporary_tables_pattern(self) -> "List[re.Pattern[str]]": + if not hasattr(self, "_cached_temp_tables_pattern"): + object.__setattr__( + self, + "_cached_temp_tables_pattern", + [ + re.compile(pattern, re.IGNORECASE) + for pattern in self.temporary_tables_pattern + ], + ) + return self._cached_temp_tables_pattern # type: ignore[attr-defined] class SnowflakeQueriesSourceConfig( @@ -293,7 +297,7 @@ def local_temp_path(self) -> pathlib.Path: def is_temp_table(self, name: str) -> bool: if any( pattern.match(name) - for pattern in self.config._compiled_temporary_tables_pattern + for pattern in self.config._get_compiled_temporary_tables_pattern() ): return True diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 82f31d0040a7a7..8e9a8445277872 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -469,7 +469,7 @@ class SnowflakePrivilege: def _is_temp_table(self, name: str) -> bool: if any( pattern.match(name) - for pattern in self.config._compiled_temporary_tables_pattern + for pattern in self.config._get_compiled_temporary_tables_pattern() ): return True From f2e966774afd2b2ac16f57420ffc60b5d1c8e509 Mon Sep 17 00:00:00 2001 From: "Rob J. Caskey" Date: Tue, 2 Dec 2025 13:48:50 -0500 Subject: [PATCH 7/8] fix(ingestion): use functools.cached_property with Pydantic compatibility Use stdlib functools.cached_property for new compiled pattern caching, and add it to ConfigModel.ignored_types alongside the cached-property package to ensure Pydantic v2 compatibility on Python 3.11+. --- .../src/datahub/configuration/common.py | 31 +++++++------------ .../source/snowflake/snowflake_config.py | 18 +++++------ .../source/snowflake/snowflake_queries.py | 20 +++++------- .../source/snowflake/snowflake_v2.py | 2 +- 4 files changed, 27 insertions(+), 44 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 2f3b0f46622a7a..7885b394d04662 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -4,6 +4,7 @@ import unittest.mock from abc import ABC, abstractmethod from enum import auto +from functools import cached_property as functools_cached_property from typing import ( IO, TYPE_CHECKING, @@ -133,7 +134,7 @@ def _config_model_schema_extra(schema: Dict[str, Any], model: Type[BaseModel]) - class ConfigModel(BaseModel): model_config = ConfigDict( extra="forbid", - ignored_types=(cached_property,), + ignored_types=(cached_property, functools_cached_property), json_schema_extra=_config_model_schema_extra, hide_input_in_errors=not get_debug(), ) @@ -384,23 +385,13 @@ class AllowDenyPattern(ConfigModel): def regex_flags(self) -> int: return re.IGNORECASE if self.ignoreCase else 0 - def _get_compiled_allow(self) -> "List[re.Pattern]": - if not hasattr(self, "_cached_compiled_allow"): - object.__setattr__( - self, - "_cached_compiled_allow", - [re.compile(pattern, self.regex_flags) for pattern in self.allow], - ) - return self._cached_compiled_allow # type: ignore[attr-defined] - - def _get_compiled_deny(self) -> "List[re.Pattern]": - if not hasattr(self, "_cached_compiled_deny"): - object.__setattr__( - self, - "_cached_compiled_deny", - [re.compile(pattern, self.regex_flags) for pattern in self.deny], - ) - return self._cached_compiled_deny # type: ignore[attr-defined] + @functools_cached_property + def _compiled_allow(self) -> "List[re.Pattern]": + return [re.compile(pattern, self.regex_flags) for pattern in self.allow] + + @functools_cached_property + def _compiled_deny(self) -> "List[re.Pattern]": + return [re.compile(pattern, self.regex_flags) for pattern in self.deny] @classmethod def allow_all(cls) -> "AllowDenyPattern": @@ -410,10 +401,10 @@ def allowed(self, string: str) -> bool: if self.denied(string): return False - return any(pattern.match(string) for pattern in self._get_compiled_allow()) + return any(pattern.match(string) for pattern in self._compiled_allow) def denied(self, string: str) -> bool: - return any(pattern.match(string) for pattern in self._get_compiled_deny()) + return any(pattern.match(string) for pattern in self._compiled_deny) def is_fully_specified_allow_list(self) -> bool: """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index f6da182aad6bcd..deb8ce6a6e4b19 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -3,6 +3,7 @@ from collections import defaultdict from dataclasses import dataclass from enum import Enum +from functools import cached_property from typing import Dict, List, Optional, Set import pydantic @@ -405,17 +406,12 @@ class SnowflakeV2Config( "This may be required in the case of _eg_ temporary tables being created in a different database than the ones in the database_name patterns.", ) - def _get_compiled_temporary_tables_pattern(self) -> "List[re.Pattern[str]]": - if not hasattr(self, "_cached_temp_tables_pattern"): - object.__setattr__( - self, - "_cached_temp_tables_pattern", - [ - re.compile(pattern, re.IGNORECASE) - for pattern in self.temporary_tables_pattern - ], - ) - return self._cached_temp_tables_pattern # type: ignore[attr-defined] + @cached_property + def _compiled_temporary_tables_pattern(self) -> "List[re.Pattern[str]]": + return [ + re.compile(pattern, re.IGNORECASE) + for pattern in self.temporary_tables_pattern + ] @field_validator("convert_urns_to_lowercase", mode="after") @classmethod diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 09c16717766b5e..bda43ace4b9782 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -8,6 +8,7 @@ import tempfile from dataclasses import dataclass from datetime import datetime, timezone +from functools import cached_property from typing import Any, Dict, Iterable, List, Optional, Union import pydantic @@ -147,17 +148,12 @@ class SnowflakeQueriesExtractorConfig(ConfigModel): query_dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD - def _get_compiled_temporary_tables_pattern(self) -> "List[re.Pattern[str]]": - if not hasattr(self, "_cached_temp_tables_pattern"): - object.__setattr__( - self, - "_cached_temp_tables_pattern", - [ - re.compile(pattern, re.IGNORECASE) - for pattern in self.temporary_tables_pattern - ], - ) - return self._cached_temp_tables_pattern # type: ignore[attr-defined] + @cached_property + def _compiled_temporary_tables_pattern(self) -> "List[re.Pattern[str]]": + return [ + re.compile(pattern, re.IGNORECASE) + for pattern in self.temporary_tables_pattern + ] class SnowflakeQueriesSourceConfig( @@ -297,7 +293,7 @@ def local_temp_path(self) -> pathlib.Path: def is_temp_table(self, name: str) -> bool: if any( pattern.match(name) - for pattern in self.config._get_compiled_temporary_tables_pattern() + for pattern in self.config._compiled_temporary_tables_pattern ): return True diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 8e9a8445277872..82f31d0040a7a7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -469,7 +469,7 @@ class SnowflakePrivilege: def _is_temp_table(self, name: str) -> bool: if any( pattern.match(name) - for pattern in self.config._get_compiled_temporary_tables_pattern() + for pattern in self.config._compiled_temporary_tables_pattern ): return True From b614c943eedc94f04957b842f5993327d9e4c282 Mon Sep 17 00:00:00 2001 From: "Rob J. Caskey" Date: Wed, 3 Dec 2025 11:27:17 -0500 Subject: [PATCH 8/8] docs(ingestion): add docstrings to cached regex properties --- metadata-ingestion/src/datahub/configuration/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 7885b394d04662..dc0e6366ca119f 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -387,10 +387,12 @@ def regex_flags(self) -> int: @functools_cached_property def _compiled_allow(self) -> "List[re.Pattern]": + """Evaluating compiled allow patterns is 1000x faster and this is in the hot path, so we cache them here for the life of this object.""" return [re.compile(pattern, self.regex_flags) for pattern in self.allow] @functools_cached_property def _compiled_deny(self) -> "List[re.Pattern]": + """Evaluating compiled deny patterns is 1000x faster and this is in the hot path, so we cache them here for the life of this object.""" return [re.compile(pattern, self.regex_flags) for pattern in self.deny] @classmethod