Skip to content

Commit c273133

Browse files
sgomezvillamorclaude
authored andcommitted
perf(ingestion): pre-compile regex patterns in hot paths
Extends Rob's regex optimization pattern (#15463) to additional ingestion hot paths: 1. **SqlQueriesSource**: Pre-compile temp_table_patterns using @cached_property - Called for every table during query processing - Eliminates repeated regex compilation overhead 2. **BigQuery**: Pre-compile sharded table & wildcard patterns at module level - get_table_and_shard(): Called for every BigQuery table - get_table_display_name(): Called for table name normalization - is_sharded_table(): Called during table classification 3. **PowerBI ODBC**: Pre-compile platform detection patterns at module level - normalize_platform_from_driver(): Called for every ODBC connection - normalize_platform_name(): Called during platform normalization - Affects 18+ database platform patterns All changes follow the same optimization strategy as #15463: - Compile regex patterns once at initialization - Use compiled Pattern objects in hot path - Maintain exact behavioral equivalence - No config changes or breaking changes Expected impact: Performance improvement for ingestion workloads with: - High volume of temp table checks (SqlQueriesSource) - Large BigQuery datasets with sharded tables - PowerBI sources with many ODBC connections 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]>
1 parent a039705 commit c273133

File tree

3 files changed

+32
-22
lines changed

3 files changed

+32
-22
lines changed

metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@
2828
"((.+\\D)[_$]?)?(\\d\\d\\d\\d(?:0[1-9]|1[0-2])(?:0[1-9]|[12][0-9]|3[01]))$"
2929
)
3030

31+
# Pre-compiled regex patterns for performance (used in ingestion hot path)
32+
_COMPILED_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: Pattern = re.compile(
33+
_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX, re.IGNORECASE
34+
)
35+
_COMPILED_BIGQUERY_WILDCARD_REGEX: Pattern = re.compile("((_(\\d+)?)\\*$)|\\*$")
36+
3137

3238
@dataclass(frozen=True, order=True)
3339
class BigqueryTableIdentifier:
@@ -58,11 +64,7 @@ def get_table_and_shard(table_name: str) -> Tuple[Optional[str], Optional[str]]:
5864
In case of sharded tables, returns (<table-prefix>, shard)
5965
"""
6066
new_table_name = table_name
61-
match = re.match(
62-
BigqueryTableIdentifier._BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX,
63-
table_name,
64-
re.IGNORECASE,
65-
)
67+
match = _COMPILED_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX.match(table_name)
6668
if match:
6769
shard: str = match[3]
6870
if shard:
@@ -96,7 +98,7 @@ def get_table_display_name(self) -> str:
9698
- removes partition ids (table$20210101 -> table or table$__UNPARTITIONED__ -> table)
9799
"""
98100
# if table name ends in _* or * or _yyyy* or _yyyymm* then we strip it as that represents a query on a sharded table
99-
shortened_table_name = re.sub(self._BIGQUERY_WILDCARD_REGEX, "", self.table)
101+
shortened_table_name = _COMPILED_BIGQUERY_WILDCARD_REGEX.sub("", self.table)
100102

101103
matches = BigQueryTableRef.SNAPSHOT_TABLE_REGEX.match(shortened_table_name)
102104
if matches:
@@ -133,11 +135,8 @@ def is_sharded_table(self) -> bool:
133135
if shard:
134136
return True
135137

136-
if re.match(
137-
f".*({BigqueryTableIdentifier._BIGQUERY_WILDCARD_REGEX})",
138-
self.raw_table_name(),
139-
re.IGNORECASE,
140-
):
138+
# Check if table name contains wildcard pattern
139+
if _COMPILED_BIGQUERY_WILDCARD_REGEX.search(self.raw_table_name()):
141140
return True
142141

143142
return False

metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/odbc.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@
4444
"databricks": r"(databricks|spark)",
4545
}
4646

47+
# Pre-compiled regex patterns for performance (used in ODBC connection hot path)
48+
_compiled_platform_patterns = {
49+
platform: re.compile(pattern, re.IGNORECASE)
50+
for platform, pattern in platform_patterns.items()
51+
}
52+
4753
powerbi_platform_names = {
4854
"mysql": "MySQL",
4955
"postgres": "PostgreSQL",
@@ -157,8 +163,8 @@ def extract_platform(connection_string: str) -> Tuple[Optional[str], Optional[st
157163

158164
driver_lower = driver_name.lower()
159165

160-
for platform, pattern in platform_patterns.items():
161-
if re.search(pattern, driver_lower):
166+
for platform, compiled_pattern in _compiled_platform_patterns.items():
167+
if compiled_pattern.search(driver_lower):
162168
return platform, powerbi_platform_names.get(platform)
163169

164170
return None, None
@@ -178,8 +184,8 @@ def normalize_platform_name(platform: str) -> Tuple[Optional[str], Optional[str]
178184
"""
179185
platform_lower = platform.lower()
180186

181-
for platform, pattern in platform_patterns.items():
182-
if re.search(pattern, platform_lower):
183-
return platform, powerbi_platform_names.get(platform)
187+
for platform_name, compiled_pattern in _compiled_platform_patterns.items():
188+
if compiled_pattern.search(platform_lower):
189+
return platform_name, powerbi_platform_names.get(platform_name)
184190

185191
return None, None

metadata-ingestion/src/datahub/ingestion/source/sql_queries.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
import re
55
from dataclasses import dataclass, field
66
from datetime import datetime
7-
from functools import partial
8-
from typing import Any, ClassVar, Iterable, List, Optional, Union, cast
7+
from functools import cached_property, partial
8+
from typing import Any, ClassVar, Iterable, List, Optional, Pattern, Union, cast
99

1010
import smart_open
1111
from pydantic import BaseModel, ConfigDict, Field, field_validator
@@ -92,6 +92,11 @@ class SqlQueriesSourceConfig(
9292
default=[],
9393
)
9494

95+
@cached_property
96+
def compiled_temp_table_patterns(self) -> List[Pattern]:
97+
"""Pre-compiled regex patterns for temp table filtering (performance optimization)."""
98+
return [re.compile(pattern, re.IGNORECASE) for pattern in self.temp_table_patterns]
99+
95100
enable_lazy_schema_loading: bool = Field(
96101
default=True,
97102
description="Enable lazy schema loading for better performance. When enabled, schemas are fetched on-demand "
@@ -422,15 +427,15 @@ def is_temp_table(self, name: str) -> bool:
422427
return False
423428

424429
try:
425-
for pattern in self.config.temp_table_patterns:
426-
if re.match(pattern, name, flags=re.IGNORECASE):
430+
for pattern in self.config.compiled_temp_table_patterns:
431+
if pattern.match(name):
427432
logger.debug(
428-
f"Table '{name}' matched temp table pattern: {pattern}"
433+
f"Table '{name}' matched temp table pattern: {pattern.pattern}"
429434
)
430435
self.report.num_temp_tables_detected += 1
431436
return True
432437
except re.error as e:
433-
logger.warning(f"Invalid regex pattern '{pattern}': {e}")
438+
logger.warning(f"Invalid regex pattern: {e}")
434439

435440
return False
436441

0 commit comments

Comments
 (0)