Skip to content

Commit a241c3e

Browse files
committed
linter
1 parent 5a1ff2b commit a241c3e

File tree

6 files changed

+94
-54
lines changed

6 files changed

+94
-54
lines changed

pyiceberg/catalog/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ def load_in_memory(name: str, conf: Properties) -> Catalog:
172172
except ImportError as exc:
173173
raise NotInstalledError("SQLAlchemy support not installed: pip install 'pyiceberg[sql-sqlite]'") from exc
174174

175+
175176
def load_bigquery(name: str, conf: Properties) -> Catalog:
176177
try:
177178
from pyiceberg.catalog.bigquery_metastore import BigQueryMetastoreCatalog
@@ -181,7 +182,6 @@ def load_bigquery(name: str, conf: Properties) -> Catalog:
181182
raise NotInstalledError("BigQuery support not installed: pip install 'pyiceberg[bigquery]'") from exc
182183

183184

184-
185185
AVAILABLE_CATALOGS: dict[CatalogType, Callable[[str, Properties], Catalog]] = {
186186
CatalogType.REST: load_rest,
187187
CatalogType.HIVE: load_hive,

pyiceberg/catalog/bigquery_metastore.py

Lines changed: 41 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717
import json
18-
from typing import Any, List, Optional, Set, Tuple, Union
18+
from typing import TYPE_CHECKING, Any, List, Optional, Set, Tuple, Union
1919

2020
from google.api_core.exceptions import NotFound
2121
from google.cloud.bigquery import Client, Dataset, DatasetReference, TableReference
@@ -40,6 +40,9 @@
4040
from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties
4141
from pyiceberg.utils.config import Config
4242

43+
if TYPE_CHECKING:
44+
import pyarrow as pa
45+
4346
GCP_PROJECT_ID = "gcp.project-id"
4447
GCP_LOCATION = "gcp.location"
4548
GCP_CREDENTIALS_LOCATION = "gcp.credentials-location"
@@ -54,6 +57,7 @@
5457
HIVE_FILE_INPUT_FORMAT = "org.apache.iceberg.mr.hive.HiveIcebergInputFormat"
5558
HIVE_FILE_OUTPUT_FORMAT = "org.apache.iceberg.mr.hive.HiveIcebergOutputFormat"
5659

60+
5761
class BigQueryMetastoreCatalog(MetastoreCatalog):
5862
def __init__(self, name: str, **properties: str):
5963
super().__init__(name, **properties)
@@ -138,7 +142,9 @@ def create_table(
138142
dataset_ref = DatasetReference(project=self.project_id, dataset_id=dataset_name)
139143

140144
try:
141-
table = self._make_new_table(metadata, metadata_location, TableReference(dataset_ref=dataset_ref, table_id=table_name))
145+
table = self._make_new_table(
146+
metadata, metadata_location, TableReference(dataset_ref=dataset_ref, table_id=table_name)
147+
)
142148
self.client.create_table(table)
143149
except Conflict as e:
144150
raise TableAlreadyExistsError(f"Table {table_name} already exists") from e
@@ -161,12 +167,13 @@ def create_namespace(self, namespace: Union[str, Identifier], properties: Proper
161167
try:
162168
dataset_ref = DatasetReference(project=self.project_id, dataset_id=database_name)
163169
dataset = Dataset(dataset_ref=dataset_ref)
164-
dataset.external_catalog_dataset_options = self._create_external_catalog_dataset_options(self._get_default_warehouse_location_for_dataset(database_name), properties, dataset_ref)
170+
dataset.external_catalog_dataset_options = self._create_external_catalog_dataset_options(
171+
self._get_default_warehouse_location_for_dataset(database_name), properties, dataset_ref
172+
)
165173
self.client.create_dataset(dataset)
166174
except Conflict as e:
167175
raise NamespaceAlreadyExistsError("Namespace {database_name} already exists") from e
168176

169-
170177
def load_table(self, identifier: Union[str, Identifier]) -> Table:
171178
"""
172179
Load the table's metadata and returns the table instance.
@@ -196,7 +203,6 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table:
196203
except NotFound as e:
197204
raise NoSuchTableError(f"Table does not exist: {dataset_name}.{table_name}") from e
198205

199-
200206
def drop_table(self, identifier: Union[str, Identifier]) -> None:
201207
"""Drop a table.
202208
@@ -222,11 +228,9 @@ def commit_table(
222228
) -> CommitTableResponse:
223229
raise NotImplementedError
224230

225-
226231
def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table:
227232
raise NotImplementedError
228233

229-
230234
def drop_namespace(self, namespace: Union[str, Identifier]) -> None:
231235
database_name = self.identifier_to_database(namespace)
232236

@@ -283,7 +287,9 @@ def register_table(self, identifier: Union[str, Identifier], metadata_location:
283287
metadata = FromInputFile.table_metadata(file)
284288

285289
try:
286-
table = self._make_new_table(metadata, metadata_location, TableReference(dataset_ref=dataset_ref, table_id=table_name))
290+
table = self._make_new_table(
291+
metadata, metadata_location, TableReference(dataset_ref=dataset_ref, table_id=table_name)
292+
)
287293
self.client.create_table(table)
288294
except Conflict as e:
289295
raise TableAlreadyExistsError(f"Table {table_name} already exists") from e
@@ -316,21 +322,16 @@ def update_namespace_properties(
316322
) -> PropertiesUpdateSummary:
317323
raise NotImplementedError
318324

319-
320325
def _make_new_table(self, metadata: TableMetadata, metadata_file_location: str, table_ref: TableReference) -> BQTable:
321-
"""
322-
To make the table queryable from Hive, the user would likely be setting the HIVE_ENGINE_ENABLED
323-
parameter.
324-
325-
"""
326+
"""To make the table queryable from Hive, the user would likely be setting the HIVE_ENGINE_ENABLED parameter."""
326327
table = BQTable(table_ref)
327328

328329
# In Python, you typically set the external data configuration directly.
329330
# BigQueryMetastoreUtils.create_external_catalog_table_options is mapped to
330331
# constructing the external_data_configuration for the Table object.
331332
external_config_options = self._create_external_catalog_table_options(
332333
metadata.location,
333-
self._create_table_parameters(metadata_file_location=metadata_file_location, table_metadata=metadata)
334+
self._create_table_parameters(metadata_file_location=metadata_file_location, table_metadata=metadata),
334335
)
335336

336337
# Apply the external configuration to the Table object.
@@ -340,22 +341,27 @@ def _make_new_table(self, metadata: TableMetadata, metadata_file_location: str,
340341

341342
return table
342343

343-
def _create_external_catalog_table_options(self, location: str, parameters: dict) -> ExternalCatalogTableOptions:
344+
def _create_external_catalog_table_options(self, location: str, parameters: dict[str, Any]) -> ExternalCatalogTableOptions:
344345
# This structure directly maps to what BigQuery's ExternalConfig expects for Hive.
345346
return ExternalCatalogTableOptions(
346347
storage_descriptor=StorageDescriptor(
347348
location_uri=location,
348349
input_format=HIVE_FILE_INPUT_FORMAT,
349350
output_format=HIVE_FILE_OUTPUT_FORMAT,
350-
serde_info=SerDeInfo(serialization_library=HIVE_SERIALIZATION_LIBRARY)
351+
serde_info=SerDeInfo(serialization_library=HIVE_SERIALIZATION_LIBRARY),
351352
),
352-
parameters=parameters
353+
parameters=parameters,
353354
)
354355

355-
def _create_external_catalog_dataset_options(self, default_storage_location: str, metadataParameters: dict, dataset_ref: DatasetReference) -> ExternalCatalogDatasetOptions:
356-
return ExternalCatalogDatasetOptions(default_storage_location_uri=self._get_default_warehouse_location_for_dataset(dataset_ref.dataset_id), parameters=metadataParameters)
356+
def _create_external_catalog_dataset_options(
357+
self, default_storage_location: str, metadataParameters: dict[str, Any], dataset_ref: DatasetReference
358+
) -> ExternalCatalogDatasetOptions:
359+
return ExternalCatalogDatasetOptions(
360+
default_storage_location_uri=self._get_default_warehouse_location_for_dataset(dataset_ref.dataset_id),
361+
parameters=metadataParameters,
362+
)
357363

358-
def _convert_bigquery_table_to_iceberg_table(self, identifier: str, table: BQTable) -> Table:
364+
def _convert_bigquery_table_to_iceberg_table(self, identifier: Union[str, Identifier], table: BQTable) -> Table:
359365
dataset_name, table_name = self.identifier_to_database_and_table(identifier, NoSuchTableError)
360366
metadata_location = ""
361367
if table.external_catalog_table_options and table.external_catalog_table_options.parameters:
@@ -381,29 +387,33 @@ def _create_table_parameters(self, metadata_file_location: str, table_metadata:
381387
parameters["EXTERNAL"] = True
382388

383389
# Add Hive-style basic statistics from snapshot metadata if it exists.
384-
if table_metadata.current_snapshot():
385-
386-
if table_metadata.current_snapshot().summary.get(TOTAL_DATA_FILES):
387-
parameters["numFiles"] = table_metadata.current_snapshot.summary.get(TOTAL_DATA_FILES)
390+
snapshot = table_metadata.current_snapshot()
391+
if snapshot:
392+
summary = snapshot.summary
393+
if summary:
394+
if summary.get(TOTAL_DATA_FILES):
395+
parameters["numFiles"] = summary.get(TOTAL_DATA_FILES)
388396

389-
if table_metadata.current_snapshot().summary.get(TOTAL_RECORDS):
390-
parameters["numRows"] = table_metadata.current_snapshot.summary.get(TOTAL_RECORDS)
397+
if summary.get(TOTAL_RECORDS):
398+
parameters["numRows"] = summary.get(TOTAL_RECORDS)
391399

392-
if table_metadata.current_snapshot().summary.get(TOTAL_FILE_SIZE):
393-
parameters["totalSize"] = table_metadata.current_snapshot.summary.get(TOTAL_FILE_SIZE)
400+
if summary.get(TOTAL_FILE_SIZE):
401+
parameters["totalSize"] = summary.get(TOTAL_FILE_SIZE)
394402

395403
return parameters
396404

397-
def _default_storage_location(self, location: Optional[str], dataset_ref: DatasetReference) -> str | None:
405+
def _default_storage_location(self, location: Optional[str], dataset_ref: DatasetReference) -> Union[str, None]:
398406
if location:
399407
return location
400408
dataset = self.client.get_dataset(dataset_ref)
401409
if dataset and dataset.external_catalog_dataset_options:
402410
return dataset.external_catalog_dataset_options.default_storage_location_uri
403411

412+
raise ValueError("Could not find default storage location")
413+
404414
def _get_default_warehouse_location_for_dataset(self, database_name: str) -> str:
405415
if warehouse_path := self.properties.get(WAREHOUSE_LOCATION):
406416
warehouse_path = warehouse_path.rstrip("/")
407417
return f"{warehouse_path}/{database_name}.db"
408418

409-
raise ValueError("No default path is set, please specify a location when creating a table")
419+
raise ValueError("No default path is set, please specify a location when creating a table")

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,10 @@ ignore_missing_imports = true
283283
module = "pyiceberg_core.*"
284284
ignore_missing_imports = true
285285

286+
[[tool.mypy.overrides]]
287+
module = "google.*"
288+
ignore_missing_imports = true
289+
286290
[tool.poetry.scripts]
287291
pyiceberg = "pyiceberg.cli.console:run"
288292

tests/catalog/integration_test_bigquery_metastore.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,20 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717
import os
18+
1819
import pytest
1920
from pytest_mock import MockFixture
2021

2122
from pyiceberg.catalog.bigquery_metastore import BigQueryMetastoreCatalog
22-
from pyiceberg.exceptions import NamespaceAlreadyExistsError, NoSuchNamespaceError, NoSuchTableError
23+
from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchTableError
2324
from pyiceberg.io import load_file_io
2425
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC
2526
from pyiceberg.schema import Schema
2627
from pyiceberg.serializers import ToOutputFile
2728
from pyiceberg.table.metadata import new_table_metadata
2829
from pyiceberg.table.sorting import UNSORTED_SORT_ORDER
29-
from tests.conftest import BQ_TABLE_METADATA_LOCATION_REGEX, BUCKET_NAME, TABLE_METADATA_LOCATION_REGEX
30+
from tests.conftest import BQ_TABLE_METADATA_LOCATION_REGEX
31+
3032

3133
def test_create_table_with_database_location(
3234
mocker: MockFixture, _bucket_initialize: None, table_schema_nested: Schema, gcp_dataset_name: str, table_name: str
@@ -35,7 +37,9 @@ def test_create_table_with_database_location(
3537

3638
catalog_name = "test_ddb_catalog"
3739
identifier = (gcp_dataset_name, table_name)
38-
test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"})
40+
test_catalog = BigQueryMetastoreCatalog(
41+
catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"}
42+
)
3943
test_catalog.create_namespace(namespace=gcp_dataset_name)
4044
table = test_catalog.create_table(identifier, table_schema_nested)
4145
assert table.name() == identifier
@@ -44,16 +48,19 @@ def test_create_table_with_database_location(
4448
tables_in_namespace = test_catalog.list_tables(namespace=gcp_dataset_name)
4549
assert identifier in tables_in_namespace
4650

51+
4752
def test_drop_table_with_database_location(
4853
mocker: MockFixture, _bucket_initialize: None, table_schema_nested: Schema, gcp_dataset_name: str, table_name: str
4954
) -> None:
5055
mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"})
5156

5257
catalog_name = "test_ddb_catalog"
5358
identifier = (gcp_dataset_name, table_name)
54-
test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"})
59+
test_catalog = BigQueryMetastoreCatalog(
60+
catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"}
61+
)
5562
test_catalog.create_namespace(namespace=gcp_dataset_name)
56-
table = test_catalog.create_table(identifier, table_schema_nested)
63+
test_catalog.create_table(identifier, table_schema_nested)
5764
test_catalog.drop_table(identifier)
5865

5966
tables_in_namespace_after_drop = test_catalog.list_tables(namespace=gcp_dataset_name)
@@ -63,18 +70,20 @@ def test_drop_table_with_database_location(
6370
try:
6471
test_catalog.load_table(identifier)
6572
raise AssertionError()
66-
except NoSuchTableError as e:
73+
except NoSuchTableError:
6774
assert True
6875

76+
6977
def test_create_and_drop_namespace(
7078
mocker: MockFixture, _bucket_initialize: None, table_schema_nested: Schema, gcp_dataset_name: str, table_name: str
7179
) -> None:
7280
mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"})
7381

7482
# Create namespace.
7583
catalog_name = "test_ddb_catalog"
76-
identifier = (gcp_dataset_name, table_name)
77-
test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"})
84+
test_catalog = BigQueryMetastoreCatalog(
85+
catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"}
86+
)
7887
test_catalog.create_namespace(namespace=gcp_dataset_name)
7988

8089
# Ensure that the namespace exists.
@@ -90,14 +99,15 @@ def test_create_and_drop_namespace(
9099
with pytest.raises(NoSuchNamespaceError):
91100
test_catalog.load_namespace_properties(gcp_dataset_name)
92101

102+
93103
def test_register_table(
94104
mocker: MockFixture, _bucket_initialize: None, table_schema_nested: Schema, gcp_dataset_name: str, table_name: str
95105
) -> None:
96106
mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"})
97107

98108
catalog_name = "test_bq_register_catalog"
99109
identifier = (gcp_dataset_name, table_name)
100-
warehouse_path = "gs://alexstephen-test-bq-bucket/" # Matches conftest BUCKET_NAME for GCS interaction
110+
warehouse_path = "gs://alexstephen-test-bq-bucket/" # Matches conftest BUCKET_NAME for GCS interaction
101111
gcp_project_id = "alexstephen-test-1"
102112

103113
test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": gcp_project_id, "warehouse": warehouse_path})
@@ -110,7 +120,13 @@ def test_register_table(
110120
metadata_file_name = "00000-aaaaaaaa-aaaa-4aaa-aaaa-aaaaaaaaaaaa.metadata.json"
111121
metadata_gcs_path = f"{table_gcs_location}/metadata/{metadata_file_name}"
112122

113-
metadata = new_table_metadata(location=table_gcs_location, schema=table_schema_nested, properties={}, partition_spec=UNPARTITIONED_PARTITION_SPEC, sort_order=UNSORTED_SORT_ORDER)
123+
metadata = new_table_metadata(
124+
location=table_gcs_location,
125+
schema=table_schema_nested,
126+
properties={},
127+
partition_spec=UNPARTITIONED_PARTITION_SPEC,
128+
sort_order=UNSORTED_SORT_ORDER,
129+
)
114130
io = load_file_io(properties=test_catalog.properties, location=metadata_gcs_path)
115131
test_catalog._write_metadata(metadata, io, metadata_gcs_path)
116132
ToOutputFile.table_metadata(metadata, io.new_output(metadata_gcs_path), overwrite=True)

0 commit comments

Comments
 (0)