linter

rambleraptor · rambleraptor · commit a241c3ef9497 · 2025-06-14T21:51:06.000-04:00
diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py
@@ -172,6 +172,7 @@ def load_in_memory(name: str, conf: Properties) -> Catalog:
     except ImportError as exc:
         raise NotInstalledError("SQLAlchemy support not installed: pip install 'pyiceberg[sql-sqlite]'") from exc
 
+
 def load_bigquery(name: str, conf: Properties) -> Catalog:
     try:
         from pyiceberg.catalog.bigquery_metastore import BigQueryMetastoreCatalog
@@ -181,7 +182,6 @@ def load_bigquery(name: str, conf: Properties) -> Catalog:
         raise NotInstalledError("BigQuery support not installed: pip install 'pyiceberg[bigquery]'") from exc
 
 
-
 AVAILABLE_CATALOGS: dict[CatalogType, Callable[[str, Properties], Catalog]] = {
     CatalogType.REST: load_rest,
     CatalogType.HIVE: load_hive,
diff --git a/pyiceberg/catalog/bigquery_metastore.py b/pyiceberg/catalog/bigquery_metastore.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import json
-from typing import Any, List, Optional, Set, Tuple, Union
+from typing import TYPE_CHECKING, Any, List, Optional, Set, Tuple, Union
 
 from google.api_core.exceptions import NotFound
 from google.cloud.bigquery import Client, Dataset, DatasetReference, TableReference
@@ -40,6 +40,9 @@
 from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties
 from pyiceberg.utils.config import Config
 
+if TYPE_CHECKING:
+    import pyarrow as pa
+
 GCP_PROJECT_ID = "gcp.project-id"
 GCP_LOCATION = "gcp.location"
 GCP_CREDENTIALS_LOCATION = "gcp.credentials-location"
@@ -54,6 +57,7 @@
 HIVE_FILE_INPUT_FORMAT = "org.apache.iceberg.mr.hive.HiveIcebergInputFormat"
 HIVE_FILE_OUTPUT_FORMAT = "org.apache.iceberg.mr.hive.HiveIcebergOutputFormat"
 
+
 class BigQueryMetastoreCatalog(MetastoreCatalog):
     def __init__(self, name: str, **properties: str):
         super().__init__(name, **properties)
@@ -138,7 +142,9 @@ def create_table(
         dataset_ref = DatasetReference(project=self.project_id, dataset_id=dataset_name)
 
         try:
-            table = self._make_new_table(metadata, metadata_location, TableReference(dataset_ref=dataset_ref, table_id=table_name))
+            table = self._make_new_table(
+                metadata, metadata_location, TableReference(dataset_ref=dataset_ref, table_id=table_name)
+            )
             self.client.create_table(table)
         except Conflict as e:
             raise TableAlreadyExistsError(f"Table {table_name} already exists") from e
@@ -161,12 +167,13 @@ def create_namespace(self, namespace: Union[str, Identifier], properties: Proper
         try:
             dataset_ref = DatasetReference(project=self.project_id, dataset_id=database_name)
             dataset = Dataset(dataset_ref=dataset_ref)
-            dataset.external_catalog_dataset_options = self._create_external_catalog_dataset_options(self._get_default_warehouse_location_for_dataset(database_name), properties, dataset_ref)
+            dataset.external_catalog_dataset_options = self._create_external_catalog_dataset_options(
+                self._get_default_warehouse_location_for_dataset(database_name), properties, dataset_ref
+            )
             self.client.create_dataset(dataset)
         except Conflict as e:
             raise NamespaceAlreadyExistsError("Namespace {database_name} already exists") from e
 
-
     def load_table(self, identifier: Union[str, Identifier]) -> Table:
         """
         Load the table's metadata and returns the table instance.
@@ -196,7 +203,6 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table:
         except NotFound as e:
             raise NoSuchTableError(f"Table does not exist: {dataset_name}.{table_name}") from e
 
-
     def drop_table(self, identifier: Union[str, Identifier]) -> None:
         """Drop a table.
 
@@ -222,11 +228,9 @@ def commit_table(
     ) -> CommitTableResponse:
         raise NotImplementedError
 
-    
     def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table:
         raise NotImplementedError
 
-    
     def drop_namespace(self, namespace: Union[str, Identifier]) -> None:
         database_name = self.identifier_to_database(namespace)
 
@@ -283,7 +287,9 @@ def register_table(self, identifier: Union[str, Identifier], metadata_location:
         metadata = FromInputFile.table_metadata(file)
 
         try:
-            table = self._make_new_table(metadata, metadata_location, TableReference(dataset_ref=dataset_ref, table_id=table_name))
+            table = self._make_new_table(
+                metadata, metadata_location, TableReference(dataset_ref=dataset_ref, table_id=table_name)
+            )
             self.client.create_table(table)
         except Conflict as e:
             raise TableAlreadyExistsError(f"Table {table_name} already exists") from e
@@ -316,21 +322,16 @@ def update_namespace_properties(
     ) -> PropertiesUpdateSummary:
         raise NotImplementedError
 
-
     def _make_new_table(self, metadata: TableMetadata, metadata_file_location: str, table_ref: TableReference) -> BQTable:
-        """
-        To make the table queryable from Hive, the user would likely be setting the HIVE_ENGINE_ENABLED
-        parameter.
-
-        """
+        """To make the table queryable from Hive, the user would likely be setting the HIVE_ENGINE_ENABLED parameter."""
         table = BQTable(table_ref)
 
         # In Python, you typically set the external data configuration directly.
         # BigQueryMetastoreUtils.create_external_catalog_table_options is mapped to
         # constructing the external_data_configuration for the Table object.
         external_config_options = self._create_external_catalog_table_options(
             metadata.location,
-            self._create_table_parameters(metadata_file_location=metadata_file_location, table_metadata=metadata)
+            self._create_table_parameters(metadata_file_location=metadata_file_location, table_metadata=metadata),
         )
 
         # Apply the external configuration to the Table object.
@@ -340,22 +341,27 @@ def _make_new_table(self, metadata: TableMetadata, metadata_file_location: str,
 
         return table
 
-    def _create_external_catalog_table_options(self, location: str, parameters: dict) -> ExternalCatalogTableOptions:
+    def _create_external_catalog_table_options(self, location: str, parameters: dict[str, Any]) -> ExternalCatalogTableOptions:
         # This structure directly maps to what BigQuery's ExternalConfig expects for Hive.
         return ExternalCatalogTableOptions(
             storage_descriptor=StorageDescriptor(
                 location_uri=location,
                 input_format=HIVE_FILE_INPUT_FORMAT,
                 output_format=HIVE_FILE_OUTPUT_FORMAT,
-                serde_info=SerDeInfo(serialization_library=HIVE_SERIALIZATION_LIBRARY)
+                serde_info=SerDeInfo(serialization_library=HIVE_SERIALIZATION_LIBRARY),
             ),
-            parameters=parameters
+            parameters=parameters,
         )
 
-    def _create_external_catalog_dataset_options(self, default_storage_location: str, metadataParameters: dict, dataset_ref: DatasetReference) -> ExternalCatalogDatasetOptions:
-        return ExternalCatalogDatasetOptions(default_storage_location_uri=self._get_default_warehouse_location_for_dataset(dataset_ref.dataset_id), parameters=metadataParameters)
+    def _create_external_catalog_dataset_options(
+        self, default_storage_location: str, metadataParameters: dict[str, Any], dataset_ref: DatasetReference
+    ) -> ExternalCatalogDatasetOptions:
+        return ExternalCatalogDatasetOptions(
+            default_storage_location_uri=self._get_default_warehouse_location_for_dataset(dataset_ref.dataset_id),
+            parameters=metadataParameters,
+        )
 
-    def _convert_bigquery_table_to_iceberg_table(self, identifier: str, table: BQTable) -> Table:
+    def _convert_bigquery_table_to_iceberg_table(self, identifier: Union[str, Identifier], table: BQTable) -> Table:
         dataset_name, table_name = self.identifier_to_database_and_table(identifier, NoSuchTableError)
         metadata_location = ""
         if table.external_catalog_table_options and table.external_catalog_table_options.parameters:
@@ -381,29 +387,33 @@ def _create_table_parameters(self, metadata_file_location: str, table_metadata:
         parameters["EXTERNAL"] = True
 
         # Add Hive-style basic statistics from snapshot metadata if it exists.
-        if table_metadata.current_snapshot():
-
-            if table_metadata.current_snapshot().summary.get(TOTAL_DATA_FILES):
-                parameters["numFiles"] = table_metadata.current_snapshot.summary.get(TOTAL_DATA_FILES)
+        snapshot = table_metadata.current_snapshot()
+        if snapshot:
+            summary = snapshot.summary
+            if summary:
+                if summary.get(TOTAL_DATA_FILES):
+                    parameters["numFiles"] = summary.get(TOTAL_DATA_FILES)
 
-            if table_metadata.current_snapshot().summary.get(TOTAL_RECORDS):
-                parameters["numRows"] = table_metadata.current_snapshot.summary.get(TOTAL_RECORDS)
+                if summary.get(TOTAL_RECORDS):
+                    parameters["numRows"] = summary.get(TOTAL_RECORDS)
 
-            if table_metadata.current_snapshot().summary.get(TOTAL_FILE_SIZE):
-                parameters["totalSize"] = table_metadata.current_snapshot.summary.get(TOTAL_FILE_SIZE)
+                if summary.get(TOTAL_FILE_SIZE):
+                    parameters["totalSize"] = summary.get(TOTAL_FILE_SIZE)
 
         return parameters
 
-    def _default_storage_location(self, location: Optional[str], dataset_ref: DatasetReference) -> str | None:
+    def _default_storage_location(self, location: Optional[str], dataset_ref: DatasetReference) -> Union[str, None]:
         if location:
             return location
         dataset = self.client.get_dataset(dataset_ref)
         if dataset and dataset.external_catalog_dataset_options:
             return dataset.external_catalog_dataset_options.default_storage_location_uri
 
+        raise ValueError("Could not find default storage location")
+
     def _get_default_warehouse_location_for_dataset(self, database_name: str) -> str:
         if warehouse_path := self.properties.get(WAREHOUSE_LOCATION):
             warehouse_path = warehouse_path.rstrip("/")
             return f"{warehouse_path}/{database_name}.db"
 
-        raise ValueError("No default path is set, please specify a location when creating a table")
+        raise ValueError("No default path is set, please specify a location when creating a table")
diff --git a/pyproject.toml b/pyproject.toml
@@ -283,6 +283,10 @@ ignore_missing_imports = true
 module = "pyiceberg_core.*"
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = "google.*"
+ignore_missing_imports = true
+
 [tool.poetry.scripts]
 pyiceberg = "pyiceberg.cli.console:run"
 
diff --git a/tests/catalog/integration_test_bigquery_metastore.py b/tests/catalog/integration_test_bigquery_metastore.py
@@ -15,18 +15,20 @@
 #  specific language governing permissions and limitations
 #  under the License.
 import os
+
 import pytest
 from pytest_mock import MockFixture
 
 from pyiceberg.catalog.bigquery_metastore import BigQueryMetastoreCatalog
-from pyiceberg.exceptions import NamespaceAlreadyExistsError, NoSuchNamespaceError, NoSuchTableError
+from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchTableError
 from pyiceberg.io import load_file_io
 from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC
 from pyiceberg.schema import Schema
 from pyiceberg.serializers import ToOutputFile
 from pyiceberg.table.metadata import new_table_metadata
 from pyiceberg.table.sorting import UNSORTED_SORT_ORDER
-from tests.conftest import BQ_TABLE_METADATA_LOCATION_REGEX, BUCKET_NAME, TABLE_METADATA_LOCATION_REGEX
+from tests.conftest import BQ_TABLE_METADATA_LOCATION_REGEX
+
 
 def test_create_table_with_database_location(
     mocker: MockFixture, _bucket_initialize: None, table_schema_nested: Schema, gcp_dataset_name: str, table_name: str
@@ -35,7 +37,9 @@ def test_create_table_with_database_location(
 
     catalog_name = "test_ddb_catalog"
     identifier = (gcp_dataset_name, table_name)
-    test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"})
+    test_catalog = BigQueryMetastoreCatalog(
+        catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"}
+    )
     test_catalog.create_namespace(namespace=gcp_dataset_name)
     table = test_catalog.create_table(identifier, table_schema_nested)
     assert table.name() == identifier
@@ -44,16 +48,19 @@ def test_create_table_with_database_location(
     tables_in_namespace = test_catalog.list_tables(namespace=gcp_dataset_name)
     assert identifier in tables_in_namespace
 
+
 def test_drop_table_with_database_location(
     mocker: MockFixture, _bucket_initialize: None, table_schema_nested: Schema, gcp_dataset_name: str, table_name: str
 ) -> None:
     mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"})
 
     catalog_name = "test_ddb_catalog"
     identifier = (gcp_dataset_name, table_name)
-    test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"})
+    test_catalog = BigQueryMetastoreCatalog(
+        catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"}
+    )
     test_catalog.create_namespace(namespace=gcp_dataset_name)
-    table = test_catalog.create_table(identifier, table_schema_nested)
+    test_catalog.create_table(identifier, table_schema_nested)
     test_catalog.drop_table(identifier)
 
     tables_in_namespace_after_drop = test_catalog.list_tables(namespace=gcp_dataset_name)
@@ -63,18 +70,20 @@ def test_drop_table_with_database_location(
     try:
         test_catalog.load_table(identifier)
         raise AssertionError()
-    except NoSuchTableError as e: 
+    except NoSuchTableError:
         assert True
 
+
 def test_create_and_drop_namespace(
     mocker: MockFixture, _bucket_initialize: None, table_schema_nested: Schema, gcp_dataset_name: str, table_name: str
 ) -> None:
     mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"})
 
     # Create namespace.
     catalog_name = "test_ddb_catalog"
-    identifier = (gcp_dataset_name, table_name)
-    test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"})
+    test_catalog = BigQueryMetastoreCatalog(
+        catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"}
+    )
     test_catalog.create_namespace(namespace=gcp_dataset_name)
 
     # Ensure that the namespace exists.
@@ -90,14 +99,15 @@ def test_create_and_drop_namespace(
     with pytest.raises(NoSuchNamespaceError):
         test_catalog.load_namespace_properties(gcp_dataset_name)
 
+
 def test_register_table(
     mocker: MockFixture, _bucket_initialize: None, table_schema_nested: Schema, gcp_dataset_name: str, table_name: str
 ) -> None:
     mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"})
 
     catalog_name = "test_bq_register_catalog"
     identifier = (gcp_dataset_name, table_name)
-    warehouse_path = "gs://alexstephen-test-bq-bucket/" # Matches conftest BUCKET_NAME for GCS interaction
+    warehouse_path = "gs://alexstephen-test-bq-bucket/"  # Matches conftest BUCKET_NAME for GCS interaction
     gcp_project_id = "alexstephen-test-1"
 
     test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": gcp_project_id, "warehouse": warehouse_path})
@@ -110,7 +120,13 @@ def test_register_table(
     metadata_file_name = "00000-aaaaaaaa-aaaa-4aaa-aaaa-aaaaaaaaaaaa.metadata.json"
     metadata_gcs_path = f"{table_gcs_location}/metadata/{metadata_file_name}"
 
-    metadata = new_table_metadata(location=table_gcs_location, schema=table_schema_nested, properties={}, partition_spec=UNPARTITIONED_PARTITION_SPEC, sort_order=UNSORTED_SORT_ORDER)
+    metadata = new_table_metadata(
+        location=table_gcs_location,
+        schema=table_schema_nested,
+        properties={},
+        partition_spec=UNPARTITIONED_PARTITION_SPEC,
+        sort_order=UNSORTED_SORT_ORDER,
+    )
     io = load_file_io(properties=test_catalog.properties, location=metadata_gcs_path)
     test_catalog._write_metadata(metadata, io, metadata_gcs_path)
     ToOutputFile.table_metadata(metadata, io.new_output(metadata_gcs_path), overwrite=True)
diff --git a/tests/catalog/test_bigquery_metastore.py b/tests/catalog/test_bigquery_metastore.py
diff --git a/tests/conftest.py b/tests/conftest.py