apache
diff --git a/‎mkdocs/docs/index.md
Lines changed: 1 addition & 0 deletions b/‎mkdocs/docs/index.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎poetry.lock
Lines changed: 306 additions & 94 deletions b/‎poetry.lock
Lines changed: 306 additions & 94 deletions
diff --git a/‎pyiceberg/catalog/__init__.py
Lines changed: 11 additions & 0 deletions b/‎pyiceberg/catalog/__init__.py
Lines changed: 11 additions & 0 deletions
diff --git a/‎pyiceberg/catalog/bigquery_metastore.py
Lines changed: 409 additions & 0 deletions b/‎pyiceberg/catalog/bigquery_metastore.py
Lines changed: 409 additions & 0 deletions
diff --git a/‎pyproject.toml
Lines changed: 2 additions & 0 deletions b/‎pyproject.toml
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/catalog/integration_test_bigquery_metastore.py
Lines changed: 133 additions & 0 deletions b/‎tests/catalog/integration_test_bigquery_metastore.py
Lines changed: 133 additions & 0 deletions
diff --git a/‎tests/catalog/test_bigquery_metastore.py
Lines changed: 171 additions & 0 deletions b/‎tests/catalog/test_bigquery_metastore.py
Lines changed: 171 additions & 0 deletions
@@ -46,6 +46,7 @@ You can mix and match optional dependencies depending on your needs:
 | hive-kerberos | Support for Hive metastore in Kerberos environment                       |
 | glue          | Support for AWS Glue                                                      |
 | dynamodb      | Support for AWS DynamoDB                                                  |
+| bigquery      | Support for Google Cloud BigQuery                                        |
 | sql-postgres  | Support for SQL Catalog backed by Postgresql                              |
 | sql-sqlite    | Support for SQL Catalog backed by SQLite                                  |
 | pyarrow       | PyArrow as a FileIO implementation to interact with the object store      |
 
@@ -117,6 +117,7 @@ class CatalogType(Enum):
     DYNAMODB = "dynamodb"
     SQL = "sql"
     IN_MEMORY = "in-memory"
+    BIGQUERY = "bigquery"
 
 
 def load_rest(name: str, conf: Properties) -> Catalog:
@@ -171,6 +172,15 @@ def load_in_memory(name: str, conf: Properties) -> Catalog:
     except ImportError as exc:
         raise NotInstalledError("SQLAlchemy support not installed: pip install 'pyiceberg[sql-sqlite]'") from exc
 
+def load_bigquery(name: str, conf: Properties) -> Catalog:
+    try:
+        from pyiceberg.catalog.bigquery_metastore import BigQueryMetastoreCatalog
+
+        return BigQueryMetastoreCatalog(name, **conf)
+    except ImportError as exc:
+        raise NotInstalledError("BigQuery support not installed: pip install 'pyiceberg[bigquery]'") from exc
+
+
 
 AVAILABLE_CATALOGS: dict[CatalogType, Callable[[str, Properties], Catalog]] = {
     CatalogType.REST: load_rest,
@@ -179,6 +189,7 @@ def load_in_memory(name: str, conf: Properties) -> Catalog:
     CatalogType.DYNAMODB: load_dynamodb,
     CatalogType.SQL: load_sql,
     CatalogType.IN_MEMORY: load_in_memory,
+    CatalogType.BIGQUERY: load_bigquery,
 }
 
 
 
@@ -73,6 +73,7 @@ python-snappy = { version = ">=0.6.0,<1.0.0", optional = true }
 thrift = { version = ">=0.13.0,<1.0.0", optional = true }
 mypy-boto3-glue = { version = ">=1.28.18", optional = true }
 boto3 = { version = ">=1.24.59", optional = true }
+google-cloud-bigquery = { version = "^3.33.0", optional = true }
 s3fs = { version = ">=2023.1.0", optional = true }
 adlfs = { version = ">=2023.1.0", optional = true }
 gcsfs = { version = ">=2023.1.0", optional = true }
@@ -302,6 +303,7 @@ s3fs = ["s3fs"]
 glue = ["boto3", "mypy-boto3-glue"]
 adlfs = ["adlfs"]
 dynamodb = ["boto3"]
+bigquery = ["google-cloud-bigquery"]
 zstandard = ["zstandard"]
 sql-postgres = ["sqlalchemy", "psycopg2-binary"]
 sql-sqlite = ["sqlalchemy"]
 
@@ -0,0 +1,133 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+import os
+import pytest
+from pytest_mock import MockFixture
+
+from pyiceberg.catalog.bigquery_metastore import BigQueryMetastoreCatalog
+from pyiceberg.exceptions import NamespaceAlreadyExistsError, NoSuchNamespaceError, NoSuchTableError
+from pyiceberg.io import load_file_io
+from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC
+from pyiceberg.schema import Schema
+from pyiceberg.serializers import ToOutputFile
+from pyiceberg.table.metadata import new_table_metadata
+from pyiceberg.table.sorting import UNSORTED_SORT_ORDER
+from tests.conftest import BQ_TABLE_METADATA_LOCATION_REGEX, BUCKET_NAME, TABLE_METADATA_LOCATION_REGEX
+
+def test_create_table_with_database_location(
+    mocker: MockFixture, _bucket_initialize: None, table_schema_nested: Schema, gcp_dataset_name: str, table_name: str
+) -> None:
+    mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"})
+
+    catalog_name = "test_ddb_catalog"
+    identifier = (gcp_dataset_name, table_name)
+    test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"})
+    test_catalog.create_namespace(namespace=gcp_dataset_name)
+    table = test_catalog.create_table(identifier, table_schema_nested)
+    assert table.name() == identifier
+    assert BQ_TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location)
+
+    tables_in_namespace = test_catalog.list_tables(namespace=gcp_dataset_name)
+    assert identifier in tables_in_namespace
+
+def test_drop_table_with_database_location(
+    mocker: MockFixture, _bucket_initialize: None, table_schema_nested: Schema, gcp_dataset_name: str, table_name: str
+) -> None:
+    mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"})
+
+    catalog_name = "test_ddb_catalog"
+    identifier = (gcp_dataset_name, table_name)
+    test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"})
+    test_catalog.create_namespace(namespace=gcp_dataset_name)
+    table = test_catalog.create_table(identifier, table_schema_nested)
+    test_catalog.drop_table(identifier)
+
+    tables_in_namespace_after_drop = test_catalog.list_tables(namespace=gcp_dataset_name)
+    assert identifier not in tables_in_namespace_after_drop
+
+    # Expect that the table no longer exists.
+    try:
+        test_catalog.load_table(identifier)
+        raise AssertionError()
+    except NoSuchTableError as e: 
+        assert True
+
+def test_create_and_drop_namespace(
+    mocker: MockFixture, _bucket_initialize: None, table_schema_nested: Schema, gcp_dataset_name: str, table_name: str
+) -> None:
+    mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"})
+
+    # Create namespace.
+    catalog_name = "test_ddb_catalog"
+    identifier = (gcp_dataset_name, table_name)
+    test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"})
+    test_catalog.create_namespace(namespace=gcp_dataset_name)
+
+    # Ensure that the namespace exists.
+    namespaces = test_catalog.list_namespaces()
+    assert (gcp_dataset_name,) in namespaces
+
+    # Drop the namespace and ensure it does not exist.
+    test_catalog.drop_namespace(namespace=gcp_dataset_name)
+    namespaces_after_drop = test_catalog.list_namespaces()
+    assert (gcp_dataset_name,) not in namespaces_after_drop
+
+    # Verify with load_namespace_properties as well
+    with pytest.raises(NoSuchNamespaceError):
+        test_catalog.load_namespace_properties(gcp_dataset_name)
+
+def test_register_table(
+    mocker: MockFixture, _bucket_initialize: None, table_schema_nested: Schema, gcp_dataset_name: str, table_name: str
+) -> None:
+    mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"})
+
+    catalog_name = "test_bq_register_catalog"
+    identifier = (gcp_dataset_name, table_name)
+    warehouse_path = "gs://alexstephen-test-bq-bucket/" # Matches conftest BUCKET_NAME for GCS interaction
+    gcp_project_id = "alexstephen-test-1"
+
+    test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": gcp_project_id, "warehouse": warehouse_path})
+
+    test_catalog.create_namespace(namespace=gcp_dataset_name)
+
+    # Manually create a metadata file in GCS
+    table_gcs_location = f"{warehouse_path.rstrip('/')}/{gcp_dataset_name}.db/{table_name}"
+    # Construct a unique metadata file name similar to how pyiceberg would
+    metadata_file_name = "00000-aaaaaaaa-aaaa-4aaa-aaaa-aaaaaaaaaaaa.metadata.json"
+    metadata_gcs_path = f"{table_gcs_location}/metadata/{metadata_file_name}"
+
+    metadata = new_table_metadata(location=table_gcs_location, schema=table_schema_nested, properties={}, partition_spec=UNPARTITIONED_PARTITION_SPEC, sort_order=UNSORTED_SORT_ORDER)
+    io = load_file_io(properties=test_catalog.properties, location=metadata_gcs_path)
+    test_catalog._write_metadata(metadata, io, metadata_gcs_path)
+    ToOutputFile.table_metadata(metadata, io.new_output(metadata_gcs_path), overwrite=True)
+
+    # Register the table
+    registered_table = test_catalog.register_table(identifier, metadata_gcs_path)
+
+    assert registered_table.name() == identifier
+    assert registered_table.metadata_location == metadata_gcs_path
+    assert registered_table.metadata.location == table_gcs_location
+    assert BQ_TABLE_METADATA_LOCATION_REGEX.match(registered_table.metadata_location)
+
+    # Verify table exists and is loadable
+    loaded_table = test_catalog.load_table(identifier)
+    assert loaded_table.name() == registered_table.name()
+    assert loaded_table.metadata_location == metadata_gcs_path
+
+    # Clean up
+    test_catalog.drop_table(identifier)
+    test_catalog.drop_namespace(gcp_dataset_name)
@@ -0,0 +1,171 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+import os
+from unittest.mock import MagicMock
+
+from google.api_core.exceptions import NotFound
+from google.cloud.bigquery import Dataset, DatasetReference, Table, TableReference
+from google.cloud.bigquery.external_config import ExternalCatalogDatasetOptions, ExternalCatalogTableOptions
+from pytest_mock import MockFixture
+
+from pyiceberg.catalog.bigquery_metastore import ICEBERG_TABLE_TYPE_VALUE, TABLE_TYPE_PROP, BigQueryMetastoreCatalog
+from pyiceberg.exceptions import NoSuchTableError
+from pyiceberg.schema import Schema
+
+
+def dataset_mock() -> Dataset:
+    d = Dataset(DatasetReference(dataset_id="my-dataset", project="my-project"))
+    d.external_catalog_dataset_options = ExternalCatalogDatasetOptions(
+        default_storage_location_uri="gs://test-bucket/iceberg-dataset"
+    )
+    return d
+
+
+def table_mock() -> Table:
+    t = Table(TableReference(dataset_ref=DatasetReference(dataset_id="my-dataset", project="my-project"), table_id="my-table"))
+    t.external_catalog_table_options = ExternalCatalogTableOptions(
+        parameters={
+            "metadata_location": "gs://alexstephen-test-bq-bucket/my_iceberg_database_aaaaaaaaaaaaaaaaaaaa.db/my_iceberg_table-bbbbbbbbbbbbbbbbbbbb/metadata/12343-aaaaaaaa-aaaa-4aaa-8aaa-aaaaaaaaaaaa.metadata",
+            TABLE_TYPE_PROP: ICEBERG_TABLE_TYPE_VALUE,
+        }
+    )
+    return t
+
+def test_create_table_with_database_location(
+    mocker: MockFixture, _bucket_initialize: None, table_schema_nested: Schema, gcp_dataset_name: str, table_name: str
+) -> None:
+    # Setup mocks for GCP.
+    client_mock = MagicMock()
+    client_mock.get_dataset.return_value = dataset_mock()
+    client_mock.get_table.return_value = table_mock()
+
+    # Setup mocks for GCS.
+    file_mock = MagicMock()
+
+    mocker.patch('pyiceberg.catalog.bigquery_metastore.Client', return_value=client_mock)
+    mocker.patch('pyiceberg.catalog.bigquery_metastore.FromInputFile.table_metadata', return_value=file_mock)
+    mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"})
+
+    catalog_name = "test_ddb_catalog"
+    identifier = (gcp_dataset_name, table_name)
+    test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"})
+    test_catalog.create_namespace(namespace=gcp_dataset_name)
+    table = test_catalog.create_table(identifier, table_schema_nested)
+    assert table.name() == identifier
+
+def test_drop_table_with_database_location(
+    mocker: MockFixture, _bucket_initialize: None, table_schema_nested: Schema, gcp_dataset_name: str, table_name: str
+) -> None:
+    # Setup mocks for GCP.
+    client_mock = MagicMock()
+    client_mock.get_dataset.return_value = dataset_mock()
+    client_mock.get_table.return_value = table_mock()
+
+    # Setup mocks for GCS.
+    file_mock = MagicMock()
+
+    mocker.patch('pyiceberg.catalog.bigquery_metastore.Client', return_value=client_mock)
+    mocker.patch('pyiceberg.catalog.bigquery_metastore.FromInputFile.table_metadata', return_value=file_mock)
+    mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"})
+
+    catalog_name = "test_ddb_catalog"
+    identifier = (gcp_dataset_name, table_name)
+    test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": "alexstephen-test-1", "warehouse": "gs://alexstephen-test-bq-bucket/"})
+    test_catalog.create_namespace(namespace=gcp_dataset_name)
+    table = test_catalog.create_table(identifier, table_schema_nested)
+    test_catalog.drop_table(identifier)
+
+    client_mock.get_table.side_effect = NotFound("Table Not Found")
+    mocker.patch('pyiceberg.catalog.bigquery_metastore.Client', return_value=client_mock)
+
+    # Expect that the table no longer exists.
+    try:
+        test_catalog.load_table(identifier)
+        raise AssertionError()
+    except NoSuchTableError as e: 
+        assert True
+
+
+def test_drop_namespace(mocker: MockFixture, gcp_dataset_name: str) -> None:
+    client_mock = MagicMock()
+    mocker.patch('pyiceberg.catalog.bigquery_metastore.Client', return_value=client_mock)
+    mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"})
+
+    catalog_name = "test_catalog"
+    test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": "alexstephen-test-1"})
+
+    test_catalog.drop_namespace(gcp_dataset_name)
+    client_mock.delete_dataset.assert_called_once()
+    args, _ = client_mock.delete_dataset.call_args
+    assert isinstance(args[0], Dataset)
+    assert args[0].dataset_id == gcp_dataset_name
+
+
+def test_list_tables(mocker: MockFixture, gcp_dataset_name: str) -> None:
+    client_mock = MagicMock()
+
+    # Mock list_tables to return an iterator of TableListItem
+    table_list_item_1 = MagicMock()
+    table_list_item_1.table_id = "iceberg_table_A"
+    table_list_item_1.reference = TableReference(
+        dataset_ref=DatasetReference(project="my-project", dataset_id=gcp_dataset_name), table_id="iceberg_table_A"
+    )
+
+    table_list_item_2 = MagicMock()
+    table_list_item_2.table_id = "iceberg_table_B"
+    table_list_item_2.reference = TableReference(
+        dataset_ref=DatasetReference(project="my-project", dataset_id=gcp_dataset_name), table_id="iceberg_table_B"
+    )
+
+    client_mock.list_tables.return_value = iter([table_list_item_1, table_list_item_2])
+
+    # Mock get_table to always return a table that is considered an Iceberg table.
+    # The table_mock() function already creates a table with the necessary Iceberg properties.
+    client_mock.get_table.return_value = table_mock()
+
+    mocker.patch('pyiceberg.catalog.bigquery_metastore.Client', return_value=client_mock)
+    mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"})
+
+    catalog_name = "test_catalog"
+    test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": "my-project"})
+
+    tables = test_catalog.list_tables(gcp_dataset_name)
+
+    # Assert that all tables returned by client.list_tables are listed
+    assert len(tables) == 2
+    assert (gcp_dataset_name, "iceberg_table_A") in tables
+    assert (gcp_dataset_name, "iceberg_table_B") in tables
+
+    client_mock.list_tables.assert_called_once_with(dataset=DatasetReference(project="my-project", dataset_id=gcp_dataset_name))
+
+def test_list_namespaces(mocker: MockFixture) -> None:
+    client_mock = MagicMock()
+    dataset_item_1 = Dataset(DatasetReference(project="my-project", dataset_id="dataset1"))
+    dataset_item_2 = Dataset(DatasetReference(project="my-project", dataset_id="dataset2"))
+    client_mock.list_datasets.return_value = iter([dataset_item_1, dataset_item_2])
+
+    mocker.patch('pyiceberg.catalog.bigquery_metastore.Client', return_value=client_mock)
+    mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"})
+
+    catalog_name = "test_catalog"
+    test_catalog = BigQueryMetastoreCatalog(catalog_name, **{"gcp.project-id": "my-project"})
+
+    namespaces = test_catalog.list_namespaces()
+    assert len(namespaces) == 2
+    assert ("dataset1",) in namespaces
+    assert ("dataset2",) in namespaces
+    client_mock.list_datasets.assert_called_once()