From b9e8317cba09912aa68da3b3cd8dfdd468d68abf Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 14:10:45 -0700 Subject: [PATCH 001/118] `poetry add airbyte-api` --- poetry.lock | 105 +++++++++++++++++++++++++++++++++++++++++++------ pyproject.toml | 1 + 2 files changed, 93 insertions(+), 13 deletions(-) diff --git a/poetry.lock b/poetry.lock index 5a6f3f35..bb8d4804 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,34 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. + +[[package]] +name = "airbyte-api" +version = "0.47.3" +description = "Python Client SDK for Airbyte API" +optional = false +python-versions = ">=3.8" +files = [ + {file = "airbyte-api-0.47.3.tar.gz", hash = "sha256:1ae86209bd3996f37d192bd0022155d2cb0dd8df9c4bfc0d81a7d5d46feffb1e"}, + {file = "airbyte_api-0.47.3-py3-none-any.whl", hash = "sha256:68606b5be11ce2e4538ed6ab106095495bd8bc4dc23fc6bd02b940f8d63e0337"}, +] + +[package.dependencies] +certifi = ">=2023.7.22" +charset-normalizer = ">=3.2.0" +dataclasses-json-speakeasy = ">=0.5.11" +idna = ">=3.4" +jsonpath-python = ">=1.0.6" +marshmallow = ">=3.19.0" +mypy-extensions = ">=1.0.0" +packaging = ">=23.1" +python-dateutil = ">=2.8.2" +requests = ">=2.31.0" +six = ">=1.16.0" +typing-extensions = ">=4.7.1" +typing-inspect = ">=0.9.0" +urllib3 = ">=1.26.18" + +[package.extras] +dev = ["pylint (==2.16.2)"] [[package]] name = "airbyte-cdk" @@ -387,6 +417,21 @@ ssh = ["bcrypt (>=3.1.5)"] test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] test-randomorder = ["pytest-randomly"] +[[package]] +name = "dataclasses-json-speakeasy" +version = "0.5.11" +description = "Easily serialize dataclasses to and from JSON." +optional = false +python-versions = ">=3.7,<4.0" +files = [ + {file = "dataclasses_json_speakeasy-0.5.11-py3-none-any.whl", hash = "sha256:ac52a069a01e8521015d682f37849bfdf056c36fa3f81497055e201fec684104"}, + {file = "dataclasses_json_speakeasy-0.5.11.tar.gz", hash = "sha256:418a987cea2ccf4e4be662f39faa5cc79b47b147c9d1a69d6928d6a27e0c17e8"}, +] + +[package.dependencies] +marshmallow = ">=3.18.0,<4.0.0" +typing-inspect = ">=0.4.0,<1" + [[package]] name = "deprecated" version = "1.2.14" @@ -1026,6 +1071,17 @@ MarkupSafe = ">=2.0" [package.extras] i18n = ["Babel (>=2.7)"] +[[package]] +name = "jsonpath-python" +version = "1.0.6" +description = "A more powerful JSONPath implementation in modern python" +optional = false +python-versions = ">=3.6" +files = [ + {file = "jsonpath-python-1.0.6.tar.gz", hash = "sha256:dd5be4a72d8a2995c3f583cf82bf3cd1a9544cfdabf2d22595b67aff07349666"}, + {file = "jsonpath_python-1.0.6-py3-none-any.whl", hash = "sha256:1e3b78df579f5efc23565293612decee04214609208a2335884b3ee3f786b575"}, +] + [[package]] name = "jsonref" version = "0.3.0" @@ -1151,6 +1207,25 @@ files = [ {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"}, ] +[[package]] +name = "marshmallow" +version = "3.21.1" +description = "A lightweight library for converting complex datatypes to and from native Python datatypes." +optional = false +python-versions = ">=3.8" +files = [ + {file = "marshmallow-3.21.1-py3-none-any.whl", hash = "sha256:f085493f79efb0644f270a9bf2892843142d80d7174bbbd2f3713f2a589dc633"}, + {file = "marshmallow-3.21.1.tar.gz", hash = "sha256:4e65e9e0d80fc9e609574b9983cf32579f305c718afb30d7233ab818571768c3"}, +] + +[package.dependencies] +packaging = ">=17.0" + +[package.extras] +dev = ["marshmallow[tests]", "pre-commit (>=3.5,<4.0)", "tox"] +docs = ["alabaster (==0.7.16)", "autodocsumm (==0.2.12)", "sphinx (==7.2.6)", "sphinx-issues (==4.0.0)", "sphinx-version-warning (==1.1.2)"] +tests = ["pytest", "pytz", "simplejson"] + [[package]] name = "mdurl" version = "0.1.2" @@ -1998,7 +2073,6 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, - {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -2006,16 +2080,8 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, - {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, - {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, - {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, - {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -2032,7 +2098,6 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, - {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -2040,7 +2105,6 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, - {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -2606,6 +2670,21 @@ files = [ {file = "typing_extensions-4.10.0.tar.gz", hash = "sha256:b0abd7c89e8fb96f98db18d86106ff1d90ab692004eb746cf6eda2682f91b3cb"}, ] +[[package]] +name = "typing-inspect" +version = "0.9.0" +description = "Runtime inspection utilities for typing module." +optional = false +python-versions = "*" +files = [ + {file = "typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f"}, + {file = "typing_inspect-0.9.0.tar.gz", hash = "sha256:b23fc42ff6f6ef6954e4852c1fb512cdd18dbea03134f91f856a95ccc9461f78"}, +] + +[package.dependencies] +mypy-extensions = ">=0.3.0" +typing-extensions = ">=3.7.4" + [[package]] name = "tzdata" version = "2024.1" @@ -2753,4 +2832,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "d0bc626fee5bbc8e83793abc5cdc4f29c21ad90f78324757afa6381e0f4a82ca" +content-hash = "e92eecde380c5a1cd9adc0621e7e051b80db4f0eb4cc949d3c84b320d4faf23d" diff --git a/pyproject.toml b/pyproject.toml index c7506838..5c40ae75 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ ulid = "^1.1" # TODO: Remove this arbitrary python constraint once `sqlalchemy-bigquery` has done so. sqlalchemy-bigquery = { version = "1.9.0", python = "<3.13" } +airbyte-api = "^0.47.3" [tool.poetry.group.dev.dependencies] docker = "^7.0.0" From 93abc6f270d68c9e5a92a5abb30dffd9095d0c33 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 15:23:53 -0700 Subject: [PATCH 002/118] install `airbyte-api` from remote branch --- poetry.lock | 18 +++++++++++------- pyproject.toml | 2 +- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/poetry.lock b/poetry.lock index bb8d4804..d34d4ea9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3,13 +3,11 @@ [[package]] name = "airbyte-api" version = "0.47.3" -description = "Python Client SDK for Airbyte API" +description = "" optional = false python-versions = ">=3.8" -files = [ - {file = "airbyte-api-0.47.3.tar.gz", hash = "sha256:1ae86209bd3996f37d192bd0022155d2cb0dd8df9c4bfc0d81a7d5d46feffb1e"}, - {file = "airbyte_api-0.47.3-py3-none-any.whl", hash = "sha256:68606b5be11ce2e4538ed6ab106095495bd8bc4dc23fc6bd02b940f8d63e0337"}, -] +files = [] +develop = false [package.dependencies] certifi = ">=2023.7.22" @@ -23,13 +21,19 @@ packaging = ">=23.1" python-dateutil = ">=2.8.2" requests = ">=2.31.0" six = ">=1.16.0" -typing-extensions = ">=4.7.1" +typing_extensions = ">=4.7.1" typing-inspect = ">=0.9.0" urllib3 = ">=1.26.18" [package.extras] dev = ["pylint (==2.16.2)"] +[package.source] +type = "git" +url = "https://github.com/airbytehq/airbyte-api-python-sdk.git" +reference = "aj/manual_rename_dir" +resolved_reference = "856599a4861ee1f0ee4e994feff22e44ffb4cbd4" + [[package]] name = "airbyte-cdk" version = "0.73.0" @@ -2832,4 +2836,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "e92eecde380c5a1cd9adc0621e7e051b80db4f0eb4cc949d3c84b320d4faf23d" +content-hash = "900d34579c968518dd45a05103ee7a5d6efbca181c3c1509bc7273e7b553886e" diff --git a/pyproject.toml b/pyproject.toml index 5c40ae75..40c7ed99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ ulid = "^1.1" # TODO: Remove this arbitrary python constraint once `sqlalchemy-bigquery` has done so. sqlalchemy-bigquery = { version = "1.9.0", python = "<3.13" } -airbyte-api = "^0.47.3" +airbyte-api = {git = "https://github.com/airbytehq/airbyte-api-python-sdk.git", rev = "aj/manual_rename_dir"} [tool.poetry.group.dev.dependencies] docker = "^7.0.0" From 4d6432624e9d173841ba449bc7362dd8b039b723 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 15:23:58 -0700 Subject: [PATCH 003/118] import and revise code from https://github.com/airbytehq/airbyte/pull/34315 --- airbyte/_util/api_duck_types.py | 21 ++ airbyte/_util/api_util.py | 342 ++++++++++++++++++++++++++++++++ airbyte/exceptions.py | 37 +++- 3 files changed, 399 insertions(+), 1 deletion(-) create mode 100644 airbyte/_util/api_duck_types.py create mode 100644 airbyte/_util/api_util.py diff --git a/airbyte/_util/api_duck_types.py b/airbyte/_util/api_duck_types.py new file mode 100644 index 00000000..fe0a8c20 --- /dev/null +++ b/airbyte/_util/api_duck_types.py @@ -0,0 +1,21 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""A set of duck-typed classes for working with the Airbyte API.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Protocol + + +if TYPE_CHECKING: + import requests + + +class AirbyteApiResponseDuckType(Protocol): + """Used for duck-typing various Airbyte API responses.""" + + content_type: str + r"""HTTP response content type for this operation""" + status_code: int + r"""HTTP response status code for this operation""" + raw_response: requests.Response + r"""Raw HTTP response; suitable for custom response parsing""" diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py new file mode 100644 index 00000000..c0283fdb --- /dev/null +++ b/airbyte/_util/api_util.py @@ -0,0 +1,342 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""These internal functions are used to interact with the Airbyte API (module named `airbyte`). + +In order to insulate users from breaking changes and to avoid general confusion around naming +and design inconsistencies, we do not expose these functions or other Airbyte API classes within +AirbyteLib. Classes and functions from the Airbyte API external library should always be wrapped in +AirbyteLib classes - unless there's a very compelling reason to surface these models intentionally. +""" + +from __future__ import annotations + +import os +from time import sleep +from typing import Any + +import airbyte_api +from airbyte_api.models import operations as api_operations +from airbyte_api.models import shared as api_models +from airbyte_api.models.shared.jobcreaterequest import JobCreateRequest, JobTypeEnum + +from airbyte.exceptions import ( + HostedAirbyteError, + HostedConnectionSyncError, + MissingResourceError, + MultipleResourcesError, +) + + +JOB_WAIT_INTERVAL_SECS = 2.0 + + +def status_ok(status_code: int) -> bool: + """Check if a status code is OK.""" + return status_code >= 200 and status_code < 300 # noqa: PLR2004 # allow inline magic numbers + + +def get_default_bearer_token() -> str | None: + """Get the default bearer token from env variables.""" + return os.environ.get("AIRBYTE_API_KEY", None) + + +def get_airbyte_server_instance( + *, + api_key: str | None = None, + api_root: str = "https://api.airbyte.com/v1", +) -> airbyte_api.Airbyte: + """Get an Airbyte instance.""" + api_key = api_key or get_default_bearer_token() + return airbyte_api.Airbyte( + api_models.Security( + bearer_auth=api_key, + ), + api_root=api_root, + ) + + +def get_workspace( + workspace_id: str, + *, + api_root: str = "https://api.airbyte.com/v1", + api_key: str | None = None, +) -> api_models.WorkspaceResponse: + """Get a connection.""" + api_key = api_key or get_default_bearer_token() + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.workspaces.get_workspace( + api_operations.GetWorkspaceRequest( + workspace_id=workspace_id, + ), + ) + if status_ok(response.status_code) and response.workspace_response: + return response.workspace_response + + raise MissingResourceError( + resource_type="workspace", + context={ + "workspace_id": workspace_id, + "response": response, + }, + ) + + +def list_connections( + workspace_id: str, + *, + api_root: str = "https://api.airbyte.com/v1", + api_key: str | None = None, +) -> list[api_models.ConnectionResponse]: + """Get a connection.""" + _ = workspace_id # Not used (yet) + api_key = api_key or get_default_bearer_token() + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.connections.list_connections( + api_operations.ListConnectionsRequest()( + workspace_ids=[workspace_id], + ), + ) + + if status_ok(response.status_code) and response.connections_response: + return response.connections_response.data + + raise HostedAirbyteError( + context={ + "workspace_id": workspace_id, + "response": response, + } + ) + + +def get_connection( + workspace_id: str, + connection_id: str, + *, + api_root: str = "https://api.airbyte.com/v1", + api_key: str | None = None, +) -> api_models.ConnectionResponse: + """Get a connection.""" + _ = workspace_id # Not used (yet) + api_key = api_key or get_default_bearer_token() + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.connections.get_connection( + api_models.GetConnectionRequest( + connection_id=connection_id, + ), + ) + if status_ok(response.status_code) and response.connection_response: + return response.connection_response + + raise MissingResourceError(connection_id, "connection", response.text) + + +def run_connection( + workspace_id: str, + connection_id: str, + *, + api_root: str = "https://api.airbyte.com/v1", + api_key: str | None = None, + wait_for_job: bool = True, + raise_on_failure: bool = True, +) -> api_models.ConnectionResponse: + """Get a connection. + + If block is True, this will block until the connection is finished running. + + If raise_on_failure is True, this will raise an exception if the connection fails. + """ + _ = workspace_id # Not used (yet) + api_key = api_key or get_default_bearer_token() + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.jobs.create_job( + JobCreateRequest( + connection_id=connection_id, + job_type=JobTypeEnum.SYNC, + ), + ) + if status_ok(response.status_code) and response.job_response: + if wait_for_job: + job_info = wait_for_airbyte_job( + workspace_id=workspace_id, + job_id=response.job_response.job_id, + api_key=api_key, + api_root=api_root, + raise_on_failure=raise_on_failure, + ) + + return job_info + + raise HostedConnectionSyncError( + context={ + "workspace_id": workspace_id, + "connection_id": connection_id, + }, + response=response, + ) + + +def wait_for_airbyte_job( + workspace_id: str, + job_id: str, + *, + api_root: str = "https://api.airbyte.com/v1", + api_key: str | None = None, + raise_on_failure: bool = True, +) -> api_models.JobInfo: + """Wait for a job to finish running.""" + _ = workspace_id # Not used (yet) + api_key = api_key or get_default_bearer_token() + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + while True: + sleep(JOB_WAIT_INTERVAL_SECS) + response = airbyte_instance.jobs.get_job( + api_operations.GetJobRequest( + job_id=job_id, + ), + ) + if status_ok(response.status_code) and response.job_info: + job_info = response.job_info + if job_info.status == api_models.StatusEnum.succeeded: + return job_info + + if job_info.status == api_models.StatusEnum.failed: + if raise_on_failure: + raise HostedConnectionSyncError( + context={ + "job_status": job_info.status, + "workspace_id": workspace_id, + "job_id": job_id, + "message": job_info.message, + }, + ) + + return job_info + + # Else: Job is still running + pass + else: + raise MissingResourceError(job_id, "job", response.text) + + +def get_connection_by_name( + workspace_id: str, + connection_name: str, + *, + api_root: str = "https://api.airbyte.com/v1", + api_key: str | None = None, +) -> api_models.ConnectionResponse: + """Get a connection.""" + connections = list_connections( + workspace_id=workspace_id, + api_key=api_key, + api_root=api_root, + ) + found: list[api_models.ConnectionResponse] = [ + connection for connection in connections if connection.name == connection_name + ] + if len(found) == 0: + raise MissingResourceError(connection_name, "connection", f"Workspace: {workspace_id}") + + if len(found) > 1: + raise MultipleResourcesError( + resource_type="connection", + resource_name_or_id=connection_name, + context={ + "workspace_id": workspace_id, + "multiples": found, + }, + ) + + return found[0] + + +def get_source( + source_id: str, + *, + api_root: str = "https://api.airbyte.com/v1", + api_key: str | None = None, +) -> api_models.SourceResponse: + """Get a connection.""" + api_key = api_key or get_default_bearer_token() + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.sources.get_source( + api_operations.GetSourceRequest( + source_id=source_id, + ), + ) + if status_ok(response.status_code) and response.connection_response: + return response.connection_response + + raise MissingResourceError(source_id, "source", response.text) + + +def create_source( + name: str, + *, + workspace_id: str, + config: dict[str, Any], + api_root: str = "https://api.airbyte.com/v1", + api_key: str | None = None, +) -> api_models.SourceResponse: + """Get a connection.""" + api_key = api_key or get_default_bearer_token() + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response: api_operations.CreateSourceResponse = airbyte_instance.sources.create_source( + api_models.SourceCreateRequest( + name=name, + workspace_id=workspace_id, + configuration=config, # TODO: wrap in a proper configuration object + definition_id=None, # Not used alternative to config.sourceType. + secret_id=None, # For OAuth, not yet supported + ), + ) + if status_ok(response.status_code) and response.connection_response: + return response.source_response + + raise HostedAirbyteError( + message="Could not create source.", + response=response, + ) + + +def get_destination( + destination_id: str, + *, + api_root: str = "https://api.airbyte.com/v1", + api_key: str | None = None, +) -> api_models.DestinationResponse: + """Get a connection.""" + api_key = api_key or get_default_bearer_token() + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.sources.get_destination( + api_operations.GetDestinationRequest( + destination_id=destination_id, + ), + ) + if status_ok(response.status_code) and response.connection_response: + return response.connection_response + + raise MissingResourceError(destination_id, "destination", response.text) diff --git a/airbyte/exceptions.py b/airbyte/exceptions.py index 38464c69..2820b54a 100644 --- a/airbyte/exceptions.py +++ b/airbyte/exceptions.py @@ -39,7 +39,11 @@ from dataclasses import dataclass from textwrap import indent -from typing import Any +from typing import TYPE_CHECKING, Any + + +if TYPE_CHECKING: + from airbyte._util.api_duck_types import AirbyteApiResponseDuckType NEW_ISSUE_URL = "https://github.com/airbytehq/airbyte/issues/new/choose" @@ -303,3 +307,34 @@ class AirbyteLibSecretNotFoundError(AirbyteError): secret_name: str | None = None sources: list[str] | None = None + + +# Airbyte API Errors + + +@dataclass +class HostedAirbyteError(AirbyteError): + """An error occurred while communicating with the hosted Airbyte instance.""" + + response: AirbyteApiResponseDuckType | None = None + """The API response from the failed request.""" + + +@dataclass +class MissingResourceError(HostedAirbyteError): + """Remote Airbyte resources does not exist.""" + + resource_type: str | None = None + resource_name_or_id: str | None = None + + +@dataclass +class MultipleResourcesError(HostedAirbyteError): + """Could not locate the resource because multiple matching resources were found.""" + + resource_type: str | None = None + resource_name_or_id: str | None = None + + +class HostedConnectionSyncError(HostedAirbyteError): + """An error occurred while executing the remote Airbyte job.""" From 4f4a5969eb5243ad6ad4672948987b0580854b85 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 17:57:42 -0700 Subject: [PATCH 004/118] add created/delete integration test for sources and destinations --- tests/integration_tests/test_api_crud.py | 115 +++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 tests/integration_tests/test_api_crud.py diff --git a/tests/integration_tests/test_api_crud.py b/tests/integration_tests/test_api_crud.py new file mode 100644 index 00000000..44118a15 --- /dev/null +++ b/tests/integration_tests/test_api_crud.py @@ -0,0 +1,115 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +"""Integration tests which test CRUD operations on the Airbyte API. + +These tests are designed to be run against a running instance of the Airbyte API. +""" +from __future__ import annotations +import os + +from airbyte_api.models.shared.sourceresponse import SourceResponse +import pytest +import ulid + +import airbyte as ab +from airbyte._util import api_util, api_duck_types +from airbyte_api.models.shared import SourceFaker, DestinationDevNull, DestinationDuckdb +from dotenv import dotenv_values + +from airbyte.caches.duckdb import DuckDBCache + +CLOUD_API_ROOT = "https://api.airbyte.com/v1" +ENV_AIRBYTE_API_KEY = "AIRBYTE_API_KEY" +ENV_AIRBYTE_API_WORKSPACE_ID = "AIRBYTE_API_WORKSPACE_ID" +ENV_MOTHERDUCK_API_KEY = "MOTHERDUCK_API_KEY" + + +@pytest.fixture +def workspace_id() -> str: + return os.environ[ENV_AIRBYTE_API_WORKSPACE_ID] + + +@pytest.fixture +def api_root() -> str: + return CLOUD_API_ROOT + + +@pytest.fixture +def api_key() -> str: + dotenv_vars: dict[str, str | None] = dotenv_values() + if ENV_AIRBYTE_API_KEY in dotenv_vars: + return dotenv_vars[ENV_AIRBYTE_API_KEY] + + if ENV_AIRBYTE_API_KEY not in os.environ: + raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") + + return os.environ[ENV_AIRBYTE_API_KEY] + + +@pytest.fixture +def motherduck_api_key() -> str: + dotenv_vars: dict[str, str | None] = dotenv_values() + if ENV_MOTHERDUCK_API_KEY in dotenv_vars: + return dotenv_vars[ENV_MOTHERDUCK_API_KEY] + + if ENV_MOTHERDUCK_API_KEY not in os.environ: + raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") + + return os.environ[ENV_MOTHERDUCK_API_KEY] + + +def test_create_and_delete_source( + workspace_id: str, + api_root: str, + api_key: str, +) -> None: + new_resource_name = "deleteme-source-faker" + str(ulid.ULID()).lower()[-6:] + source_config = SourceFaker() + source: SourceResponse = api_util.create_source( + name=new_resource_name, + api_root=api_root, + api_key=api_key, + workspace_id=workspace_id, + config=source_config, + ) + assert source.name == new_resource_name + assert source.source_type == "faker" + assert source.source_id + + api_util.delete_source( + source_id=source.source_id, + api_root=api_root, + api_key=api_key, + workspace_id=workspace_id, + ) + + +def test_create_and_delete_destination( + workspace_id: str, + api_root: str, + api_key: str, + motherduck_api_key: str, +) -> None: + new_resource_name = "deleteme-destination-faker" + str(ulid.ULID()).lower()[-6:] + destination_config = DestinationDuckdb( + destination_path="temp_db", + motherduck_api_key=motherduck_api_key, + ) + + destination: SourceResponse = api_util.create_destination( + name=new_resource_name, + api_root=api_root, + api_key=api_key, + workspace_id=workspace_id, + config=destination_config, + ) + assert destination.name == new_resource_name + assert destination.destination_type == "duckdb" + assert destination.destination_id + + api_util.delete_destination( + destination_id=destination.destination_id, + api_root=api_root, + api_key=api_key, + workspace_id=workspace_id, + ) From 72d522612242137271b52ac8ec71093bac6bd61a Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 17:58:03 -0700 Subject: [PATCH 005/118] fix tests --- airbyte/_util/api_util.py | 111 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 108 insertions(+), 3 deletions(-) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index c0283fdb..99dad5a9 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -47,10 +47,10 @@ def get_airbyte_server_instance( """Get an Airbyte instance.""" api_key = api_key or get_default_bearer_token() return airbyte_api.Airbyte( - api_models.Security( + security=api_models.Security( bearer_auth=api_key, ), - api_root=api_root, + server_url=api_root, ) @@ -310,7 +310,7 @@ def create_source( secret_id=None, # For OAuth, not yet supported ), ) - if status_ok(response.status_code) and response.connection_response: + if status_ok(response.status_code) and response.source_response: return response.source_response raise HostedAirbyteError( @@ -319,6 +319,111 @@ def create_source( ) +def delete_source( + source_id: str, + *, + api_root: str = "https://api.airbyte.com/v1", + api_key: str | None = None, + workspace_id: str | None = None, +) -> None: + """Delete a source.""" + _ = workspace_id # Not used (yet) + api_key = api_key or get_default_bearer_token() + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.sources.delete_source( + api_operations.DeleteSourceRequest( + source_id=source_id, + ), + ) + if not status_ok(response.status_code): + raise HostedAirbyteError( + context={ + "source_id": source_id, + "response": response, + }, + ) + + +def create_destination( + name: str, + *, + workspace_id: str, + config: dict[str, Any], + api_root: str = "https://api.airbyte.com/v1", + api_key: str | None = None, +) -> api_models.SourceResponse: + """Get a connection.""" + api_key = api_key or get_default_bearer_token() + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response: api_operations.CreateDestinationResponse = ( + airbyte_instance.destinations.create_destination( + api_models.DestinationCreateRequest( + name=name, + workspace_id=workspace_id, + configuration=config, # TODO: wrap in a proper configuration object + # definition_id="a7bcc9d8-13b3-4e49-b80d-d020b90045e3", # Not used alternative to config.destinationType. + ), + ) + ) + if status_ok(response.status_code) and response.destination_response: + return response.destination_response + + raise HostedAirbyteError( + message="Could not create destination.", + response=response, + ) + + +def delete_destination( + destination_id: str, + *, + api_root: str = "https://api.airbyte.com/v1", + api_key: str | None = None, + workspace_id: str | None = None, +) -> None: + """Delete a destination.""" + _ = workspace_id # Not used (yet) + api_key = api_key or get_default_bearer_token() + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.destinations.delete_destination( + api_operations.DeleteDestinationRequest( + destination_id=destination_id, + ), + ) + if not status_ok(response.status_code): + raise HostedAirbyteError( + context={ + "destination_id": destination_id, + "response": response, + }, + ) + + +def check_source( + source_id: str, + *, + api_root: str = "https://api.airbyte.com/v1", + api_key: str | None = None, + workspace_id: str | None = None, +) -> api_models.SourceCheckResponse: + """Check a source. + + # TODO: Need to use legacy Configuration API for this: + # https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#post-/v1/sources/check_connection + """ + _ = source_id, workspace_id, api_root, api_key + raise NotImplementedError + + def get_destination( destination_id: str, *, From 68e9df1f17f17d0b95c88058a369960446f240d4 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 18:00:09 -0700 Subject: [PATCH 006/118] remove type hints --- tests/integration_tests/test_api_crud.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration_tests/test_api_crud.py b/tests/integration_tests/test_api_crud.py index 44118a15..65d7323d 100644 --- a/tests/integration_tests/test_api_crud.py +++ b/tests/integration_tests/test_api_crud.py @@ -65,7 +65,7 @@ def test_create_and_delete_source( ) -> None: new_resource_name = "deleteme-source-faker" + str(ulid.ULID()).lower()[-6:] source_config = SourceFaker() - source: SourceResponse = api_util.create_source( + source = api_util.create_source( name=new_resource_name, api_root=api_root, api_key=api_key, @@ -96,7 +96,7 @@ def test_create_and_delete_destination( motherduck_api_key=motherduck_api_key, ) - destination: SourceResponse = api_util.create_destination( + destination = api_util.create_destination( name=new_resource_name, api_root=api_root, api_key=api_key, From 90c17b5143a744851f08470705a8de3d61178578 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 18:29:57 -0700 Subject: [PATCH 007/118] add tests to add/delete connections (passing) --- airbyte/_util/api_util.py | 64 ++++++++++++++++++++++ tests/integration_tests/test_api_crud.py | 67 ++++++++++++++++++++++++ 2 files changed, 131 insertions(+) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index 99dad5a9..461c53f9 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -408,6 +408,70 @@ def delete_destination( ) +def create_connection( + name: str, + *, + source_id: str, + destination_id: str, + api_root: str, + api_key: str | None = None, + workspace_id: str | None = None, +) -> api_models.ConnectionResponse: + api_key = api_key or get_default_bearer_token() + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + stream_configuration = api_models.StreamConfiguration( + name="users", + ) + stream_configurations = api_models.StreamConfigurations([stream_configuration]) + response = airbyte_instance.connections.create_connection( + api_models.ConnectionCreateRequest( + name=name, + source_id=source_id, + destination_id=destination_id, + configurations=stream_configurations, + ), + ) + if not status_ok(response.status_code): + raise HostedAirbyteError( + context={ + "source_id": source_id, + "destination_id": destination_id, + "response": response, + }, + ) + + return response.connection_response + + +def delete_connection( + connection_id: str, + api_root: str, + workspace_id: str | None = None, + api_key: str | None = None, +) -> None: + _ = workspace_id # Not used (yet) + api_key = api_key or get_default_bearer_token() + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.connections.delete_connection( + api_operations.DeleteConnectionRequest( + connection_id=connection_id, + ), + ) + if not status_ok(response.status_code): + raise HostedAirbyteError( + context={ + "connection_id": connection_id, + "response": response, + }, + ) + + def check_source( source_id: str, *, diff --git a/tests/integration_tests/test_api_crud.py b/tests/integration_tests/test_api_crud.py index 65d7323d..0e9d0f6f 100644 --- a/tests/integration_tests/test_api_crud.py +++ b/tests/integration_tests/test_api_crud.py @@ -113,3 +113,70 @@ def test_create_and_delete_destination( api_key=api_key, workspace_id=workspace_id, ) + + + +def test_create_and_delete_connection( + workspace_id: str, + api_root: str, + api_key: str, + motherduck_api_key: str, +) -> None: + new_source_name = "deleteme-source-faker" + str(ulid.ULID()).lower()[-6:] + new_destination_name = "deleteme-destination-dummy" + str(ulid.ULID()).lower()[-6:] + new_connection_name = "deleteme-connection-dummy" + str(ulid.ULID()).lower()[-6:] + source = api_util.create_source( + name=new_source_name, + api_root=api_root, + api_key=api_key, + workspace_id=workspace_id, + config=SourceFaker(), + ) + assert source.name == new_source_name + assert source.source_type == "faker" + assert source.source_id + + destination = api_util.create_destination( + name=new_destination_name, + api_root=api_root, + api_key=api_key, + workspace_id=workspace_id, + config=DestinationDuckdb( + destination_path="temp_db", + motherduck_api_key=motherduck_api_key, + ), + ) + assert destination.name == new_destination_name + assert destination.destination_type == "duckdb" + assert destination.destination_id + + connection = api_util.create_connection( + name=new_connection_name, + api_root=api_root, + api_key=api_key, + workspace_id=workspace_id, + source_id=source.source_id, + destination_id=destination.destination_id, + ) + assert connection.source_id == source.source_id + assert connection.destination_id == destination.destination_id + assert connection.connection_id + + api_util.delete_connection( + connection_id=connection.connection_id, + api_root=api_root, + api_key=api_key, + workspace_id=workspace_id, + ) + api_util.delete_source( + source_id=source.source_id, + api_root=api_root, + api_key=api_key, + workspace_id=workspace_id, + ) + api_util.delete_destination( + destination_id=destination.destination_id, + api_root=api_root, + api_key=api_key, + workspace_id=workspace_id, + ) From 1740374f47c4362ded6e113f0614e06ae48a949f Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 21:25:07 -0700 Subject: [PATCH 008/118] fixes --- airbyte/_util/api_util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index 461c53f9..0120732b 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -354,7 +354,7 @@ def create_destination( config: dict[str, Any], api_root: str = "https://api.airbyte.com/v1", api_key: str | None = None, -) -> api_models.SourceResponse: +) -> api_models.DestinationResponse: """Get a connection.""" api_key = api_key or get_default_bearer_token() airbyte_instance = get_airbyte_server_instance( @@ -417,6 +417,7 @@ def create_connection( api_key: str | None = None, workspace_id: str | None = None, ) -> api_models.ConnectionResponse: + _ = workspace_id # Not used (yet) api_key = api_key or get_default_bearer_token() airbyte_instance = get_airbyte_server_instance( api_key=api_key, From 7585106148ed0cc1b4f3d6ba8ec7177786e735cc Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 21:25:27 -0700 Subject: [PATCH 009/118] fix missing bigquery cache import --- airbyte/caches/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/airbyte/caches/__init__.py b/airbyte/caches/__init__.py index c565a976..1b1df33f 100644 --- a/airbyte/caches/__init__.py +++ b/airbyte/caches/__init__.py @@ -4,6 +4,7 @@ from airbyte.caches import bigquery, duckdb, motherduck, postgres, snowflake, util from airbyte.caches.base import CacheBase +from airbyte.caches.bigquery import BigQueryCache from airbyte.caches.duckdb import DuckDBCache from airbyte.caches.motherduck import MotherDuckCache from airbyte.caches.postgres import PostgresCache @@ -17,6 +18,7 @@ "get_default_cache", "new_local_cache", # Classes + "BigQueryCache", "CacheBase", "DuckDBCache", "MotherDuckCache", From e571903338c310f8f80d28370fe664296c104cb6 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 21:25:54 -0700 Subject: [PATCH 010/118] add placeholder deployment ids --- airbyte/caches/base.py | 4 ++++ airbyte/sources/base.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index 7f5f34f6..a8b7f10f 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -47,6 +47,10 @@ class CacheBase(BaseModel): _sql_processor_class: type[SqlProcessorBase] = PrivateAttr() _sql_processor: Optional[SqlProcessorBase] = PrivateAttr(default=None) + _deployed_api_root: str | None = None + _deployed_workspace_id: str | None = None + _deployed_destination_id: str | None = None + @final @property def processor(self) -> SqlProcessorBase: diff --git a/airbyte/sources/base.py b/airbyte/sources/base.py index 5951e2fd..d99d1543 100644 --- a/airbyte/sources/base.py +++ b/airbyte/sources/base.py @@ -103,6 +103,10 @@ def __init__( if streams is not None: self.select_streams(streams) + self._deployed_api_root: str | None = None + self._deployed_workspace_id: str | None = None + self._deployed_source_id: str | None = None + def set_streams(self, streams: list[str]) -> None: """Deprecated. See select_streams().""" warnings.warn( From ee8057b2674fecab79c8b120c6673d5e94b35dcb Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 21:26:42 -0700 Subject: [PATCH 011/118] add missing copyright msg --- airbyte/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/airbyte/__init__.py b/airbyte/__init__.py index 64f6255e..b7a61c9a 100644 --- a/airbyte/__init__.py +++ b/airbyte/__init__.py @@ -1,3 +1,4 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. """PyAirbyte brings Airbyte ELT to every Python developer. .. include:: ../README.md From 9712855fda1989176ab37023843fdb89d3498782 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 21:34:51 -0700 Subject: [PATCH 012/118] rename test file --- .../{test_api_crud.py => test_cloud_api_util.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/integration_tests/{test_api_crud.py => test_cloud_api_util.py} (100%) diff --git a/tests/integration_tests/test_api_crud.py b/tests/integration_tests/test_cloud_api_util.py similarity index 100% rename from tests/integration_tests/test_api_crud.py rename to tests/integration_tests/test_cloud_api_util.py From a42d76071f212744d1188a587671aec37ba3c9e7 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 22:19:22 -0700 Subject: [PATCH 013/118] use constant --- airbyte/_util/api_util.py | 29 ++++++++++--------- .../integration_tests/test_cloud_api_util.py | 3 +- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index 0120732b..7dd3e09b 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -27,6 +27,7 @@ JOB_WAIT_INTERVAL_SECS = 2.0 +CLOUD_API_ROOT = "https://api.airbyte.com/v1" def status_ok(status_code: int) -> bool: @@ -42,7 +43,7 @@ def get_default_bearer_token() -> str | None: def get_airbyte_server_instance( *, api_key: str | None = None, - api_root: str = "https://api.airbyte.com/v1", + api_root: str = CLOUD_API_ROOT, ) -> airbyte_api.Airbyte: """Get an Airbyte instance.""" api_key = api_key or get_default_bearer_token() @@ -57,7 +58,7 @@ def get_airbyte_server_instance( def get_workspace( workspace_id: str, *, - api_root: str = "https://api.airbyte.com/v1", + api_root: str = CLOUD_API_ROOT, api_key: str | None = None, ) -> api_models.WorkspaceResponse: """Get a connection.""" @@ -86,7 +87,7 @@ def get_workspace( def list_connections( workspace_id: str, *, - api_root: str = "https://api.airbyte.com/v1", + api_root: str = CLOUD_API_ROOT, api_key: str | None = None, ) -> list[api_models.ConnectionResponse]: """Get a connection.""" @@ -117,7 +118,7 @@ def get_connection( workspace_id: str, connection_id: str, *, - api_root: str = "https://api.airbyte.com/v1", + api_root: str = CLOUD_API_ROOT, api_key: str | None = None, ) -> api_models.ConnectionResponse: """Get a connection.""" @@ -142,7 +143,7 @@ def run_connection( workspace_id: str, connection_id: str, *, - api_root: str = "https://api.airbyte.com/v1", + api_root: str = CLOUD_API_ROOT, api_key: str | None = None, wait_for_job: bool = True, raise_on_failure: bool = True, @@ -190,7 +191,7 @@ def wait_for_airbyte_job( workspace_id: str, job_id: str, *, - api_root: str = "https://api.airbyte.com/v1", + api_root: str = CLOUD_API_ROOT, api_key: str | None = None, raise_on_failure: bool = True, ) -> api_models.JobInfo: @@ -236,7 +237,7 @@ def get_connection_by_name( workspace_id: str, connection_name: str, *, - api_root: str = "https://api.airbyte.com/v1", + api_root: str = CLOUD_API_ROOT, api_key: str | None = None, ) -> api_models.ConnectionResponse: """Get a connection.""" @@ -267,7 +268,7 @@ def get_connection_by_name( def get_source( source_id: str, *, - api_root: str = "https://api.airbyte.com/v1", + api_root: str = CLOUD_API_ROOT, api_key: str | None = None, ) -> api_models.SourceResponse: """Get a connection.""" @@ -292,7 +293,7 @@ def create_source( *, workspace_id: str, config: dict[str, Any], - api_root: str = "https://api.airbyte.com/v1", + api_root: str = CLOUD_API_ROOT, api_key: str | None = None, ) -> api_models.SourceResponse: """Get a connection.""" @@ -322,7 +323,7 @@ def create_source( def delete_source( source_id: str, *, - api_root: str = "https://api.airbyte.com/v1", + api_root: str = CLOUD_API_ROOT, api_key: str | None = None, workspace_id: str | None = None, ) -> None: @@ -352,7 +353,7 @@ def create_destination( *, workspace_id: str, config: dict[str, Any], - api_root: str = "https://api.airbyte.com/v1", + api_root: str = CLOUD_API_ROOT, api_key: str | None = None, ) -> api_models.DestinationResponse: """Get a connection.""" @@ -383,7 +384,7 @@ def create_destination( def delete_destination( destination_id: str, *, - api_root: str = "https://api.airbyte.com/v1", + api_root: str = CLOUD_API_ROOT, api_key: str | None = None, workspace_id: str | None = None, ) -> None: @@ -476,7 +477,7 @@ def delete_connection( def check_source( source_id: str, *, - api_root: str = "https://api.airbyte.com/v1", + api_root: str = CLOUD_API_ROOT, api_key: str | None = None, workspace_id: str | None = None, ) -> api_models.SourceCheckResponse: @@ -492,7 +493,7 @@ def check_source( def get_destination( destination_id: str, *, - api_root: str = "https://api.airbyte.com/v1", + api_root: str = CLOUD_API_ROOT, api_key: str | None = None, ) -> api_models.DestinationResponse: """Get a connection.""" diff --git a/tests/integration_tests/test_cloud_api_util.py b/tests/integration_tests/test_cloud_api_util.py index 0e9d0f6f..8ef57342 100644 --- a/tests/integration_tests/test_cloud_api_util.py +++ b/tests/integration_tests/test_cloud_api_util.py @@ -13,12 +13,12 @@ import airbyte as ab from airbyte._util import api_util, api_duck_types +from airbyte._util.api_util import CLOUD_API_ROOT from airbyte_api.models.shared import SourceFaker, DestinationDevNull, DestinationDuckdb from dotenv import dotenv_values from airbyte.caches.duckdb import DuckDBCache -CLOUD_API_ROOT = "https://api.airbyte.com/v1" ENV_AIRBYTE_API_KEY = "AIRBYTE_API_KEY" ENV_AIRBYTE_API_WORKSPACE_ID = "AIRBYTE_API_WORKSPACE_ID" ENV_MOTHERDUCK_API_KEY = "MOTHERDUCK_API_KEY" @@ -115,7 +115,6 @@ def test_create_and_delete_destination( ) - def test_create_and_delete_connection( workspace_id: str, api_root: str, From c077081c8a9beb1b19cd58969e16e09941a88a31 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 22:59:38 -0700 Subject: [PATCH 014/118] working deploy methods and tests --- airbyte/_util/api_util.py | 2 +- airbyte/caches/base.py | 9 +- airbyte/cloud/__init__.py | 31 ++ airbyte/cloud/_destinations.py | 200 +++++++++++++ airbyte/cloud/_workspaces.py | 265 ++++++++++++++++++ airbyte/sources/base.py | 1 + .../test_cloud_workspaces.py | 145 ++++++++++ tests/integration_tests/test_duckdb_cache.py | 3 +- 8 files changed, 649 insertions(+), 7 deletions(-) create mode 100644 airbyte/cloud/__init__.py create mode 100644 airbyte/cloud/_destinations.py create mode 100644 airbyte/cloud/_workspaces.py create mode 100644 tests/integration_tests/test_cloud_workspaces.py diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index 7dd3e09b..a92a66c5 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -129,7 +129,7 @@ def get_connection( api_root=api_root, ) response = airbyte_instance.connections.get_connection( - api_models.GetConnectionRequest( + api_operations.GetConnectionRequest( connection_id=connection_id, ), ) diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index a8b7f10f..af24ab98 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -44,13 +44,14 @@ class CacheBase(BaseModel): table_suffix: str = "" """A suffix to add to all table names.""" + _deployed_api_root: str | None = PrivateAttr(default=None) + _deployed_workspace_id: str | None = PrivateAttr(default=None) + _deployed_destination_id: str | None = PrivateAttr(default=None) + _deployed_connection_id: str | None = PrivateAttr(default=None) + _sql_processor_class: type[SqlProcessorBase] = PrivateAttr() _sql_processor: Optional[SqlProcessorBase] = PrivateAttr(default=None) - _deployed_api_root: str | None = None - _deployed_workspace_id: str | None = None - _deployed_destination_id: str | None = None - @final @property def processor(self) -> SqlProcessorBase: diff --git a/airbyte/cloud/__init__.py b/airbyte/cloud/__init__.py new file mode 100644 index 00000000..913892cb --- /dev/null +++ b/airbyte/cloud/__init__.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""PyAirbyte classes and methods for interacting with the Airbyte Cloud API. + +You can use this module to interact with Airbyte Cloud, OSS, and Enterprise. + +Usage example: + +```python +import airbyte as ab +from airbyte import cloud + +workspace = cloud.CloudWorkspace( + workspace_id="123", + api_key=ab.get_secret("AIRBYTE_API_KEY"), +) + +source = ab.get_source("source-faker", config={}) +source.check() + +workspace.deploy_source(source) +``` +""" + +from __future__ import annotations + +from airbyte.cloud._workspaces import CloudWorkspace + + +__all__ = [ + "CloudWorkspace", +] diff --git a/airbyte/cloud/_destinations.py b/airbyte/cloud/_destinations.py new file mode 100644 index 00000000..a8226643 --- /dev/null +++ b/airbyte/cloud/_destinations.py @@ -0,0 +1,200 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Cloud destinations for Airbyte.""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from airbyte_api.models.shared import ( + DestinationBigquery, + DestinationDuckdb, + DestinationPostgres, + DestinationSnowflake, +) + +from airbyte.caches import ( + BigQueryCache, + DuckDBCache, + MotherDuckCache, + PostgresCache, + SnowflakeCache, +) +from airbyte.secrets import get_secret + + +if TYPE_CHECKING: + from collections.abc import Callable + + from airbyte.caches.base import CacheBase + + +def get_destination_config_from_cache( + cache: CacheBase, +) -> dict[str, str]: + """Get the destination configuration from the cache.""" + conversion_fn_map: dict[str, Callable[[Any], dict[str, str]]] = { + "BigQueryCache": get_bigquery_destination_config, + "DuckDBCache": get_duckdb_destination_config, + "MotherDuckCache": get_motherduck_destination_config, + "PostgresCache": get_postgres_destination_config, + "SnowflakeCache": get_snowflake_destination_config, + } + cache_class_name = cache.__class__.__name__ + if cache_class_name not in conversion_fn_map: + raise ValueError( # noqa: TRY003 + "Cannot convert cache type to destination configuration. Cache type not supported. ", + f"Supported cache types: {list(conversion_fn_map.keys())}", + ) + + conversion_fn = conversion_fn_map[cache_class_name] + return conversion_fn(cache) + + +def get_duckdb_destination_config( + cache: DuckDBCache, +) -> dict[str, str]: + """Get the destination configuration from the DuckDB cache.""" + return DestinationDuckdb( + destination_path=cache.db_path, + schema=cache.schema_name, + ).to_dict() + + +def get_motherduck_destination_config( + cache: MotherDuckCache, +) -> dict[str, str]: + """Get the destination configuration from the DuckDB cache.""" + return DestinationDuckdb( + destination_path=cache.db_path, + schema=cache.schema_name, + motherduck_api_key=cache.api_key, + ).to_dict() + + +def get_postgres_destination_config( + cache: PostgresCache, +) -> dict[str, str]: + """Get the destination configuration from the Postgres cache.""" + return DestinationPostgres( + database=cache.database, + host=cache.host, + password=cache.password, + port=cache.port, + schema=cache.schema_name, + username=cache.username, + ).to_dict() + + +def get_snowflake_destination_config( + cache: SnowflakeCache, +) -> dict[str, str]: + """Get the destination configuration from the Snowflake cache.""" + return DestinationSnowflake( + account=cache.account, + database=cache.database, + password=cache.password, + role=cache.role, + schema=cache.schema_name, + username=cache.username, + warehouse=cache.warehouse, + ).to_dict() + + +def get_bigquery_destination_config( + cache: BigQueryCache, +) -> dict[str, str]: + """Get the destination configuration from the BigQuery cache.""" + return DestinationBigquery( + project_id=cache.project_name, + dataset_id=cache.dataset_name, + schema=cache.schema_name, + credentials_json=Path(cache.credentials_path).read_text(), + ).to_dict() + + +def create_bigquery_cache( + destination_configuration: dict[str, str], +) -> BigQueryCache: + """Create a new BigQuery cache from the destination configuration.""" + credentials_path = get_secret("BIGQUERY_CREDENTIALS_PATH") + return BigQueryCache( + project_name=destination_configuration["project_id"], + dataset_name=destination_configuration["dataset_id"], + schema_name=destination_configuration["schema"], + credentials_path=credentials_path, + ) + + +def create_duckdb_cache( + destination_configuration: dict[str, str], +) -> DuckDBCache: + """Create a new DuckDB cache from the destination configuration.""" + return DuckDBCache( + db_path=destination_configuration["destination_path"], + schema_name=destination_configuration["schema"], + ) + + +def create_motherduck_cache( + destination_configuration: dict[str, str], +) -> MotherDuckCache: + """Create a new DuckDB cache from the destination configuration.""" + return MotherDuckCache( + database=destination_configuration["destination_path"], + schema_name=destination_configuration["schema"], + api_key=destination_configuration["motherduck_api_key"], + ) + + +def create_postgres_cache( + destination_configuration: dict[str, str], +) -> PostgresCache: + """Create a new Postgres cache from the destination configuration.""" + port: int = ( + int(destination_configuration["port"]) if "port" in destination_configuration else 5432 + ) + return PostgresCache( + database=destination_configuration["database"], + host=destination_configuration["host"], + password=destination_configuration["password"], + port=port, + schema_name=destination_configuration["schema"], + username=destination_configuration["username"], + ) + + +def create_snowflake_cache( + destination_configuration: dict[str, str], +) -> SnowflakeCache: + """Create a new Snowflake cache from the destination configuration.""" + return SnowflakeCache( + account=destination_configuration["account"], + database=destination_configuration["database"], + password=destination_configuration["password"], + role=destination_configuration["role"], + schema_name=destination_configuration["schema"], + username=destination_configuration["username"], + warehouse=destination_configuration["warehouse"], + ) + + +def create_cache_from_destination( + destination_configuration: dict[str, str], +) -> CacheBase: + """Create a new cache from the destination.""" + conversion_fn_map: dict[str, Callable[[dict[str, str]], CacheBase]] = { + "DestinationBigquery": create_bigquery_cache, + "DestinationDuckdb": create_duckdb_cache, + "DestinationPostgres": create_postgres_cache, + "DestinationSnowflake": create_snowflake_cache, + } + destination_class_name = destination_configuration["destination_type"] + if destination_class_name not in conversion_fn_map: + raise ValueError( # noqa: TRY003 + "Cannot convert destination configuration to cache. Destination type not supported. ", + f"Supported destination types: {list(conversion_fn_map.keys())}", + ) + + conversion_fn = conversion_fn_map[destination_class_name] + return conversion_fn(destination_configuration) diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/_workspaces.py new file mode 100644 index 00000000..f87a0395 --- /dev/null +++ b/airbyte/cloud/_workspaces.py @@ -0,0 +1,265 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""PyAirbyte classes and methods for interacting with the Airbyte Cloud API. + +By overriding `api_root`, you can use this module to interact with self-managed Airbyte instances, +both OSS and Enterprise. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +import ulid + +from airbyte._util.api_util import ( + CLOUD_API_ROOT, + create_connection, + create_destination, + create_source, + delete_connection, + delete_destination, + delete_source, + get_connection, + get_workspace, +) +from airbyte.cloud._destinations import get_destination_config_from_cache +from airbyte.sources.base import Source + + +if TYPE_CHECKING: + from airbyte_api.models.shared.connectionresponse import ConnectionResponse + from airbyte_api.models.shared.destinationresponse import DestinationResponse + + from airbyte.caches.base import CacheBase + + +def _get_deploy_name( + resource_desc: str, + deploy_key: str, +) -> str: + """Get the name of the source to deploy.""" + return f"{resource_desc} (id={deploy_key})" + + +def _new_deploy_key() -> str: + """Generate a new deploy key.""" + return str(ulid.ULID()) + + +def _get_deploy_key(deploy_name: str) -> str: + """Get the deploy key from a deployed resource name.""" + if " (id=" not in deploy_name: + raise ValueError( # noqa: TRY003 + f"Could not extract deploy key from {deploy_name}.", + ) + + return deploy_name.split(" (id=")[-1].split(")")[0] + + +@dataclass +class CloudWorkspace: + """A remote workspace on the Airbyte Cloud. + + By overriding `api_root`, you can use this class to interact with self-managed Airbyte + instances, both OSS and Enterprise. + """ + + workspace_id: str + api_key: str + api_root: str = CLOUD_API_ROOT + + _deploy_key: str | None = None + + def connect(self) -> None: + """Check that the workspace is reachable and raise an exception otherwise.""" + _ = get_workspace( + api_root=self.api_root, + api_key=self.api_key, + workspace_id=self.workspace_id, + ) + + def deploy_source( + self, + source: Source, + ) -> str: + """Deploy a source to the workspace. + + Returns the newly deployed source ID. + """ + if self._deploy_key is None: + self._deploy_key = str(ulid.ULID()) + + source_configuration = source.get_config().copy() + source_configuration["sourceType"] = source.name.replace("source-", "") + + deployed_source = create_source( + name=_get_deploy_name( + resource_desc=f"Source {source.name.replace('-', ' ').title()}", + deploy_key=self._deploy_key, + ), + api_root=self.api_root, + api_key=self.api_key, + workspace_id=self.workspace_id, + config=source_configuration, + ) + + # Set the deployment Ids on the source object + source._deployed_api_root = self.api_root # noqa: SLF001 # Accessing nn-public API + source._deployed_workspace_id = self.workspace_id # noqa: SLF001 # Accessing nn-public API + source._deployed_source_id = deployed_source.source_id # noqa: SLF001 # Accessing nn-public API + + return deployed_source.source_id + + def delete_source( + self, + source: str | Source, + ) -> None: + """Delete a source from the workspace. + + You can pass either the source ID `str` or a deployed `Source` object. + """ + if not isinstance(source, (str, Source)): + raise ValueError(f"Invalid source type: {type(source)}") # noqa: TRY004, TRY003 + + if isinstance(source, Source): + if not source._deployed_source_id: # noqa: SLF001 + raise ValueError("Source has not been deployed.") # noqa: TRY003 + + source_id = source._deployed_source_id # noqa: SLF001 + + elif isinstance(source, str): + source_id = source + + delete_source( + source_id=source_id, + api_root=self.api_root, + api_key=self.api_key, + ) + + def deploy_cache_as_destination( + self, + cache: CacheBase, + ) -> str: + """Deploy a cache to the workspace as a new destination. + + Returns the newly deployed destination ID. + """ + if self._deploy_key is None: + self._deploy_key = str(ulid.ULID()) + + cache_type_name = cache.__class__.__name__.replace("Cache", "") + + deployed_destination: DestinationResponse = create_destination( + name=_get_deploy_name( + resource_desc=f"Destination {cache_type_name} (Deployed by PyAirbyte)", + deploy_key=self._deploy_key, + ), + api_root=self.api_root, + api_key=self.api_key, + workspace_id=self.workspace_id, + config=get_destination_config_from_cache(cache), + ) + + # Set the deployment Ids on the source object + cache._deployed_api_root = self.api_root # noqa: SLF001 # Accessing nn-public API + cache._deployed_workspace_id = self.workspace_id # noqa: SLF001 # Accessing nn-public API + cache._deployed_destination_id = deployed_destination.destination_id # noqa: SLF001 # Accessing nn-public API + + return deployed_destination.destination_id + + def delete_destination( + self, + *, + destination_id: str | None = None, + cache: CacheBase | None = None, + ) -> None: + """Delete a deployed destination from the workspace. + + You can pass either the `Cache` class or the deployed destination ID as a `str`. + """ + if destination_id is None and cache is None: + raise ValueError("You must provide either a destination ID or a cache object.") # noqa: TRY003 + if destination_id is not None and cache is not None: + raise ValueError( # noqa: TRY003 + "You must provide either a destination ID or a cache object, not both." + ) + + if cache: + if not cache._deployed_destination_id: # noqa: SLF001 + raise ValueError("Cache has not been deployed.") # noqa: TRY003 + + destination_id = cache._deployed_destination_id # noqa: SLF001 + + if destination_id is None: + raise ValueError("No destination ID provided.") # noqa: TRY003 + + delete_destination( + destination_id=destination_id, + api_root=self.api_root, + api_key=self.api_key, + ) + + def deploy_connection( + self, + source: Source, + cache: CacheBase, + ) -> str: + """Deploy a source and cache to the workspace as a new connection. + + Returns the newly deployed connection ID as a `str`. + """ + if self._deploy_key is None: + self._deploy_key = str(ulid.ULID()) + + self.deploy_source(source) + self.deploy_cache_as_destination(cache) + + assert source._deployed_source_id is not None # noqa: SLF001 # Accessing nn-public API + assert cache._deployed_destination_id is not None # noqa: SLF001 # Accessing nn-public API + + deployed_connection = create_connection( + name=_get_deploy_name( + resource_desc=f"Connection {source.name.replace('-', ' ').title()}", + deploy_key=self._deploy_key, + ), + source_id=source._deployed_source_id, # noqa: SLF001 # Accessing nn-public API + destination_id=cache._deployed_destination_id, # noqa: SLF001 # Accessing nn-public API + api_root=self.api_root, + api_key=self.api_key, + workspace_id=self.workspace_id, + ) + + source._deployed_connection_id = deployed_connection.connection_id # noqa: SLF001 + cache._deployed_connection_id = deployed_connection.connection_id # noqa: SLF001 + + return deployed_connection.connection_id + + def delete_connection( + self, + connection_id: str | None, + *, + delete_source: bool = False, + delete_destination: bool = False, + ) -> None: + """Delete a deployed connection from the workspace.""" + if connection_id is None: + raise ValueError("No connection ID provided.") # noqa: TRY003 + + connection: ConnectionResponse = get_connection( + connection_id=connection_id, + api_root=self.api_root, + api_key=self.api_key, + workspace_id=self.workspace_id, + ) + delete_connection( + connection_id=connection_id, + api_root=self.api_root, + api_key=self.api_key, + workspace_id=self.workspace_id, + ) + if delete_source: + self.delete_source(source=connection.source_id) + + if delete_destination: + self.delete_destination(destination_id=connection.destination_id) diff --git a/airbyte/sources/base.py b/airbyte/sources/base.py index d99d1543..b8309fd8 100644 --- a/airbyte/sources/base.py +++ b/airbyte/sources/base.py @@ -106,6 +106,7 @@ def __init__( self._deployed_api_root: str | None = None self._deployed_workspace_id: str | None = None self._deployed_source_id: str | None = None + self._deployed_connection_id: str | None = None def set_streams(self, streams: list[str]) -> None: """Deprecated. See select_streams().""" diff --git a/tests/integration_tests/test_cloud_workspaces.py b/tests/integration_tests/test_cloud_workspaces.py new file mode 100644 index 00000000..722f2bb6 --- /dev/null +++ b/tests/integration_tests/test_cloud_workspaces.py @@ -0,0 +1,145 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""Cloud Workspace integration tests. + +These tests are designed to be run against a running instance of the Airbyte API. +""" +from __future__ import annotations +import os +from pathlib import Path +import sys + +from dotenv import dotenv_values +import pytest + +import airbyte as ab +from airbyte.caches import MotherDuckCache +from airbyte.cloud import CloudWorkspace +from airbyte._executor import _get_bin_dir +from airbyte._util.api_util import CLOUD_API_ROOT, delete_destination + +ENV_AIRBYTE_API_KEY = "AIRBYTE_API_KEY" +ENV_AIRBYTE_API_WORKSPACE_ID = "AIRBYTE_API_WORKSPACE_ID" +ENV_MOTHERDUCK_API_KEY = "MOTHERDUCK_API_KEY" + + +@pytest.fixture(autouse=True) +def add_venv_bin_to_path(monkeypatch): + """Patch the PATH to include the virtual environment's bin directory.""" + # Get the path to the bin directory of the virtual environment + venv_bin_path = str(_get_bin_dir(Path(sys.prefix))) + + # Add the bin directory to the PATH + new_path = f"{venv_bin_path}{os.pathsep}{os.environ['PATH']}" + monkeypatch.setenv('PATH', new_path) + + +@pytest.fixture +def workspace_id() -> str: + return os.environ[ENV_AIRBYTE_API_WORKSPACE_ID] + + +@pytest.fixture +def api_root() -> str: + return CLOUD_API_ROOT + + +@pytest.fixture +def api_key() -> str: + dotenv_vars: dict[str, str | None] = dotenv_values() + if ENV_AIRBYTE_API_KEY in dotenv_vars: + return dotenv_vars[ENV_AIRBYTE_API_KEY] + + if ENV_AIRBYTE_API_KEY not in os.environ: + raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") + + return os.environ[ENV_AIRBYTE_API_KEY] + + +@pytest.fixture +def motherduck_api_key() -> str: + dotenv_vars: dict[str, str | None] = dotenv_values() + if ENV_MOTHERDUCK_API_KEY in dotenv_vars: + return dotenv_vars[ENV_MOTHERDUCK_API_KEY] + + if ENV_MOTHERDUCK_API_KEY not in os.environ: + raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") + + return os.environ[ENV_MOTHERDUCK_API_KEY] + + +@pytest.fixture +def cloud_workspace( + workspace_id: str, + api_key: str, + api_root: str, +) -> CloudWorkspace: + return CloudWorkspace( + workspace_id=workspace_id, + api_key=api_key, + api_root=api_root, + ) + + +def test_deploy_source( + cloud_workspace: CloudWorkspace, +) -> None: + """Test deploying a source to a workspace.""" + source = ab.get_source( + "source-faker", + local_executable="source-faker", + config={"count": 100}, + install_if_missing=False, + ) + source.check() + source_id: str = cloud_workspace.deploy_source(source) + + cloud_workspace.delete_source(source=source_id) + + +def test_deploy_cache_as_destination( + workspace_id: str, + api_key: str, + motherduck_api_key: str, +) -> None: + """Test deploying a cache to a workspace as a destination.""" + workspace = CloudWorkspace( + workspace_id=workspace_id, + api_key=api_key, + ) + + cache = MotherDuckCache( + api_key=motherduck_api_key, + database="temp", + schema_name="public", + ) + destination_id: str = workspace.deploy_cache_as_destination(cache=cache) + workspace.delete_destination(destination_id=destination_id) + + +def test_deploy_connection( + workspace_id: str, + api_key: str, + motherduck_api_key: str, +) -> None: + """Test deploying a source and cache to a workspace as a new connection.""" + workspace = CloudWorkspace( + workspace_id=workspace_id, + api_key=api_key, + ) + + source = ab.get_source( + "source-faker", + local_executable="source-faker", + config={"count": 100}, + install_if_missing=False, + ) + source.check() + + cache = MotherDuckCache( + api_key=motherduck_api_key, + database="temp", + schema_name="public", + ) + + connection_id: str = workspace.deploy_connection(source=source, cache=cache) + workspace.delete_connection(connection_id=connection_id) diff --git a/tests/integration_tests/test_duckdb_cache.py b/tests/integration_tests/test_duckdb_cache.py index abdcaf11..203fb24c 100644 --- a/tests/integration_tests/test_duckdb_cache.py +++ b/tests/integration_tests/test_duckdb_cache.py @@ -34,10 +34,9 @@ FAKER_SCALE_B = 300 -# Patch PATH to include the source-faker executable. - @pytest.fixture(autouse=True) def add_venv_bin_to_path(monkeypatch): + """Patch the PATH to include the virtual environment's bin directory.""" # Get the path to the bin directory of the virtual environment venv_bin_path = str(_get_bin_dir(Path(sys.prefix))) From 9440bc73671e36aeda7b890177a0cbf0b5275e9a Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 23:00:43 -0700 Subject: [PATCH 015/118] remove commented code --- airbyte/_util/api_util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index a92a66c5..1fb178e4 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -368,7 +368,6 @@ def create_destination( name=name, workspace_id=workspace_id, configuration=config, # TODO: wrap in a proper configuration object - # definition_id="a7bcc9d8-13b3-4e49-b80d-d020b90045e3", # Not used alternative to config.destinationType. ), ) ) From 44cf945fd26c982b301a226b9214e6cf8f2b1294 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 23:06:20 -0700 Subject: [PATCH 016/118] remove deploy key --- airbyte/cloud/_workspaces.py | 49 +++--------------------------------- 1 file changed, 3 insertions(+), 46 deletions(-) diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/_workspaces.py index f87a0395..ec37785d 100644 --- a/airbyte/cloud/_workspaces.py +++ b/airbyte/cloud/_workspaces.py @@ -34,29 +34,6 @@ from airbyte.caches.base import CacheBase -def _get_deploy_name( - resource_desc: str, - deploy_key: str, -) -> str: - """Get the name of the source to deploy.""" - return f"{resource_desc} (id={deploy_key})" - - -def _new_deploy_key() -> str: - """Generate a new deploy key.""" - return str(ulid.ULID()) - - -def _get_deploy_key(deploy_name: str) -> str: - """Get the deploy key from a deployed resource name.""" - if " (id=" not in deploy_name: - raise ValueError( # noqa: TRY003 - f"Could not extract deploy key from {deploy_name}.", - ) - - return deploy_name.split(" (id=")[-1].split(")")[0] - - @dataclass class CloudWorkspace: """A remote workspace on the Airbyte Cloud. @@ -69,8 +46,6 @@ class CloudWorkspace: api_key: str api_root: str = CLOUD_API_ROOT - _deploy_key: str | None = None - def connect(self) -> None: """Check that the workspace is reachable and raise an exception otherwise.""" _ = get_workspace( @@ -87,17 +62,11 @@ def deploy_source( Returns the newly deployed source ID. """ - if self._deploy_key is None: - self._deploy_key = str(ulid.ULID()) - source_configuration = source.get_config().copy() source_configuration["sourceType"] = source.name.replace("source-", "") deployed_source = create_source( - name=_get_deploy_name( - resource_desc=f"Source {source.name.replace('-', ' ').title()}", - deploy_key=self._deploy_key, - ), + name=f"{source.name.replace('-', ' ').title()} (Deployed by PyAirbyte)", api_root=self.api_root, api_key=self.api_key, workspace_id=self.workspace_id, @@ -145,16 +114,10 @@ def deploy_cache_as_destination( Returns the newly deployed destination ID. """ - if self._deploy_key is None: - self._deploy_key = str(ulid.ULID()) - cache_type_name = cache.__class__.__name__.replace("Cache", "") deployed_destination: DestinationResponse = create_destination( - name=_get_deploy_name( - resource_desc=f"Destination {cache_type_name} (Deployed by PyAirbyte)", - deploy_key=self._deploy_key, - ), + name=f"Destination {cache_type_name} (Deployed by PyAirbyte)", api_root=self.api_root, api_key=self.api_key, workspace_id=self.workspace_id, @@ -209,9 +172,6 @@ def deploy_connection( Returns the newly deployed connection ID as a `str`. """ - if self._deploy_key is None: - self._deploy_key = str(ulid.ULID()) - self.deploy_source(source) self.deploy_cache_as_destination(cache) @@ -219,10 +179,7 @@ def deploy_connection( assert cache._deployed_destination_id is not None # noqa: SLF001 # Accessing nn-public API deployed_connection = create_connection( - name=_get_deploy_name( - resource_desc=f"Connection {source.name.replace('-', ' ').title()}", - deploy_key=self._deploy_key, - ), + name=f"Connection {source.name.replace('-', ' ').title()} (Deployed by PyAirbyte)", source_id=source._deployed_source_id, # noqa: SLF001 # Accessing nn-public API destination_id=cache._deployed_destination_id, # noqa: SLF001 # Accessing nn-public API api_root=self.api_root, From 288e3df0ac466fb72ad6f0b2e79b008c699bc986 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 25 Mar 2024 23:08:13 -0700 Subject: [PATCH 017/118] fix extra import --- airbyte/cloud/_workspaces.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/_workspaces.py index ec37785d..0c707e02 100644 --- a/airbyte/cloud/_workspaces.py +++ b/airbyte/cloud/_workspaces.py @@ -10,8 +10,6 @@ from dataclasses import dataclass from typing import TYPE_CHECKING -import ulid - from airbyte._util.api_util import ( CLOUD_API_ROOT, create_connection, From 4038444f9f85fd5ef69a453361124038fdb30987 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 26 Mar 2024 00:06:54 -0700 Subject: [PATCH 018/118] fix lint issues, skip hanging test --- airbyte/_util/api_util.py | 40 +++++++++++++++---- airbyte/cloud/_workspaces.py | 34 ++++++++++++++++ .../test_cloud_workspaces.py | 31 ++++++++++++++ 3 files changed, 98 insertions(+), 7 deletions(-) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index 1fb178e4..afb125b1 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -187,6 +187,29 @@ def run_connection( ) +def get_job_info( + job_id: str, + *, + api_root: str = CLOUD_API_ROOT, + api_key: str | None = None, +) -> api_models.JobResponse: + """Get a job.""" + api_key = api_key or get_default_bearer_token() + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.jobs.get_job( + api_operations.GetJobRequest( + job_id=job_id, + ), + ) + if status_ok(response.status_code) and response.job_response: + return response.job_response + + raise MissingResourceError(job_id, "job", response.text) + + def wait_for_airbyte_job( workspace_id: str, job_id: str, @@ -194,7 +217,7 @@ def wait_for_airbyte_job( api_root: str = CLOUD_API_ROOT, api_key: str | None = None, raise_on_failure: bool = True, -) -> api_models.JobInfo: +) -> api_models.JobResponse: """Wait for a job to finish running.""" _ = workspace_id # Not used (yet) api_key = api_key or get_default_bearer_token() @@ -204,23 +227,26 @@ def wait_for_airbyte_job( ) while True: sleep(JOB_WAIT_INTERVAL_SECS) - response = airbyte_instance.jobs.get_job( + response: api_operations.GetJobResponse = airbyte_instance.jobs.get_job( api_operations.GetJobRequest( job_id=job_id, ), ) - if status_ok(response.status_code) and response.job_info: - job_info = response.job_info - if job_info.status == api_models.StatusEnum.succeeded: + if status_ok(response.status_code) and response.job_response: + job_info = response.job_response + if job_info.status == api_models.JobStatusEnum.SUCCEEDED: return job_info - if job_info.status == api_models.StatusEnum.failed: + if job_info.status in ( + api_models.JobStatusEnum.FAILED, + api_models.JobStatusEnum.CANCELLED, + ): if raise_on_failure: raise HostedConnectionSyncError( context={ + "job_id": job_id, "job_status": job_info.status, "workspace_id": workspace_id, - "job_id": job_id, "message": job_info.message, }, ) diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/_workspaces.py index 0c707e02..2cb074f1 100644 --- a/airbyte/cloud/_workspaces.py +++ b/airbyte/cloud/_workspaces.py @@ -10,6 +10,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING +from airbyte._util import api_util from airbyte._util.api_util import ( CLOUD_API_ROOT, create_connection, @@ -32,6 +33,24 @@ from airbyte.caches.base import CacheBase +@dataclass +class SyncResult: + """The result of a sync operation.""" + + workspace: CloudWorkspace + connection_id: str + job_id: str + + @property + def is_running(self) -> bool: + """Check if the sync job is still running.""" + return api_util.get_job_info( + job_id=self.job_id, + api_root=CLOUD_API_ROOT, + api_key=self.workspace.api_key, + ) + + @dataclass class CloudWorkspace: """A remote workspace on the Airbyte Cloud. @@ -218,3 +237,18 @@ def delete_connection( if delete_destination: self.delete_destination(destination_id=connection.destination_id) + + def run_sync( + self, + connection_id: str, + ) -> str: + """Run a sync on a deployed connection.""" + connection_response = api_util.run_connection( + connection_id=connection_id, + api_root=self.api_root, + api_key=self.api_key, + workspace_id=self.workspace_id, + wait_for_job=True, + raise_on_failure=True, + ) + return connection_response.job_id diff --git a/tests/integration_tests/test_cloud_workspaces.py b/tests/integration_tests/test_cloud_workspaces.py index 722f2bb6..5f73d562 100644 --- a/tests/integration_tests/test_cloud_workspaces.py +++ b/tests/integration_tests/test_cloud_workspaces.py @@ -143,3 +143,34 @@ def test_deploy_connection( connection_id: str = workspace.deploy_connection(source=source, cache=cache) workspace.delete_connection(connection_id=connection_id) + +@pytest.mark.skip(reason="This test is not yet complete. It is hanging currently.") +def test_deploy_and_run_connection( + workspace_id: str, + api_key: str, + motherduck_api_key: str, +) -> None: + """Test deploying a source and cache to a workspace as a new connection.""" + workspace = CloudWorkspace( + workspace_id=workspace_id, + api_key=api_key, + ) + + source = ab.get_source( + "source-faker", + local_executable="source-faker", + config={"count": 100}, + install_if_missing=False, + ) + source.check() + + cache = MotherDuckCache( + api_key=motherduck_api_key, + database="temp", + schema_name="public", + ) + + connection_id: str = workspace.deploy_connection(source=source, cache=cache) + sync_result = workspace.run_sync(connection_id=connection_id) + + workspace.delete_connection(connection_id=connection_id) From 4c76c788f60cceb2a0bb0a6375874997379a6c47 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 26 Mar 2024 10:02:15 -0700 Subject: [PATCH 019/118] working run sync --- airbyte/cloud/_sync_results.py | 51 +++++ airbyte/cloud/_workspaces.py | 27 +-- tests/integration_tests/cloud/__init__.py | 0 tests/integration_tests/cloud/conftest.py | 109 +++++++++++ .../{ => cloud}/test_cloud_api_util.py | 50 +---- .../cloud/test_cloud_sync.py | 57 ++++++ .../cloud/test_cloud_workspaces.py | 69 +++++++ .../test_cloud_workspaces.py | 176 ------------------ 8 files changed, 295 insertions(+), 244 deletions(-) create mode 100644 airbyte/cloud/_sync_results.py create mode 100644 tests/integration_tests/cloud/__init__.py create mode 100644 tests/integration_tests/cloud/conftest.py rename tests/integration_tests/{ => cloud}/test_cloud_api_util.py (73%) create mode 100644 tests/integration_tests/cloud/test_cloud_sync.py create mode 100644 tests/integration_tests/cloud/test_cloud_workspaces.py delete mode 100644 tests/integration_tests/test_cloud_workspaces.py diff --git a/airbyte/cloud/_sync_results.py b/airbyte/cloud/_sync_results.py new file mode 100644 index 00000000..c497ab4c --- /dev/null +++ b/airbyte/cloud/_sync_results.py @@ -0,0 +1,51 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Sync results for Airbyte Cloud workspaces.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from airbyte_api.models.shared import JobStatusEnum, JobTypeEnum + +from airbyte._util import api_util + + +if TYPE_CHECKING: + from airbyte.cloud._workspaces import CloudWorkspace + + +FINAL_STATUSES = { + JobStatusEnum.SUCCEEDED, + JobStatusEnum.FAILED, + JobStatusEnum.CANCELLED, +} + + +@dataclass +class SyncResult: + """The result of a sync operation.""" + + workspace: CloudWorkspace + connection_id: str + job_id: str + _final_status: JobStatusEnum | None = None + + def is_job_complete(self) -> bool: + """Check if the sync job is complete.""" + return self.get_job_status() in FINAL_STATUSES + + def get_job_status(self) -> JobStatusEnum: + """Check if the sync job is still running.""" + if self._final_status: + return self._final_status + + job_info = api_util.get_job_info( + job_id=self.job_id, + api_root=self.workspace.api_root, + api_key=self.workspace.api_key, + ) + if job_info.status in FINAL_STATUSES: + self._final_status = job_info.status + + return job_info.status diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/_workspaces.py index 2cb074f1..6917574e 100644 --- a/airbyte/cloud/_workspaces.py +++ b/airbyte/cloud/_workspaces.py @@ -23,6 +23,7 @@ get_workspace, ) from airbyte.cloud._destinations import get_destination_config_from_cache +from airbyte.cloud._sync_results import SyncResult from airbyte.sources.base import Source @@ -33,24 +34,6 @@ from airbyte.caches.base import CacheBase -@dataclass -class SyncResult: - """The result of a sync operation.""" - - workspace: CloudWorkspace - connection_id: str - job_id: str - - @property - def is_running(self) -> bool: - """Check if the sync job is still running.""" - return api_util.get_job_info( - job_id=self.job_id, - api_root=CLOUD_API_ROOT, - api_key=self.workspace.api_key, - ) - - @dataclass class CloudWorkspace: """A remote workspace on the Airbyte Cloud. @@ -241,7 +224,7 @@ def delete_connection( def run_sync( self, connection_id: str, - ) -> str: + ) -> SyncResult: """Run a sync on a deployed connection.""" connection_response = api_util.run_connection( connection_id=connection_id, @@ -251,4 +234,8 @@ def run_sync( wait_for_job=True, raise_on_failure=True, ) - return connection_response.job_id + return SyncResult( + workspace=self, + connection_id=connection_response.connection_id, + job_id=connection_response.job_id, + ) diff --git a/tests/integration_tests/cloud/__init__.py b/tests/integration_tests/cloud/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration_tests/cloud/conftest.py b/tests/integration_tests/cloud/conftest.py new file mode 100644 index 00000000..37a7e2ed --- /dev/null +++ b/tests/integration_tests/cloud/conftest.py @@ -0,0 +1,109 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Fixtures for Cloud Workspace integration tests.""" +from __future__ import annotations + +import os +from pathlib import Path +import sys +import pytest +from airbyte._util.api_util import CLOUD_API_ROOT +from dotenv import dotenv_values +from airbyte._executor import _get_bin_dir +from airbyte.cloud import CloudWorkspace + + +ENV_AIRBYTE_API_KEY = "AIRBYTE_API_KEY" +ENV_AIRBYTE_API_WORKSPACE_ID = "AIRBYTE_API_WORKSPACE_ID" +ENV_MOTHERDUCK_API_KEY = "MOTHERDUCK_API_KEY" + + +@pytest.fixture(autouse=True) +def add_venv_bin_to_path(monkeypatch: pytest.MonkeyPatch) -> None: + """Patch the PATH to include the virtual environment's bin directory.""" + # Get the path to the bin directory of the virtual environment + venv_bin_path = str(_get_bin_dir(Path(sys.prefix))) + + # Add the bin directory to the PATH + new_path = f"{venv_bin_path}{os.pathsep}{os.environ['PATH']}" + monkeypatch.setenv('PATH', new_path) + + +@pytest.fixture +def workspace_id() -> str: + return os.environ[ENV_AIRBYTE_API_WORKSPACE_ID] + + +@pytest.fixture +def api_root() -> str: + return CLOUD_API_ROOT + + +@pytest.fixture +def api_key() -> str: + dotenv_vars: dict[str, str | None] = dotenv_values() + if ENV_AIRBYTE_API_KEY in dotenv_vars: + return dotenv_vars[ENV_AIRBYTE_API_KEY] + + if ENV_AIRBYTE_API_KEY not in os.environ: + raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") + + return os.environ[ENV_AIRBYTE_API_KEY] + + +@pytest.fixture +def motherduck_api_key() -> str: + dotenv_vars: dict[str, str | None] = dotenv_values() + if ENV_MOTHERDUCK_API_KEY in dotenv_vars: + return dotenv_vars[ENV_MOTHERDUCK_API_KEY] + + if ENV_MOTHERDUCK_API_KEY not in os.environ: + raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") + + return os.environ[ENV_MOTHERDUCK_API_KEY] + + +@pytest.fixture +def cloud_workspace( + workspace_id: str, + api_key: str, + api_root: str, +) -> CloudWorkspace: + return CloudWorkspace( + workspace_id=workspace_id, + api_key=api_key, + api_root=api_root, + ) + + +@pytest.fixture +def workspace_id() -> str: + return os.environ[ENV_AIRBYTE_API_WORKSPACE_ID] + + +@pytest.fixture +def api_root() -> str: + return CLOUD_API_ROOT + + +@pytest.fixture +def api_key() -> str: + dotenv_vars: dict[str, str | None] = dotenv_values() + if ENV_AIRBYTE_API_KEY in dotenv_vars: + return dotenv_vars[ENV_AIRBYTE_API_KEY] + + if ENV_AIRBYTE_API_KEY not in os.environ: + raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") + + return os.environ[ENV_AIRBYTE_API_KEY] + + +@pytest.fixture +def motherduck_api_key() -> str: + dotenv_vars: dict[str, str | None] = dotenv_values() + if ENV_MOTHERDUCK_API_KEY in dotenv_vars: + return dotenv_vars[ENV_MOTHERDUCK_API_KEY] + + if ENV_MOTHERDUCK_API_KEY not in os.environ: + raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") + + return os.environ[ENV_MOTHERDUCK_API_KEY] diff --git a/tests/integration_tests/test_cloud_api_util.py b/tests/integration_tests/cloud/test_cloud_api_util.py similarity index 73% rename from tests/integration_tests/test_cloud_api_util.py rename to tests/integration_tests/cloud/test_cloud_api_util.py index 8ef57342..c769eb15 100644 --- a/tests/integration_tests/test_cloud_api_util.py +++ b/tests/integration_tests/cloud/test_cloud_api_util.py @@ -5,57 +5,11 @@ These tests are designed to be run against a running instance of the Airbyte API. """ from __future__ import annotations -import os -from airbyte_api.models.shared.sourceresponse import SourceResponse -import pytest import ulid -import airbyte as ab -from airbyte._util import api_util, api_duck_types -from airbyte._util.api_util import CLOUD_API_ROOT -from airbyte_api.models.shared import SourceFaker, DestinationDevNull, DestinationDuckdb -from dotenv import dotenv_values - -from airbyte.caches.duckdb import DuckDBCache - -ENV_AIRBYTE_API_KEY = "AIRBYTE_API_KEY" -ENV_AIRBYTE_API_WORKSPACE_ID = "AIRBYTE_API_WORKSPACE_ID" -ENV_MOTHERDUCK_API_KEY = "MOTHERDUCK_API_KEY" - - -@pytest.fixture -def workspace_id() -> str: - return os.environ[ENV_AIRBYTE_API_WORKSPACE_ID] - - -@pytest.fixture -def api_root() -> str: - return CLOUD_API_ROOT - - -@pytest.fixture -def api_key() -> str: - dotenv_vars: dict[str, str | None] = dotenv_values() - if ENV_AIRBYTE_API_KEY in dotenv_vars: - return dotenv_vars[ENV_AIRBYTE_API_KEY] - - if ENV_AIRBYTE_API_KEY not in os.environ: - raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") - - return os.environ[ENV_AIRBYTE_API_KEY] - - -@pytest.fixture -def motherduck_api_key() -> str: - dotenv_vars: dict[str, str | None] = dotenv_values() - if ENV_MOTHERDUCK_API_KEY in dotenv_vars: - return dotenv_vars[ENV_MOTHERDUCK_API_KEY] - - if ENV_MOTHERDUCK_API_KEY not in os.environ: - raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") - - return os.environ[ENV_MOTHERDUCK_API_KEY] +from airbyte._util import api_util +from airbyte_api.models.shared import SourceFaker, DestinationDuckdb def test_create_and_delete_source( diff --git a/tests/integration_tests/cloud/test_cloud_sync.py b/tests/integration_tests/cloud/test_cloud_sync.py new file mode 100644 index 00000000..24a870cf --- /dev/null +++ b/tests/integration_tests/cloud/test_cloud_sync.py @@ -0,0 +1,57 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""Cloud Workspace integration tests. + +These tests are designed to be run against a running instance of the Airbyte API. +""" +from __future__ import annotations + +import pytest + +import airbyte as ab +from airbyte.caches import MotherDuckCache +from airbyte.cloud import CloudWorkspace + +ENV_AIRBYTE_API_KEY = "AIRBYTE_API_KEY" +ENV_AIRBYTE_API_WORKSPACE_ID = "AIRBYTE_API_WORKSPACE_ID" +ENV_MOTHERDUCK_API_KEY = "MOTHERDUCK_API_KEY" + + +@pytest.fixture +def pre_created_connection_id() -> str: + return "80857d37-1f21-4500-a802-f5ac08d1a3dd" + + +def test_run_connection( + cloud_workspace: CloudWorkspace, + pre_created_connection_id: str, +): + """Test running a connection.""" + sync_result = cloud_workspace.run_sync(connection_id=pre_created_connection_id) + _ = sync_result + + +@pytest.mark.skip(reason="This test is not yet complete. It is hanging currently.") +def test_deploy_and_run_connection( + cloud_workspace: CloudWorkspace, + motherduck_api_key: str, +) -> None: + """Test deploying a source and cache to a workspace as a new connection.""" + source = ab.get_source( + "source-faker", + local_executable="source-faker", + config={"count": 100}, + install_if_missing=False, + ) + source.check() + + cache = MotherDuckCache( + api_key=motherduck_api_key, + database="temp", + schema_name="public", + ) + + connection_id: str = cloud_workspace.deploy_connection(source=source, cache=cache) + sync_result = cloud_workspace.run_sync(connection_id=connection_id) + _ = sync_result + + cloud_workspace.delete_connection(connection_id=connection_id) diff --git a/tests/integration_tests/cloud/test_cloud_workspaces.py b/tests/integration_tests/cloud/test_cloud_workspaces.py new file mode 100644 index 00000000..664b0c7b --- /dev/null +++ b/tests/integration_tests/cloud/test_cloud_workspaces.py @@ -0,0 +1,69 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""Cloud Workspace integration tests. + +These tests are designed to be run against a running instance of the Airbyte API. +""" +from __future__ import annotations + +import pytest + +import airbyte as ab +from airbyte.caches import MotherDuckCache +from airbyte.cloud import CloudWorkspace + +ENV_AIRBYTE_API_KEY = "AIRBYTE_API_KEY" +ENV_AIRBYTE_API_WORKSPACE_ID = "AIRBYTE_API_WORKSPACE_ID" +ENV_MOTHERDUCK_API_KEY = "MOTHERDUCK_API_KEY" + + +def test_deploy_source( + cloud_workspace: CloudWorkspace, +) -> None: + """Test deploying a source to a workspace.""" + source = ab.get_source( + "source-faker", + local_executable="source-faker", + config={"count": 100}, + install_if_missing=False, + ) + source.check() + source_id: str = cloud_workspace.deploy_source(source) + + cloud_workspace.delete_source(source=source_id) + + +def test_deploy_cache_as_destination( + cloud_workspace: CloudWorkspace, + motherduck_api_key: str, +) -> None: + """Test deploying a cache to a workspace as a destination.""" + cache = MotherDuckCache( + api_key=motherduck_api_key, + database="temp", + schema_name="public", + ) + destination_id: str = cloud_workspace.deploy_cache_as_destination(cache=cache) + cloud_workspace.delete_destination(destination_id=destination_id) + + +def test_deploy_connection( + cloud_workspace: CloudWorkspace, + motherduck_api_key: str, +) -> None: + """Test deploying a source and cache to a workspace as a new connection.""" + source = ab.get_source( + "source-faker", + local_executable="source-faker", + config={"count": 100}, + install_if_missing=False, + ) + source.check() + + cache = MotherDuckCache( + api_key=motherduck_api_key, + database="temp", + schema_name="public", + ) + + connection_id: str = cloud_workspace.deploy_connection(source=source, cache=cache) + cloud_workspace.delete_connection(connection_id=connection_id) diff --git a/tests/integration_tests/test_cloud_workspaces.py b/tests/integration_tests/test_cloud_workspaces.py deleted file mode 100644 index 5f73d562..00000000 --- a/tests/integration_tests/test_cloud_workspaces.py +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright (c) 2023 Airbyte, Inc., all rights reserved. -"""Cloud Workspace integration tests. - -These tests are designed to be run against a running instance of the Airbyte API. -""" -from __future__ import annotations -import os -from pathlib import Path -import sys - -from dotenv import dotenv_values -import pytest - -import airbyte as ab -from airbyte.caches import MotherDuckCache -from airbyte.cloud import CloudWorkspace -from airbyte._executor import _get_bin_dir -from airbyte._util.api_util import CLOUD_API_ROOT, delete_destination - -ENV_AIRBYTE_API_KEY = "AIRBYTE_API_KEY" -ENV_AIRBYTE_API_WORKSPACE_ID = "AIRBYTE_API_WORKSPACE_ID" -ENV_MOTHERDUCK_API_KEY = "MOTHERDUCK_API_KEY" - - -@pytest.fixture(autouse=True) -def add_venv_bin_to_path(monkeypatch): - """Patch the PATH to include the virtual environment's bin directory.""" - # Get the path to the bin directory of the virtual environment - venv_bin_path = str(_get_bin_dir(Path(sys.prefix))) - - # Add the bin directory to the PATH - new_path = f"{venv_bin_path}{os.pathsep}{os.environ['PATH']}" - monkeypatch.setenv('PATH', new_path) - - -@pytest.fixture -def workspace_id() -> str: - return os.environ[ENV_AIRBYTE_API_WORKSPACE_ID] - - -@pytest.fixture -def api_root() -> str: - return CLOUD_API_ROOT - - -@pytest.fixture -def api_key() -> str: - dotenv_vars: dict[str, str | None] = dotenv_values() - if ENV_AIRBYTE_API_KEY in dotenv_vars: - return dotenv_vars[ENV_AIRBYTE_API_KEY] - - if ENV_AIRBYTE_API_KEY not in os.environ: - raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") - - return os.environ[ENV_AIRBYTE_API_KEY] - - -@pytest.fixture -def motherduck_api_key() -> str: - dotenv_vars: dict[str, str | None] = dotenv_values() - if ENV_MOTHERDUCK_API_KEY in dotenv_vars: - return dotenv_vars[ENV_MOTHERDUCK_API_KEY] - - if ENV_MOTHERDUCK_API_KEY not in os.environ: - raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") - - return os.environ[ENV_MOTHERDUCK_API_KEY] - - -@pytest.fixture -def cloud_workspace( - workspace_id: str, - api_key: str, - api_root: str, -) -> CloudWorkspace: - return CloudWorkspace( - workspace_id=workspace_id, - api_key=api_key, - api_root=api_root, - ) - - -def test_deploy_source( - cloud_workspace: CloudWorkspace, -) -> None: - """Test deploying a source to a workspace.""" - source = ab.get_source( - "source-faker", - local_executable="source-faker", - config={"count": 100}, - install_if_missing=False, - ) - source.check() - source_id: str = cloud_workspace.deploy_source(source) - - cloud_workspace.delete_source(source=source_id) - - -def test_deploy_cache_as_destination( - workspace_id: str, - api_key: str, - motherduck_api_key: str, -) -> None: - """Test deploying a cache to a workspace as a destination.""" - workspace = CloudWorkspace( - workspace_id=workspace_id, - api_key=api_key, - ) - - cache = MotherDuckCache( - api_key=motherduck_api_key, - database="temp", - schema_name="public", - ) - destination_id: str = workspace.deploy_cache_as_destination(cache=cache) - workspace.delete_destination(destination_id=destination_id) - - -def test_deploy_connection( - workspace_id: str, - api_key: str, - motherduck_api_key: str, -) -> None: - """Test deploying a source and cache to a workspace as a new connection.""" - workspace = CloudWorkspace( - workspace_id=workspace_id, - api_key=api_key, - ) - - source = ab.get_source( - "source-faker", - local_executable="source-faker", - config={"count": 100}, - install_if_missing=False, - ) - source.check() - - cache = MotherDuckCache( - api_key=motherduck_api_key, - database="temp", - schema_name="public", - ) - - connection_id: str = workspace.deploy_connection(source=source, cache=cache) - workspace.delete_connection(connection_id=connection_id) - -@pytest.mark.skip(reason="This test is not yet complete. It is hanging currently.") -def test_deploy_and_run_connection( - workspace_id: str, - api_key: str, - motherduck_api_key: str, -) -> None: - """Test deploying a source and cache to a workspace as a new connection.""" - workspace = CloudWorkspace( - workspace_id=workspace_id, - api_key=api_key, - ) - - source = ab.get_source( - "source-faker", - local_executable="source-faker", - config={"count": 100}, - install_if_missing=False, - ) - source.check() - - cache = MotherDuckCache( - api_key=motherduck_api_key, - database="temp", - schema_name="public", - ) - - connection_id: str = workspace.deploy_connection(source=source, cache=cache) - sync_result = workspace.run_sync(connection_id=connection_id) - - workspace.delete_connection(connection_id=connection_id) From 079621921f9c9d1e3abd6d94f25b599caa8b6e89 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 26 Mar 2024 13:02:45 -0700 Subject: [PATCH 020/118] improve error and timeout handling --- airbyte/_util/api_util.py | 65 ++-------------------------- airbyte/cloud/_sync_results.py | 77 +++++++++++++++++++++++++++++++--- airbyte/cloud/_workspaces.py | 14 +++++-- airbyte/exceptions.py | 68 ++++++++++++++++++++++++++++-- 4 files changed, 149 insertions(+), 75 deletions(-) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index afb125b1..1c3acc0b 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -27,6 +27,7 @@ JOB_WAIT_INTERVAL_SECS = 2.0 +JOB_WAIT_TIMEOUT_SECS_DEFAULT = 60 * 60 # 1 hour CLOUD_API_ROOT = "https://api.airbyte.com/v1" @@ -145,8 +146,6 @@ def run_connection( *, api_root: str = CLOUD_API_ROOT, api_key: str | None = None, - wait_for_job: bool = True, - raise_on_failure: bool = True, ) -> api_models.ConnectionResponse: """Get a connection. @@ -167,21 +166,12 @@ def run_connection( ), ) if status_ok(response.status_code) and response.job_response: - if wait_for_job: - job_info = wait_for_airbyte_job( - workspace_id=workspace_id, - job_id=response.job_response.job_id, - api_key=api_key, - api_root=api_root, - raise_on_failure=raise_on_failure, - ) - - return job_info + return response.job_response raise HostedConnectionSyncError( + connection_id=connection_id, context={ "workspace_id": workspace_id, - "connection_id": connection_id, }, response=response, ) @@ -210,55 +200,6 @@ def get_job_info( raise MissingResourceError(job_id, "job", response.text) -def wait_for_airbyte_job( - workspace_id: str, - job_id: str, - *, - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, - raise_on_failure: bool = True, -) -> api_models.JobResponse: - """Wait for a job to finish running.""" - _ = workspace_id # Not used (yet) - api_key = api_key or get_default_bearer_token() - airbyte_instance = get_airbyte_server_instance( - api_key=api_key, - api_root=api_root, - ) - while True: - sleep(JOB_WAIT_INTERVAL_SECS) - response: api_operations.GetJobResponse = airbyte_instance.jobs.get_job( - api_operations.GetJobRequest( - job_id=job_id, - ), - ) - if status_ok(response.status_code) and response.job_response: - job_info = response.job_response - if job_info.status == api_models.JobStatusEnum.SUCCEEDED: - return job_info - - if job_info.status in ( - api_models.JobStatusEnum.FAILED, - api_models.JobStatusEnum.CANCELLED, - ): - if raise_on_failure: - raise HostedConnectionSyncError( - context={ - "job_id": job_id, - "job_status": job_info.status, - "workspace_id": workspace_id, - "message": job_info.message, - }, - ) - - return job_info - - # Else: Job is still running - pass - else: - raise MissingResourceError(job_id, "job", response.text) - - def get_connection_by_name( workspace_id: str, connection_name: str, diff --git a/airbyte/cloud/_sync_results.py b/airbyte/cloud/_sync_results.py index c497ab4c..461cd487 100644 --- a/airbyte/cloud/_sync_results.py +++ b/airbyte/cloud/_sync_results.py @@ -3,12 +3,17 @@ from __future__ import annotations +import time from dataclasses import dataclass from typing import TYPE_CHECKING -from airbyte_api.models.shared import JobStatusEnum, JobTypeEnum +from airbyte_api.models.shared import JobStatusEnum from airbyte._util import api_util +from airbyte.exceptions import HostedConnectionSyncError, HostedConnectionSyncTimeoutError + + +DEFAULT_SYNC_TIMEOUT_SECONDS = 30 * 60 # 30 minutes if TYPE_CHECKING: @@ -20,6 +25,10 @@ JobStatusEnum.FAILED, JobStatusEnum.CANCELLED, } +FAILED_STATUSES = { + JobStatusEnum.FAILED, + JobStatusEnum.CANCELLED, +} @dataclass @@ -29,7 +38,7 @@ class SyncResult: workspace: CloudWorkspace connection_id: str job_id: str - _final_status: JobStatusEnum | None = None + _latest_status: JobStatusEnum | None = None def is_job_complete(self) -> bool: """Check if the sync job is complete.""" @@ -37,15 +46,71 @@ def is_job_complete(self) -> bool: def get_job_status(self) -> JobStatusEnum: """Check if the sync job is still running.""" - if self._final_status: - return self._final_status + if self._latest_status and self._latest_status in FINAL_STATUSES: + return self._latest_status job_info = api_util.get_job_info( job_id=self.job_id, api_root=self.workspace.api_root, api_key=self.workspace.api_key, ) - if job_info.status in FINAL_STATUSES: - self._final_status = job_info.status + self._latest_status = job_info.status return job_info.status + + def raise_failure_status( + self, + *, + refresh_status: bool = False, + ) -> None: + """Raise an exception if the sync job failed. + + By default, this method will use the latest status available. If you want to refresh the status + before checking for failure, set `refresh_status=True`. If the job has failed, this method will + raise a `HostedConnectionSyncError`. + + Otherwise, do nothing. + """ + latest_status = self._latest_status + if refresh_status: + latest_status = self.get_job_status() + + if latest_status in FAILED_STATUSES: + raise HostedConnectionSyncError( + workspace=self.workspace, + connection_id=self.connection_id, + job_id=self.job_id, + job_status=self._latest_status, + ) + + def wait_for_completion( + self, + *, + wait_timeout: int = DEFAULT_SYNC_TIMEOUT_SECONDS, + raise_timeout: bool = True, + raise_failure: bool = False, + ) -> JobStatusEnum: + """Wait for a job to finish running.""" + start_time = time.time() + while True: + latest_status = self.get_job_status() + if latest_status in FINAL_STATUSES: + if raise_failure: + # No-op if the job succeeded or is still running: + self.raise_failure_status() + + return latest_status + + if time.time() - start_time > wait_timeout: + if raise_timeout: + raise HostedConnectionSyncTimeoutError( + workspace=self.workspace, + connection_id=self.connection_id, + job_id=self.job_id, + job_status=latest_status, + timeout=wait_timeout, + ) + + return latest_status # This will be a non-final status + + time.sleep(api_util.JOB_WAIT_INTERVAL_SECS) diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/_workspaces.py index 6917574e..82dcd8e6 100644 --- a/airbyte/cloud/_workspaces.py +++ b/airbyte/cloud/_workspaces.py @@ -224,6 +224,8 @@ def delete_connection( def run_sync( self, connection_id: str, + wait_for_job: bool = True, + wait_timeout: int = 300, ) -> SyncResult: """Run a sync on a deployed connection.""" connection_response = api_util.run_connection( @@ -231,11 +233,17 @@ def run_sync( api_root=self.api_root, api_key=self.api_key, workspace_id=self.workspace_id, - wait_for_job=True, - raise_on_failure=True, ) - return SyncResult( + sync_result = SyncResult( workspace=self, connection_id=connection_response.connection_id, job_id=connection_response.job_id, ) + if wait_for_job: + sync_result.wait_for_completion( + wait_timeout=wait_timeout, + raise_failure=True, + raise_timeout=True, + ) + + return sync_result diff --git a/airbyte/exceptions.py b/airbyte/exceptions.py index 2820b54a..5224ed39 100644 --- a/airbyte/exceptions.py +++ b/airbyte/exceptions.py @@ -37,6 +37,7 @@ """ from __future__ import annotations +from ast import Not from dataclasses import dataclass from textwrap import indent from typing import TYPE_CHECKING, Any @@ -44,6 +45,8 @@ if TYPE_CHECKING: from airbyte._util.api_duck_types import AirbyteApiResponseDuckType + from airbyte.cloud._sync_results import SyncResult + from airbyte.cloud._workspaces import CloudWorkspace NEW_ISSUE_URL = "https://github.com/airbytehq/airbyte/issues/new/choose" @@ -319,6 +322,67 @@ class HostedAirbyteError(AirbyteError): response: AirbyteApiResponseDuckType | None = None """The API response from the failed request.""" + workspace: CloudWorkspace | None = None + """The workspace where the error occurred.""" + + @property + def workspace_url(self) -> str | None: + if self.workspace: + return f"{self.workspace.api_root}/workspaces/{self.workspace.workspace_id}" + + return None + + +@dataclass +class HostedAirbyteConnectionError(HostedAirbyteError): + """An connection error occurred while communicating with the hosted Airbyte instance.""" + + connection_id: str | None = None + """The connection ID where the error occurred.""" + + job_id: str | None = None + """The job ID where the error occurred (if applicable).""" + + job_status: str | None = None + """The latest status of the job where the error occurred (if applicable).""" + + @property + def connection_url(self) -> str | None: + if self.workspace_url and self.connection_id: + return f"{self.workspace_url}/connections/{self.connection_id}" + + return None + + @property + def job_history_url(self) -> str | None: + if self.connection_url: + return f"{self.connection_url}/job-history" + + return None + + @property + def job_url(self) -> str | None: + if self.job_history_url and self.job_id: + return f"{self.job_history_url}#{self.job_id}::0" + + return None + + +@dataclass +class HostedConnectionSyncError(HostedAirbyteConnectionError): + """An error occurred while executing the remote Airbyte job.""" + + +@dataclass +class HostedConnectionSyncTimeoutError(HostedConnectionSyncError): + """An timeout occurred while waiting for the remote Airbyte job to complete.""" + + timeout: int | None = None + """The timeout in seconds that was reached.""" + + +# Airbyte Resource Errors (General) + @dataclass class MissingResourceError(HostedAirbyteError): @@ -334,7 +398,3 @@ class MultipleResourcesError(HostedAirbyteError): resource_type: str | None = None resource_name_or_id: str | None = None - - -class HostedConnectionSyncError(HostedAirbyteError): - """An error occurred while executing the remote Airbyte job.""" From ee2da5ef6e37ea6f13cfc728b99557227034c146 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 26 Mar 2024 16:09:14 -0700 Subject: [PATCH 021/118] rename arg to 'wait' --- airbyte/cloud/_workspaces.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/_workspaces.py index 82dcd8e6..bfaa2eff 100644 --- a/airbyte/cloud/_workspaces.py +++ b/airbyte/cloud/_workspaces.py @@ -224,7 +224,8 @@ def delete_connection( def run_sync( self, connection_id: str, - wait_for_job: bool = True, + *, + wait: bool = True, wait_timeout: int = 300, ) -> SyncResult: """Run a sync on a deployed connection.""" @@ -239,7 +240,7 @@ def run_sync( connection_id=connection_response.connection_id, job_id=connection_response.job_id, ) - if wait_for_job: + if wait: sync_result.wait_for_completion( wait_timeout=wait_timeout, raise_failure=True, From 3ab045a21d1205b31a5840c309f0df349e3d427b Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 26 Mar 2024 16:24:20 -0700 Subject: [PATCH 022/118] tidy up, add comments --- airbyte/_util/api_util.py | 174 ++++++++++++++++++++--------------- airbyte/cloud/_workspaces.py | 69 +++++++++++++- 2 files changed, 167 insertions(+), 76 deletions(-) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index 1c3acc0b..046c1cc4 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -31,6 +31,9 @@ CLOUD_API_ROOT = "https://api.airbyte.com/v1" +# Helper functions + + def status_ok(status_code: int) -> bool: """Check if a status code is OK.""" return status_code >= 200 and status_code < 300 # noqa: PLR2004 # allow inline magic numbers @@ -56,6 +59,9 @@ def get_airbyte_server_instance( ) +# Get workspace + + def get_workspace( workspace_id: str, *, @@ -85,6 +91,9 @@ def get_workspace( ) +# List, get, and run connections + + def list_connections( workspace_id: str, *, @@ -177,6 +186,9 @@ def run_connection( ) +# Get job info (logs) + + def get_job_info( job_id: str, *, @@ -200,59 +212,8 @@ def get_job_info( raise MissingResourceError(job_id, "job", response.text) -def get_connection_by_name( - workspace_id: str, - connection_name: str, - *, - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, -) -> api_models.ConnectionResponse: - """Get a connection.""" - connections = list_connections( - workspace_id=workspace_id, - api_key=api_key, - api_root=api_root, - ) - found: list[api_models.ConnectionResponse] = [ - connection for connection in connections if connection.name == connection_name - ] - if len(found) == 0: - raise MissingResourceError(connection_name, "connection", f"Workspace: {workspace_id}") - - if len(found) > 1: - raise MultipleResourcesError( - resource_type="connection", - resource_name_or_id=connection_name, - context={ - "workspace_id": workspace_id, - "multiples": found, - }, - ) - return found[0] - - -def get_source( - source_id: str, - *, - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, -) -> api_models.SourceResponse: - """Get a connection.""" - api_key = api_key or get_default_bearer_token() - airbyte_instance = get_airbyte_server_instance( - api_key=api_key, - api_root=api_root, - ) - response = airbyte_instance.sources.get_source( - api_operations.GetSourceRequest( - source_id=source_id, - ), - ) - if status_ok(response.status_code) and response.connection_response: - return response.connection_response - - raise MissingResourceError(source_id, "source", response.text) +# Create, get, and delete sources def create_source( @@ -286,6 +247,28 @@ def create_source( response=response, ) +def get_source( + source_id: str, + *, + api_root: str = CLOUD_API_ROOT, + api_key: str | None = None, +) -> api_models.SourceResponse: + """Get a connection.""" + api_key = api_key or get_default_bearer_token() + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.sources.get_source( + api_operations.GetSourceRequest( + source_id=source_id, + ), + ) + if status_ok(response.status_code) and response.connection_response: + return response.connection_response + + raise MissingResourceError(source_id, "source", response.text) + def delete_source( source_id: str, @@ -315,6 +298,9 @@ def delete_source( ) +# Create, get, and delete destinations + + def create_destination( name: str, *, @@ -347,6 +333,29 @@ def create_destination( ) +def get_destination( + destination_id: str, + *, + api_root: str = CLOUD_API_ROOT, + api_key: str | None = None, +) -> api_models.DestinationResponse: + """Get a connection.""" + api_key = api_key or get_default_bearer_token() + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.sources.get_destination( + api_operations.GetDestinationRequest( + destination_id=destination_id, + ), + ) + if status_ok(response.status_code) and response.connection_response: + return response.connection_response + + raise MissingResourceError(destination_id, "destination", response.text) + + def delete_destination( destination_id: str, *, @@ -375,6 +384,9 @@ def delete_destination( ) +# Create and delete connections + + def create_connection( name: str, *, @@ -414,6 +426,38 @@ def create_connection( return response.connection_response +def get_connection_by_name( + workspace_id: str, + connection_name: str, + *, + api_root: str = CLOUD_API_ROOT, + api_key: str | None = None, +) -> api_models.ConnectionResponse: + """Get a connection.""" + connections = list_connections( + workspace_id=workspace_id, + api_key=api_key, + api_root=api_root, + ) + found: list[api_models.ConnectionResponse] = [ + connection for connection in connections if connection.name == connection_name + ] + if len(found) == 0: + raise MissingResourceError(connection_name, "connection", f"Workspace: {workspace_id}") + + if len(found) > 1: + raise MultipleResourcesError( + resource_type="connection", + resource_name_or_id=connection_name, + context={ + "workspace_id": workspace_id, + "multiples": found, + }, + ) + + return found[0] + + def delete_connection( connection_id: str, api_root: str, @@ -440,6 +484,9 @@ def delete_connection( ) +# Not yet implemented + + def check_source( source_id: str, *, @@ -454,26 +501,3 @@ def check_source( """ _ = source_id, workspace_id, api_root, api_key raise NotImplementedError - - -def get_destination( - destination_id: str, - *, - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, -) -> api_models.DestinationResponse: - """Get a connection.""" - api_key = api_key or get_default_bearer_token() - airbyte_instance = get_airbyte_server_instance( - api_key=api_key, - api_root=api_root, - ) - response = airbyte_instance.sources.get_destination( - api_operations.GetDestinationRequest( - destination_id=destination_id, - ), - ) - if status_ok(response.status_code) and response.connection_response: - return response.connection_response - - raise MissingResourceError(destination_id, "destination", response.text) diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/_workspaces.py index bfaa2eff..d58f0123 100644 --- a/airbyte/cloud/_workspaces.py +++ b/airbyte/cloud/_workspaces.py @@ -46,14 +46,23 @@ class CloudWorkspace: api_key: str api_root: str = CLOUD_API_ROOT + # Test connection and creds + def connect(self) -> None: - """Check that the workspace is reachable and raise an exception otherwise.""" + """Check that the workspace is reachable and raise an exception otherwise. + + Note: It is not necessary to call this method before calling other operations. It + serves primarily as a simple check to ensure that the workspace is reachable + and credentials are correct. + """ _ = get_workspace( api_root=self.api_root, api_key=self.api_key, workspace_id=self.workspace_id, ) + # Deploy and delete sources + def deploy_source( self, source: Source, @@ -106,6 +115,8 @@ def delete_source( api_key=self.api_key, ) + # Deploy and delete destinations + def deploy_cache_as_destination( self, cache: CacheBase, @@ -163,6 +174,8 @@ def delete_destination( api_key=self.api_key, ) + # Deploy and delete connections + def deploy_connection( self, source: Source, @@ -221,6 +234,8 @@ def delete_connection( if delete_destination: self.delete_destination(destination_id=connection.destination_id) + # Run syncs + def run_sync( self, connection_id: str, @@ -248,3 +263,55 @@ def run_sync( ) return sync_result + + # Get sync results and previous sync logs + + def get_sync_result( + self, + connection_id: str, + job_id: str | None, + ) -> SyncResult | None: + """Get the sync result for a connection job. + + If `job_id` is not provided, the most recent sync job will be used. + + Returns `None` if job_id is omitted and no previous jobs are found. + """ + if job_id is None: + results = self.get_previous_sync_logs( + connection_id=connection_id, + num_sync_logs=1, + ) + if results: + return results[0] + + return None + + return SyncResult( + workspace=self, + connection_id=connection_id, + job_id=job_id, + ) + + def get_previous_sync_logs( + self, + connection_id: str, + *, + num_sync_logs: int = 10, + ) -> list[SyncResult]: + """Get the previous sync logs for a connection.""" + sync_logs = api_util.get_connection_sync_logs( + connection_id=connection_id, + api_root=self.api_root, + api_key=self.api_key, + workspace_id=self.workspace_id, + num_sync_logs=num_sync_logs, + ) + return [ + SyncResult( + workspace=self, + connection_id=sync_log.connection_id, + job_id=sync_log.job_id, + ) + for sync_log in sync_logs + ] From a5356d83422ea5571235f23c47a3d0ec69de47fe Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 26 Mar 2024 16:39:00 -0700 Subject: [PATCH 023/118] remove defaults in low-level functions --- airbyte/_util/api_util.py | 110 ++++++++++++++++++++++---------------- 1 file changed, 64 insertions(+), 46 deletions(-) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index 046c1cc4..1ef1a0d3 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -10,7 +10,6 @@ from __future__ import annotations import os -from time import sleep from typing import Any import airbyte_api @@ -46,11 +45,10 @@ def get_default_bearer_token() -> str | None: def get_airbyte_server_instance( *, - api_key: str | None = None, - api_root: str = CLOUD_API_ROOT, + api_key: str, + api_root: str, ) -> airbyte_api.Airbyte: """Get an Airbyte instance.""" - api_key = api_key or get_default_bearer_token() return airbyte_api.Airbyte( security=api_models.Security( bearer_auth=api_key, @@ -65,11 +63,10 @@ def get_airbyte_server_instance( def get_workspace( workspace_id: str, *, - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, + api_root: str, + api_key: str, ) -> api_models.WorkspaceResponse: """Get a connection.""" - api_key = api_key or get_default_bearer_token() airbyte_instance = get_airbyte_server_instance( api_key=api_key, api_root=api_root, @@ -97,12 +94,11 @@ def get_workspace( def list_connections( workspace_id: str, *, - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, + api_root: str, + api_key: str, ) -> list[api_models.ConnectionResponse]: """Get a connection.""" _ = workspace_id # Not used (yet) - api_key = api_key or get_default_bearer_token() airbyte_instance = get_airbyte_server_instance( api_key=api_key, api_root=api_root, @@ -128,12 +124,11 @@ def get_connection( workspace_id: str, connection_id: str, *, - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, + api_root: str, + api_key: str, ) -> api_models.ConnectionResponse: """Get a connection.""" _ = workspace_id # Not used (yet) - api_key = api_key or get_default_bearer_token() airbyte_instance = get_airbyte_server_instance( api_key=api_key, api_root=api_root, @@ -153,8 +148,8 @@ def run_connection( workspace_id: str, connection_id: str, *, - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, + api_root: str, + api_key: str, ) -> api_models.ConnectionResponse: """Get a connection. @@ -163,7 +158,6 @@ def run_connection( If raise_on_failure is True, this will raise an exception if the connection fails. """ _ = workspace_id # Not used (yet) - api_key = api_key or get_default_bearer_token() airbyte_instance = get_airbyte_server_instance( api_key=api_key, api_root=api_root, @@ -189,14 +183,46 @@ def run_connection( # Get job info (logs) +def get_job_logs( + workspace_id: str, + connection_id: str, + limit: int = 20, + *, + api_root: str, + api_key: str, +) -> list[api_models.JobResponse]: + """Get a job's logs.""" + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response: api_operations.ListJobsResponse = airbyte_instance.jobs.list_jobs( + api_operations.ListJobsRequest( + workspace_ids=[workspace_id], + connection_id=connection_id, + limit=limit, + ), + ) + if status_ok(response.status_code) and response.jobs_response: + return response.jobs_response.data + + raise MissingResourceError( + response=response, + resource_type="job", + context={ + "workspace_id": workspace_id, + "connection_id": connection_id, + }, + ) + + def get_job_info( job_id: str, *, - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, + api_root: str, + api_key: str, ) -> api_models.JobResponse: """Get a job.""" - api_key = api_key or get_default_bearer_token() airbyte_instance = get_airbyte_server_instance( api_key=api_key, api_root=api_root, @@ -221,11 +247,10 @@ def create_source( *, workspace_id: str, config: dict[str, Any], - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, + api_root: str, + api_key: str, ) -> api_models.SourceResponse: """Get a connection.""" - api_key = api_key or get_default_bearer_token() airbyte_instance = get_airbyte_server_instance( api_key=api_key, api_root=api_root, @@ -250,11 +275,10 @@ def create_source( def get_source( source_id: str, *, - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, + api_root: str, + api_key: str, ) -> api_models.SourceResponse: """Get a connection.""" - api_key = api_key or get_default_bearer_token() airbyte_instance = get_airbyte_server_instance( api_key=api_key, api_root=api_root, @@ -273,13 +297,12 @@ def get_source( def delete_source( source_id: str, *, - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, + api_root: str, + api_key: str, workspace_id: str | None = None, ) -> None: """Delete a source.""" _ = workspace_id # Not used (yet) - api_key = api_key or get_default_bearer_token() airbyte_instance = get_airbyte_server_instance( api_key=api_key, api_root=api_root, @@ -306,11 +329,10 @@ def create_destination( *, workspace_id: str, config: dict[str, Any], - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, + api_root: str, + api_key: str, ) -> api_models.DestinationResponse: """Get a connection.""" - api_key = api_key or get_default_bearer_token() airbyte_instance = get_airbyte_server_instance( api_key=api_key, api_root=api_root, @@ -336,11 +358,10 @@ def create_destination( def get_destination( destination_id: str, *, - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, + api_root: str, + api_key: str, ) -> api_models.DestinationResponse: """Get a connection.""" - api_key = api_key or get_default_bearer_token() airbyte_instance = get_airbyte_server_instance( api_key=api_key, api_root=api_root, @@ -359,13 +380,12 @@ def get_destination( def delete_destination( destination_id: str, *, - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, + api_root: str, + api_key: str, workspace_id: str | None = None, ) -> None: """Delete a destination.""" _ = workspace_id # Not used (yet) - api_key = api_key or get_default_bearer_token() airbyte_instance = get_airbyte_server_instance( api_key=api_key, api_root=api_root, @@ -393,11 +413,10 @@ def create_connection( source_id: str, destination_id: str, api_root: str, - api_key: str | None = None, + api_key: str, workspace_id: str | None = None, ) -> api_models.ConnectionResponse: _ = workspace_id # Not used (yet) - api_key = api_key or get_default_bearer_token() airbyte_instance = get_airbyte_server_instance( api_key=api_key, api_root=api_root, @@ -430,8 +449,8 @@ def get_connection_by_name( workspace_id: str, connection_name: str, *, - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, + api_root: str, + api_key: str, ) -> api_models.ConnectionResponse: """Get a connection.""" connections = list_connections( @@ -461,11 +480,10 @@ def get_connection_by_name( def delete_connection( connection_id: str, api_root: str, - workspace_id: str | None = None, - api_key: str | None = None, + workspace_id: str, + api_key: str, ) -> None: _ = workspace_id # Not used (yet) - api_key = api_key or get_default_bearer_token() airbyte_instance = get_airbyte_server_instance( api_key=api_key, api_root=api_root, @@ -490,8 +508,8 @@ def delete_connection( def check_source( source_id: str, *, - api_root: str = CLOUD_API_ROOT, - api_key: str | None = None, + api_root: str, + api_key: str, workspace_id: str | None = None, ) -> api_models.SourceCheckResponse: """Check a source. From d1c4f3fd87898e50243905d2f49ac3e67a7616b7 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 26 Mar 2024 16:39:44 -0700 Subject: [PATCH 024/118] implement logs lookup --- airbyte/cloud/_sync_results.py | 6 +++--- airbyte/cloud/_workspaces.py | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/airbyte/cloud/_sync_results.py b/airbyte/cloud/_sync_results.py index 461cd487..3a8c9384 100644 --- a/airbyte/cloud/_sync_results.py +++ b/airbyte/cloud/_sync_results.py @@ -65,9 +65,9 @@ def raise_failure_status( ) -> None: """Raise an exception if the sync job failed. - By default, this method will use the latest status available. If you want to refresh the status - before checking for failure, set `refresh_status=True`. If the job has failed, this method will - raise a `HostedConnectionSyncError`. + By default, this method will use the latest status available. If you want to refresh the + status before checking for failure, set `refresh_status=True`. If the job has failed, this + method will raise a `HostedConnectionSyncError`. Otherwise, do nothing. """ diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/_workspaces.py index d58f0123..694f45a6 100644 --- a/airbyte/cloud/_workspaces.py +++ b/airbyte/cloud/_workspaces.py @@ -30,6 +30,7 @@ if TYPE_CHECKING: from airbyte_api.models.shared.connectionresponse import ConnectionResponse from airbyte_api.models.shared.destinationresponse import DestinationResponse + from airbyte_api.models.shared.jobresponse import JobResponse from airbyte.caches.base import CacheBase @@ -300,18 +301,19 @@ def get_previous_sync_logs( num_sync_logs: int = 10, ) -> list[SyncResult]: """Get the previous sync logs for a connection.""" - sync_logs = api_util.get_connection_sync_logs( + sync_logs: list[JobResponse] = api_util.get_job_logs( connection_id=connection_id, api_root=self.api_root, api_key=self.api_key, workspace_id=self.workspace_id, - num_sync_logs=num_sync_logs, + limit=num_sync_logs, ) return [ SyncResult( workspace=self, connection_id=sync_log.connection_id, job_id=sync_log.job_id, + _latest_status=sync_log.status, ) for sync_log in sync_logs ] From 142d7f653bb295c1dcb7923e4618552b8d5289c7 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 26 Mar 2024 16:39:51 -0700 Subject: [PATCH 025/118] remove extra import --- airbyte/exceptions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/airbyte/exceptions.py b/airbyte/exceptions.py index 5224ed39..ea2130cd 100644 --- a/airbyte/exceptions.py +++ b/airbyte/exceptions.py @@ -37,7 +37,6 @@ """ from __future__ import annotations -from ast import Not from dataclasses import dataclass from textwrap import indent from typing import TYPE_CHECKING, Any @@ -45,7 +44,6 @@ if TYPE_CHECKING: from airbyte._util.api_duck_types import AirbyteApiResponseDuckType - from airbyte.cloud._sync_results import SyncResult from airbyte.cloud._workspaces import CloudWorkspace From 24c462c1f6f6ad8131af6f475d631a73e5f51244 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Fri, 29 Mar 2024 22:36:43 -0700 Subject: [PATCH 026/118] fix get_destination() --- airbyte/_util/api_util.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index 1ef1a0d3..8ad2bc01 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -366,13 +366,13 @@ def get_destination( api_key=api_key, api_root=api_root, ) - response = airbyte_instance.sources.get_destination( + response = airbyte_instance.destinations.get_destination( api_operations.GetDestinationRequest( destination_id=destination_id, ), ) - if status_ok(response.status_code) and response.connection_response: - return response.connection_response + if status_ok(response.status_code) and response.destination_response: + return response.destination_response raise MissingResourceError(destination_id, "destination", response.text) From 6e7f635e2ad19aaed417a2a3d97455a14fad752a Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Fri, 29 Mar 2024 22:39:32 -0700 Subject: [PATCH 027/118] split destination util, add destinations module --- airbyte/cloud/_destination_util.py | 200 ++++++++++++++++++++++++ airbyte/cloud/_destinations.py | 234 ++++++----------------------- airbyte/cloud/_workspaces.py | 9 +- 3 files changed, 254 insertions(+), 189 deletions(-) create mode 100644 airbyte/cloud/_destination_util.py diff --git a/airbyte/cloud/_destination_util.py b/airbyte/cloud/_destination_util.py new file mode 100644 index 00000000..a8226643 --- /dev/null +++ b/airbyte/cloud/_destination_util.py @@ -0,0 +1,200 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Cloud destinations for Airbyte.""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from airbyte_api.models.shared import ( + DestinationBigquery, + DestinationDuckdb, + DestinationPostgres, + DestinationSnowflake, +) + +from airbyte.caches import ( + BigQueryCache, + DuckDBCache, + MotherDuckCache, + PostgresCache, + SnowflakeCache, +) +from airbyte.secrets import get_secret + + +if TYPE_CHECKING: + from collections.abc import Callable + + from airbyte.caches.base import CacheBase + + +def get_destination_config_from_cache( + cache: CacheBase, +) -> dict[str, str]: + """Get the destination configuration from the cache.""" + conversion_fn_map: dict[str, Callable[[Any], dict[str, str]]] = { + "BigQueryCache": get_bigquery_destination_config, + "DuckDBCache": get_duckdb_destination_config, + "MotherDuckCache": get_motherduck_destination_config, + "PostgresCache": get_postgres_destination_config, + "SnowflakeCache": get_snowflake_destination_config, + } + cache_class_name = cache.__class__.__name__ + if cache_class_name not in conversion_fn_map: + raise ValueError( # noqa: TRY003 + "Cannot convert cache type to destination configuration. Cache type not supported. ", + f"Supported cache types: {list(conversion_fn_map.keys())}", + ) + + conversion_fn = conversion_fn_map[cache_class_name] + return conversion_fn(cache) + + +def get_duckdb_destination_config( + cache: DuckDBCache, +) -> dict[str, str]: + """Get the destination configuration from the DuckDB cache.""" + return DestinationDuckdb( + destination_path=cache.db_path, + schema=cache.schema_name, + ).to_dict() + + +def get_motherduck_destination_config( + cache: MotherDuckCache, +) -> dict[str, str]: + """Get the destination configuration from the DuckDB cache.""" + return DestinationDuckdb( + destination_path=cache.db_path, + schema=cache.schema_name, + motherduck_api_key=cache.api_key, + ).to_dict() + + +def get_postgres_destination_config( + cache: PostgresCache, +) -> dict[str, str]: + """Get the destination configuration from the Postgres cache.""" + return DestinationPostgres( + database=cache.database, + host=cache.host, + password=cache.password, + port=cache.port, + schema=cache.schema_name, + username=cache.username, + ).to_dict() + + +def get_snowflake_destination_config( + cache: SnowflakeCache, +) -> dict[str, str]: + """Get the destination configuration from the Snowflake cache.""" + return DestinationSnowflake( + account=cache.account, + database=cache.database, + password=cache.password, + role=cache.role, + schema=cache.schema_name, + username=cache.username, + warehouse=cache.warehouse, + ).to_dict() + + +def get_bigquery_destination_config( + cache: BigQueryCache, +) -> dict[str, str]: + """Get the destination configuration from the BigQuery cache.""" + return DestinationBigquery( + project_id=cache.project_name, + dataset_id=cache.dataset_name, + schema=cache.schema_name, + credentials_json=Path(cache.credentials_path).read_text(), + ).to_dict() + + +def create_bigquery_cache( + destination_configuration: dict[str, str], +) -> BigQueryCache: + """Create a new BigQuery cache from the destination configuration.""" + credentials_path = get_secret("BIGQUERY_CREDENTIALS_PATH") + return BigQueryCache( + project_name=destination_configuration["project_id"], + dataset_name=destination_configuration["dataset_id"], + schema_name=destination_configuration["schema"], + credentials_path=credentials_path, + ) + + +def create_duckdb_cache( + destination_configuration: dict[str, str], +) -> DuckDBCache: + """Create a new DuckDB cache from the destination configuration.""" + return DuckDBCache( + db_path=destination_configuration["destination_path"], + schema_name=destination_configuration["schema"], + ) + + +def create_motherduck_cache( + destination_configuration: dict[str, str], +) -> MotherDuckCache: + """Create a new DuckDB cache from the destination configuration.""" + return MotherDuckCache( + database=destination_configuration["destination_path"], + schema_name=destination_configuration["schema"], + api_key=destination_configuration["motherduck_api_key"], + ) + + +def create_postgres_cache( + destination_configuration: dict[str, str], +) -> PostgresCache: + """Create a new Postgres cache from the destination configuration.""" + port: int = ( + int(destination_configuration["port"]) if "port" in destination_configuration else 5432 + ) + return PostgresCache( + database=destination_configuration["database"], + host=destination_configuration["host"], + password=destination_configuration["password"], + port=port, + schema_name=destination_configuration["schema"], + username=destination_configuration["username"], + ) + + +def create_snowflake_cache( + destination_configuration: dict[str, str], +) -> SnowflakeCache: + """Create a new Snowflake cache from the destination configuration.""" + return SnowflakeCache( + account=destination_configuration["account"], + database=destination_configuration["database"], + password=destination_configuration["password"], + role=destination_configuration["role"], + schema_name=destination_configuration["schema"], + username=destination_configuration["username"], + warehouse=destination_configuration["warehouse"], + ) + + +def create_cache_from_destination( + destination_configuration: dict[str, str], +) -> CacheBase: + """Create a new cache from the destination.""" + conversion_fn_map: dict[str, Callable[[dict[str, str]], CacheBase]] = { + "DestinationBigquery": create_bigquery_cache, + "DestinationDuckdb": create_duckdb_cache, + "DestinationPostgres": create_postgres_cache, + "DestinationSnowflake": create_snowflake_cache, + } + destination_class_name = destination_configuration["destination_type"] + if destination_class_name not in conversion_fn_map: + raise ValueError( # noqa: TRY003 + "Cannot convert destination configuration to cache. Destination type not supported. ", + f"Supported destination types: {list(conversion_fn_map.keys())}", + ) + + conversion_fn = conversion_fn_map[destination_class_name] + return conversion_fn(destination_configuration) diff --git a/airbyte/cloud/_destinations.py b/airbyte/cloud/_destinations.py index a8226643..c3cc3bc8 100644 --- a/airbyte/cloud/_destinations.py +++ b/airbyte/cloud/_destinations.py @@ -3,198 +3,56 @@ from __future__ import annotations -from pathlib import Path -from typing import TYPE_CHECKING, Any - -from airbyte_api.models.shared import ( - DestinationBigquery, - DestinationDuckdb, - DestinationPostgres, - DestinationSnowflake, -) - -from airbyte.caches import ( - BigQueryCache, - DuckDBCache, - MotherDuckCache, - PostgresCache, - SnowflakeCache, -) -from airbyte.secrets import get_secret +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from airbyte._util import api_util +from airbyte.cloud import _destination_util as dest_util if TYPE_CHECKING: - from collections.abc import Callable + from airbyte_api.models.shared.destinationresponse import DestinationResponse + from sqlalchemy.engine import Engine from airbyte.caches.base import CacheBase + from airbyte.cloud._workspaces import CloudWorkspace + + +@dataclass +class CloudDestination: + """A cloud destination for Airbyte.""" + + workspace: CloudWorkspace + destination_id: str + destination_type: str + + _destination_response: DestinationResponse | None = None + _as_cache: CacheBase | None = None + + def _get_destination_response(self, *, force_refresh: bool = False) -> DestinationResponse: + """Get the destination response.""" + if self._destination_response is None or force_refresh: + self._destination_response = api_util.get_destination( + destination_id=self.destination_id, + api_root=self.workspace.api_root, + api_key=self.workspace.api_key, + ) + + return self._destination_response + + def get_destination_config(self) -> dict[str, str]: + """Get the destination configuration.""" + return self._get_destination_response().configuration + + def as_cache(self) -> CacheBase: + """Get the cache for the destination.""" + if self._as_cache is None: + self._as_cache = dest_util.create_cache_from_destination( + destination_configuration=self.get_destination_config(), + ) + return self._as_cache -def get_destination_config_from_cache( - cache: CacheBase, -) -> dict[str, str]: - """Get the destination configuration from the cache.""" - conversion_fn_map: dict[str, Callable[[Any], dict[str, str]]] = { - "BigQueryCache": get_bigquery_destination_config, - "DuckDBCache": get_duckdb_destination_config, - "MotherDuckCache": get_motherduck_destination_config, - "PostgresCache": get_postgres_destination_config, - "SnowflakeCache": get_snowflake_destination_config, - } - cache_class_name = cache.__class__.__name__ - if cache_class_name not in conversion_fn_map: - raise ValueError( # noqa: TRY003 - "Cannot convert cache type to destination configuration. Cache type not supported. ", - f"Supported cache types: {list(conversion_fn_map.keys())}", - ) - - conversion_fn = conversion_fn_map[cache_class_name] - return conversion_fn(cache) - - -def get_duckdb_destination_config( - cache: DuckDBCache, -) -> dict[str, str]: - """Get the destination configuration from the DuckDB cache.""" - return DestinationDuckdb( - destination_path=cache.db_path, - schema=cache.schema_name, - ).to_dict() - - -def get_motherduck_destination_config( - cache: MotherDuckCache, -) -> dict[str, str]: - """Get the destination configuration from the DuckDB cache.""" - return DestinationDuckdb( - destination_path=cache.db_path, - schema=cache.schema_name, - motherduck_api_key=cache.api_key, - ).to_dict() - - -def get_postgres_destination_config( - cache: PostgresCache, -) -> dict[str, str]: - """Get the destination configuration from the Postgres cache.""" - return DestinationPostgres( - database=cache.database, - host=cache.host, - password=cache.password, - port=cache.port, - schema=cache.schema_name, - username=cache.username, - ).to_dict() - - -def get_snowflake_destination_config( - cache: SnowflakeCache, -) -> dict[str, str]: - """Get the destination configuration from the Snowflake cache.""" - return DestinationSnowflake( - account=cache.account, - database=cache.database, - password=cache.password, - role=cache.role, - schema=cache.schema_name, - username=cache.username, - warehouse=cache.warehouse, - ).to_dict() - - -def get_bigquery_destination_config( - cache: BigQueryCache, -) -> dict[str, str]: - """Get the destination configuration from the BigQuery cache.""" - return DestinationBigquery( - project_id=cache.project_name, - dataset_id=cache.dataset_name, - schema=cache.schema_name, - credentials_json=Path(cache.credentials_path).read_text(), - ).to_dict() - - -def create_bigquery_cache( - destination_configuration: dict[str, str], -) -> BigQueryCache: - """Create a new BigQuery cache from the destination configuration.""" - credentials_path = get_secret("BIGQUERY_CREDENTIALS_PATH") - return BigQueryCache( - project_name=destination_configuration["project_id"], - dataset_name=destination_configuration["dataset_id"], - schema_name=destination_configuration["schema"], - credentials_path=credentials_path, - ) - - -def create_duckdb_cache( - destination_configuration: dict[str, str], -) -> DuckDBCache: - """Create a new DuckDB cache from the destination configuration.""" - return DuckDBCache( - db_path=destination_configuration["destination_path"], - schema_name=destination_configuration["schema"], - ) - - -def create_motherduck_cache( - destination_configuration: dict[str, str], -) -> MotherDuckCache: - """Create a new DuckDB cache from the destination configuration.""" - return MotherDuckCache( - database=destination_configuration["destination_path"], - schema_name=destination_configuration["schema"], - api_key=destination_configuration["motherduck_api_key"], - ) - - -def create_postgres_cache( - destination_configuration: dict[str, str], -) -> PostgresCache: - """Create a new Postgres cache from the destination configuration.""" - port: int = ( - int(destination_configuration["port"]) if "port" in destination_configuration else 5432 - ) - return PostgresCache( - database=destination_configuration["database"], - host=destination_configuration["host"], - password=destination_configuration["password"], - port=port, - schema_name=destination_configuration["schema"], - username=destination_configuration["username"], - ) - - -def create_snowflake_cache( - destination_configuration: dict[str, str], -) -> SnowflakeCache: - """Create a new Snowflake cache from the destination configuration.""" - return SnowflakeCache( - account=destination_configuration["account"], - database=destination_configuration["database"], - password=destination_configuration["password"], - role=destination_configuration["role"], - schema_name=destination_configuration["schema"], - username=destination_configuration["username"], - warehouse=destination_configuration["warehouse"], - ) - - -def create_cache_from_destination( - destination_configuration: dict[str, str], -) -> CacheBase: - """Create a new cache from the destination.""" - conversion_fn_map: dict[str, Callable[[dict[str, str]], CacheBase]] = { - "DestinationBigquery": create_bigquery_cache, - "DestinationDuckdb": create_duckdb_cache, - "DestinationPostgres": create_postgres_cache, - "DestinationSnowflake": create_snowflake_cache, - } - destination_class_name = destination_configuration["destination_type"] - if destination_class_name not in conversion_fn_map: - raise ValueError( # noqa: TRY003 - "Cannot convert destination configuration to cache. Destination type not supported. ", - f"Supported destination types: {list(conversion_fn_map.keys())}", - ) - - conversion_fn = conversion_fn_map[destination_class_name] - return conversion_fn(destination_configuration) + def get_sql_engine(self) -> Engine: + """Get the SQL engine for the destination.""" + return self.as_cache().get_sql_engine() diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/_workspaces.py index 694f45a6..a9de3753 100644 --- a/airbyte/cloud/_workspaces.py +++ b/airbyte/cloud/_workspaces.py @@ -22,7 +22,7 @@ get_connection, get_workspace, ) -from airbyte.cloud._destinations import get_destination_config_from_cache +from airbyte.cloud._destination_util import get_destination_config_from_cache from airbyte.cloud._sync_results import SyncResult from airbyte.sources.base import Source @@ -317,3 +317,10 @@ def get_previous_sync_logs( ) for sync_log in sync_logs ] + + def _get_destination_sql_engine( + self, + destination_id: str, + ) -> Engine: + """Get the SQL engine for a deployed destination.""" + return cache.processor.get_sql_engine() From f5480078a400076cdfb302186d92bc38b555d678 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Fri, 29 Mar 2024 22:44:14 -0700 Subject: [PATCH 028/118] remove unused --- airbyte/cloud/_workspaces.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/_workspaces.py index a9de3753..08aadbd0 100644 --- a/airbyte/cloud/_workspaces.py +++ b/airbyte/cloud/_workspaces.py @@ -317,10 +317,3 @@ def get_previous_sync_logs( ) for sync_log in sync_logs ] - - def _get_destination_sql_engine( - self, - destination_id: str, - ) -> Engine: - """Get the SQL engine for a deployed destination.""" - return cache.processor.get_sql_engine() From 12170f3a017cb7b83ee3b9b8ca4fa7f42dfe480e Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 1 Apr 2024 09:33:10 -0700 Subject: [PATCH 029/118] add read-from-destination-cache scaffold and failing test --- airbyte/cloud/_sync_results.py | 101 +++++++++++++++++- .../cloud/test_cloud_sync.py | 4 + 2 files changed, 103 insertions(+), 2 deletions(-) diff --git a/airbyte/cloud/_sync_results.py b/airbyte/cloud/_sync_results.py index 3a8c9384..87c54d05 100644 --- a/airbyte/cloud/_sync_results.py +++ b/airbyte/cloud/_sync_results.py @@ -4,12 +4,15 @@ from __future__ import annotations import time +from collections.abc import Iterator, Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, final -from airbyte_api.models.shared import JobStatusEnum +from airbyte_api.models.shared import ConnectionResponse, JobStatusEnum from airbyte._util import api_util +from airbyte.cloud._destination_util import create_cache_from_destination +from airbyte.datasets import CachedDataset from airbyte.exceptions import HostedConnectionSyncError, HostedConnectionSyncTimeoutError @@ -17,6 +20,9 @@ if TYPE_CHECKING: + import sqlalchemy + + from airbyte.caches.base import CacheBase from airbyte.cloud._workspaces import CloudWorkspace @@ -39,6 +45,29 @@ class SyncResult: connection_id: str job_id: str _latest_status: JobStatusEnum | None = None + _connection_response: ConnectionResponse | None = None + + def _get_connection_info(self, *, force_refresh: bool = False) -> ConnectionResponse: + """TODO""" + if self._connection_response and not force_refresh: + return self._connection_response + + self._connection_response = api_util.get_connection( + workspace_id=self.workspace.workspace_id, + api_root=self.workspace.api_root, + api_key=self.workspace.api_key, + connection_id=self.connection_id, + ) + return self._connection_response + + def _get_destination_configuration(self, *, force_refresh: bool = False) -> dict[str, Any]: + connection_info: ConnectionResponse = self._get_connection_info(force_refresh=force_refresh) + destination_response = api_util.get_destination( + destination_id=connection_info.destination_id, + api_root=self.workspace.api_root, + api_key=self.workspace.api_key, + ) + return destination_response.configuration def is_job_complete(self) -> bool: """Check if the sync job is complete.""" @@ -114,3 +143,71 @@ def wait_for_completion( return latest_status # This will be a non-final status time.sleep(api_util.JOB_WAIT_INTERVAL_SECS) + + def get_sql_cache(self) -> CacheBase: + """Return a SQL Cache object for working with the data in a SQL-based destination's.""" + # TODO: Implement + return create_cache_from_destination(destination_configuration) + + def get_sql_engine(self) -> sqlalchemy.engine.Engine: + """Return a SQL Engine for querying a SQL-based destination.""" + self.get_sql_cache().get_sql_engine() + + def get_sql_table_name(self, stream_name: str) -> str: + """Return the SQL table name of the named stream.""" + return self.get_sql_cache().processor.get_sql_table_name(stream_name=stream_name) + + def get_sql_table( + self, + stream_name: str, + ) -> sqlalchemy.Table: + """Return a SQLAlchemy table object for the named stream.""" + self.get_sql_cache().processor.get_sql_table(stream_name) + + def get_dataset(self, stream_name: str) -> CachedDataset: + """Return cached dataset.""" + cache = self.get_sql_cache() + return cache.streams[stream_name] + + def get_sql_database_name(self) -> str: + """Return the SQL database name.""" + cache = self.get_sql_cache() + return cache.get_database_name() + + def get_sql_schema_name(self) -> str: + """Return the SQL schema name.""" + cache = self.get_sql_cache() + return cache.schema_name + + @property + def stream_names(self) -> set[str]: + """TODO""" + return self.get_sql_cache().processor.expected_streams + + @final + @property + def streams( + self, + ) -> SyncResultStreams: + """Return a temporary table name.""" + return self.SyncResultStreams(self) + + class SyncResultStreams(Mapping[str, CachedDataset]): + """TODO""" + + def __init__( + self, + parent: SyncResult, + /, + ) -> None: + self.parent: SyncResult = parent + + def __getitem__(self, key: str) -> CachedDataset: + return self.parent.get_dataset(stream_name=key) + + def __iter__(self) -> Iterator[str]: + """TODO""" + return iter(self.parent.stream_names) + + def __len__(self) -> int: + return len(self.parent.stream_names) diff --git a/tests/integration_tests/cloud/test_cloud_sync.py b/tests/integration_tests/cloud/test_cloud_sync.py index 24a870cf..ca48e4b3 100644 --- a/tests/integration_tests/cloud/test_cloud_sync.py +++ b/tests/integration_tests/cloud/test_cloud_sync.py @@ -54,4 +54,8 @@ def test_deploy_and_run_connection( sync_result = cloud_workspace.run_sync(connection_id=connection_id) _ = sync_result + cache = sync_result.get_sql_cache() + assert cache.stream_names + assert cache.streams["users"].to_pandas() + cloud_workspace.delete_connection(connection_id=connection_id) From 4aaedbf5374e182cc9cd05bf49d1c876e1805e2a Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Wed, 3 Apr 2024 21:35:33 -0700 Subject: [PATCH 030/118] format fix --- airbyte/_util/api_util.py | 2 +- airbyte/cloud/_sync_results.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index 8ad2bc01..f4cc93ec 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -238,7 +238,6 @@ def get_job_info( raise MissingResourceError(job_id, "job", response.text) - # Create, get, and delete sources @@ -272,6 +271,7 @@ def create_source( response=response, ) + def get_source( source_id: str, *, diff --git a/airbyte/cloud/_sync_results.py b/airbyte/cloud/_sync_results.py index 87c54d05..501eb720 100644 --- a/airbyte/cloud/_sync_results.py +++ b/airbyte/cloud/_sync_results.py @@ -6,7 +6,7 @@ import time from collections.abc import Iterator, Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, final +from typing import TYPE_CHECKING, Any, final from airbyte_api.models.shared import ConnectionResponse, JobStatusEnum From b616fa35c0dc8fb8c23461c5983543cfd5e44dfb Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Wed, 3 Apr 2024 21:44:09 -0700 Subject: [PATCH 031/118] implement missing parts --- airbyte/cloud/_sync_results.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/airbyte/cloud/_sync_results.py b/airbyte/cloud/_sync_results.py index 501eb720..cd3508b6 100644 --- a/airbyte/cloud/_sync_results.py +++ b/airbyte/cloud/_sync_results.py @@ -48,7 +48,7 @@ class SyncResult: _connection_response: ConnectionResponse | None = None def _get_connection_info(self, *, force_refresh: bool = False) -> ConnectionResponse: - """TODO""" + """Return connection info for the sync job.""" if self._connection_response and not force_refresh: return self._connection_response @@ -61,6 +61,7 @@ def _get_connection_info(self, *, force_refresh: bool = False) -> ConnectionResp return self._connection_response def _get_destination_configuration(self, *, force_refresh: bool = False) -> dict[str, Any]: + """Return the destination configuration for the sync job.""" connection_info: ConnectionResponse = self._get_connection_info(force_refresh=force_refresh) destination_response = api_util.get_destination( destination_id=connection_info.destination_id, @@ -146,7 +147,7 @@ def wait_for_completion( def get_sql_cache(self) -> CacheBase: """Return a SQL Cache object for working with the data in a SQL-based destination's.""" - # TODO: Implement + destination_configuration = self._get_destination_configuration() return create_cache_from_destination(destination_configuration) def get_sql_engine(self) -> sqlalchemy.engine.Engine: @@ -181,7 +182,7 @@ def get_sql_schema_name(self) -> str: @property def stream_names(self) -> set[str]: - """TODO""" + """Return the set of stream names.""" return self.get_sql_cache().processor.expected_streams @final @@ -193,7 +194,7 @@ def streams( return self.SyncResultStreams(self) class SyncResultStreams(Mapping[str, CachedDataset]): - """TODO""" + """A mapping of stream names to cached datasets.""" def __init__( self, From 07f55492b023f4f69f32ae4aac116e4a599b8e87 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Wed, 3 Apr 2024 21:50:55 -0700 Subject: [PATCH 032/118] adapt nullable credentials --- airbyte/cloud/_destination_util.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/airbyte/cloud/_destination_util.py b/airbyte/cloud/_destination_util.py index a8226643..ac0acaa4 100644 --- a/airbyte/cloud/_destination_util.py +++ b/airbyte/cloud/_destination_util.py @@ -105,11 +105,14 @@ def get_bigquery_destination_config( cache: BigQueryCache, ) -> dict[str, str]: """Get the destination configuration from the BigQuery cache.""" + credentials_json: str | None = ( + Path(cache.credentials_path).read_text() if cache.credentials_path else None + ) return DestinationBigquery( project_id=cache.project_name, dataset_id=cache.dataset_name, schema=cache.schema_name, - credentials_json=Path(cache.credentials_path).read_text(), + credentials_json=credentials_json, ).to_dict() From 0da393bc151c95089b6aa051af4a3312cdf6a6dd Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Wed, 3 Apr 2024 21:55:43 -0700 Subject: [PATCH 033/118] fix pydantic "|" compat issue --- airbyte/caches/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index af24ab98..bc8cc7c9 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -44,10 +44,10 @@ class CacheBase(BaseModel): table_suffix: str = "" """A suffix to add to all table names.""" - _deployed_api_root: str | None = PrivateAttr(default=None) - _deployed_workspace_id: str | None = PrivateAttr(default=None) - _deployed_destination_id: str | None = PrivateAttr(default=None) - _deployed_connection_id: str | None = PrivateAttr(default=None) + _deployed_api_root: Optional[str] = PrivateAttr(default=None) + _deployed_workspace_id: Optional[str] = PrivateAttr(default=None) + _deployed_destination_id: Optional[str] = PrivateAttr(default=None) + _deployed_connection_id: Optional[str] = PrivateAttr(default=None) _sql_processor_class: type[SqlProcessorBase] = PrivateAttr() _sql_processor: Optional[SqlProcessorBase] = PrivateAttr(default=None) From ad3378e8c54eb2e7ae807e4eb32f97d6b93e61b4 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Wed, 3 Apr 2024 22:21:19 -0700 Subject: [PATCH 034/118] rename exceptions --- airbyte/_executor.py | 4 +-- airbyte/_processors/base.py | 8 +++--- airbyte/_processors/file/base.py | 2 +- airbyte/_processors/sql/base.py | 16 ++++++------ airbyte/_processors/sql/bigquery.py | 6 ++--- airbyte/_util/api_util.py | 4 +-- airbyte/caches/_catalog_manager.py | 8 +++--- airbyte/caches/base.py | 2 +- airbyte/caches/util.py | 4 +-- airbyte/exceptions.py | 26 +++++++++---------- airbyte/secrets.py | 2 +- airbyte/sources/base.py | 12 ++++----- airbyte/sources/registry.py | 4 +-- airbyte/sources/util.py | 4 +-- airbyte/validate.py | 2 +- docs/generate.py | 4 +-- .../test_source_test_fixture.py | 2 +- tests/unit_tests/test_exceptions.py | 4 ++- 18 files changed, 58 insertions(+), 56 deletions(-) diff --git a/airbyte/_executor.py b/airbyte/_executor.py index 139d3ef5..3792fdb7 100644 --- a/airbyte/_executor.py +++ b/airbyte/_executor.py @@ -47,7 +47,7 @@ def __init__( The 'name' param is required if 'metadata' is None. """ if not name and not metadata: - raise exc.AirbyteLibInternalError(message="Either name or metadata must be provided.") + raise exc.PyAirbyteInternalError(message="Either name or metadata must be provided.") self.name: str = name or cast(ConnectorMetadata, metadata).name # metadata is not None here self.metadata: ConnectorMetadata | None = metadata @@ -270,7 +270,7 @@ def _get_installed_version( if not self.interpreter_path.exists(): # No point in trying to detect the version if the interpreter does not exist if raise_on_error: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Connector's virtual environment interpreter could not be found.", context={ "interpreter_path": self.interpreter_path, diff --git a/airbyte/_processors/base.py b/airbyte/_processors/base.py index 6c1d35a7..84234dcf 100644 --- a/airbyte/_processors/base.py +++ b/airbyte/_processors/base.py @@ -60,7 +60,7 @@ def __init__( self._expected_streams: set[str] | None = None self.cache: CacheBase = cache if not isinstance(self.cache, CacheBase): - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message=( f"Expected config class of type 'CacheBase'. " f"Instead received type '{type(self.cache).__name__}'." @@ -92,7 +92,7 @@ def register_source( ) -> None: """Register the source name and catalog.""" if not self._catalog_manager: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Catalog manager should exist but does not.", ) self._catalog_manager.register_source( @@ -226,7 +226,7 @@ def _finalize_state_messages( ) -> None: """Handle state messages by passing them to the catalog manager.""" if not self._catalog_manager: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Catalog manager should exist but does not.", ) if state_messages and self._source_name: @@ -251,7 +251,7 @@ def _get_stream_config( ) -> ConfiguredAirbyteStream: """Return the definition of the given stream.""" if not self._catalog_manager: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Catalog manager should exist but does not.", ) diff --git a/airbyte/_processors/file/base.py b/airbyte/_processors/file/base.py index 52267bc1..5c644371 100644 --- a/airbyte/_processors/file/base.py +++ b/airbyte/_processors/file/base.py @@ -162,7 +162,7 @@ def process_record_message( batch_handle = self._new_batch(stream_name=stream_name) if batch_handle.open_file_writer is None: - raise exc.AirbyteLibInternalError(message="Expected open file writer.") + raise exc.PyAirbyteInternalError(message="Expected open file writer.") self._write_record_dict( record_dict=StreamRecord.from_record_message( diff --git a/airbyte/_processors/sql/base.py b/airbyte/_processors/sql/base.py index bb67a696..db2bb55c 100644 --- a/airbyte/_processors/sql/base.py +++ b/airbyte/_processors/sql/base.py @@ -296,7 +296,7 @@ def _get_table_by_name( query. To ignore the cache and force a refresh, set 'force_refresh' to True. """ if force_refresh and shallow_okay: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Cannot force refresh and use shallow query at the same time." ) @@ -453,7 +453,7 @@ def _ensure_compatible_table_schema( ] if missing_columns: if raise_on_error: - raise exc.AirbyteLibCacheTableValidationError( + raise exc.PyAirbyteCacheTableValidationError( violation="Cache table is missing expected columns.", context={ "stream_column_names": stream_column_names, @@ -666,7 +666,7 @@ def _write_files_to_new_table( # Pandas will auto-create the table if it doesn't exist, which we don't want. if not self._table_exists(temp_table_name): - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Table does not exist after creation.", context={ "temp_table_name": temp_table_name, @@ -727,7 +727,7 @@ def _write_temp_table_to_final_table( has_pks: bool = bool(self._get_primary_keys(stream_name)) has_incremental_key: bool = bool(self._get_incremental_key(stream_name)) if write_strategy == WriteStrategy.MERGE and not has_pks: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Cannot use merge strategy on a stream with no primary keys.", context={ "stream_name": stream_name, @@ -783,7 +783,7 @@ def _write_temp_table_to_final_table( ) return - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Write strategy is not supported.", context={ "write_strategy": write_strategy, @@ -843,9 +843,9 @@ def _swap_temp_table_with_final_table( Databases that do not support this syntax can override this method. """ if final_table_name is None: - raise exc.AirbyteLibInternalError(message="Arg 'final_table_name' cannot be None.") + raise exc.PyAirbyteInternalError(message="Arg 'final_table_name' cannot be None.") if temp_table_name is None: - raise exc.AirbyteLibInternalError(message="Arg 'temp_table_name' cannot be None.") + raise exc.PyAirbyteInternalError(message="Arg 'temp_table_name' cannot be None.") _ = stream_name deletion_name = f"{final_table_name}_deleteme" @@ -909,7 +909,7 @@ def _get_column_by_name(self, table: str | Table, column_name: str) -> Column: # Try to get the column in a case-insensitive manner return next(col for col in table.c if col.name.lower() == column_name.lower()) except StopIteration: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Could not find matching column.", context={ "table": table, diff --git a/airbyte/_processors/sql/bigquery.py b/airbyte/_processors/sql/bigquery.py index 6c4ec4ad..7c821ce0 100644 --- a/airbyte/_processors/sql/bigquery.py +++ b/airbyte/_processors/sql/bigquery.py @@ -175,7 +175,7 @@ def _table_exists( return False except ValueError as ex: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Invalid project name or dataset name.", context={ "table_id": table_id, @@ -225,9 +225,9 @@ def _swap_temp_table_with_final_table( ALTER TABLE my_schema.my_old_table_name RENAME TO my_new_table_name; """ if final_table_name is None: - raise exc.AirbyteLibInternalError(message="Arg 'final_table_name' cannot be None.") + raise exc.PyAirbyteInternalError(message="Arg 'final_table_name' cannot be None.") if temp_table_name is None: - raise exc.AirbyteLibInternalError(message="Arg 'temp_table_name' cannot be None.") + raise exc.PyAirbyteInternalError(message="Arg 'temp_table_name' cannot be None.") _ = stream_name deletion_name = f"{final_table_name}_deleteme" diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index f4cc93ec..64417908 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -3,8 +3,8 @@ In order to insulate users from breaking changes and to avoid general confusion around naming and design inconsistencies, we do not expose these functions or other Airbyte API classes within -AirbyteLib. Classes and functions from the Airbyte API external library should always be wrapped in -AirbyteLib classes - unless there's a very compelling reason to surface these models intentionally. +PyAirbyte. Classes and functions from the Airbyte API external library should always be wrapped in +PyAirbyte classes - unless there's a very compelling reason to surface these models intentionally. """ from __future__ import annotations diff --git a/airbyte/caches/_catalog_manager.py b/airbyte/caches/_catalog_manager.py index 6306b26d..ea26d521 100644 --- a/airbyte/caches/_catalog_manager.py +++ b/airbyte/caches/_catalog_manager.py @@ -85,10 +85,10 @@ def source_catalog(self) -> ConfiguredAirbyteCatalog: """Return the source catalog. Raises: - AirbyteLibInternalError: If the source catalog is not set. + PyAirbyteInternalError: If the source catalog is not set. """ if not self._source_catalog: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Source catalog should be initialized but is not.", ) @@ -231,7 +231,7 @@ def get_stream_config( ) -> ConfiguredAirbyteStream: """Return the column definitions for the given stream.""" if not self.source_catalog: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Cannot get stream JSON schema without a catalog.", ) @@ -249,7 +249,7 @@ def get_stream_config( ) if len(matching_streams) > 1: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Multiple streams found with same name.", context={ "stream_name": stream_name, diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index bc8cc7c9..a6f566f9 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -113,7 +113,7 @@ def _catalog_manager( self, ) -> CatalogManager: if not self._has_catalog_manager: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Catalog manager should exist but does not.", ) diff --git a/airbyte/caches/util.py b/airbyte/caches/util.py index 8e3a6e81..d1cf2128 100644 --- a/airbyte/caches/util.py +++ b/airbyte/caches/util.py @@ -42,13 +42,13 @@ def new_local_cache( """ if cache_name: if " " in cache_name: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Cache name cannot contain spaces.", input_value=cache_name, ) if not cache_name.replace("_", "").isalnum(): - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Cache name can only contain alphanumeric characters and underscores.", input_value=cache_name, ) diff --git a/airbyte/exceptions.py b/airbyte/exceptions.py index ea2130cd..9fa43bdf 100644 --- a/airbyte/exceptions.py +++ b/airbyte/exceptions.py @@ -135,7 +135,7 @@ def safe_logging_dict(self) -> dict[str, Any]: @dataclass -class AirbyteLibInternalError(AirbyteError): +class PyAirbyteInternalError(AirbyteError): """An internal error occurred in PyAirbyte.""" guidance = "Please consider reporting this error to the Airbyte team." @@ -146,7 +146,7 @@ class AirbyteLibInternalError(AirbyteError): @dataclass -class AirbyteLibInputError(AirbyteError, ValueError): +class PyAirbyteInputError(AirbyteError, ValueError): """The input provided to PyAirbyte did not match expected validation rules. This inherits from ValueError so that it can be used as a drop-in replacement for @@ -160,7 +160,7 @@ class AirbyteLibInputError(AirbyteError, ValueError): @dataclass -class AirbyteLibNoStreamsSelectedError(AirbyteLibInputError): +class PyAirbyteNoStreamsSelectedError(PyAirbyteInputError): """No streams were selected for the source.""" guidance = ( @@ -174,19 +174,19 @@ class AirbyteLibNoStreamsSelectedError(AirbyteLibInputError): # PyAirbyte Cache Errors -class AirbyteLibCacheError(AirbyteError): +class PyAirbyteCacheError(AirbyteError): """Error occurred while accessing the cache.""" @dataclass -class AirbyteLibCacheTableValidationError(AirbyteLibCacheError): +class PyAirbyteCacheTableValidationError(PyAirbyteCacheError): """Cache table validation failed.""" violation: str | None = None @dataclass -class AirbyteConnectorConfigurationMissingError(AirbyteLibCacheError): +class AirbyteConnectorConfigurationMissingError(PyAirbyteCacheError): """Connector is missing configuration.""" connector_name: str | None = None @@ -298,7 +298,7 @@ class AirbyteStreamNotFoundError(AirbyteConnectorError): @dataclass -class AirbyteLibSecretNotFoundError(AirbyteError): +class PyAirbyteSecretNotFoundError(AirbyteError): """Secret not found.""" guidance = "Please ensure that the secret is set." @@ -314,7 +314,7 @@ class AirbyteLibSecretNotFoundError(AirbyteError): @dataclass -class HostedAirbyteError(AirbyteError): +class AirbyteError(AirbyteError): """An error occurred while communicating with the hosted Airbyte instance.""" response: AirbyteApiResponseDuckType | None = None @@ -332,7 +332,7 @@ def workspace_url(self) -> str | None: @dataclass -class HostedAirbyteConnectionError(HostedAirbyteError): +class AirbyteConnectionError(AirbyteError): """An connection error occurred while communicating with the hosted Airbyte instance.""" connection_id: str | None = None @@ -367,12 +367,12 @@ def job_url(self) -> str | None: @dataclass -class HostedConnectionSyncError(HostedAirbyteConnectionError): +class AirbyteConnectionSyncError(AirbyteConnectionError): """An error occurred while executing the remote Airbyte job.""" @dataclass -class HostedConnectionSyncTimeoutError(HostedConnectionSyncError): +class AirbyteConnectionSyncTimeoutError(AirbyteConnectionSyncError): """An timeout occurred while waiting for the remote Airbyte job to complete.""" timeout: int | None = None @@ -383,7 +383,7 @@ class HostedConnectionSyncTimeoutError(HostedConnectionSyncError): @dataclass -class MissingResourceError(HostedAirbyteError): +class AirbyteMissingResourceError(AirbyteError): """Remote Airbyte resources does not exist.""" resource_type: str | None = None @@ -391,7 +391,7 @@ class MissingResourceError(HostedAirbyteError): @dataclass -class MultipleResourcesError(HostedAirbyteError): +class AirbyteMultipleResourcesError(AirbyteError): """Could not locate the resource because multiple matching resources were found.""" resource_type: str | None = None diff --git a/airbyte/secrets.py b/airbyte/secrets.py index f0a4d11c..48e2c517 100644 --- a/airbyte/secrets.py +++ b/airbyte/secrets.py @@ -122,7 +122,7 @@ def get_secret( if val: return val - raise exc.AirbyteLibSecretNotFoundError( + raise exc.PyAirbyteSecretNotFoundError( secret_name=secret_name, sources=[str(s) for s in sources], ) diff --git a/airbyte/sources/base.py b/airbyte/sources/base.py index 21d10d2f..dcc029da 100644 --- a/airbyte/sources/base.py +++ b/airbyte/sources/base.py @@ -288,7 +288,7 @@ def print_config_spec( it will be printed to the console. """ if format not in ["yaml", "json"]: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Invalid format. Expected 'yaml' or 'json'", input_value=format, ) @@ -382,13 +382,13 @@ def get_stream_json_schema(self, stream_name: str) -> dict[str, Any]: ] if len(found) == 0: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Stream name does not exist in catalog.", input_value=stream_name, ) if len(found) > 1: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Duplicate streams found with the same name.", context={ "found_streams": found, @@ -421,7 +421,7 @@ def get_records(self, stream: str) -> LazyDataset: ], ) if len(configured_catalog.streams) == 0: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Requested stream does not exist.", context={ "stream": stream, @@ -706,7 +706,7 @@ def read( try: write_strategy = WriteStrategy(write_strategy) except ValueError: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Invalid strategy", context={ "write_strategy": write_strategy, @@ -718,7 +718,7 @@ def read( self.select_streams(streams) if not self._selected_stream_names: - raise exc.AirbyteLibNoStreamsSelectedError( + raise exc.PyAirbyteNoStreamsSelectedError( connector_name=self.name, available_streams=self.get_available_streams(), ) diff --git a/airbyte/sources/registry.py b/airbyte/sources/registry.py index 4d825937..8b05566a 100644 --- a/airbyte/sources/registry.py +++ b/airbyte/sources/registry.py @@ -79,7 +79,7 @@ def _get_registry_cache(*, force_refresh: bool = False) -> dict[str, ConnectorMe new_cache[connector_metadata.name] = connector_metadata if len(new_cache) == 0: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Connector registry is empty.", context={ "registry_url": _get_registry_url(), @@ -97,7 +97,7 @@ def get_connector_metadata(name: str) -> ConnectorMetadata: """ cache = copy(_get_registry_cache()) if not cache: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Connector registry could not be loaded.", context={ "registry_url": _get_registry_url(), diff --git a/airbyte/sources/util.py b/airbyte/sources/util.py index 518ea132..51ee952c 100644 --- a/airbyte/sources/util.py +++ b/airbyte/sources/util.py @@ -74,11 +74,11 @@ def get_source( """ if local_executable: if pip_url: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Param 'pip_url' is not supported when 'local_executable' is set." ) if version: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Param 'version' is not supported when 'local_executable' is set." ) diff --git a/airbyte/validate.py b/airbyte/validate.py index 89789a80..9a4650d2 100644 --- a/airbyte/validate.py +++ b/airbyte/validate.py @@ -154,7 +154,7 @@ def validate(connector_dir: str, sample_config: str, *, validate_install_only: b install_only_test(connector_name) else: if not sample_config: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( input_value="--sample-config is required without --validate-install-only set" ) full_tests(connector_name, sample_config) diff --git a/docs/generate.py b/docs/generate.py index dbcea00d..1b95663c 100755 --- a/docs/generate.py +++ b/docs/generate.py @@ -1,5 +1,5 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. -"""Generate docs for all public modules in AirbyteLib and save them to docs/generated. +"""Generate docs for all public modules in PyAirbyte and save them to docs/generated. Usage: poetry run python docs/generate.py @@ -19,7 +19,7 @@ def run() -> None: - """Generate docs for all public modules in AirbyteLib and save them to docs/generated.""" + """Generate docs for all public modules in PyAirbyte and save them to docs/generated.""" public_modules = ["airbyte"] # recursively delete the docs/generated folder if it exists diff --git a/tests/integration_tests/test_source_test_fixture.py b/tests/integration_tests/test_source_test_fixture.py index 5672ebe6..e42c93e4 100644 --- a/tests/integration_tests/test_source_test_fixture.py +++ b/tests/integration_tests/test_source_test_fixture.py @@ -607,7 +607,7 @@ def test_lazy_dataset_from_source( pop_internal_columns_from_dataset(list_from_iter_b) # Make sure that we get a key error if we try to access a stream that doesn't exist - with pytest.raises(exc.AirbyteLibInputError): + with pytest.raises(exc.PyAirbyteInputError): source.get_records(not_a_stream_name) # Make sure we can iterate on all available streams diff --git a/tests/unit_tests/test_exceptions.py b/tests/unit_tests/test_exceptions.py index 919ea7ed..23e9a4b4 100644 --- a/tests/unit_tests/test_exceptions.py +++ b/tests/unit_tests/test_exceptions.py @@ -21,7 +21,9 @@ def test_exceptions(): assert message.count("\n") == 0 assert message != "" assert message.strip() == message - assert name.startswith("Airbyte") + assert any( + [name.startswith(prefix) for prefix in ["Airbyte", "HostedAirbyte", "PyAirbyte"]] + ), f"{name} does not start with Airbyte, HostedAirbyte, or PyAirbyte" assert name.endswith("Error") From b0f89964153978cdf388854bd80da817666cf9c0 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Wed, 3 Apr 2024 22:35:56 -0700 Subject: [PATCH 035/118] update exception names --- airbyte/_util/api_util.py | 42 +++++++++++++++-------------- airbyte/cloud/_sync_results.py | 8 +++--- airbyte/exceptions.py | 18 ++++++------- tests/unit_tests/test_exceptions.py | 4 +-- 4 files changed, 37 insertions(+), 35 deletions(-) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index 64417908..25774cd3 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -18,10 +18,10 @@ from airbyte_api.models.shared.jobcreaterequest import JobCreateRequest, JobTypeEnum from airbyte.exceptions import ( - HostedAirbyteError, - HostedConnectionSyncError, - MissingResourceError, - MultipleResourcesError, + AirbyteConnectionSyncError, + AirbyteError, + AirbyteMissingResourceError, + AirbyteMultipleResourcesError, ) @@ -79,7 +79,7 @@ def get_workspace( if status_ok(response.status_code) and response.workspace_response: return response.workspace_response - raise MissingResourceError( + raise AirbyteMissingResourceError( resource_type="workspace", context={ "workspace_id": workspace_id, @@ -112,7 +112,7 @@ def list_connections( if status_ok(response.status_code) and response.connections_response: return response.connections_response.data - raise HostedAirbyteError( + raise AirbyteError( context={ "workspace_id": workspace_id, "response": response, @@ -141,7 +141,7 @@ def get_connection( if status_ok(response.status_code) and response.connection_response: return response.connection_response - raise MissingResourceError(connection_id, "connection", response.text) + raise AirbyteMissingResourceError(connection_id, "connection", response.text) def run_connection( @@ -171,7 +171,7 @@ def run_connection( if status_ok(response.status_code) and response.job_response: return response.job_response - raise HostedConnectionSyncError( + raise AirbyteConnectionSyncError( connection_id=connection_id, context={ "workspace_id": workspace_id, @@ -206,7 +206,7 @@ def get_job_logs( if status_ok(response.status_code) and response.jobs_response: return response.jobs_response.data - raise MissingResourceError( + raise AirbyteMissingResourceError( response=response, resource_type="job", context={ @@ -235,7 +235,7 @@ def get_job_info( if status_ok(response.status_code) and response.job_response: return response.job_response - raise MissingResourceError(job_id, "job", response.text) + raise AirbyteMissingResourceError(job_id, "job", response.text) # Create, get, and delete sources @@ -266,7 +266,7 @@ def create_source( if status_ok(response.status_code) and response.source_response: return response.source_response - raise HostedAirbyteError( + raise AirbyteError( message="Could not create source.", response=response, ) @@ -291,7 +291,7 @@ def get_source( if status_ok(response.status_code) and response.connection_response: return response.connection_response - raise MissingResourceError(source_id, "source", response.text) + raise AirbyteMissingResourceError(source_id, "source", response.text) def delete_source( @@ -313,7 +313,7 @@ def delete_source( ), ) if not status_ok(response.status_code): - raise HostedAirbyteError( + raise AirbyteError( context={ "source_id": source_id, "response": response, @@ -349,7 +349,7 @@ def create_destination( if status_ok(response.status_code) and response.destination_response: return response.destination_response - raise HostedAirbyteError( + raise AirbyteError( message="Could not create destination.", response=response, ) @@ -374,7 +374,7 @@ def get_destination( if status_ok(response.status_code) and response.destination_response: return response.destination_response - raise MissingResourceError(destination_id, "destination", response.text) + raise AirbyteMissingResourceError(destination_id, "destination", response.text) def delete_destination( @@ -396,7 +396,7 @@ def delete_destination( ), ) if not status_ok(response.status_code): - raise HostedAirbyteError( + raise AirbyteError( context={ "destination_id": destination_id, "response": response, @@ -434,7 +434,7 @@ def create_connection( ), ) if not status_ok(response.status_code): - raise HostedAirbyteError( + raise AirbyteError( context={ "source_id": source_id, "destination_id": destination_id, @@ -462,10 +462,12 @@ def get_connection_by_name( connection for connection in connections if connection.name == connection_name ] if len(found) == 0: - raise MissingResourceError(connection_name, "connection", f"Workspace: {workspace_id}") + raise AirbyteMissingResourceError( + connection_name, "connection", f"Workspace: {workspace_id}" + ) if len(found) > 1: - raise MultipleResourcesError( + raise AirbyteMultipleResourcesError( resource_type="connection", resource_name_or_id=connection_name, context={ @@ -494,7 +496,7 @@ def delete_connection( ), ) if not status_ok(response.status_code): - raise HostedAirbyteError( + raise AirbyteError( context={ "connection_id": connection_id, "response": response, diff --git a/airbyte/cloud/_sync_results.py b/airbyte/cloud/_sync_results.py index cd3508b6..81a3d4aa 100644 --- a/airbyte/cloud/_sync_results.py +++ b/airbyte/cloud/_sync_results.py @@ -13,7 +13,7 @@ from airbyte._util import api_util from airbyte.cloud._destination_util import create_cache_from_destination from airbyte.datasets import CachedDataset -from airbyte.exceptions import HostedConnectionSyncError, HostedConnectionSyncTimeoutError +from airbyte.exceptions import AirbyteConnectionSyncError, AirbyteConnectionSyncTimeoutError DEFAULT_SYNC_TIMEOUT_SECONDS = 30 * 60 # 30 minutes @@ -97,7 +97,7 @@ def raise_failure_status( By default, this method will use the latest status available. If you want to refresh the status before checking for failure, set `refresh_status=True`. If the job has failed, this - method will raise a `HostedConnectionSyncError`. + method will raise a `AirbyteConnectionSyncError`. Otherwise, do nothing. """ @@ -106,7 +106,7 @@ def raise_failure_status( latest_status = self.get_job_status() if latest_status in FAILED_STATUSES: - raise HostedConnectionSyncError( + raise AirbyteConnectionSyncError( workspace=self.workspace, connection_id=self.connection_id, job_id=self.job_id, @@ -133,7 +133,7 @@ def wait_for_completion( if time.time() - start_time > wait_timeout: if raise_timeout: - raise HostedConnectionSyncTimeoutError( + raise AirbyteConnectionSyncTimeoutError( workspace=self.workspace, connection_id=self.connection_id, job_id=self.job_id, diff --git a/airbyte/exceptions.py b/airbyte/exceptions.py index 9fa43bdf..a87410d0 100644 --- a/airbyte/exceptions.py +++ b/airbyte/exceptions.py @@ -55,7 +55,7 @@ @dataclass -class AirbyteError(Exception): +class PyAirbyteError(Exception): """Base class for exceptions in Airbyte.""" guidance: str | None = None @@ -135,7 +135,7 @@ def safe_logging_dict(self) -> dict[str, Any]: @dataclass -class PyAirbyteInternalError(AirbyteError): +class PyAirbyteInternalError(PyAirbyteError): """An internal error occurred in PyAirbyte.""" guidance = "Please consider reporting this error to the Airbyte team." @@ -146,7 +146,7 @@ class PyAirbyteInternalError(AirbyteError): @dataclass -class PyAirbyteInputError(AirbyteError, ValueError): +class PyAirbyteInputError(PyAirbyteError, ValueError): """The input provided to PyAirbyte did not match expected validation rules. This inherits from ValueError so that it can be used as a drop-in replacement for @@ -174,7 +174,7 @@ class PyAirbyteNoStreamsSelectedError(PyAirbyteInputError): # PyAirbyte Cache Errors -class PyAirbyteCacheError(AirbyteError): +class PyAirbyteCacheError(PyAirbyteError): """Error occurred while accessing the cache.""" @@ -196,7 +196,7 @@ class AirbyteConnectorConfigurationMissingError(PyAirbyteCacheError): @dataclass -class AirbyteSubprocessError(AirbyteError): +class AirbyteSubprocessError(PyAirbyteError): """Error when running subprocess.""" run_args: list[str] | None = None @@ -212,7 +212,7 @@ class AirbyteSubprocessFailedError(AirbyteSubprocessError): # Connector Registry Errors -class AirbyteConnectorRegistryError(AirbyteError): +class AirbyteConnectorRegistryError(PyAirbyteError): """Error when accessing the connector registry.""" @@ -236,7 +236,7 @@ class AirbyteConnectorNotPyPiPublishedError(AirbyteConnectorRegistryError): @dataclass -class AirbyteConnectorError(AirbyteError): +class AirbyteConnectorError(PyAirbyteError): """Error when running the connector.""" connector_name: str | None = None @@ -298,7 +298,7 @@ class AirbyteStreamNotFoundError(AirbyteConnectorError): @dataclass -class PyAirbyteSecretNotFoundError(AirbyteError): +class PyAirbyteSecretNotFoundError(PyAirbyteError): """Secret not found.""" guidance = "Please ensure that the secret is set." @@ -314,7 +314,7 @@ class PyAirbyteSecretNotFoundError(AirbyteError): @dataclass -class AirbyteError(AirbyteError): +class AirbyteError(PyAirbyteError): """An error occurred while communicating with the hosted Airbyte instance.""" response: AirbyteApiResponseDuckType | None = None diff --git a/tests/unit_tests/test_exceptions.py b/tests/unit_tests/test_exceptions.py index 23e9a4b4..11f0a1cb 100644 --- a/tests/unit_tests/test_exceptions.py +++ b/tests/unit_tests/test_exceptions.py @@ -22,8 +22,8 @@ def test_exceptions(): assert message != "" assert message.strip() == message assert any( - [name.startswith(prefix) for prefix in ["Airbyte", "HostedAirbyte", "PyAirbyte"]] - ), f"{name} does not start with Airbyte, HostedAirbyte, or PyAirbyte" + [name.startswith(prefix) for prefix in ["Airbyte", "PyAirbyte"]] + ), f"{name} does not start with Airbyte or PyAirbyte" assert name.endswith("Error") From a22c36e9ea78603a6014480bcda0f1386c9540be Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Thu, 4 Apr 2024 13:34:27 -0700 Subject: [PATCH 036/118] improve deploy_connection() --- airbyte/cloud/_workspaces.py | 56 +++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/_workspaces.py index 08aadbd0..7c72d295 100644 --- a/airbyte/cloud/_workspaces.py +++ b/airbyte/cloud/_workspaces.py @@ -10,6 +10,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING +from airbyte import exceptions as exc from airbyte._util import api_util from airbyte._util.api_util import ( CLOUD_API_ROOT, @@ -179,30 +180,63 @@ def delete_destination( def deploy_connection( self, - source: Source, - cache: CacheBase, + source: Source | str, + cache: CacheBase | None = None, + destination: str | None = None, ) -> str: """Deploy a source and cache to the workspace as a new connection. Returns the newly deployed connection ID as a `str`. + + Args: + source (Source | str): The source to deploy. You can pass either an already deployed + source ID `str` or a PyAirbyte `Source` object. If you pass a `Source` object, + it will be deployed automatically. + cache (CacheBase, optional): The cache to deploy as a new destination. You can provide + `cache` or `destination`, but not both. + destination (str, optional): The destination ID to use. You can provide + `cache` or `destination`, but not both. """ - self.deploy_source(source) - self.deploy_cache_as_destination(cache) + # Resolve source ID + source_id: str + if isinstance(source, Source): + if source._deployed_source_id: # noqa: SLF001 + source_id = source._deployed_source_id # noqa: SLF001 + else: + source_id = self.deploy_source(source) + else: + source_id = source - assert source._deployed_source_id is not None # noqa: SLF001 # Accessing nn-public API - assert cache._deployed_destination_id is not None # noqa: SLF001 # Accessing nn-public API + # Resolve destination ID + destination_id: str + if destination: + destination_id = destination + elif cache: + if not cache._deployed_destination_id: # noqa: SLF001 + destination_id = self.deploy_cache_as_destination(cache) + else: + destination_id = cache._deployed_destination_id # noqa: SLF001 + else: + raise exc.PyAirbyteInputError( + guidance="You must provide either a destination ID or a cache object." + ) + + assert source_id is not None + assert destination_id is not None deployed_connection = create_connection( - name=f"Connection {source.name.replace('-', ' ').title()} (Deployed by PyAirbyte)", - source_id=source._deployed_source_id, # noqa: SLF001 # Accessing nn-public API - destination_id=cache._deployed_destination_id, # noqa: SLF001 # Accessing nn-public API + name="Connection (Deployed by PyAirbyte)", + source_id=source_id, + destination_id=destination_id, api_root=self.api_root, api_key=self.api_key, workspace_id=self.workspace_id, ) - source._deployed_connection_id = deployed_connection.connection_id # noqa: SLF001 - cache._deployed_connection_id = deployed_connection.connection_id # noqa: SLF001 + if isinstance(source, Source): + source._deployed_connection_id = deployed_connection.connection_id # noqa: SLF001 + if cache: + cache._deployed_connection_id = deployed_connection.connection_id # noqa: SLF001 return deployed_connection.connection_id From c7967acb9447d06fe93f10452c8c820d63059bf8 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Thu, 4 Apr 2024 13:34:53 -0700 Subject: [PATCH 037/118] use standard inserts option for the bigquery dest --- airbyte/cloud/_destination_util.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/airbyte/cloud/_destination_util.py b/airbyte/cloud/_destination_util.py index ac0acaa4..95f8939f 100644 --- a/airbyte/cloud/_destination_util.py +++ b/airbyte/cloud/_destination_util.py @@ -11,6 +11,7 @@ DestinationDuckdb, DestinationPostgres, DestinationSnowflake, + StandardInserts, ) from airbyte.caches import ( @@ -108,12 +109,14 @@ def get_bigquery_destination_config( credentials_json: str | None = ( Path(cache.credentials_path).read_text() if cache.credentials_path else None ) - return DestinationBigquery( + destination = DestinationBigquery( project_id=cache.project_name, dataset_id=cache.dataset_name, - schema=cache.schema_name, + dataset_location="us-west1", credentials_json=credentials_json, - ).to_dict() + loading_method=StandardInserts, + ) + return destination.to_dict() def create_bigquery_cache( From 2b3874096ecf25bfd42a37492a4e935877191d22 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Thu, 4 Apr 2024 13:35:13 -0700 Subject: [PATCH 038/118] add test for read-from-destination --- .../cloud/test_cloud_sql_reads.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 tests/integration_tests/cloud/test_cloud_sql_reads.py diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py new file mode 100644 index 00000000..60720468 --- /dev/null +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -0,0 +1,66 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""Integration tests for reading from cache.""" +from __future__ import annotations +from contextlib import suppress + +import pytest + +import airbyte as ab +from airbyte import cloud +from airbyte.cloud._sync_results import SyncResult +from tests.conftest import new_bigquery_cache + + +@pytest.fixture +def deployable_cache(new_bigquery_cache) -> ab.BigQueryCache | ab.SnowflakeCache: + # TODO: Add Snowflake here as well + return new_bigquery_cache + + +@pytest.fixture +def deployable_source() -> ab.Source: + return ab.get_source( + "source-faker", + local_executable="source-faker", + config={"count": 100}, + install_if_missing=False, + ) + + +def test_read_cache( + cloud_workspace: cloud.CloudWorkspace, + deployable_cache: ab.BigQueryCache | ab.SnowflakeCache, + deployable_source: ab.Source, +) -> None: + """Test reading from a cache.""" + + # Deploy source, destination, and connection: + source_id = cloud_workspace.deploy_source(source=deployable_source) + destination_id = cloud_workspace.deploy_cache_as_destination(cache=deployable_cache) + connection_id = cloud_workspace.deploy_connection( + source=source_id, + destination=destination_id, + ) + + # Run sync and get result: + sync_result: SyncResult = cloud_workspace.run_sync(connection_id=connection_id) + + # Test sync result: + assert sync_result.success + assert sync_result.stream_names == ["users", "products", "purchases"] + dataset: ab.CachedDataset = sync_result.get_dataset("users") + assert dataset.stream_name == "users" + data_as_list = list(dataset) + assert len(data_as_list) == 100 + + # Cleanup + with suppress(Exception): + cloud_workspace.delete_connection( + connection_id=connection_id, + delete_source=True, + delete_destination=True, + ) + with suppress(Exception): + cloud_workspace.delete_source(source_id=source_id) + with suppress(Exception): + cloud_workspace.delete_destination(destination_id=destination_id) From 4470603ff8419560d8378c923c101df5aa4d6f05 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Thu, 4 Apr 2024 13:35:45 -0700 Subject: [PATCH 039/118] remove redundant conftest import --- tests/integration_tests/cloud/test_cloud_sql_reads.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py index 60720468..f4db985f 100644 --- a/tests/integration_tests/cloud/test_cloud_sql_reads.py +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -8,7 +8,6 @@ import airbyte as ab from airbyte import cloud from airbyte.cloud._sync_results import SyncResult -from tests.conftest import new_bigquery_cache @pytest.fixture From 0983dd12a6f6234b2ce49a671338528527a938f5 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Fri, 5 Apr 2024 21:13:46 -0700 Subject: [PATCH 040/118] =?UTF-8?q?snowflake=20passing=20tests=20?= =?UTF-8?q?=F0=9F=8E=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- airbyte/_util/api_util.py | 25 +++++- airbyte/cloud/_destination_util.py | 80 ++++++++++--------- airbyte/cloud/_destinations.py | 16 +++- airbyte/cloud/_sync_results.py | 11 ++- airbyte/cloud/_workspaces.py | 8 +- airbyte/datasets/_sql.py | 35 ++++++-- tests/conftest.py | 2 + .../cloud/test_cloud_sql_reads.py | 66 +++++++++++++-- 8 files changed, 183 insertions(+), 60 deletions(-) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index 25774cd3..292d6113 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -9,6 +9,7 @@ from __future__ import annotations +import json import os from typing import Any @@ -371,7 +372,29 @@ def get_destination( destination_id=destination_id, ), ) - if status_ok(response.status_code) and response.destination_response: + if status_ok(response.status_code): + # TODO: This is a temporary workaround to resolve an issue where + # the destination API response is of the wrong type. + raw_response: dict[str, Any] = json.loads(response.raw_response.text) + raw_configuration: dict[str, Any] = raw_response["configuration"] + destination_type = raw_response.get("destinationType") + if destination_type == "snowflake": + response.destination_response.configuration = api_models.DestinationSnowflake.from_dict( + raw_configuration, + ) + if destination_type == "bigquery": + response.destination_response.configuration = api_models.DestinationBigquery.from_dict( + raw_configuration, + ) + if destination_type == "postgres": + response.destination_response.configuration = api_models.DestinationPostgres.from_dict( + raw_configuration, + ) + if destination_type == "duckdb": + response.destination_response.configuration = api_models.DestinationDuckdb.from_dict( + raw_configuration, + ) + return response.destination_response raise AirbyteMissingResourceError(destination_id, "destination", response.text) diff --git a/airbyte/cloud/_destination_util.py b/airbyte/cloud/_destination_util.py index 95f8939f..d9292ed3 100644 --- a/airbyte/cloud/_destination_util.py +++ b/airbyte/cloud/_destination_util.py @@ -12,6 +12,7 @@ DestinationPostgres, DestinationSnowflake, StandardInserts, + UsernameAndPassword, ) from airbyte.caches import ( @@ -30,6 +31,9 @@ from airbyte.caches.base import CacheBase +SNOWFLAKE_PASSWORD_SECRET_NAME = "SNOWFLAKE_PASSWORD" + + def get_destination_config_from_cache( cache: CacheBase, ) -> dict[str, str]: @@ -92,13 +96,15 @@ def get_snowflake_destination_config( ) -> dict[str, str]: """Get the destination configuration from the Snowflake cache.""" return DestinationSnowflake( - account=cache.account, - database=cache.database, - password=cache.password, + host=f"{cache.account}.snowflakecomputing.com", + database=cache.get_database_name().upper(), + schema=cache.schema_name.upper(), + warehouse=cache.warehouse, role=cache.role, - schema=cache.schema_name, username=cache.username, - warehouse=cache.warehouse, + credentials=UsernameAndPassword( + password=cache.password, + ), ).to_dict() @@ -120,73 +126,75 @@ def get_bigquery_destination_config( def create_bigquery_cache( - destination_configuration: dict[str, str], + destination_configuration: DestinationBigquery, ) -> BigQueryCache: """Create a new BigQuery cache from the destination configuration.""" credentials_path = get_secret("BIGQUERY_CREDENTIALS_PATH") return BigQueryCache( - project_name=destination_configuration["project_id"], - dataset_name=destination_configuration["dataset_id"], - schema_name=destination_configuration["schema"], + project_name=destination_configuration.project_id, + dataset_name=destination_configuration.dataset_id, + schema_name=destination_configuration.schema, credentials_path=credentials_path, ) def create_duckdb_cache( - destination_configuration: dict[str, str], + destination_configuration: DestinationDuckdb, ) -> DuckDBCache: """Create a new DuckDB cache from the destination configuration.""" return DuckDBCache( - db_path=destination_configuration["destination_path"], - schema_name=destination_configuration["schema"], + db_path=destination_configuration.destination_path, + schema_name=destination_configuration.schema, ) def create_motherduck_cache( - destination_configuration: dict[str, str], + destination_configuration: DestinationDuckdb, ) -> MotherDuckCache: """Create a new DuckDB cache from the destination configuration.""" return MotherDuckCache( - database=destination_configuration["destination_path"], - schema_name=destination_configuration["schema"], - api_key=destination_configuration["motherduck_api_key"], + database=destination_configuration.destination_path, + schema_name=destination_configuration.schema, + api_key=destination_configuration.motherduck_api_key, ) def create_postgres_cache( - destination_configuration: dict[str, str], + destination_configuration: DestinationPostgres, ) -> PostgresCache: """Create a new Postgres cache from the destination configuration.""" - port: int = ( - int(destination_configuration["port"]) if "port" in destination_configuration else 5432 - ) + port: int = int(destination_configuration.port) if "port" in destination_configuration else 5432 return PostgresCache( - database=destination_configuration["database"], - host=destination_configuration["host"], - password=destination_configuration["password"], + database=destination_configuration.database, + host=destination_configuration.host, + password=destination_configuration.password, port=port, - schema_name=destination_configuration["schema"], - username=destination_configuration["username"], + schema_name=destination_configuration.schema, + username=destination_configuration.username, ) def create_snowflake_cache( - destination_configuration: dict[str, str], + destination_configuration: DestinationSnowflake, + password_secret_name: str = SNOWFLAKE_PASSWORD_SECRET_NAME, ) -> SnowflakeCache: """Create a new Snowflake cache from the destination configuration.""" return SnowflakeCache( - account=destination_configuration["account"], - database=destination_configuration["database"], - password=destination_configuration["password"], - role=destination_configuration["role"], - schema_name=destination_configuration["schema"], - username=destination_configuration["username"], - warehouse=destination_configuration["warehouse"], + account=destination_configuration.host.split(".snowflakecomputing")[0], + database=destination_configuration.database, + schema_name=destination_configuration.schema, + warehouse=destination_configuration.warehouse, + role=destination_configuration.role, + username=destination_configuration.username, + password=get_secret(password_secret_name), ) -def create_cache_from_destination( - destination_configuration: dict[str, str], +def create_cache_from_destination_config( + destination_configuration: DestinationBigquery + | DestinationDuckdb + | DestinationPostgres + | DestinationSnowflake, ) -> CacheBase: """Create a new cache from the destination.""" conversion_fn_map: dict[str, Callable[[dict[str, str]], CacheBase]] = { @@ -195,7 +203,7 @@ def create_cache_from_destination( "DestinationPostgres": create_postgres_cache, "DestinationSnowflake": create_snowflake_cache, } - destination_class_name = destination_configuration["destination_type"] + destination_class_name = type(destination_config).__name__ if destination_class_name not in conversion_fn_map: raise ValueError( # noqa: TRY003 "Cannot convert destination configuration to cache. Destination type not supported. ", diff --git a/airbyte/cloud/_destinations.py b/airbyte/cloud/_destinations.py index c3cc3bc8..0ffa4584 100644 --- a/airbyte/cloud/_destinations.py +++ b/airbyte/cloud/_destinations.py @@ -4,14 +4,20 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from airbyte._util import api_util from airbyte.cloud import _destination_util as dest_util if TYPE_CHECKING: - from airbyte_api.models.shared.destinationresponse import DestinationResponse + from airbyte_api.models.shared import ( + DestinationBigquery, + DestinationDuckdb, + DestinationPostgres, + DestinationResponse, + DestinationSnowflake, + ) from sqlalchemy.engine import Engine from airbyte.caches.base import CacheBase @@ -40,14 +46,16 @@ def _get_destination_response(self, *, force_refresh: bool = False) -> Destinati return self._destination_response - def get_destination_config(self) -> dict[str, str]: + def get_destination_config( + self, + ) -> DestinationBigquery | DestinationDuckdb | DestinationPostgres | DestinationSnowflake | Any: """Get the destination configuration.""" return self._get_destination_response().configuration def as_cache(self) -> CacheBase: """Get the cache for the destination.""" if self._as_cache is None: - self._as_cache = dest_util.create_cache_from_destination( + self._as_cache = dest_util.create_cache_from_destination_config( destination_configuration=self.get_destination_config(), ) diff --git a/airbyte/cloud/_sync_results.py b/airbyte/cloud/_sync_results.py index 81a3d4aa..9896aab1 100644 --- a/airbyte/cloud/_sync_results.py +++ b/airbyte/cloud/_sync_results.py @@ -11,7 +11,7 @@ from airbyte_api.models.shared import ConnectionResponse, JobStatusEnum from airbyte._util import api_util -from airbyte.cloud._destination_util import create_cache_from_destination +from airbyte.cloud._destination_util import create_cache_from_destination_config from airbyte.datasets import CachedDataset from airbyte.exceptions import AirbyteConnectionSyncError, AirbyteConnectionSyncTimeoutError @@ -44,6 +44,8 @@ class SyncResult: workspace: CloudWorkspace connection_id: str job_id: str + table_name_prefix: str = "" + table_name_suffix: str = "" _latest_status: JobStatusEnum | None = None _connection_response: ConnectionResponse | None = None @@ -148,7 +150,9 @@ def wait_for_completion( def get_sql_cache(self) -> CacheBase: """Return a SQL Cache object for working with the data in a SQL-based destination's.""" destination_configuration = self._get_destination_configuration() - return create_cache_from_destination(destination_configuration) + return create_cache_from_destination_config( + destination_configuration, + ) def get_sql_engine(self) -> sqlalchemy.engine.Engine: """Return a SQL Engine for querying a SQL-based destination.""" @@ -167,8 +171,7 @@ def get_sql_table( def get_dataset(self, stream_name: str) -> CachedDataset: """Return cached dataset.""" - cache = self.get_sql_cache() - return cache.streams[stream_name] + return CachedDataset(self.get_sql_cache(), stream_name=stream_name) def get_sql_database_name(self) -> str: """Return the SQL database name.""" diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/_workspaces.py index 7c72d295..1c35ffb9 100644 --- a/airbyte/cloud/_workspaces.py +++ b/airbyte/cloud/_workspaces.py @@ -304,7 +304,7 @@ def run_sync( def get_sync_result( self, connection_id: str, - job_id: str | None, + job_id: str | None = None, ) -> SyncResult | None: """Get the sync result for a connection job. @@ -315,7 +315,7 @@ def get_sync_result( if job_id is None: results = self.get_previous_sync_logs( connection_id=connection_id, - num_sync_logs=1, + limit=1, ) if results: return results[0] @@ -332,7 +332,7 @@ def get_previous_sync_logs( self, connection_id: str, *, - num_sync_logs: int = 10, + limit: int = 10, ) -> list[SyncResult]: """Get the previous sync logs for a connection.""" sync_logs: list[JobResponse] = api_util.get_job_logs( @@ -340,7 +340,7 @@ def get_previous_sync_logs( api_root=self.api_root, api_key=self.api_key, workspace_id=self.workspace_id, - limit=num_sync_logs, + limit=limit, ) return [ SyncResult( diff --git a/airbyte/datasets/_sql.py b/airbyte/datasets/_sql.py index b23cccad..311d6e7d 100644 --- a/airbyte/datasets/_sql.py +++ b/airbyte/datasets/_sql.py @@ -7,6 +7,9 @@ from overrides import overrides from sqlalchemy import and_, func, select, text +from typing_extensions import Literal + +from airbyte_protocol.models.airbyte_protocol import ConfiguredAirbyteStream from airbyte.datasets._base import DatasetBase @@ -18,6 +21,8 @@ from sqlalchemy import Selectable, Table from sqlalchemy.sql import ClauseElement + from airbyte_protocol.models import ConfiguredAirbyteStream + from airbyte.caches.base import CacheBase @@ -33,16 +38,36 @@ def __init__( cache: CacheBase, stream_name: str, query_statement: Selectable, + stream_configuration: ConfiguredAirbyteStream | None | Literal[False] = None, ) -> None: + """Initialize the dataset with a cache, stream name, and query statement. + + The query statement should be a SQLAlchemy Selectable object that can be executed to + retrieve records from the dataset. + + If stream_configuration is not provided, we attempt to retrieve the stream configuration + from the cache processor. This is useful when constructing a dataset from a CachedDataset + object, which already has the stream configuration. + + If stream_configuration is set to False, we skip the stream configuration retrieval. + """ self._length: int | None = None self._cache: CacheBase = cache self._stream_name: str = stream_name self._query_statement: Selectable = query_statement - super().__init__( - stream_metadata=cache.processor._get_stream_config( # noqa: SLF001 # Member is private until we have a public API for it. - stream_name=stream_name - ), - ) + if stream_configuration is None: + try: + stream_configuration = cache.processor._get_stream_config( # noqa: SLF001 # Member is private until we have a public API for it. + stream_name=stream_name + ) + except Exception as ex: + Warning(f"Failed to get stream configuration for {stream_name}: {ex}") + + stream_configuration: ConfiguredAirbyteStream | None = ( + stream_configuration or None # Coalesce False to None + ) + + super().__init__(stream_metadata=stream_configuration) @property def stream_name(self) -> str: diff --git a/tests/conftest.py b/tests/conftest.py index 30d8a7ed..05fa5125 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,6 +13,7 @@ import time from requests.exceptions import HTTPError +import sqlalchemy import ulid from airbyte._util.google_secrets import get_gcp_secret from airbyte._util.meta import is_windows @@ -225,6 +226,7 @@ def new_snowflake_cache(): role=secret["role"], schema_name=f"test{str(ulid.ULID()).lower()[-6:]}", ) + sqlalchemy_url = config.get_sql_alchemy_url() yield config diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py index f4db985f..51cc2cc0 100644 --- a/tests/integration_tests/cloud/test_cloud_sql_reads.py +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -4,6 +4,8 @@ from contextlib import suppress import pytest +import sqlalchemy +from sqlalchemy.engine.base import Engine import airbyte as ab from airbyte import cloud @@ -11,9 +13,13 @@ @pytest.fixture -def deployable_cache(new_bigquery_cache) -> ab.BigQueryCache | ab.SnowflakeCache: +def deployable_cache( + new_bigquery_cache, + new_snowflake_cache, +) -> ab.BigQueryCache | ab.SnowflakeCache: # TODO: Add Snowflake here as well - return new_bigquery_cache + return new_snowflake_cache + # return new_bigquery_cache @pytest.fixture @@ -26,7 +32,17 @@ def deployable_source() -> ab.Source: ) -def test_read_cache( +@pytest.fixture +def deployed_connection_id() -> str: + return "c7b4d838-a612-495a-9d91-a14e477add51" + + +@pytest.fixture +def previous_job_run_id() -> str: + return "10136196" + + +def test_deploy_and_run_and_read( cloud_workspace: cloud.CloudWorkspace, deployable_cache: ab.BigQueryCache | ab.SnowflakeCache, deployable_source: ab.Source, @@ -45,9 +61,19 @@ def test_read_cache( sync_result: SyncResult = cloud_workspace.run_sync(connection_id=connection_id) # Test sync result: - assert sync_result.success - assert sync_result.stream_names == ["users", "products", "purchases"] - dataset: ab.CachedDataset = sync_result.get_dataset("users") + assert sync_result.is_job_complete() + + # TODO: Remove this after Destination bug is resolved: + # https://github.com/airbytehq/airbyte/issues/36875 + sync_result: SyncResult = cloud_workspace.run_sync(connection_id=connection_id) + + # Test sync result: + assert sync_result.is_job_complete() + + # TODO: Rebuild streams property from connection's configured streams API endpoint + # assert sync_result.stream_names == ["users", "products", "purchases"] + + dataset: ab.CachedDataset = sync_result.get_dataset(stream_name="users") assert dataset.stream_name == "users" data_as_list = list(dataset) assert len(data_as_list) == 100 @@ -63,3 +89,31 @@ def test_read_cache( cloud_workspace.delete_source(source_id=source_id) with suppress(Exception): cloud_workspace.delete_destination(destination_id=destination_id) + + +def test_read_from_deployed_connection( + cloud_workspace: cloud.CloudWorkspace, + deployed_connection_id: str, +) -> None: + """Test reading from a cache.""" + # Run sync and get result: + sync_result: SyncResult = cloud_workspace.get_sync_result(connection_id=deployed_connection_id) + + # Test sync result: + assert sync_result.is_job_complete() + + cache = sync_result.get_sql_cache() + sqlalchemy_url = cache.get_sql_alchemy_url() + engine: Engine = sync_result.get_sql_engine() + # assert sync_result.stream_names == ["users", "products", "purchases"] + + dataset: ab.CachedDataset = sync_result.get_dataset(stream_name="users") + assert dataset.stream_name == "users" + data_as_list = list(dataset) + assert len(data_as_list) == 100 + + pandas_df = dataset.to_pandas() + assert pandas_df.shape == (100, 20) + for col in pandas_df.columns: + # Check that no values are null + assert pandas_df[col].notnull().all() From 98f376bee3b173ead70f0ade2b8117f8a8369839 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Fri, 5 Apr 2024 22:18:00 -0700 Subject: [PATCH 041/118] add motherduck test --- airbyte/secrets.py | 1 + tests/conftest.py | 17 +++++++++++++++++ .../cloud/test_cloud_sql_reads.py | 13 +++++++++---- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/airbyte/secrets.py b/airbyte/secrets.py index 48e2c517..8b776fd7 100644 --- a/airbyte/secrets.py +++ b/airbyte/secrets.py @@ -90,6 +90,7 @@ def _get_secret_from_prompt( def get_secret( secret_name: str, + /, source: SecretSource | list[SecretSource] = SecretSource.ANY, *, prompt: bool = True, diff --git a/tests/conftest.py b/tests/conftest.py index 05fa5125..ff020366 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,6 +20,7 @@ from airbyte.caches.base import CacheBase from airbyte.caches.bigquery import BigQueryCache from airbyte.caches.duckdb import DuckDBCache +from airbyte.caches.motherduck import MotherDuckCache from airbyte.caches.snowflake import SnowflakeCache import docker @@ -33,6 +34,8 @@ from airbyte.caches.util import new_local_cache from airbyte.sources.base import as_temp_files +import airbyte as ab + logger = logging.getLogger(__name__) @@ -93,6 +96,11 @@ def test_priority(item: Item) -> int: if True or not is_docker_available(): item.add_marker(pytest.mark.skip(reason="Skipping tests (Docker not available)")) + # Every test in the cloud directory is slow abd requires credentials + if "integration_tests/cloud" in str(item.fspath): + item.add_marker(pytest.mark.slow) + item.add_marker(pytest.mark.requires_creds) + def is_port_in_use(port): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: @@ -314,6 +322,15 @@ def new_duckdb_cache() -> DuckDBCache: return new_local_cache() +@pytest.fixture(scope="function") +def new_motherduck_cache() -> MotherDuckCache: + return MotherDuckCache( + api_key=ab.get_secret("MOTHERDUCK_API_KEY"), + schema_name=f"test{str(ulid.ULID()).lower()[-6:]}", + database="integration_tests_deleteany", + ) + + @pytest.fixture(scope="function") def new_generic_cache(request) -> CacheBase: """This is a placeholder fixture that will be overridden by pytest_generate_tests().""" diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py index 51cc2cc0..16bba87b 100644 --- a/tests/integration_tests/cloud/test_cloud_sql_reads.py +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -9,17 +9,22 @@ import airbyte as ab from airbyte import cloud +from airbyte.caches.postgres import PostgresCache from airbyte.cloud._sync_results import SyncResult @pytest.fixture def deployable_cache( - new_bigquery_cache, - new_snowflake_cache, -) -> ab.BigQueryCache | ab.SnowflakeCache: + # new_bigquery_cache, + # new_snowflake_cache, + # new_remote_postgres_cache: PostgresCache, + new_motherduck_cache: ab.MotherDuckCache, +) -> ab.BigQueryCache | ab.SnowflakeCache | ab.MotherDuckCache | ab.PostgresCache: # TODO: Add Snowflake here as well - return new_snowflake_cache + # return new_snowflake_cache # return new_bigquery_cache + return new_motherduck_cache + # return new_remote_postgres_cache @pytest.fixture From 87b3876cad247da915dddb9b90bfb1926a85d3cc Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 6 Apr 2024 06:39:08 -0700 Subject: [PATCH 042/118] fix name --- airbyte/cloud/_destination_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte/cloud/_destination_util.py b/airbyte/cloud/_destination_util.py index d9292ed3..512c1857 100644 --- a/airbyte/cloud/_destination_util.py +++ b/airbyte/cloud/_destination_util.py @@ -203,7 +203,7 @@ def create_cache_from_destination_config( "DestinationPostgres": create_postgres_cache, "DestinationSnowflake": create_snowflake_cache, } - destination_class_name = type(destination_config).__name__ + destination_class_name = type(destination_configuration).__name__ if destination_class_name not in conversion_fn_map: raise ValueError( # noqa: TRY003 "Cannot convert destination configuration to cache. Destination type not supported. ", From 10407f569f0c7fa0f4e5ab91fe4e8e4cf88a7d73 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 6 Apr 2024 06:41:46 -0700 Subject: [PATCH 043/118] improve tests, add parameterized test fixture for deployable cache --- tests/integration_tests/cloud/conftest.py | 31 +++++++++++ .../cloud/test_cloud_sql_reads.py | 51 ++++++++++++------- .../cloud/test_cloud_workspaces.py | 3 +- 3 files changed, 66 insertions(+), 19 deletions(-) diff --git a/tests/integration_tests/cloud/conftest.py b/tests/integration_tests/cloud/conftest.py index 37a7e2ed..3ae558a6 100644 --- a/tests/integration_tests/cloud/conftest.py +++ b/tests/integration_tests/cloud/conftest.py @@ -9,6 +9,7 @@ from airbyte._util.api_util import CLOUD_API_ROOT from dotenv import dotenv_values from airbyte._executor import _get_bin_dir +from airbyte.caches.base import CacheBase from airbyte.cloud import CloudWorkspace @@ -107,3 +108,33 @@ def motherduck_api_key() -> str: raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") return os.environ[ENV_MOTHERDUCK_API_KEY] + + +@pytest.fixture(scope="function") +def new_deployable_cache(request) -> CacheBase: + """This is a placeholder fixture that will be overridden by pytest_generate_tests().""" + return request.getfixturevalue(request.param) + + +def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: + """Override default pytest behavior, parameterizing our tests based on the available cache types. + + This is useful for running the same tests with different cache types, to ensure that the tests + can pass across all cache types. + """ + deployable_cache_fixtures: dict[str, str] = { + # Ordered by priority (fastest first) + # "DuckDB": "new_duckdb_cache", + # "Postgres": "new_remote_postgres_cache", + # "BigQuery": "new_bigquery_cache", + "Snowflake": "new_snowflake_cache", + } + + if "new_deployable_cache" in metafunc.fixturenames: + metafunc.parametrize( + "new_deployable_cache", + deployable_cache_fixtures.values(), + ids=deployable_cache_fixtures.keys(), + indirect=True, + scope="function", + ) diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py index 16bba87b..572ac5b4 100644 --- a/tests/integration_tests/cloud/test_cloud_sql_reads.py +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -9,24 +9,9 @@ import airbyte as ab from airbyte import cloud -from airbyte.caches.postgres import PostgresCache from airbyte.cloud._sync_results import SyncResult -@pytest.fixture -def deployable_cache( - # new_bigquery_cache, - # new_snowflake_cache, - # new_remote_postgres_cache: PostgresCache, - new_motherduck_cache: ab.MotherDuckCache, -) -> ab.BigQueryCache | ab.SnowflakeCache | ab.MotherDuckCache | ab.PostgresCache: - # TODO: Add Snowflake here as well - # return new_snowflake_cache - # return new_bigquery_cache - return new_motherduck_cache - # return new_remote_postgres_cache - - @pytest.fixture def deployable_source() -> ab.Source: return ab.get_source( @@ -49,14 +34,14 @@ def previous_job_run_id() -> str: def test_deploy_and_run_and_read( cloud_workspace: cloud.CloudWorkspace, - deployable_cache: ab.BigQueryCache | ab.SnowflakeCache, + new_deployable_cache: ab.BigQueryCache | ab.SnowflakeCache, deployable_source: ab.Source, ) -> None: """Test reading from a cache.""" # Deploy source, destination, and connection: source_id = cloud_workspace.deploy_source(source=deployable_source) - destination_id = cloud_workspace.deploy_cache_as_destination(cache=deployable_cache) + destination_id = cloud_workspace.deploy_cache_as_destination(cache=new_deployable_cache) connection_id = cloud_workspace.deploy_connection( source=source_id, destination=destination_id, @@ -122,3 +107,35 @@ def test_read_from_deployed_connection( for col in pandas_df.columns: # Check that no values are null assert pandas_df[col].notnull().all() + + +def test_read_from_previous_job( + cloud_workspace: cloud.CloudWorkspace, + deployed_connection_id: str, + previous_job_run_id: str, +) -> None: + """Test reading from a cache.""" + # Run sync and get result: + sync_result: SyncResult = cloud_workspace.get_sync_result( + connection_id=deployed_connection_id, + job_id=previous_job_run_id, + ) + + # Test sync result: + assert sync_result.is_job_complete() + + cache = sync_result.get_sql_cache() + sqlalchemy_url = cache.get_sql_alchemy_url() + engine: Engine = sync_result.get_sql_engine() + # assert sync_result.stream_names == ["users", "products", "purchases"] + + dataset: ab.CachedDataset = sync_result.get_dataset(stream_name="users") + assert dataset.stream_name == "users" + data_as_list = list(dataset) + assert len(data_as_list) == 100 + + pandas_df = dataset.to_pandas() + assert pandas_df.shape == (100, 20) + for col in pandas_df.columns: + # Check that no values are null + assert pandas_df[col].notnull().all() diff --git a/tests/integration_tests/cloud/test_cloud_workspaces.py b/tests/integration_tests/cloud/test_cloud_workspaces.py index 664b0c7b..4676622c 100644 --- a/tests/integration_tests/cloud/test_cloud_workspaces.py +++ b/tests/integration_tests/cloud/test_cloud_workspaces.py @@ -5,12 +5,11 @@ """ from __future__ import annotations -import pytest - import airbyte as ab from airbyte.caches import MotherDuckCache from airbyte.cloud import CloudWorkspace + ENV_AIRBYTE_API_KEY = "AIRBYTE_API_KEY" ENV_AIRBYTE_API_WORKSPACE_ID = "AIRBYTE_API_WORKSPACE_ID" ENV_MOTHERDUCK_API_KEY = "MOTHERDUCK_API_KEY" From 28bd4890f2febde2d39c17ccd84be27980d11446 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 6 Apr 2024 07:15:47 -0700 Subject: [PATCH 044/118] handle table prefix and stream names --- airbyte/_util/api_util.py | 16 ++++++++++++---- airbyte/cloud/_workspaces.py | 10 ++++++++++ .../cloud/test_cloud_sql_reads.py | 8 +++----- .../cloud/test_cloud_workspaces.py | 5 ++++- 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index 292d6113..d65cfb95 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -438,22 +438,30 @@ def create_connection( api_root: str, api_key: str, workspace_id: str | None = None, + prefix: str, + selected_stream_names: list[str], ) -> api_models.ConnectionResponse: _ = workspace_id # Not used (yet) airbyte_instance = get_airbyte_server_instance( api_key=api_key, api_root=api_root, ) - stream_configuration = api_models.StreamConfiguration( - name="users", - ) - stream_configurations = api_models.StreamConfigurations([stream_configuration]) + stream_configurations: list[api_models.StreamConfiguration] = [] + if selected_stream_names: + for stream_name in selected_stream_names: + stream_configuration = api_models.StreamConfiguration( + name=stream_name, + ) + stream_configurations.append(stream_configuration) + + stream_configurations = api_models.StreamConfigurations(stream_configurations) response = airbyte_instance.connections.create_connection( api_models.ConnectionCreateRequest( name=name, source_id=source_id, destination_id=destination_id, configurations=stream_configurations, + prefix=prefix, ), ) if not status_ok(response.status_code): diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/_workspaces.py index 1c35ffb9..ec06961b 100644 --- a/airbyte/cloud/_workspaces.py +++ b/airbyte/cloud/_workspaces.py @@ -183,6 +183,8 @@ def deploy_connection( source: Source | str, cache: CacheBase | None = None, destination: str | None = None, + table_prefix: str | None = None, + selected_streams: list[str] | None = None, ) -> str: """Deploy a source and cache to the workspace as a new connection. @@ -200,18 +202,24 @@ def deploy_connection( # Resolve source ID source_id: str if isinstance(source, Source): + selected_streams = selected_streams or source.get_selected_streams() if source._deployed_source_id: # noqa: SLF001 source_id = source._deployed_source_id # noqa: SLF001 else: source_id = self.deploy_source(source) else: source_id = source + if not selected_streams: + raise exc.PyAirbyteInputError( + guidance="You must provide `selected_streams` when deploying a source ID." + ) # Resolve destination ID destination_id: str if destination: destination_id = destination elif cache: + table_prefix = table_prefix if table_prefix is not None else (cache.table_prefix or "") if not cache._deployed_destination_id: # noqa: SLF001 destination_id = self.deploy_cache_as_destination(cache) else: @@ -231,6 +239,8 @@ def deploy_connection( api_root=self.api_root, api_key=self.api_key, workspace_id=self.workspace_id, + selected_stream_names=selected_streams, + prefix=table_prefix or "", ) if isinstance(source, Source): diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py index 572ac5b4..4f10d8f8 100644 --- a/tests/integration_tests/cloud/test_cloud_sql_reads.py +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -4,7 +4,6 @@ from contextlib import suppress import pytest -import sqlalchemy from sqlalchemy.engine.base import Engine import airbyte as ab @@ -45,15 +44,14 @@ def test_deploy_and_run_and_read( connection_id = cloud_workspace.deploy_connection( source=source_id, destination=destination_id, + table_prefix=cache.table_prefix, + selected_streams=source.get_selected_streams(), ) # Run sync and get result: sync_result: SyncResult = cloud_workspace.run_sync(connection_id=connection_id) - # Test sync result: - assert sync_result.is_job_complete() - - # TODO: Remove this after Destination bug is resolved: + # TODO: Remove this second run after Destination bug is resolved: # https://github.com/airbytehq/airbyte/issues/36875 sync_result: SyncResult = cloud_workspace.run_sync(connection_id=connection_id) diff --git a/tests/integration_tests/cloud/test_cloud_workspaces.py b/tests/integration_tests/cloud/test_cloud_workspaces.py index 4676622c..955ef961 100644 --- a/tests/integration_tests/cloud/test_cloud_workspaces.py +++ b/tests/integration_tests/cloud/test_cloud_workspaces.py @@ -64,5 +64,8 @@ def test_deploy_connection( schema_name="public", ) - connection_id: str = cloud_workspace.deploy_connection(source=source, cache=cache) + connection_id: str = cloud_workspace.deploy_connection( + source=source, + cache=cache, + ) cloud_workspace.delete_connection(connection_id=connection_id) From b209ea36523429c435187ba39e152994f32d7bb6 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 6 Apr 2024 08:19:53 -0700 Subject: [PATCH 045/118] parameterize read test --- .../cloud/test_cloud_sql_reads.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py index 4f10d8f8..147487c2 100644 --- a/tests/integration_tests/cloud/test_cloud_sql_reads.py +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -21,11 +21,6 @@ def deployable_source() -> ab.Source: ) -@pytest.fixture -def deployed_connection_id() -> str: - return "c7b4d838-a612-495a-9d91-a14e477add51" - - @pytest.fixture def previous_job_run_id() -> str: return "10136196" @@ -78,7 +73,15 @@ def test_deploy_and_run_and_read( with suppress(Exception): cloud_workspace.delete_destination(destination_id=destination_id) - +@pytest.mark.parametrize( + "deployed_connection_id", + [ + pytest.param("c7b4d838-a612-495a-9d91-a14e477add51", id="Faker->Snowflake"), + pytest.param("", id="Faker->BigQuery", marks=pytest.mark.skip(reason="Not yet supported")), + pytest.param("", id="Faker->Postgres", marks=pytest.mark.skip(reason="Not yet supported")), + pytest.param("", id="Faker->MotherDuck", marks=pytest.mark.skip(reason="Not yet supported")), + ], +) def test_read_from_deployed_connection( cloud_workspace: cloud.CloudWorkspace, deployed_connection_id: str, From c7996e60fed6660bdb67d3d59c4392828794efff Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 6 Apr 2024 23:29:46 -0700 Subject: [PATCH 046/118] use constant for secret name --- airbyte/_util/api_util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index d65cfb95..1997d66e 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -30,6 +30,7 @@ JOB_WAIT_TIMEOUT_SECS_DEFAULT = 60 * 60 # 1 hour CLOUD_API_ROOT = "https://api.airbyte.com/v1" +AIRBYTE_API_KEY_SECRET_NAME = "AIRBYTE_CLOUD_API_KEY" # Helper functions @@ -41,7 +42,7 @@ def status_ok(status_code: int) -> bool: def get_default_bearer_token() -> str | None: """Get the default bearer token from env variables.""" - return os.environ.get("AIRBYTE_API_KEY", None) + return os.environ.get(AIRBYTE_API_KEY_SECRET_NAME, None) def get_airbyte_server_instance( From f10a84cac0889e6033cc03ce8704db39c77aee9d Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 6 Apr 2024 23:30:13 -0700 Subject: [PATCH 047/118] add `is_interactive()` check --- airbyte/_util/meta.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/airbyte/_util/meta.py b/airbyte/_util/meta.py index 40283697..190634ce 100644 --- a/airbyte/_util/meta.py +++ b/airbyte/_util/meta.py @@ -53,6 +53,20 @@ def is_colab() -> bool: return bool(get_colab_release_version()) +@lru_cache +def is_interactive() -> bool: + if is_colab() or is_jupyter(): + return True + + if is_ci(): + return False + + if sys.__stdin__.isatty() and sys.__stdout__.isatty(): + return True + + return False + + @lru_cache def is_jupyter() -> bool: """Return True if running in a Jupyter notebook or qtconsole. From e18271d3cdcf21a89779f8e327f0bfc0d95293f4 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 6 Apr 2024 23:30:32 -0700 Subject: [PATCH 048/118] use constant --- airbyte/cloud/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte/cloud/__init__.py b/airbyte/cloud/__init__.py index 913892cb..6ce04e52 100644 --- a/airbyte/cloud/__init__.py +++ b/airbyte/cloud/__init__.py @@ -11,7 +11,7 @@ workspace = cloud.CloudWorkspace( workspace_id="123", - api_key=ab.get_secret("AIRBYTE_API_KEY"), + api_key=ab.get_secret("AIRBYTE_CLOUD_API_KEY"), ) source = ab.get_source("source-faker", config={}) From 787b7b037937bfbb1926fc63079979a7b746ea0a Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 6 Apr 2024 23:30:44 -0700 Subject: [PATCH 049/118] fix bigquery region --- airbyte/cloud/_destination_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte/cloud/_destination_util.py b/airbyte/cloud/_destination_util.py index 512c1857..08366398 100644 --- a/airbyte/cloud/_destination_util.py +++ b/airbyte/cloud/_destination_util.py @@ -118,7 +118,7 @@ def get_bigquery_destination_config( destination = DestinationBigquery( project_id=cache.project_name, dataset_id=cache.dataset_name, - dataset_location="us-west1", + dataset_location="US", credentials_json=credentials_json, loading_method=StandardInserts, ) From cac40dd351feb808e5cd6ebdd663a4f6f67c98a2 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 6 Apr 2024 23:31:21 -0700 Subject: [PATCH 050/118] add caching for SyncResult cache --- airbyte/cloud/_sync_results.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/airbyte/cloud/_sync_results.py b/airbyte/cloud/_sync_results.py index 9896aab1..5b3b76cb 100644 --- a/airbyte/cloud/_sync_results.py +++ b/airbyte/cloud/_sync_results.py @@ -48,6 +48,7 @@ class SyncResult: table_name_suffix: str = "" _latest_status: JobStatusEnum | None = None _connection_response: ConnectionResponse | None = None + _cache: CacheBase | None = None def _get_connection_info(self, *, force_refresh: bool = False) -> ConnectionResponse: """Return connection info for the sync job.""" @@ -149,10 +150,14 @@ def wait_for_completion( def get_sql_cache(self) -> CacheBase: """Return a SQL Cache object for working with the data in a SQL-based destination's.""" - destination_configuration = self._get_destination_configuration() - return create_cache_from_destination_config( - destination_configuration, + if self._cache: + return self._cache + + destination_configuration: dict[str, Any] = self._get_destination_configuration() + self._cache = create_cache_from_destination_config( + destination_configuration=destination_configuration ) + return self._cache def get_sql_engine(self) -> sqlalchemy.engine.Engine: """Return a SQL Engine for querying a SQL-based destination.""" From 37389dd18877bd0363d02c640b0606201839f5e5 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 6 Apr 2024 23:32:03 -0700 Subject: [PATCH 051/118] re-implement secrets management --- airbyte/secrets.py | 335 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 265 insertions(+), 70 deletions(-) diff --git a/airbyte/secrets.py b/airbyte/secrets.py index 8b776fd7..382db05a 100644 --- a/airbyte/secrets.py +++ b/airbyte/secrets.py @@ -4,13 +4,16 @@ import contextlib import os -from enum import Enum, auto +import warnings +from abc import ABC, abstractmethod +from enum import Enum from getpass import getpass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, cast from dotenv import dotenv_values from airbyte import exceptions as exc +from airbyte._util import meta if TYPE_CHECKING: @@ -23,113 +26,305 @@ colab_userdata = None -class SecretSource(Enum): - ENV = auto() - DOTENV = auto() - GOOGLE_COLAB = auto() - ANY = auto() +class SecretSourceEnum(str, Enum): + ENV = "env" + DOTENV = "dotenv" + GOOGLE_COLAB = "google_colab" - PROMPT = auto() + PROMPT = "prompt" -def _get_secret_from_env( - secret_name: str, -) -> str | None: - if secret_name not in os.environ: - return None +_SECRETS_SOURCES: list[SecretManager] = [] - return os.environ[secret_name] +class SecretManager(ABC): + """Abstract base class for secret managers. -def _get_secret_from_dotenv( - secret_name: str, -) -> str | None: - try: - dotenv_vars: dict[str, str | None] = dotenv_values() - except Exception: - # Can't locate or parse a .env file - return None + Secret managers are used to retrieve secrets from a secret store. - if secret_name not in dotenv_vars: - # Secret not found - return None + By registering a secret manager, PyAirbyte can automatically locate and + retrieve secrets from the secret store when needed. This allows you to + securely store and access sensitive information such as API keys, passwords, + and other credentials without hardcoding them in your code. - return dotenv_vars[secret_name] + To create a custom secret manager, subclass this class and implement the + `get_secret` method. By default, the secret manager will be automatically + registered as a global secret source, but will not replace any existing + secret sources. To customize this behavior, override the `auto_register` and + `replace_existing` attributes in your subclass as needed. + Note: Registered secrets managers always have priority over the default + secret sources such as environment variables, dotenv files, and Google Colab + secrets. If multiple secret managers are registered, the last one registered + will take priority. + """ -def _get_secret_from_colab( - secret_name: str, -) -> str | None: - if colab_userdata is None: - # The module doesn't exist. We probably aren't in Colab. - return None + replace_existing = False + as_backup = False + + def __init__(self, name: str | None = None) -> None: + """Instantiate the new secret manager.""" + + self.name: str = ( # Default to the class name if no name is provided + name or self.__class__.__name__ + ) + + @abstractmethod + def get_secret(self, secret_name: str) -> str | None: + """Get a named secret from the secret manager. + + This method should be implemented by subclasses to retrieve secrets from + the secret store. If the secret is not found, the method should return `None`. + """ + ... + + def __str__(self) -> str: + return self.name + + def __eq__(self, value: object) -> bool: + if isinstance(value, SecretManager): + return self.name == value.name + + if isinstance(value, str): + return self.name == value + + if isinstance(value, SecretSourceEnum): + return self.name == str(value) + + return super().__eq__(value) + + +class CustomSecretManager(SecretManager, ABC): + """Custom secret manager that retrieves secrets from a custom source. + + This class is a convenience class that can be used to create custom secret + managers. By default, custom secrets managers are auto-registered during + creation. + """ + + auto_register = True + replace_existing = False + as_backup = False + + def __init__(self, name: str | None = None) -> None: + super().__init__(name) + if self.auto_register: + self.register() + + def register(self, *, replace_existing: bool | None = None) -> None: + """Register the secret manager as global secret source. + + This makes the secret manager available to the `get_secret` function and + allows it to be used automatically as a source for secrets. + + If `replace_existing` is `True`, the secret manager will replace all existing + secrets sources, including the default secret managers such as environment + variables, dotenv files, and Google Colab secrets. If `replace_existing` is + None or not provided, the default behavior will be used from the `replace_existing` + of the class (`False` unless overridden by the subclass). + """ + if replace_existing is None: + replace_existing = self.replace_existing + + if replace_existing: + _SECRETS_SOURCES.clear() + + if self.as_backup: + # Add to end of list + _SECRETS_SOURCES.append(self) + else: + # Add to beginning of list + _SECRETS_SOURCES.insert(0, self) + + +class EnvVarSecretManager(CustomSecretManager): + """Secret manager that retrieves secrets from environment variables.""" + + name = str(SecretSourceEnum.ENV) + + def get_secret(self, secret_name: str) -> str | None: + """Get a named secret from the environment.""" + if secret_name not in os.environ: + return None + + return os.environ[secret_name] + + +class DotenvSecretManager(CustomSecretManager): + """Secret manager that retrieves secrets from a `.env` file.""" + + name = str(SecretSourceEnum.DOTENV) + + def get_secret(self, secret_name: str) -> str | None: + """Get a named secret from the `.env` file.""" + try: + dotenv_vars: dict[str, str | None] = dotenv_values() + except Exception: + # Can't locate or parse a .env file + return None + + if secret_name not in dotenv_vars: + # Secret not found + return None + + return dotenv_vars[secret_name] + + +class ColabSecretManager(CustomSecretManager): + """Secret manager that retrieves secrets from Google Colab user secrets.""" + + name = str(SecretSourceEnum.GOOGLE_COLAB) + + def get_secret(self, secret_name: str) -> str | None: + """Get a named secret from Google Colab user secrets.""" + if colab_userdata is None: + # The module doesn't exist. We probably aren't in Colab. + return None + + try: + return colab_userdata.get(secret_name) + except Exception: + # Secret name not found. Continue. + return None + + +class SecretsPrompt(CustomSecretManager): + """Secret manager that prompts the user to enter a secret.""" + + name = str(SecretSourceEnum.PROMPT) + + def get_secret( + self, + secret_name: str, + ) -> str | None: + with contextlib.suppress(Exception): + return getpass(f"Enter the value for secret '{secret_name}': ") - try: - return colab_userdata.get(secret_name) - except Exception: - # Secret name not found. Continue. return None -def _get_secret_from_prompt( - secret_name: str, -) -> str | None: - with contextlib.suppress(Exception): - return getpass(f"Enter the value for secret '{secret_name}': ") +def _get_secret_sources() -> list[SecretManager]: + """Initialize the default secret sources.""" + if len(_SECRETS_SOURCES) == 0: + # Initialize the default secret sources + _SECRETS_SOURCES.extend( + [ + EnvVarSecretManager(), + DotenvSecretManager(), + ] + ) + if meta.is_colab(): + _SECRETS_SOURCES.append(ColabSecretManager()) + + if meta.is_interactive(): + _SECRETS_SOURCES.append(SecretsPrompt()) + + return _SECRETS_SOURCES.copy() + + +# Ensure the default secret sources are initialized +_ = _get_secret_sources() - return None +def register_secret_manager(secret_manager: CustomSecretManager) -> None: + """Register a custom secret manager.""" + secret_manager.register() -_SOURCE_FUNCTIONS: dict[SecretSource, Callable] = { - SecretSource.ENV: _get_secret_from_env, - SecretSource.DOTENV: _get_secret_from_dotenv, - SecretSource.GOOGLE_COLAB: _get_secret_from_colab, - SecretSource.PROMPT: _get_secret_from_prompt, -} + +def disable_secret_source(source: SecretManager | SecretSourceEnum) -> None: + """Disable one of the default secrets sources. + + This function can accept either a `SecretManager` instance, a `SecretSourceEnum` enum value, or a + string representing the name of the source to disable. + """ + if isinstance(source, SecretManager) and source in _SECRETS_SOURCES: + _SECRETS_SOURCES.remove(source) + return + + # Else, remove by name + for s in _SECRETS_SOURCES: + if s.name == str(source): + _SECRETS_SOURCES.remove(s) def get_secret( secret_name: str, /, - source: SecretSource | list[SecretSource] = SecretSource.ANY, *, - prompt: bool = True, + sources: list[SecretManager | SecretSourceEnum] | None = None, + allow_prompt: bool = True, + **kwargs: dict[str, Any], ) -> str: """Get a secret from the environment. - The optional `source` argument of enum type `SecretSource` or list of `SecretSource` options. - If left blank, the `source` arg will be `SecretSource.ANY`. If `source` is set to a specific - source, then only that source will be checked. If a list of `SecretSource` entries is passed, + The optional `sources` argument of enum type `SecretSourceEnum` or list of `SecretSourceEnum` options. + If left blank, the `sources` arg will be `SecretSourceEnum.ANY`. If `source` is set to a specific + source, then only that source will be checked. If a list of `SecretSourceEnum` entries is passed, then the sources will be checked using the provided ordering. - If `prompt` to `True` or if SecretSource.PROMPT is declared in the `source` arg, then the + If `prompt` to `True` or if SecretSourceEnum.PROMPT is declared in the `source` arg, then the user will be prompted to enter the secret if it is not found in any of the other sources. """ - sources = [source] if not isinstance(source, list) else source - all_sources = set(_SOURCE_FUNCTIONS.keys()) - {SecretSource.PROMPT} - if SecretSource.ANY in sources: - sources += [s for s in all_sources if s not in sources] - sources.remove(SecretSource.ANY) - - if prompt or SecretSource.PROMPT in sources: - if SecretSource.PROMPT in sources: - sources.remove(SecretSource.PROMPT) - - sources.append(SecretSource.PROMPT) # Always check prompt last - + if "source" in kwargs: + warnings.warn( + message="The `source` argument is deprecated. Use the `sources` argument instead.", + category=DeprecationWarning, + stacklevel=2, + ) + sources = kwargs.pop("source") + + available_sources: dict[str, SecretManager] = {} + for available_source in _get_secret_sources(): + # Add available sources to the dict. Order matters. + available_sources[available_source.name] = available_source + + if sources is None: + # If ANY is in the list, then we don't need to check any other sources. + # This is the default behavior. + sources = list(available_sources.values()) + + elif not isinstance(sources, list): + sources = [sources] + + # Replace any SecretSourceEnum strings with the matching SecretManager object for source in sources: - fn = _SOURCE_FUNCTIONS[source] # Get the matching function for this source - val = fn(secret_name) + if isinstance(source, SecretSourceEnum): + if source not in available_sources: + raise exc.PyAirbyteInputError( + guidance="Invalid secret source name.", + input_value=source, + context={ + "Available Sources": list(available_sources.keys()), + }, + ) + + sources[sources.index(source)] = available_sources[source] + + secret_managers = cast(list[SecretManager], sources) + + if SecretSourceEnum.PROMPT in secret_managers: + prompt_source = secret_managers.pop( + secret_managers.index(SecretSourceEnum.PROMPT), + ) + + if allow_prompt: + # Always check prompt last. Add it to the end of the list. + secret_managers.append(prompt_source) + + for secret_mgr in secret_managers: + val = secret_mgr.get_secret(secret_name) if val: return val raise exc.PyAirbyteSecretNotFoundError( secret_name=secret_name, - sources=[str(s) for s in sources], + sources=[str(s) for s in available_sources], ) __all__ = [ "get_secret", - "SecretSource", + "SecretSourceEnum", + "SecretManager", + "CustomSecretManager", ] From be7c04447f36d090df59998380c8ec630e4bcb02 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 6 Apr 2024 23:34:04 -0700 Subject: [PATCH 052/118] improve fixtures --- tests/conftest.py | 126 ++++++++++++++++++++-- tests/integration_tests/cloud/conftest.py | 27 ++--- 2 files changed, 128 insertions(+), 25 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index ff020366..98f7f417 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,9 +11,9 @@ import socket import subprocess import time +from ci_credentials import RemoteSecret, get_connector_secrets from requests.exceptions import HTTPError -import sqlalchemy import ulid from airbyte._util.google_secrets import get_gcp_secret from airbyte._util.meta import is_windows @@ -32,6 +32,7 @@ from airbyte.caches import PostgresCache from airbyte._executor import _get_bin_dir from airbyte.caches.util import new_local_cache +from airbyte.secrets import CustomSecretManager from airbyte.sources.base import as_temp_files import airbyte as ab @@ -62,6 +63,90 @@ def get_ci_secret_json( return json.loads(get_ci_secret(secret_name=secret_name, project_name=project_name)) +class AirbyteIntegrationTestSecretManager(CustomSecretManager): + """Custom secret manager for Airbyte integration tests. + + This class is used to auto-retrieve needed secrets from GSM. + """ + auto_register = True + replace_existing = False + as_backup = True + + def get_secret( + self, + secret_name: str, + *, + required: bool = False, + ) -> str | None: + """This method attempts to find matching properties within the integration test config. + + If `required` is `True`, this method will raise an exception if the secret is not found. + Otherwise, it will return None. + """ + system_name = secret_name.split("_")[0].lower() + property_name = "_".join(secret_name.split("_")[1:]).lower() + + mapping = { + "snowflake": "destination-snowflake", + "bigquery": "destination-bigquery", + "postgres": "destination-postgres", + "duckdb": "destination-duckdb", + } + if system_name not in mapping: + return None + + connector_name = mapping[system_name] + connector_config = self.get_connector_config(connector_name) + if "credentials" in connector_config: + if property_name in connector_config["credentials"]: + return connector_config["credentials"][property_name] + + if property_name in connector_config: + return connector_config[property_name] + + if not required: + return None + + raise KeyError( + f"Property '{property_name}' not found in '{connector_name}' connector config. " + f"\nAvailable config keys: {', '.join(connector_config.keys())} " + f"\nAvailable 'credential' keys: {', '.join(connector_config.get('credentials', {}).keys())} " + ) + + + def get_connector_config(self, connector_name: str, index: int = 0) -> dict | None: + assert connector_name is not None and connector_name != "all", \ + "We can only retrieve one connector config at a time." + + gcp_gsm_credentials = ab.get_secret("GCP_GSM_CREDENTIALS") + secrets: list[RemoteSecret] = [] + secrets, _ = get_connector_secrets( + connector_name=connector_name, + gcp_gsm_credentials=gcp_gsm_credentials, + disable_masking=True, + ) + + if len(secrets) > 1: + print( + f"Found {len(secrets)} secrets for connector '{connector_name}'." + ) + else: + print( + f"Found '{connector_name}' credentials." + ) + + if index >= len(secrets): + raise IndexError(f"Index {index} is out of range for connector '{connector_name}'.") + + return secrets[index].value_dict + + +@pytest.fixture(autouse=True, scope="session") +def airbyte_integration_test_secrets_manager() -> AirbyteIntegrationTestSecretManager: + """Create a new instance of the custom secret manager.""" + + return AirbyteIntegrationTestSecretManager() + def pytest_collection_modifyitems(items: list[Item]) -> None: """Override default pytest behavior, sorting our tests in a sensible execution order. @@ -76,13 +161,13 @@ def pytest_collection_modifyitems(items: list[Item]) -> None: def test_priority(item: Item) -> int: if item.get_closest_marker(name="slow"): return 9 # slow tests have the lowest priority - elif 'lint_tests' in str(item.fspath): + elif "lint_tests" in str(item.fspath): return 1 # lint tests have high priority - elif 'unit_tests' in str(item.fspath): + elif "unit_tests" in str(item.fspath): return 2 # unit tests have highest priority - elif 'docs_tests' in str(item.fspath): + elif "docs_tests" in str(item.fspath): return 3 # doc tests have medium priority - elif 'integration_tests' in str(item.fspath): + elif "integration_tests" in str(item.fspath): return 4 # integration tests have the lowest priority else: return 5 # all other tests have lower priority @@ -220,6 +305,20 @@ def new_postgres_cache(): postgres.remove() +@pytest.fixture +def new_motherduck_cache( + airbyte_integration_test_secrets_manager: AirbyteIntegrationTestSecretManager, +) -> MotherDuckCache: + config = airbyte_integration_test_secrets_manager.get_connector_config( + connector_name="destination-duckdb", + ) + return MotherDuckCache( + database="integration_tests_deleteany", + schema_name=f"test_deleteme_{str(ulid.ULID()).lower()[-6:]}", + api_key=config["motherduck_api_key"], + ) + + @pytest.fixture def new_snowflake_cache(): secret = get_ci_secret_json( @@ -243,6 +342,21 @@ def new_snowflake_cache(): connection.execute(f"DROP SCHEMA IF EXISTS {config.schema_name}") +@pytest.fixture(autouse=True, scope="session") +def with_bigquery_credentials_path_env_var(): + dest_bigquery_config = get_ci_secret_json( + secret_name="SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" + ) + credentials_json = dest_bigquery_config["credentials_json"] + + with as_temp_files([credentials_json]) as (credentials_path,): + os.environ["BIGQUERY_CREDENTIALS_PATH"] = credentials_path + + yield + + return + + @pytest.fixture @pytest.mark.requires_creds def new_bigquery_cache(): @@ -256,7 +370,7 @@ def new_bigquery_cache(): cache = BigQueryCache( credentials_path=credentials_path, project_name=dest_bigquery_config["project_id"], - dataset_name=dataset_name + dataset_name=dataset_name, ) yield cache diff --git a/tests/integration_tests/cloud/conftest.py b/tests/integration_tests/cloud/conftest.py index 3ae558a6..891ffd61 100644 --- a/tests/integration_tests/cloud/conftest.py +++ b/tests/integration_tests/cloud/conftest.py @@ -2,6 +2,7 @@ """Fixtures for Cloud Workspace integration tests.""" from __future__ import annotations +from enum import auto import os from pathlib import Path import sys @@ -13,8 +14,8 @@ from airbyte.cloud import CloudWorkspace -ENV_AIRBYTE_API_KEY = "AIRBYTE_API_KEY" -ENV_AIRBYTE_API_WORKSPACE_ID = "AIRBYTE_API_WORKSPACE_ID" +ENV_AIRBYTE_API_KEY = "AIRBYTE_CLOUD_API_KEY" +ENV_AIRBYTE_API_WORKSPACE_ID = "AIRBYTE_CLOUD_API_WORKSPACE_ID" ENV_MOTHERDUCK_API_KEY = "MOTHERDUCK_API_KEY" @@ -46,7 +47,7 @@ def api_key() -> str: return dotenv_vars[ENV_AIRBYTE_API_KEY] if ENV_AIRBYTE_API_KEY not in os.environ: - raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") + raise ValueError(f"Please set the '{ENV_AIRBYTE_API_KEY}' environment variable.") return os.environ[ENV_AIRBYTE_API_KEY] @@ -58,7 +59,7 @@ def motherduck_api_key() -> str: return dotenv_vars[ENV_MOTHERDUCK_API_KEY] if ENV_MOTHERDUCK_API_KEY not in os.environ: - raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") + raise ValueError(f"Please set the '{ENV_MOTHERDUCK_API_KEY}' environment variable.") return os.environ[ENV_MOTHERDUCK_API_KEY] @@ -93,23 +94,11 @@ def api_key() -> str: return dotenv_vars[ENV_AIRBYTE_API_KEY] if ENV_AIRBYTE_API_KEY not in os.environ: - raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") + raise ValueError(f"Please set the {ENV_AIRBYTE_API_KEY} environment variable.") return os.environ[ENV_AIRBYTE_API_KEY] -@pytest.fixture -def motherduck_api_key() -> str: - dotenv_vars: dict[str, str | None] = dotenv_values() - if ENV_MOTHERDUCK_API_KEY in dotenv_vars: - return dotenv_vars[ENV_MOTHERDUCK_API_KEY] - - if ENV_MOTHERDUCK_API_KEY not in os.environ: - raise ValueError("Please set the AIRBYTE_API_KEY environment variable.") - - return os.environ[ENV_MOTHERDUCK_API_KEY] - - @pytest.fixture(scope="function") def new_deployable_cache(request) -> CacheBase: """This is a placeholder fixture that will be overridden by pytest_generate_tests().""" @@ -124,9 +113,9 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: """ deployable_cache_fixtures: dict[str, str] = { # Ordered by priority (fastest first) - # "DuckDB": "new_duckdb_cache", + # "MotherDuck": "new_motherduck_cache", # "Postgres": "new_remote_postgres_cache", - # "BigQuery": "new_bigquery_cache", + "BigQuery": "new_bigquery_cache", "Snowflake": "new_snowflake_cache", } From 5b3bcaf035a4c86b44ed7e5f64e22aba6bed7703 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 6 Apr 2024 23:35:11 -0700 Subject: [PATCH 053/118] improved tests --- .../cloud/test_cloud_api_util.py | 2 ++ .../cloud/test_cloud_sql_reads.py | 23 +++++++++++-------- .../cloud/test_cloud_sync.py | 4 ---- .../cloud/test_cloud_workspaces.py | 5 ---- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/tests/integration_tests/cloud/test_cloud_api_util.py b/tests/integration_tests/cloud/test_cloud_api_util.py index c769eb15..dfe90d2c 100644 --- a/tests/integration_tests/cloud/test_cloud_api_util.py +++ b/tests/integration_tests/cloud/test_cloud_api_util.py @@ -110,6 +110,8 @@ def test_create_and_delete_connection( workspace_id=workspace_id, source_id=source.source_id, destination_id=destination.destination_id, + prefix="", + selected_stream_names=["users", "purchases", "products"], ) assert connection.source_id == source.source_id assert connection.destination_id == destination.destination_id diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py index 147487c2..d4359fd2 100644 --- a/tests/integration_tests/cloud/test_cloud_sql_reads.py +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -4,13 +4,13 @@ from contextlib import suppress import pytest +import pandas as pd from sqlalchemy.engine.base import Engine import airbyte as ab from airbyte import cloud from airbyte.cloud._sync_results import SyncResult - @pytest.fixture def deployable_source() -> ab.Source: return ab.get_source( @@ -37,10 +37,10 @@ def test_deploy_and_run_and_read( source_id = cloud_workspace.deploy_source(source=deployable_source) destination_id = cloud_workspace.deploy_cache_as_destination(cache=new_deployable_cache) connection_id = cloud_workspace.deploy_connection( - source=source_id, - destination=destination_id, - table_prefix=cache.table_prefix, - selected_streams=source.get_selected_streams(), + source=deployable_source, + cache=new_deployable_cache, + table_prefix=new_deployable_cache.table_prefix, + selected_streams=deployable_source.get_selected_streams(), ) # Run sync and get result: @@ -50,7 +50,7 @@ def test_deploy_and_run_and_read( # https://github.com/airbytehq/airbyte/issues/36875 sync_result: SyncResult = cloud_workspace.run_sync(connection_id=connection_id) - # Test sync result: + # Check sync result: assert sync_result.is_job_complete() # TODO: Rebuild streams property from connection's configured streams API endpoint @@ -73,11 +73,12 @@ def test_deploy_and_run_and_read( with suppress(Exception): cloud_workspace.delete_destination(destination_id=destination_id) + @pytest.mark.parametrize( "deployed_connection_id", [ pytest.param("c7b4d838-a612-495a-9d91-a14e477add51", id="Faker->Snowflake"), - pytest.param("", id="Faker->BigQuery", marks=pytest.mark.skip(reason="Not yet supported")), + pytest.param("0e1d6b32-b8e3-4b68-91a3-3a314599c782", id="Faker->BigQuery"), pytest.param("", id="Faker->Postgres", marks=pytest.mark.skip(reason="Not yet supported")), pytest.param("", id="Faker->MotherDuck", marks=pytest.mark.skip(reason="Not yet supported")), ], @@ -103,10 +104,14 @@ def test_read_from_deployed_connection( data_as_list = list(dataset) assert len(data_as_list) == 100 - pandas_df = dataset.to_pandas() + # TODO: Debug why this is super slow: + # pandas_df = dataset.to_pandas() + pandas_df = pd.DataFrame(data_as_list) + assert pandas_df.shape == (100, 20) + + # Check that no values are null for col in pandas_df.columns: - # Check that no values are null assert pandas_df[col].notnull().all() diff --git a/tests/integration_tests/cloud/test_cloud_sync.py b/tests/integration_tests/cloud/test_cloud_sync.py index ca48e4b3..72d04a20 100644 --- a/tests/integration_tests/cloud/test_cloud_sync.py +++ b/tests/integration_tests/cloud/test_cloud_sync.py @@ -11,10 +11,6 @@ from airbyte.caches import MotherDuckCache from airbyte.cloud import CloudWorkspace -ENV_AIRBYTE_API_KEY = "AIRBYTE_API_KEY" -ENV_AIRBYTE_API_WORKSPACE_ID = "AIRBYTE_API_WORKSPACE_ID" -ENV_MOTHERDUCK_API_KEY = "MOTHERDUCK_API_KEY" - @pytest.fixture def pre_created_connection_id() -> str: diff --git a/tests/integration_tests/cloud/test_cloud_workspaces.py b/tests/integration_tests/cloud/test_cloud_workspaces.py index 955ef961..e1a1a697 100644 --- a/tests/integration_tests/cloud/test_cloud_workspaces.py +++ b/tests/integration_tests/cloud/test_cloud_workspaces.py @@ -10,11 +10,6 @@ from airbyte.cloud import CloudWorkspace -ENV_AIRBYTE_API_KEY = "AIRBYTE_API_KEY" -ENV_AIRBYTE_API_WORKSPACE_ID = "AIRBYTE_API_WORKSPACE_ID" -ENV_MOTHERDUCK_API_KEY = "MOTHERDUCK_API_KEY" - - def test_deploy_source( cloud_workspace: CloudWorkspace, ) -> None: From 0863a131284a19740ff005c61e3b61793a9bfc43 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 6 Apr 2024 23:35:51 -0700 Subject: [PATCH 054/118] rename enum --- README.md | 4 ++-- airbyte/__init__.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7ed76cbf..f30c3c5b 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ _Note: Additional secret store options may be supported in the future. [More inf ### Retrieving Secrets ```python -from airbyte import get_secret, SecretSource +from airbyte import get_secret, SecretSourceEnum source = get_source("source-github") source.set_config( @@ -44,7 +44,7 @@ source.set_config( ) ``` -The `get_secret()` function accepts an optional `source` argument of enum type `SecretSource`. If omitted or set to `SecretSource.ANY`, PyAirbyte will search all available secrets sources. If `source` is set to a specific source, then only that source will be checked. If a list of `SecretSource` entries is passed, then the sources will be checked using the provided ordering. +The `get_secret()` function accepts an optional `source` argument of enum type `SecretSourceEnum`. If omitted or set to `SecretSourceEnum.ANY`, PyAirbyte will search all available secrets sources. If `source` is set to a specific source, then only that source will be checked. If a list of `SecretSourceEnum` entries is passed, then the sources will be checked using the provided ordering. By default, PyAirbyte will prompt the user for any requested secrets that are not provided via other secret managers. You can disable this prompt by passing `prompt=False` to `get_secret()`. diff --git a/airbyte/__init__.py b/airbyte/__init__.py index 8f729a43..31861588 100644 --- a/airbyte/__init__.py +++ b/airbyte/__init__.py @@ -15,7 +15,7 @@ from airbyte.datasets import CachedDataset from airbyte.records import StreamRecord from airbyte.results import ReadResult -from airbyte.secrets import SecretSource, get_secret +from airbyte.secrets import SecretSourceEnum, get_secret from airbyte.sources import registry from airbyte.sources.base import Source from airbyte.sources.registry import get_available_connectors @@ -44,7 +44,7 @@ "CachedDataset", "DuckDBCache", "ReadResult", - "SecretSource", + "SecretSourceEnum", "Source", "StreamRecord", ] From 3808fd73fcb9819b7051eeca2b774da34f1a8e13 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 6 Apr 2024 23:37:00 -0700 Subject: [PATCH 055/118] add `ci_credentials` dev dependency --- poetry.lock | 67 +++++++++++++++++++++++++++++++++++++++++++++++--- pyproject.toml | 6 +++++ 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 0ffada01..c15f1aa1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -382,6 +382,42 @@ files = [ {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, ] +[[package]] +name = "ci-credentials" +version = "1.1.0" +description = "CLI tooling to read and manage GSM secrets" +optional = false +python-versions = "^3.10" +files = [] +develop = false + +[package.dependencies] +click = "^8.1.3" +common_utils = {git = "https://github.com/airbytehq/airbyte.git", subdirectory = "airbyte-ci/connectors/common_utils"} +pyyaml = "^6.0" +requests = "^2.28.2" + +[package.source] +type = "git" +url = "https://github.com/airbytehq/airbyte.git" +reference = "aj/ci_credentials/make-portable-as-library" +resolved_reference = "13ba054ccf14df74d2bb7a07f8ff81f7ee4d2992" +subdirectory = "airbyte-ci/connectors/ci_credentials" + +[[package]] +name = "click" +version = "8.1.7" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.7" +files = [ + {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, + {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + [[package]] name = "colorama" version = "0.4.6" @@ -393,6 +429,26 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "common-utils" +version = "0.0.0" +description = "Suite of all often used classes and common functions" +optional = false +python-versions = "^3.10" +files = [] +develop = false + +[package.dependencies] +pyjwt = "^2.8.0" +requests = "^2.31.0" + +[package.source] +type = "git" +url = "https://github.com/airbytehq/airbyte.git" +reference = "HEAD" +resolved_reference = "27e851c5caf9bfc8d9e555370a0aeedef959c67d" +subdirectory = "airbyte-ci/connectors/common_utils" + [[package]] name = "cryptography" version = "41.0.7" @@ -2699,13 +2755,13 @@ files = [ [[package]] name = "typing-extensions" -version = "4.10.0" +version = "4.11.0" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" files = [ - {file = "typing_extensions-4.10.0-py3-none-any.whl", hash = "sha256:69b1a937c3a517342112fb4c6df7e72fc39a38e7891a5730ed4985b5214b5475"}, - {file = "typing_extensions-4.10.0.tar.gz", hash = "sha256:b0abd7c89e8fb96f98db18d86106ff1d90ab692004eb746cf6eda2682f91b3cb"}, + {file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"}, + {file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"}, ] [[package]] @@ -2867,7 +2923,10 @@ files = [ {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"}, ] +[extras] +integ-testing = [] + [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "9a031cf5b629604d4b79ad9aa2e93f86a0cc6eb1a07d3ffaf3c6aff29acc7d21" +content-hash = "81ff8fcf1213593612fbd388bfc7fab9d3de77e3bc6988df773ab62d8f9b1203" diff --git a/pyproject.toml b/pyproject.toml index 6f119072..da7afb12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,12 @@ responses = "^0.25.0" airbyte-source-pokeapi = "^0.2.0" pytest-mock = "^3.14.0" +# TODO: Move to 'main' branch dependencies once merged: https://github.com/airbytehq/airbyte/pull/35938 +ci_credentials = { python = "^3.10", git = "https://github.com/airbytehq/airbyte.git", branch = "aj/ci_credentials/make-portable-as-library", subdirectory="airbyte-ci/connectors/ci_credentials" } + +[tool.poetry.extras] +integ-testing = ["ci_credentials"] + [build-system] requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"] build-backend = "poetry_dynamic_versioning.backend" From 7ee3c72d0212a5ff0755b1919f9ca2ca37811e7e Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sun, 7 Apr 2024 12:07:19 -0700 Subject: [PATCH 056/118] chore: require pytest marks to be declared, add missing mark --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index da7afb12..d6002d4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,9 +74,11 @@ requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"] build-backend = "poetry_dynamic_versioning.backend" [tool.pytest.ini_options] +addopts = "--strict-markers" markers = [ "slow: marks tests as slow (deselect with '-m \"not slow\"')", - "requires_creds: marks a test as requiring credentials (skip when secrets unavailable)" + "requires_creds: marks a test as requiring credentials (skip when secrets unavailable)", + "linting: marks a test as a linting test", ] [tool.ruff.pylint] From 6cbd360ba9f4f35929fa34f4beb5c285966f9482 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sun, 7 Apr 2024 12:14:18 -0700 Subject: [PATCH 057/118] add 'super_slow' pytest mark and skip these in ci --- .github/workflows/python_pytest.yml | 2 +- .github/workflows/test-pr-command.yml | 2 +- pyproject.toml | 1 + tests/conftest.py | 3 +++ tests/integration_tests/cloud/test_cloud_sql_reads.py | 1 + tests/integration_tests/cloud/test_cloud_sync.py | 2 ++ 6 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python_pytest.yml b/.github/workflows/python_pytest.yml index 395bea36..a75d51a4 100644 --- a/.github/workflows/python_pytest.yml +++ b/.github/workflows/python_pytest.yml @@ -114,4 +114,4 @@ jobs: - name: Run Pytest env: GCP_GSM_CREDENTIALS: ${{ secrets.GCP_GSM_CREDENTIALS }} - run: poetry run pytest -m "not linting" + run: poetry run pytest -m "not linting and not super_slow" diff --git a/.github/workflows/test-pr-command.yml b/.github/workflows/test-pr-command.yml index 7937311b..7a71a9e4 100644 --- a/.github/workflows/test-pr-command.yml +++ b/.github/workflows/test-pr-command.yml @@ -82,7 +82,7 @@ jobs: - name: Run Pytest env: GCP_GSM_CREDENTIALS: ${{ secrets.GCP_GSM_CREDENTIALS }} - run: poetry run pytest + run: poetry run pytest -m "not super_slow" log-success-comment: name: Append 'Success' Comment diff --git a/pyproject.toml b/pyproject.toml index d6002d4a..f8abff5e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,6 +79,7 @@ markers = [ "slow: marks tests as slow (deselect with '-m \"not slow\"')", "requires_creds: marks a test as requiring credentials (skip when secrets unavailable)", "linting: marks a test as a linting test", + "super_slow: these super slow tests will not run in CI; they will only ever run on-demand", ] [tool.ruff.pylint] diff --git a/tests/conftest.py b/tests/conftest.py index 98f7f417..ebecc671 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -186,6 +186,9 @@ def test_priority(item: Item) -> int: item.add_marker(pytest.mark.slow) item.add_marker(pytest.mark.requires_creds) + if "super_slow" in item.keywords: + # Super slow tests are also slow + item.add_marker("slow") def is_port_in_use(port): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py index d4359fd2..4aa369c6 100644 --- a/tests/integration_tests/cloud/test_cloud_sql_reads.py +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -26,6 +26,7 @@ def previous_job_run_id() -> str: return "10136196" +@pytest.mark.super_slow def test_deploy_and_run_and_read( cloud_workspace: cloud.CloudWorkspace, new_deployable_cache: ab.BigQueryCache | ab.SnowflakeCache, diff --git a/tests/integration_tests/cloud/test_cloud_sync.py b/tests/integration_tests/cloud/test_cloud_sync.py index 72d04a20..4e4bae95 100644 --- a/tests/integration_tests/cloud/test_cloud_sync.py +++ b/tests/integration_tests/cloud/test_cloud_sync.py @@ -17,6 +17,7 @@ def pre_created_connection_id() -> str: return "80857d37-1f21-4500-a802-f5ac08d1a3dd" +@pytest.mark.super_slow def test_run_connection( cloud_workspace: CloudWorkspace, pre_created_connection_id: str, @@ -26,6 +27,7 @@ def test_run_connection( _ = sync_result +@pytest.mark.super_slow @pytest.mark.skip(reason="This test is not yet complete. It is hanging currently.") def test_deploy_and_run_connection( cloud_workspace: CloudWorkspace, From 997d01429b2490dfc54c010dbf448c523787736a Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sun, 7 Apr 2024 12:20:51 -0700 Subject: [PATCH 058/118] docs: update text in secrets readme --- README.md | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index f30c3c5b..50e4b015 100644 --- a/README.md +++ b/README.md @@ -29,24 +29,34 @@ PyAirbyte can auto-import secrets from the following sources: 3. [Google Colab secrets](https://medium.com/@parthdasawant/how-to-use-secrets-in-google-colab-450c38e3ec75). 4. Manual entry via [`getpass`](https://docs.python.org/3.9/library/getpass.html). -_Note: Additional secret store options may be supported in the future. [More info here.](https://github.com/airbytehq/airbyte-lib-private-beta/discussions/5)_ +_Note: You can also build your own secret manager by subclassing the `CustomSecretManager` implementation. For more information, see the `airbyte.secrets.CustomSecretManager` class definiton._ ### Retrieving Secrets ```python -from airbyte import get_secret, SecretSourceEnum +import airbyte as ab -source = get_source("source-github") +source = ab.get_source("source-github") source.set_config( "credentials": { - "personal_access_token": get_secret("GITHUB_PERSONAL_ACCESS_TOKEN"), + "personal_access_token": ab.get_secret("GITHUB_PERSONAL_ACCESS_TOKEN"), } ) ``` -The `get_secret()` function accepts an optional `source` argument of enum type `SecretSourceEnum`. If omitted or set to `SecretSourceEnum.ANY`, PyAirbyte will search all available secrets sources. If `source` is set to a specific source, then only that source will be checked. If a list of `SecretSourceEnum` entries is passed, then the sources will be checked using the provided ordering. +By default, PyAirbyte will search all available secrets sources. The `get_secret()` function also accepts an optional `sources` argument of specific source names (`SecretSourceEnum`) and/or secret manager objects to check. -By default, PyAirbyte will prompt the user for any requested secrets that are not provided via other secret managers. You can disable this prompt by passing `prompt=False` to `get_secret()`. +By default, PyAirbyte will prompt the user for any requested secrets that are not provided via other secret managers. You can disable this prompt by passing `allow_prompt=False` to `get_secret()`. + +For more information, see the `airbyte.secrets` module. + +### Secrets Auto-Discovery + +If you have a secret matching an expected name, PyAirbyte will automatically use it. For example, if you have a secret named `GITHUB_PERSONAL_ACCESS_TOKEN`, PyAirbyte will automatically use it when configuring the GitHub source. + +The naming convention for secrets is as `{CONNECTOR_NAME}_{PROPERTY_NAME}`, for instance `SNOWFLAKE_PASSWORD` and `BIGQUERY_CREDENTIALS_PATH`. + +PyAirbyte will also auto-discover secrets for interop with hosted Airbyte: `AIRBYTE_CLOUD_API_URL`, `AIRBYTE_CLOUD_API_KEY`, etc. ## Connector compatibility @@ -120,7 +130,6 @@ Yes. Just pick the cache type matching the destination - like SnowflakeCache for **6. Can PyAirbyte import a connector from a local directory that has python project files, or does it have to be pip install** Yes, PyAirbyte can use any local install that has a CLI - and will automatically find connectors by name if they are on PATH. - ## Changelog and Release Notes For a version history and list of all changes, please see our [GitHub Releases](https://github.com/airbytehq/PyAirbyte/releases) page. From d93e867024e365200140ce06dbf0ee0631f73a48 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Sun, 7 Apr 2024 20:37:18 +0000 Subject: [PATCH 059/118] Auto-fix lint and format issues --- airbyte/secrets.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/airbyte/secrets.py b/airbyte/secrets.py index 382db05a..b2518d89 100644 --- a/airbyte/secrets.py +++ b/airbyte/secrets.py @@ -8,7 +8,7 @@ from abc import ABC, abstractmethod from enum import Enum from getpass import getpass -from typing import TYPE_CHECKING, Any, cast +from typing import Any, cast from dotenv import dotenv_values @@ -16,10 +16,6 @@ from airbyte._util import meta -if TYPE_CHECKING: - from collections.abc import Callable - - try: from google.colab import userdata as colab_userdata except ImportError: From 9ac8cfb2b2870d0f2e414651c18aea7f9a59ef96 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sun, 7 Apr 2024 13:49:39 -0700 Subject: [PATCH 060/118] chore: install all extras in ci --- .github/workflows/autofix.yml | 2 +- .github/workflows/fix-pr-command.yml | 2 +- .github/workflows/pydoc_preview.yml | 2 +- .github/workflows/pydoc_publish.yml | 2 +- .github/workflows/python_pytest.yml | 6 +++--- .github/workflows/test-pr-command.yml | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/autofix.yml b/.github/workflows/autofix.yml index 4e9ecd51..0b4976b3 100644 --- a/.github/workflows/autofix.yml +++ b/.github/workflows/autofix.yml @@ -25,7 +25,7 @@ jobs: cache: 'poetry' - name: Install dependencies - run: poetry install + run: poetry install --all-extras - name: Format code run: poetry run ruff format . diff --git a/.github/workflows/fix-pr-command.yml b/.github/workflows/fix-pr-command.yml index 712fd852..00206423 100644 --- a/.github/workflows/fix-pr-command.yml +++ b/.github/workflows/fix-pr-command.yml @@ -79,7 +79,7 @@ jobs: python-version: ${{ matrix.python-version }} cache: 'poetry' - name: Install dependencies - run: poetry install + run: poetry install --all-extras # Fix any lint or format issues diff --git a/.github/workflows/pydoc_preview.yml b/.github/workflows/pydoc_preview.yml index 8284dfde..ee052ecb 100644 --- a/.github/workflows/pydoc_preview.yml +++ b/.github/workflows/pydoc_preview.yml @@ -27,7 +27,7 @@ jobs: cache: 'poetry' - name: Install dependencies - run: poetry install + run: poetry install --all-extras - name: Generate documentation run: | diff --git a/.github/workflows/pydoc_publish.yml b/.github/workflows/pydoc_publish.yml index 0d719dbb..9f2df8ec 100644 --- a/.github/workflows/pydoc_publish.yml +++ b/.github/workflows/pydoc_publish.yml @@ -46,7 +46,7 @@ jobs: uses: actions/configure-pages@v4 - name: Install dependencies - run: poetry install + run: poetry install --all-extras - name: Generate documentation run: | diff --git a/.github/workflows/python_pytest.yml b/.github/workflows/python_pytest.yml index a75d51a4..20418e0a 100644 --- a/.github/workflows/python_pytest.yml +++ b/.github/workflows/python_pytest.yml @@ -37,7 +37,7 @@ jobs: python-version: '3.10' cache: 'poetry' - name: Install dependencies - run: poetry install + run: poetry install --all-extras # Job-specific step(s): - name: Run Pytest (Fast Tests Only) @@ -65,7 +65,7 @@ jobs: python-version: '3.10' cache: 'poetry' - name: Install dependencies - run: poetry install + run: poetry install --all-extras # Job-specific step(s): - name: Run Pytest (No-Creds) @@ -108,7 +108,7 @@ jobs: python-version: ${{ matrix.python-version }} cache: 'poetry' - name: Install dependencies - run: poetry install + run: poetry install --all-extras # Job-specific step(s): - name: Run Pytest diff --git a/.github/workflows/test-pr-command.yml b/.github/workflows/test-pr-command.yml index 7a71a9e4..d1bf2fce 100644 --- a/.github/workflows/test-pr-command.yml +++ b/.github/workflows/test-pr-command.yml @@ -77,7 +77,7 @@ jobs: python-version: ${{ matrix.python-version }} cache: 'poetry' - name: Install dependencies - run: poetry install + run: poetry install --all-extras - name: Run Pytest env: From e38d90e46f223fe0d9a6821138b66e4b85c33e22 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sun, 7 Apr 2024 13:54:43 -0700 Subject: [PATCH 061/118] lint: fix --- airbyte/cloud/_destinations.py | 2 +- airbyte/secrets.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/airbyte/cloud/_destinations.py b/airbyte/cloud/_destinations.py index 0ffa4584..f5f177c0 100644 --- a/airbyte/cloud/_destinations.py +++ b/airbyte/cloud/_destinations.py @@ -48,7 +48,7 @@ def _get_destination_response(self, *, force_refresh: bool = False) -> Destinati def get_destination_config( self, - ) -> DestinationBigquery | DestinationDuckdb | DestinationPostgres | DestinationSnowflake | Any: + ) -> DestinationBigquery | DestinationDuckdb | DestinationPostgres | DestinationSnowflake | Any: # noqa: ANN401 """Get the destination configuration.""" return self._get_destination_response().configuration diff --git a/airbyte/secrets.py b/airbyte/secrets.py index b2518d89..86afe366 100644 --- a/airbyte/secrets.py +++ b/airbyte/secrets.py @@ -230,8 +230,8 @@ def register_secret_manager(secret_manager: CustomSecretManager) -> None: def disable_secret_source(source: SecretManager | SecretSourceEnum) -> None: """Disable one of the default secrets sources. - This function can accept either a `SecretManager` instance, a `SecretSourceEnum` enum value, or a - string representing the name of the source to disable. + This function can accept either a `SecretManager` instance, a `SecretSourceEnum` enum value, or + a string representing the name of the source to disable. """ if isinstance(source, SecretManager) and source in _SECRETS_SOURCES: _SECRETS_SOURCES.remove(source) @@ -253,10 +253,10 @@ def get_secret( ) -> str: """Get a secret from the environment. - The optional `sources` argument of enum type `SecretSourceEnum` or list of `SecretSourceEnum` options. - If left blank, the `sources` arg will be `SecretSourceEnum.ANY`. If `source` is set to a specific - source, then only that source will be checked. If a list of `SecretSourceEnum` entries is passed, - then the sources will be checked using the provided ordering. + The optional `sources` argument of enum type `SecretSourceEnum` or list of `SecretSourceEnum` + options. If left blank, the `sources` arg will be `SecretSourceEnum.ANY`. If `source` is set to + a specific source, then only that source will be checked. If a list of `SecretSourceEnum` + entries is passed, then the sources will be checked using the provided ordering. If `prompt` to `True` or if SecretSourceEnum.PROMPT is declared in the `source` arg, then the user will be prompted to enter the secret if it is not found in any of the other sources. From 36f5c84a70d64d523a19f6d64216ed082c3f8936 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sun, 7 Apr 2024 14:37:19 -0700 Subject: [PATCH 062/118] move conftest into cloud subfolder --- tests/conftest.py | 15 --------------- tests/integration_tests/cloud/conftest.py | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index ebecc671..e571b44f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -345,21 +345,6 @@ def new_snowflake_cache(): connection.execute(f"DROP SCHEMA IF EXISTS {config.schema_name}") -@pytest.fixture(autouse=True, scope="session") -def with_bigquery_credentials_path_env_var(): - dest_bigquery_config = get_ci_secret_json( - secret_name="SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" - ) - credentials_json = dest_bigquery_config["credentials_json"] - - with as_temp_files([credentials_json]) as (credentials_path,): - os.environ["BIGQUERY_CREDENTIALS_PATH"] = credentials_path - - yield - - return - - @pytest.fixture @pytest.mark.requires_creds def new_bigquery_cache(): diff --git a/tests/integration_tests/cloud/conftest.py b/tests/integration_tests/cloud/conftest.py index 891ffd61..dd353508 100644 --- a/tests/integration_tests/cloud/conftest.py +++ b/tests/integration_tests/cloud/conftest.py @@ -52,6 +52,22 @@ def api_key() -> str: return os.environ[ENV_AIRBYTE_API_KEY] +@pytest.mark.requires_creds +@pytest.fixture(autouse=True, scope="session") +def bigquery_credentials_file(): + dest_bigquery_config = get_ci_secret_json( + secret_name="SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" + ) + credentials_json = dest_bigquery_config["credentials_json"] + + with as_temp_files([credentials_json]) as (credentials_path,): + os.environ["BIGQUERY_CREDENTIALS_PATH"] = credentials_path + + yield + + return + + @pytest.fixture def motherduck_api_key() -> str: dotenv_vars: dict[str, str | None] = dotenv_values() From 9337537b2dcfb66728ce01c21ec6491a34cdcf4d Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sun, 7 Apr 2024 14:42:14 -0700 Subject: [PATCH 063/118] fix no-creds test filter --- .github/workflows/python_pytest.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python_pytest.yml b/.github/workflows/python_pytest.yml index 20418e0a..1a133462 100644 --- a/.github/workflows/python_pytest.yml +++ b/.github/workflows/python_pytest.yml @@ -72,7 +72,9 @@ jobs: env: # Force this to an invalid value to ensure tests that no creds are required are run. GCP_GSM_CREDENTIALS: "no-creds" - run: poetry run pytest -m "not requires_creds" + run: > + poetry run pytest -m + "not requires_creds and not linting and not super_slow" pytest: name: Pytest (All, Python ${{ matrix.python-version }}, ${{ matrix.os }}) From 3f5f87985f82699cfeb0f2fc61e3497b4a324c91 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sun, 7 Apr 2024 14:48:45 -0700 Subject: [PATCH 064/118] skip requires_creds tests on 3.9 --- .github/workflows/python_pytest.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python_pytest.yml b/.github/workflows/python_pytest.yml index 1a133462..0f4e1ced 100644 --- a/.github/workflows/python_pytest.yml +++ b/.github/workflows/python_pytest.yml @@ -113,7 +113,11 @@ jobs: run: poetry install --all-extras # Job-specific step(s): - - name: Run Pytest + - name: "Run Pytest ${{ matrix.python-version == '3.9' && '(No Creds)' || '' }}" env: GCP_GSM_CREDENTIALS: ${{ secrets.GCP_GSM_CREDENTIALS }} - run: poetry run pytest -m "not linting and not super_slow" + # We have to exclude `requires_creds` tests on 3.9, because the `airbyte_ci` package + # is not available for 3.9, and the tests that require creds will fail. + run: > + poetry run pytest -m "not linting and not super_slow + ${{ matrix.python-version == '3.9' && 'and not requires_creds' || '' }}" From dd360e1d31205b54e2679855d013964260066dae Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sun, 7 Apr 2024 15:15:18 -0700 Subject: [PATCH 065/118] chore: refactor fixture hierarchy --- tests/conftest.py | 207 +--------------- tests/integration_tests/conftest.py | 226 ++++++++++++++++++ .../test_bigquery_cache.py | 0 3 files changed, 227 insertions(+), 206 deletions(-) create mode 100644 tests/integration_tests/conftest.py rename tests/{unit_tests => integration_tests}/test_bigquery_cache.py (100%) diff --git a/tests/conftest.py b/tests/conftest.py index e571b44f..60ec62ed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,7 +11,6 @@ import socket import subprocess import time -from ci_credentials import RemoteSecret, get_connector_secrets from requests.exceptions import HTTPError import ulid @@ -27,7 +26,6 @@ import psycopg2 as psycopg import pytest from _pytest.nodes import Item -from sqlalchemy import create_engine from airbyte.caches import PostgresCache from airbyte._executor import _get_bin_dir @@ -46,107 +44,6 @@ LOCAL_TEST_REGISTRY_URL = "./tests/integration_tests/fixtures/registry.json" -AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" - - -def get_ci_secret( - secret_name, - project_name: str = AIRBYTE_INTERNAL_GCP_PROJECT, -) -> str: - return get_gcp_secret(project_name=project_name, secret_name=secret_name) - - -def get_ci_secret_json( - secret_name, - project_name: str = AIRBYTE_INTERNAL_GCP_PROJECT, -) -> dict: - return json.loads(get_ci_secret(secret_name=secret_name, project_name=project_name)) - - -class AirbyteIntegrationTestSecretManager(CustomSecretManager): - """Custom secret manager for Airbyte integration tests. - - This class is used to auto-retrieve needed secrets from GSM. - """ - auto_register = True - replace_existing = False - as_backup = True - - def get_secret( - self, - secret_name: str, - *, - required: bool = False, - ) -> str | None: - """This method attempts to find matching properties within the integration test config. - - If `required` is `True`, this method will raise an exception if the secret is not found. - Otherwise, it will return None. - """ - system_name = secret_name.split("_")[0].lower() - property_name = "_".join(secret_name.split("_")[1:]).lower() - - mapping = { - "snowflake": "destination-snowflake", - "bigquery": "destination-bigquery", - "postgres": "destination-postgres", - "duckdb": "destination-duckdb", - } - if system_name not in mapping: - return None - - connector_name = mapping[system_name] - connector_config = self.get_connector_config(connector_name) - if "credentials" in connector_config: - if property_name in connector_config["credentials"]: - return connector_config["credentials"][property_name] - - if property_name in connector_config: - return connector_config[property_name] - - if not required: - return None - - raise KeyError( - f"Property '{property_name}' not found in '{connector_name}' connector config. " - f"\nAvailable config keys: {', '.join(connector_config.keys())} " - f"\nAvailable 'credential' keys: {', '.join(connector_config.get('credentials', {}).keys())} " - ) - - - def get_connector_config(self, connector_name: str, index: int = 0) -> dict | None: - assert connector_name is not None and connector_name != "all", \ - "We can only retrieve one connector config at a time." - - gcp_gsm_credentials = ab.get_secret("GCP_GSM_CREDENTIALS") - secrets: list[RemoteSecret] = [] - secrets, _ = get_connector_secrets( - connector_name=connector_name, - gcp_gsm_credentials=gcp_gsm_credentials, - disable_masking=True, - ) - - if len(secrets) > 1: - print( - f"Found {len(secrets)} secrets for connector '{connector_name}'." - ) - else: - print( - f"Found '{connector_name}' credentials." - ) - - if index >= len(secrets): - raise IndexError(f"Index {index} is out of range for connector '{connector_name}'.") - - return secrets[index].value_dict - - -@pytest.fixture(autouse=True, scope="session") -def airbyte_integration_test_secrets_manager() -> AirbyteIntegrationTestSecretManager: - """Create a new instance of the custom secret manager.""" - - return AirbyteIntegrationTestSecretManager() - def pytest_collection_modifyitems(items: list[Item]) -> None: """Override default pytest behavior, sorting our tests in a sensible execution order. @@ -190,6 +87,7 @@ def test_priority(item: Item) -> int: # Super slow tests are also slow item.add_marker("slow") + def is_port_in_use(port): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: return s.connect_ex(("localhost", port)) == 0 @@ -308,67 +206,6 @@ def new_postgres_cache(): postgres.remove() -@pytest.fixture -def new_motherduck_cache( - airbyte_integration_test_secrets_manager: AirbyteIntegrationTestSecretManager, -) -> MotherDuckCache: - config = airbyte_integration_test_secrets_manager.get_connector_config( - connector_name="destination-duckdb", - ) - return MotherDuckCache( - database="integration_tests_deleteany", - schema_name=f"test_deleteme_{str(ulid.ULID()).lower()[-6:]}", - api_key=config["motherduck_api_key"], - ) - - -@pytest.fixture -def new_snowflake_cache(): - secret = get_ci_secret_json( - "AIRBYTE_LIB_SNOWFLAKE_CREDS", - ) - config = SnowflakeCache( - account=secret["account"], - username=secret["username"], - password=secret["password"], - database=secret["database"], - warehouse=secret["warehouse"], - role=secret["role"], - schema_name=f"test{str(ulid.ULID()).lower()[-6:]}", - ) - sqlalchemy_url = config.get_sql_alchemy_url() - - yield config - - engine = create_engine(config.get_sql_alchemy_url()) - with engine.begin() as connection: - connection.execute(f"DROP SCHEMA IF EXISTS {config.schema_name}") - - -@pytest.fixture -@pytest.mark.requires_creds -def new_bigquery_cache(): - dest_bigquery_config = get_ci_secret_json( - "SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" - ) - - dataset_name = f"test_deleteme_{str(ulid.ULID()).lower()[-6:]}" - credentials_json = dest_bigquery_config["credentials_json"] - with as_temp_files([credentials_json]) as (credentials_path,): - cache = BigQueryCache( - credentials_path=credentials_path, - project_name=dest_bigquery_config["project_id"], - dataset_name=dataset_name, - ) - yield cache - - url = cache.get_sql_alchemy_url() - engine = create_engine(url) - with suppress(Exception): - with engine.begin() as connection: - connection.execute(f"DROP SCHEMA IF EXISTS {cache.schema_name}") - - @pytest.fixture(autouse=True) def source_test_registry(monkeypatch): """ @@ -422,45 +259,3 @@ def source_test_installation(): @pytest.fixture(scope="function") def new_duckdb_cache() -> DuckDBCache: return new_local_cache() - - -@pytest.fixture(scope="function") -def new_motherduck_cache() -> MotherDuckCache: - return MotherDuckCache( - api_key=ab.get_secret("MOTHERDUCK_API_KEY"), - schema_name=f"test{str(ulid.ULID()).lower()[-6:]}", - database="integration_tests_deleteany", - ) - - -@pytest.fixture(scope="function") -def new_generic_cache(request) -> CacheBase: - """This is a placeholder fixture that will be overridden by pytest_generate_tests().""" - return request.getfixturevalue(request.param) - - -def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: - """Override default pytest behavior, parameterizing our tests based on the available cache types. - - This is useful for running the same tests with different cache types, to ensure that the tests - can pass across all cache types. - """ - all_cache_type_fixtures: dict[str, str] = { - # Ordered by priority (fastest first) - "DuckDB": "new_duckdb_cache", - "Postgres": "new_postgres_cache", - "BigQuery": "new_bigquery_cache", - "Snowflake": "new_snowflake_cache", - } - if is_windows(): - # Postgres tests require Linux containers - all_cache_type_fixtures.pop("Postgres") - - if "new_generic_cache" in metafunc.fixturenames: - metafunc.parametrize( - "new_generic_cache", - all_cache_type_fixtures.values(), - ids=all_cache_type_fixtures.keys(), - indirect=True, - scope="function", - ) diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py new file mode 100644 index 00000000..6dccb691 --- /dev/null +++ b/tests/integration_tests/conftest.py @@ -0,0 +1,226 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Fixtures for integration tests.""" + +from __future__ import annotations +import json + +import pytest +import ulid +from sqlalchemy import create_engine + +from airbyte._util import meta +from airbyte._util.google_secrets import get_gcp_secret +from airbyte.caches.motherduck import MotherDuckCache +from airbyte.caches.snowflake import SnowflakeCache +from airbyte.secrets import CustomSecretManager + +import airbyte as ab + +AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" + + +def get_ci_secret( + secret_name, + project_name: str = AIRBYTE_INTERNAL_GCP_PROJECT, +) -> str: + return get_gcp_secret(project_name=project_name, secret_name=secret_name) + + +def get_ci_secret_json( + secret_name, + project_name: str = AIRBYTE_INTERNAL_GCP_PROJECT, +) -> dict: + return json.loads(get_ci_secret(secret_name=secret_name, project_name=project_name)) + + +def get_connector_config(self, connector_name: str, index: int = 0) -> dict | None: + # Import here because `airbyte_ci` may not be available in all environments: + from ci_credentials import RemoteSecret, get_connector_secrets + + assert connector_name is not None and connector_name != "all", \ + "We can only retrieve one connector config at a time." + + gcp_gsm_credentials = ab.get_secret("GCP_GSM_CREDENTIALS") + secrets: list[RemoteSecret] = [] + secrets, _ = get_connector_secrets( + connector_name=connector_name, + gcp_gsm_credentials=gcp_gsm_credentials, + disable_masking=True, + ) + + if len(secrets) > 1: + print( + f"Found {len(secrets)} secrets for connector '{connector_name}'." + ) + else: + print( + f"Found '{connector_name}' credentials." + ) + + if index >= len(secrets): + raise IndexError(f"Index {index} is out of range for connector '{connector_name}'.") + + return secrets[index].value_dict + + +class AirbyteIntegrationTestSecretManager(CustomSecretManager): + """Custom secret manager for Airbyte integration tests. + + This class is used to auto-retrieve needed secrets from GSM. + """ + auto_register = True + replace_existing = False + as_backup = True + + def get_secret( + self, + secret_name: str, + *, + required: bool = False, + ) -> str | None: + """This method attempts to find matching properties within the integration test config. + + If `required` is `True`, this method will raise an exception if the secret is not found. + Otherwise, it will return None. + """ + system_name = secret_name.split("_")[0].lower() + property_name = "_".join(secret_name.split("_")[1:]).lower() + + mapping = { + "snowflake": "destination-snowflake", + "bigquery": "destination-bigquery", + "postgres": "destination-postgres", + "duckdb": "destination-duckdb", + } + if system_name not in mapping: + return None + + connector_name = mapping[system_name] + connector_config = self.get_connector_config(connector_name) + if "credentials" in connector_config: + if property_name in connector_config["credentials"]: + return connector_config["credentials"][property_name] + + if property_name in connector_config: + return connector_config[property_name] + + if not required: + return None + + raise KeyError( + f"Property '{property_name}' not found in '{connector_name}' connector config. " + f"\nAvailable config keys: {', '.join(connector_config.keys())} " + f"\nAvailable 'credential' keys: {', '.join(connector_config.get('credentials', {}).keys())} " + ) + + +@pytest.fixture(autouse=True, scope="session") +def airbyte_integration_test_secrets_manager() -> AirbyteIntegrationTestSecretManager: + """Create a new instance of the custom secret manager.""" + + return AirbyteIntegrationTestSecretManager() + + +@pytest.fixture +def new_motherduck_cache( + airbyte_integration_test_secrets_manager: AirbyteIntegrationTestSecretManager, +) -> MotherDuckCache: + config = airbyte_integration_test_secrets_manager.get_connector_config( + connector_name="destination-duckdb", + ) + return MotherDuckCache( + database="integration_tests_deleteany", + schema_name=f"test_deleteme_{str(ulid.ULID()).lower()[-6:]}", + api_key=config["motherduck_api_key"], + ) + + +@pytest.fixture +def new_snowflake_cache(): + secret = get_ci_secret_json( + "AIRBYTE_LIB_SNOWFLAKE_CREDS", + ) + config = SnowflakeCache( + account=secret["account"], + username=secret["username"], + password=secret["password"], + database=secret["database"], + warehouse=secret["warehouse"], + role=secret["role"], + schema_name=f"test{str(ulid.ULID()).lower()[-6:]}", + ) + sqlalchemy_url = config.get_sql_alchemy_url() + + yield config + + engine = create_engine(config.get_sql_alchemy_url()) + with engine.begin() as connection: + connection.execute(f"DROP SCHEMA IF EXISTS {config.schema_name}") + + +@pytest.fixture +@pytest.mark.requires_creds +def new_bigquery_cache(): + dest_bigquery_config = get_ci_secret_json( + "SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" + ) + + dataset_name = f"test_deleteme_{str(ulid.ULID()).lower()[-6:]}" + credentials_json = dest_bigquery_config["credentials_json"] + with as_temp_files([credentials_json]) as (credentials_path,): + cache = BigQueryCache( + credentials_path=credentials_path, + project_name=dest_bigquery_config["project_id"], + dataset_name=dataset_name, + ) + yield cache + + url = cache.get_sql_alchemy_url() + engine = create_engine(url) + with suppress(Exception): + with engine.begin() as connection: + connection.execute(f"DROP SCHEMA IF EXISTS {cache.schema_name}") + + +@pytest.fixture(scope="function") +def new_motherduck_cache() -> MotherDuckCache: + return MotherDuckCache( + api_key=ab.get_secret("MOTHERDUCK_API_KEY"), + schema_name=f"test{str(ulid.ULID()).lower()[-6:]}", + database="integration_tests_deleteany", + ) + + + + +@pytest.fixture(scope="function") +def new_generic_cache(request) -> CacheBase: + """This is a placeholder fixture that will be overridden by pytest_generate_tests().""" + return request.getfixturevalue(request.param) + + +def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: + """Override default pytest behavior, parameterizing our tests based on the available cache types. + + This is useful for running the same tests with different cache types, to ensure that the tests + can pass across all cache types. + """ + all_cache_type_fixtures: dict[str, str] = { + # Ordered by priority (fastest first) + "DuckDB": "new_duckdb_cache", + "Postgres": "new_postgres_cache", + "BigQuery": "new_bigquery_cache", + "Snowflake": "new_snowflake_cache", + } + if meta.is_windows(): + # Postgres tests require Linux containers + all_cache_type_fixtures.pop("Postgres") + + if "new_generic_cache" in metafunc.fixturenames: + metafunc.parametrize( + "new_generic_cache", + all_cache_type_fixtures.values(), + ids=all_cache_type_fixtures.keys(), + indirect=True, + scope="function", + ) diff --git a/tests/unit_tests/test_bigquery_cache.py b/tests/integration_tests/test_bigquery_cache.py similarity index 100% rename from tests/unit_tests/test_bigquery_cache.py rename to tests/integration_tests/test_bigquery_cache.py From e62c6d41f5b3fc0ae2b17cfc1ad79fbb2b32226c Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sun, 7 Apr 2024 15:24:24 -0700 Subject: [PATCH 066/118] chore: fix tests --- airbyte/_util/temp_files.py | 33 +++++++++++++++++++++++ airbyte/sources/base.py | 22 +-------------- tests/conftest.py | 1 - tests/integration_tests/cloud/conftest.py | 17 +----------- tests/integration_tests/conftest.py | 23 ++++++++++++++-- 5 files changed, 56 insertions(+), 40 deletions(-) create mode 100644 airbyte/_util/temp_files.py diff --git a/airbyte/_util/temp_files.py b/airbyte/_util/temp_files.py new file mode 100644 index 00000000..a1a56532 --- /dev/null +++ b/airbyte/_util/temp_files.py @@ -0,0 +1,33 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""___""" + +from __future__ import annotations + +import json +import tempfile +from contextlib import contextmanager, suppress +from pathlib import Path +from typing import TYPE_CHECKING, Any + + +if TYPE_CHECKING: + from collections.abc import Generator + + +@contextmanager +def as_temp_files(files_contents: list[dict | str]) -> Generator[list[str], Any, None]: + """Write the given contents to temporary files and yield the file paths as strings.""" + temp_files: list[Any] = [] + try: + for content in files_contents: + temp_file = tempfile.NamedTemporaryFile(mode="w+t", delete=False) + temp_file.write( + json.dumps(content) if isinstance(content, dict) else content, + ) + temp_file.flush() + temp_files.append(temp_file) + yield [file.name for file in temp_files] + finally: + for temp_file in temp_files: + with suppress(Exception): + Path(temp_file.name).unlink() diff --git a/airbyte/sources/base.py b/airbyte/sources/base.py index dcc029da..7e7041e0 100644 --- a/airbyte/sources/base.py +++ b/airbyte/sources/base.py @@ -2,9 +2,7 @@ from __future__ import annotations import json -import tempfile import warnings -from contextlib import contextmanager, suppress from pathlib import Path from typing import TYPE_CHECKING, Any, cast @@ -37,6 +35,7 @@ log_source_check_result, send_telemetry, ) +from airbyte._util.temp_files import as_temp_files from airbyte.caches.util import get_default_cache from airbyte.datasets._lazy import LazyDataset from airbyte.progress import progress @@ -56,25 +55,6 @@ from airbyte.documents import Document -@contextmanager -def as_temp_files(files_contents: list[dict | str]) -> Generator[list[str], Any, None]: - """Write the given contents to temporary files and yield the file paths as strings.""" - temp_files: list[Any] = [] - try: - for content in files_contents: - temp_file = tempfile.NamedTemporaryFile(mode="w+t", delete=False) - temp_file.write( - json.dumps(content) if isinstance(content, dict) else content, - ) - temp_file.flush() - temp_files.append(temp_file) - yield [file.name for file in temp_files] - finally: - for temp_file in temp_files: - with suppress(Exception): - Path(temp_file.name).unlink() - - class Source: """A class representing a source that can be called.""" diff --git a/tests/conftest.py b/tests/conftest.py index 60ec62ed..8fcf5702 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -31,7 +31,6 @@ from airbyte._executor import _get_bin_dir from airbyte.caches.util import new_local_cache from airbyte.secrets import CustomSecretManager -from airbyte.sources.base import as_temp_files import airbyte as ab diff --git a/tests/integration_tests/cloud/conftest.py b/tests/integration_tests/cloud/conftest.py index dd353508..23b5ba1f 100644 --- a/tests/integration_tests/cloud/conftest.py +++ b/tests/integration_tests/cloud/conftest.py @@ -12,6 +12,7 @@ from airbyte._executor import _get_bin_dir from airbyte.caches.base import CacheBase from airbyte.cloud import CloudWorkspace +from airbyte._util.temp_files import as_temp_files ENV_AIRBYTE_API_KEY = "AIRBYTE_CLOUD_API_KEY" @@ -52,22 +53,6 @@ def api_key() -> str: return os.environ[ENV_AIRBYTE_API_KEY] -@pytest.mark.requires_creds -@pytest.fixture(autouse=True, scope="session") -def bigquery_credentials_file(): - dest_bigquery_config = get_ci_secret_json( - secret_name="SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" - ) - credentials_json = dest_bigquery_config["credentials_json"] - - with as_temp_files([credentials_json]) as (credentials_path,): - os.environ["BIGQUERY_CREDENTIALS_PATH"] = credentials_path - - yield - - return - - @pytest.fixture def motherduck_api_key() -> str: dotenv_vars: dict[str, str | None] = dotenv_values() diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index 6dccb691..55853abc 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -2,7 +2,9 @@ """Fixtures for integration tests.""" from __future__ import annotations +from contextlib import suppress import json +import os import pytest import ulid @@ -10,9 +12,12 @@ from airbyte._util import meta from airbyte._util.google_secrets import get_gcp_secret +from airbyte.caches.base import CacheBase +from airbyte.caches.bigquery import BigQueryCache from airbyte.caches.motherduck import MotherDuckCache from airbyte.caches.snowflake import SnowflakeCache from airbyte.secrets import CustomSecretManager +from airbyte._util.temp_files import as_temp_files import airbyte as ab @@ -182,6 +187,22 @@ def new_bigquery_cache(): connection.execute(f"DROP SCHEMA IF EXISTS {cache.schema_name}") +@pytest.mark.requires_creds +@pytest.fixture(autouse=True, scope="session") +def bigquery_credentials_file(): + dest_bigquery_config = get_ci_secret_json( + secret_name="SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" + ) + credentials_json = dest_bigquery_config["credentials_json"] + + with as_temp_files([credentials_json]) as (credentials_path,): + os.environ["BIGQUERY_CREDENTIALS_PATH"] = credentials_path + + yield + + return + + @pytest.fixture(scope="function") def new_motherduck_cache() -> MotherDuckCache: return MotherDuckCache( @@ -191,8 +212,6 @@ def new_motherduck_cache() -> MotherDuckCache: ) - - @pytest.fixture(scope="function") def new_generic_cache(request) -> CacheBase: """This is a placeholder fixture that will be overridden by pytest_generate_tests().""" From 5a92b57080b05559029effb39afd7b9357ff4473 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sun, 7 Apr 2024 16:02:12 -0700 Subject: [PATCH 067/118] fix: only use class name as last result --- airbyte/secrets.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/airbyte/secrets.py b/airbyte/secrets.py index 86afe366..c25ceb90 100644 --- a/airbyte/secrets.py +++ b/airbyte/secrets.py @@ -58,12 +58,11 @@ class SecretManager(ABC): replace_existing = False as_backup = False - def __init__(self, name: str | None = None) -> None: + def __init__(self) -> None: """Instantiate the new secret manager.""" - - self.name: str = ( # Default to the class name if no name is provided - name or self.__class__.__name__ - ) + if not hasattr(self, "name"): + # Default to the class name if no name is provided + self.name: str = self.__class__.__name__ @abstractmethod def get_secret(self, secret_name: str) -> str | None: @@ -102,8 +101,8 @@ class CustomSecretManager(SecretManager, ABC): replace_existing = False as_backup = False - def __init__(self, name: str | None = None) -> None: - super().__init__(name) + def __init__(self) -> None: + super().__init__() if self.auto_register: self.register() From 89402d1464debb60cfa7c18b0f9fb9ca2e25b11f Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sun, 7 Apr 2024 16:12:23 -0700 Subject: [PATCH 068/118] fix secret manager names --- airbyte/secrets.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/airbyte/secrets.py b/airbyte/secrets.py index c25ceb90..76797c4e 100644 --- a/airbyte/secrets.py +++ b/airbyte/secrets.py @@ -135,7 +135,7 @@ def register(self, *, replace_existing: bool | None = None) -> None: class EnvVarSecretManager(CustomSecretManager): """Secret manager that retrieves secrets from environment variables.""" - name = str(SecretSourceEnum.ENV) + name = SecretSourceEnum.ENV.value def get_secret(self, secret_name: str) -> str | None: """Get a named secret from the environment.""" @@ -148,7 +148,7 @@ def get_secret(self, secret_name: str) -> str | None: class DotenvSecretManager(CustomSecretManager): """Secret manager that retrieves secrets from a `.env` file.""" - name = str(SecretSourceEnum.DOTENV) + name = SecretSourceEnum.DOTENV.value def get_secret(self, secret_name: str) -> str | None: """Get a named secret from the `.env` file.""" @@ -168,7 +168,7 @@ def get_secret(self, secret_name: str) -> str | None: class ColabSecretManager(CustomSecretManager): """Secret manager that retrieves secrets from Google Colab user secrets.""" - name = str(SecretSourceEnum.GOOGLE_COLAB) + name = SecretSourceEnum.GOOGLE_COLAB.value def get_secret(self, secret_name: str) -> str | None: """Get a named secret from Google Colab user secrets.""" @@ -186,7 +186,7 @@ def get_secret(self, secret_name: str) -> str | None: class SecretsPrompt(CustomSecretManager): """Secret manager that prompts the user to enter a secret.""" - name = str(SecretSourceEnum.PROMPT) + name = SecretSourceEnum.PROMPT.value def get_secret( self, @@ -253,12 +253,11 @@ def get_secret( """Get a secret from the environment. The optional `sources` argument of enum type `SecretSourceEnum` or list of `SecretSourceEnum` - options. If left blank, the `sources` arg will be `SecretSourceEnum.ANY`. If `source` is set to - a specific source, then only that source will be checked. If a list of `SecretSourceEnum` + options. If left blank, all available sources will be checked. If a list of `SecretSourceEnum` entries is passed, then the sources will be checked using the provided ordering. - If `prompt` to `True` or if SecretSourceEnum.PROMPT is declared in the `source` arg, then the - user will be prompted to enter the secret if it is not found in any of the other sources. + If `allow_prompt` is `True` or if SecretSourceEnum.PROMPT is declared in the `source` arg, then + the user will be prompted to enter the secret if it is not found in any of the other sources. """ if "source" in kwargs: warnings.warn( From 29a19bf4c37d74d5eb97668455e5371ce930a89c Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sun, 7 Apr 2024 16:31:00 -0700 Subject: [PATCH 069/118] declare new SecretString class --- airbyte/secrets.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/airbyte/secrets.py b/airbyte/secrets.py index 76797c4e..64268d65 100644 --- a/airbyte/secrets.py +++ b/airbyte/secrets.py @@ -33,6 +33,19 @@ class SecretSourceEnum(str, Enum): _SECRETS_SOURCES: list[SecretManager] = [] +class SecretString(str): + """A string that represents a secret. + + This class is used to mark a string as a secret. When a secret is printed, it + will be masked to prevent accidental exposure of sensitive information. + """ + + __slots__ = () + + def __repr__(self) -> str: + return "" + + class SecretManager(ABC): """Abstract base class for secret managers. @@ -65,7 +78,7 @@ def __init__(self) -> None: self.name: str = self.__class__.__name__ @abstractmethod - def get_secret(self, secret_name: str) -> str | None: + def get_secret(self, secret_name: str) -> SecretString | None: """Get a named secret from the secret manager. This method should be implemented by subclasses to retrieve secrets from @@ -137,12 +150,12 @@ class EnvVarSecretManager(CustomSecretManager): name = SecretSourceEnum.ENV.value - def get_secret(self, secret_name: str) -> str | None: + def get_secret(self, secret_name: str) -> SecretString | None: """Get a named secret from the environment.""" if secret_name not in os.environ: return None - return os.environ[secret_name] + return SecretString(os.environ[secret_name]) class DotenvSecretManager(CustomSecretManager): @@ -150,7 +163,7 @@ class DotenvSecretManager(CustomSecretManager): name = SecretSourceEnum.DOTENV.value - def get_secret(self, secret_name: str) -> str | None: + def get_secret(self, secret_name: str) -> SecretString | None: """Get a named secret from the `.env` file.""" try: dotenv_vars: dict[str, str | None] = dotenv_values() @@ -162,7 +175,7 @@ def get_secret(self, secret_name: str) -> str | None: # Secret not found return None - return dotenv_vars[secret_name] + return SecretString(dotenv_vars[secret_name]) class ColabSecretManager(CustomSecretManager): @@ -170,14 +183,14 @@ class ColabSecretManager(CustomSecretManager): name = SecretSourceEnum.GOOGLE_COLAB.value - def get_secret(self, secret_name: str) -> str | None: + def get_secret(self, secret_name: str) -> SecretString | None: """Get a named secret from Google Colab user secrets.""" if colab_userdata is None: # The module doesn't exist. We probably aren't in Colab. return None try: - return colab_userdata.get(secret_name) + return SecretString(colab_userdata.get(secret_name)) except Exception: # Secret name not found. Continue. return None @@ -191,9 +204,9 @@ class SecretsPrompt(CustomSecretManager): def get_secret( self, secret_name: str, - ) -> str | None: + ) -> SecretString | None: with contextlib.suppress(Exception): - return getpass(f"Enter the value for secret '{secret_name}': ") + return SecretString(getpass(f"Enter the value for secret '{secret_name}': ")) return None @@ -249,7 +262,7 @@ def get_secret( sources: list[SecretManager | SecretSourceEnum] | None = None, allow_prompt: bool = True, **kwargs: dict[str, Any], -) -> str: +) -> SecretString: """Get a secret from the environment. The optional `sources` argument of enum type `SecretSourceEnum` or list of `SecretSourceEnum` @@ -308,7 +321,7 @@ def get_secret( for secret_mgr in secret_managers: val = secret_mgr.get_secret(secret_name) if val: - return val + return SecretString(val) raise exc.PyAirbyteSecretNotFoundError( secret_name=secret_name, From 42ca50982804062c2ca726c45e29cc00f355044f Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sun, 7 Apr 2024 17:43:48 -0700 Subject: [PATCH 070/118] apply SecretString to cache config --- airbyte/caches/motherduck.py | 7 ++++--- airbyte/caches/postgres.py | 9 ++++++--- airbyte/caches/snowflake.py | 7 ++++--- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/airbyte/caches/motherduck.py b/airbyte/caches/motherduck.py index 99b599cc..4f538334 100644 --- a/airbyte/caches/motherduck.py +++ b/airbyte/caches/motherduck.py @@ -20,6 +20,7 @@ from airbyte._processors.sql.motherduck import MotherDuckSqlProcessor from airbyte.caches.duckdb import DuckDBCache +from airbyte.secrets import SecretString class MotherDuckCache(DuckDBCache): @@ -27,14 +28,14 @@ class MotherDuckCache(DuckDBCache): db_path: str = Field(default="md:") database: str - api_key: str + api_key: SecretString _sql_processor_class = MotherDuckSqlProcessor @overrides - def get_sql_alchemy_url(self) -> str: + def get_sql_alchemy_url(self) -> SecretString: """Return the SQLAlchemy URL to use.""" - return ( + return SecretString( f"duckdb:///md:{self.database}?motherduck_token={self.api_key}" # f"&schema={self.schema_name}" # TODO: Debug why this doesn't work ) diff --git a/airbyte/caches/postgres.py b/airbyte/caches/postgres.py index 5d4c33e2..c82869ba 100644 --- a/airbyte/caches/postgres.py +++ b/airbyte/caches/postgres.py @@ -23,6 +23,7 @@ from airbyte._processors.sql.postgres import PostgresSqlProcessor from airbyte.caches.base import CacheBase +from airbyte.secrets import SecretString class PostgresCache(CacheBase): @@ -34,15 +35,17 @@ class PostgresCache(CacheBase): host: str port: int username: str - password: str + password: SecretString database: str _sql_processor_class = PostgresSqlProcessor @overrides - def get_sql_alchemy_url(self) -> str: + def get_sql_alchemy_url(self) -> SecretString: """Return the SQLAlchemy URL to use.""" - return f"postgresql+psycopg2://{self.username}:{self.password}@{self.host}:{self.port}/{self.database}" + return SecretString( + f"postgresql+psycopg2://{self.username}:{self.password}@{self.host}:{self.port}/{self.database}" + ) @overrides def get_database_name(self) -> str: diff --git a/airbyte/caches/snowflake.py b/airbyte/caches/snowflake.py index f0e55f3b..4819b919 100644 --- a/airbyte/caches/snowflake.py +++ b/airbyte/caches/snowflake.py @@ -27,6 +27,7 @@ from airbyte._processors.sql.base import RecordDedupeMode from airbyte._processors.sql.snowflake import SnowflakeSqlProcessor from airbyte.caches.base import CacheBase +from airbyte.secrets import SecretString class SnowflakeCache(CacheBase): @@ -34,7 +35,7 @@ class SnowflakeCache(CacheBase): account: str username: str - password: str + password: SecretString warehouse: str database: str role: str @@ -47,9 +48,9 @@ class SnowflakeCache(CacheBase): # schema_name: str @overrides - def get_sql_alchemy_url(self) -> str: + def get_sql_alchemy_url(self) -> SecretString: """Return the SQLAlchemy URL to use.""" - return str( + return SecretString( URL( account=self.account, user=self.username, From afdb236da4bed9a9b91564b2bbfd590b26403235 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sun, 7 Apr 2024 23:05:05 -0700 Subject: [PATCH 071/118] refactor: add CloudConnection --- airbyte/cloud/_connections.py | 206 ++++++++++++++++++ airbyte/cloud/_sync_results.py | 14 +- airbyte/cloud/_workspaces.py | 68 +++--- airbyte/datasets/_sql.py | 5 +- airbyte/exceptions.py | 2 +- .../cloud/test_cloud_sql_reads.py | 9 + 6 files changed, 257 insertions(+), 47 deletions(-) create mode 100644 airbyte/cloud/_connections.py diff --git a/airbyte/cloud/_connections.py b/airbyte/cloud/_connections.py new file mode 100644 index 00000000..f72fffca --- /dev/null +++ b/airbyte/cloud/_connections.py @@ -0,0 +1,206 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Cloud Connections.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, cast + +from airbyte._util import api_util +from airbyte.cloud._sync_results import SyncResult + + +if TYPE_CHECKING: + from airbyte_api.models.shared.connectionresponse import ConnectionResponse + from airbyte_api.models.shared.jobresponse import JobResponse + + from airbyte.cloud._workspaces import CloudWorkspace + + +class CloudConnection: + """A connection is a link between a source and a destination. + + Do not instantiate this class directly. Instead, use the `CloudWorkspace.create_connection` + or `CloudWorkspace.get_connection` methods. + """ + + def __init__( + self, + workspace: CloudWorkspace, + connection_id: str, + source: str | None = None, + destination: str | None = None, + ) -> None: + self.connection_id = connection_id + """The ID of the connection.""" + + self.workspace = workspace + """The workspace that the connection belongs to.""" + + self._source_id = source + """The ID of the source.""" + + self._destination_id = destination + """The ID of the destination.""" + + self._connection_info: ConnectionResponse | None = None + + def _fetch_connection_info(self) -> ConnectionResponse: + """Populate the connection with data from the API.""" + return api_util.get_connection( + workspace_id=self.workspace.workspace_id, + connection_id=self.connection_id, + api_root=self.workspace.api_root, + api_key=self.workspace.api_key, + ) + + # Properties + + @property + def source_id(self) -> str: + """The ID of the source.""" + if not self._source_id: + if not self._connection_info: + self._connection_info = self._fetch_connection_info() + + self._source_id = self._connection_info.source_id + + return cast(str, self._source_id) + + @property + def destination_id(self) -> str: + """The ID of the destination.""" + if not self._destination_id: + if not self._connection_info: + self._connection_info = self._fetch_connection_info() + + self._destination_id = self._connection_info.source_id + + return cast(str, self._destination_id) + + @property + def stream_names(self) -> list[str]: + """The stream names.""" + if not self._connection_info: + self._connection_info = self._fetch_connection_info() + + return [stream.name for stream in self._connection_info.configurations.streams] + + @property + def table_prefix(self) -> str: + """The table prefix.""" + if not self._connection_info: + self._connection_info = self._fetch_connection_info() + + return self._connection_info.configurations.prefix + + @property + def connection_url(self) -> str | None: + return f"{self.workspace.workspace_url}/connections/{self.connection_id}" + + @property + def job_history_url(self) -> str | None: + return f"{self.connection_url}/job-history" + + # Run Sync + + def run_sync( + self, + *, + wait: bool = True, + wait_timeout: int = 300, + ) -> SyncResult: + """Run a sync.""" + connection_response = api_util.run_connection( + connection_id=self.connection_id, + api_root=self.workspace.api_root, + api_key=self.workspace.api_key, + workspace_id=self.workspace.workspace_id, + ) + sync_result = SyncResult( + workspace=self.workspace, + connection=self, + job_id=connection_response.job_id, + ) + + if wait: + sync_result.wait_for_completion( + wait_timeout=wait_timeout, + raise_failure=True, + raise_timeout=True, + ) + + return sync_result + + # Logs + + def get_previous_sync_logs( + self, + *, + limit: int = 10, + ) -> list[SyncResult]: + """Get the previous sync logs for a connection.""" + sync_logs: list[JobResponse] = api_util.get_job_logs( + connection_id=self.connection_id, + api_root=self.workspace.api_root, + api_key=self.workspace.api_key, + workspace_id=self.workspace.workspace_id, + limit=limit, + ) + return [ + SyncResult( + workspace=self.workspace, + connection=self, + job_id=sync_log.job_id, + _latest_status=sync_log.status, + ) + for sync_log in sync_logs + ] + + def get_sync_result( + self, + job_id: str | None = None, + ) -> SyncResult | None: + """Get the sync result for the connection. + + If `job_id` is not provided, the most recent sync job will be used. + + Returns `None` if job_id is omitted and no previous jobs are found. + """ + if job_id is None: + # Get the most recent sync job + results = self.get_previous_sync_logs( + limit=1, + ) + if results: + return results[0] + + return None + + # Get the sync job by ID (lazy loaded) + return SyncResult( + workspace=self.workspace, + connection=self, + job_id=job_id, + ) + + # Deletions + + def delete( + self, + *, + delete_source: bool = False, + delete_destination: bool = False, + ) -> None: + """Delete the connection. + + Args: + delete_source: Whether to also delete the source. + delete_destination: Whether to also delete the destination. + """ + self.workspace.delete_connection(connection_id=self.connection_id) + + if delete_source: + self.workspace.delete_source(source=self.source_id) + + if delete_destination: + self.workspace.delete_destination(destination=self.destination_id) diff --git a/airbyte/cloud/_sync_results.py b/airbyte/cloud/_sync_results.py index 5b3b76cb..0d9c2b47 100644 --- a/airbyte/cloud/_sync_results.py +++ b/airbyte/cloud/_sync_results.py @@ -23,6 +23,7 @@ import sqlalchemy from airbyte.caches.base import CacheBase + from airbyte.cloud._connections import CloudConnection from airbyte.cloud._workspaces import CloudWorkspace @@ -42,7 +43,7 @@ class SyncResult: """The result of a sync operation.""" workspace: CloudWorkspace - connection_id: str + connection: CloudConnection job_id: str table_name_prefix: str = "" table_name_suffix: str = "" @@ -50,6 +51,11 @@ class SyncResult: _connection_response: ConnectionResponse | None = None _cache: CacheBase | None = None + @property + def job_url(self) -> str: + """Return the URL of the sync job.""" + return f"{self.connection.job_history_url}/{self.job_id}" + def _get_connection_info(self, *, force_refresh: bool = False) -> ConnectionResponse: """Return connection info for the sync job.""" if self._connection_response and not force_refresh: @@ -59,7 +65,7 @@ def _get_connection_info(self, *, force_refresh: bool = False) -> ConnectionResp workspace_id=self.workspace.workspace_id, api_root=self.workspace.api_root, api_key=self.workspace.api_key, - connection_id=self.connection_id, + connection_id=self.connection.connection_id, ) return self._connection_response @@ -111,7 +117,7 @@ def raise_failure_status( if latest_status in FAILED_STATUSES: raise AirbyteConnectionSyncError( workspace=self.workspace, - connection_id=self.connection_id, + connection_id=self.connection.connection_id, job_id=self.job_id, job_status=self._latest_status, ) @@ -138,7 +144,7 @@ def wait_for_completion( if raise_timeout: raise AirbyteConnectionSyncTimeoutError( workspace=self.workspace, - connection_id=self.connection_id, + connection_id=self.connection.connection_id, job_id=self.job_id, job_status=latest_status, timeout=wait_timeout, diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/_workspaces.py index ec06961b..7e1720f3 100644 --- a/airbyte/cloud/_workspaces.py +++ b/airbyte/cloud/_workspaces.py @@ -23,6 +23,7 @@ get_connection, get_workspace, ) +from airbyte.cloud._connections import CloudConnection from airbyte.cloud._destination_util import get_destination_config_from_cache from airbyte.cloud._sync_results import SyncResult from airbyte.sources.base import Source @@ -48,6 +49,10 @@ class CloudWorkspace: api_key: str api_root: str = CLOUD_API_ROOT + @property + def workspace_url(self) -> str | None: + return f"{self.api_root}/workspaces/{self.workspace_id}" + # Test connection and creds def connect(self) -> None: @@ -62,6 +67,7 @@ def connect(self) -> None: api_key=self.api_key, workspace_id=self.workspace_id, ) + print(f"Successfully connected to workspace: {self.workspace_url}") # Deploy and delete sources @@ -147,16 +153,16 @@ def deploy_cache_as_destination( def delete_destination( self, *, - destination_id: str | None = None, + destination: str | None = None, cache: CacheBase | None = None, ) -> None: """Delete a deployed destination from the workspace. You can pass either the `Cache` class or the deployed destination ID as a `str`. """ - if destination_id is None and cache is None: + if destination is None and cache is None: raise ValueError("You must provide either a destination ID or a cache object.") # noqa: TRY003 - if destination_id is not None and cache is not None: + if destination is not None and cache is not None: raise ValueError( # noqa: TRY003 "You must provide either a destination ID or a cache object, not both." ) @@ -165,13 +171,13 @@ def delete_destination( if not cache._deployed_destination_id: # noqa: SLF001 raise ValueError("Cache has not been deployed.") # noqa: TRY003 - destination_id = cache._deployed_destination_id # noqa: SLF001 + destination = cache._deployed_destination_id # noqa: SLF001 - if destination_id is None: + if destination is None: raise ValueError("No destination ID provided.") # noqa: TRY003 delete_destination( - destination_id=destination_id, + destination_id=destination, api_root=self.api_root, api_key=self.api_key, ) @@ -277,7 +283,7 @@ def delete_connection( self.delete_source(source=connection.source_id) if delete_destination: - self.delete_destination(destination_id=connection.destination_id) + self.delete_destination(destination=connection.destination_id) # Run syncs @@ -289,25 +295,11 @@ def run_sync( wait_timeout: int = 300, ) -> SyncResult: """Run a sync on a deployed connection.""" - connection_response = api_util.run_connection( - connection_id=connection_id, - api_root=self.api_root, - api_key=self.api_key, - workspace_id=self.workspace_id, - ) - sync_result = SyncResult( + connection = CloudConnection( workspace=self, - connection_id=connection_response.connection_id, - job_id=connection_response.job_id, + connection_id=connection_id, ) - if wait: - sync_result.wait_for_completion( - wait_timeout=wait_timeout, - raise_failure=True, - raise_timeout=True, - ) - - return sync_result + return connection.run_sync(wait=wait, wait_timeout=wait_timeout) # Get sync results and previous sync logs @@ -322,6 +314,10 @@ def get_sync_result( Returns `None` if job_id is omitted and no previous jobs are found. """ + connection = CloudConnection( + workspace=self, + connection_id=connection_id, + ) if job_id is None: results = self.get_previous_sync_logs( connection_id=connection_id, @@ -331,10 +327,13 @@ def get_sync_result( return results[0] return None - - return SyncResult( + connection = CloudConnection( workspace=self, connection_id=connection_id, + ) + return SyncResult( + workspace=self, + connection=connection, job_id=job_id, ) @@ -345,19 +344,10 @@ def get_previous_sync_logs( limit: int = 10, ) -> list[SyncResult]: """Get the previous sync logs for a connection.""" - sync_logs: list[JobResponse] = api_util.get_job_logs( + connection = CloudConnection( + workspace=self, connection_id=connection_id, - api_root=self.api_root, - api_key=self.api_key, - workspace_id=self.workspace_id, + ) + return connection.get_previous_sync_logs( limit=limit, ) - return [ - SyncResult( - workspace=self, - connection_id=sync_log.connection_id, - job_id=sync_log.job_id, - _latest_status=sync_log.status, - ) - for sync_log in sync_logs - ] diff --git a/airbyte/datasets/_sql.py b/airbyte/datasets/_sql.py index 311d6e7d..e9150c69 100644 --- a/airbyte/datasets/_sql.py +++ b/airbyte/datasets/_sql.py @@ -63,9 +63,8 @@ def __init__( except Exception as ex: Warning(f"Failed to get stream configuration for {stream_name}: {ex}") - stream_configuration: ConfiguredAirbyteStream | None = ( - stream_configuration or None # Coalesce False to None - ) + # Coalesce False to None + stream_configuration = stream_configuration or None super().__init__(stream_metadata=stream_configuration) diff --git a/airbyte/exceptions.py b/airbyte/exceptions.py index a87410d0..a3b11bca 100644 --- a/airbyte/exceptions.py +++ b/airbyte/exceptions.py @@ -326,7 +326,7 @@ class AirbyteError(PyAirbyteError): @property def workspace_url(self) -> str | None: if self.workspace: - return f"{self.workspace.api_root}/workspaces/{self.workspace.workspace_id}" + return self.workspace.workspace_url return None diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py index 4aa369c6..4994bbdf 100644 --- a/tests/integration_tests/cloud/test_cloud_sql_reads.py +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -116,6 +116,15 @@ def test_read_from_deployed_connection( assert pandas_df[col].notnull().all() +@pytest.mark.parametrize( + "deployed_connection_id", + [ + pytest.param("c7b4d838-a612-495a-9d91-a14e477add51", id="Faker->Snowflake"), + pytest.param("0e1d6b32-b8e3-4b68-91a3-3a314599c782", id="Faker->BigQuery"), + pytest.param("", id="Faker->Postgres", marks=pytest.mark.skip(reason="Not yet supported")), + pytest.param("", id="Faker->MotherDuck", marks=pytest.mark.skip(reason="Not yet supported")), + ], +) def test_read_from_previous_job( cloud_workspace: cloud.CloudWorkspace, deployed_connection_id: str, From b5a44deeefc464491db3df23d48adb874266d4f9 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sun, 7 Apr 2024 23:12:47 -0700 Subject: [PATCH 072/118] avoid to_pandas on bigquery --- tests/integration_tests/cloud/test_cloud_sql_reads.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py index 4994bbdf..a695328a 100644 --- a/tests/integration_tests/cloud/test_cloud_sql_reads.py +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -105,8 +105,9 @@ def test_read_from_deployed_connection( data_as_list = list(dataset) assert len(data_as_list) == 100 - # TODO: Debug why this is super slow: + # TODO: Fails on BigQuery: https://github.com/airbytehq/PyAirbyte/issues/165 # pandas_df = dataset.to_pandas() + pandas_df = pd.DataFrame(data_as_list) assert pandas_df.shape == (100, 20) @@ -150,7 +151,11 @@ def test_read_from_previous_job( data_as_list = list(dataset) assert len(data_as_list) == 100 - pandas_df = dataset.to_pandas() + # TODO: Fails on BigQuery: https://github.com/airbytehq/PyAirbyte/issues/165 + # pandas_df = dataset.to_pandas() + + pandas_df = pd.DataFrame(data_as_list) + assert pandas_df.shape == (100, 20) for col in pandas_df.columns: # Check that no values are null From 700ea828c8933d9c36118e74a39ee9ca419698eb Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 8 Apr 2024 11:47:04 -0700 Subject: [PATCH 073/118] remove unnecessary tests --- tests/integration_tests/test_duckdb_cache.py | 6 ------ tests/integration_tests/test_source_faker_integration.py | 6 ------ 2 files changed, 12 deletions(-) diff --git a/tests/integration_tests/test_duckdb_cache.py b/tests/integration_tests/test_duckdb_cache.py index 203fb24c..b13ceab9 100644 --- a/tests/integration_tests/test_duckdb_cache.py +++ b/tests/integration_tests/test_duckdb_cache.py @@ -75,9 +75,3 @@ def duckdb_cache() -> Generator[DuckDBCache, None, None]: yield cache # TODO: Delete cache DB file after test is complete. return - - -def test_duckdb_cache(duckdb_cache: DuckDBCache) -> None: - """Test that the duckdb cache is available.""" - assert duckdb_cache - assert isinstance(duckdb_cache, DuckDBCache) diff --git a/tests/integration_tests/test_source_faker_integration.py b/tests/integration_tests/test_source_faker_integration.py index 1a2318c6..dde24bbf 100644 --- a/tests/integration_tests/test_source_faker_integration.py +++ b/tests/integration_tests/test_source_faker_integration.py @@ -128,12 +128,6 @@ def test_which_source_faker() -> None: f"Can't find source-faker on PATH: {os.environ['PATH']}" -def test_duckdb_cache(duckdb_cache: DuckDBCache) -> None: - """Test that the duckdb cache is available.""" - assert duckdb_cache - assert isinstance(duckdb_cache, DuckDBCache) - - def test_faker_pks( source_faker_seed_a: ab.Source, duckdb_cache: DuckDBCache, From b55c4677d321362d1124ab306ed11850c52f8697 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 8 Apr 2024 17:02:32 -0700 Subject: [PATCH 074/118] change parent class of built-in secretmanagers --- airbyte/secrets.py | 94 +++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/airbyte/secrets.py b/airbyte/secrets.py index 64268d65..bb088852 100644 --- a/airbyte/secrets.py +++ b/airbyte/secrets.py @@ -102,50 +102,7 @@ def __eq__(self, value: object) -> bool: return super().__eq__(value) -class CustomSecretManager(SecretManager, ABC): - """Custom secret manager that retrieves secrets from a custom source. - - This class is a convenience class that can be used to create custom secret - managers. By default, custom secrets managers are auto-registered during - creation. - """ - - auto_register = True - replace_existing = False - as_backup = False - - def __init__(self) -> None: - super().__init__() - if self.auto_register: - self.register() - - def register(self, *, replace_existing: bool | None = None) -> None: - """Register the secret manager as global secret source. - - This makes the secret manager available to the `get_secret` function and - allows it to be used automatically as a source for secrets. - - If `replace_existing` is `True`, the secret manager will replace all existing - secrets sources, including the default secret managers such as environment - variables, dotenv files, and Google Colab secrets. If `replace_existing` is - None or not provided, the default behavior will be used from the `replace_existing` - of the class (`False` unless overridden by the subclass). - """ - if replace_existing is None: - replace_existing = self.replace_existing - - if replace_existing: - _SECRETS_SOURCES.clear() - - if self.as_backup: - # Add to end of list - _SECRETS_SOURCES.append(self) - else: - # Add to beginning of list - _SECRETS_SOURCES.insert(0, self) - - -class EnvVarSecretManager(CustomSecretManager): +class EnvVarSecretManager(SecretManager): """Secret manager that retrieves secrets from environment variables.""" name = SecretSourceEnum.ENV.value @@ -158,7 +115,7 @@ def get_secret(self, secret_name: str) -> SecretString | None: return SecretString(os.environ[secret_name]) -class DotenvSecretManager(CustomSecretManager): +class DotenvSecretManager(SecretManager): """Secret manager that retrieves secrets from a `.env` file.""" name = SecretSourceEnum.DOTENV.value @@ -178,7 +135,7 @@ def get_secret(self, secret_name: str) -> SecretString | None: return SecretString(dotenv_vars[secret_name]) -class ColabSecretManager(CustomSecretManager): +class ColabSecretManager(SecretManager): """Secret manager that retrieves secrets from Google Colab user secrets.""" name = SecretSourceEnum.GOOGLE_COLAB.value @@ -196,7 +153,7 @@ def get_secret(self, secret_name: str) -> SecretString | None: return None -class SecretsPrompt(CustomSecretManager): +class SecretsPrompt(SecretManager): """Secret manager that prompts the user to enter a secret.""" name = SecretSourceEnum.PROMPT.value @@ -211,6 +168,49 @@ def get_secret( return None +class CustomSecretManager(SecretManager, ABC): + """Custom secret manager that retrieves secrets from a custom source. + + This class is a convenience class that can be used to create custom secret + managers. By default, custom secrets managers are auto-registered during + creation. + """ + + auto_register = True + replace_existing = False + as_backup = False + + def __init__(self) -> None: + super().__init__() + if self.auto_register: + self.register() + + def register(self, *, replace_existing: bool | None = None) -> None: + """Register the secret manager as global secret source. + + This makes the secret manager available to the `get_secret` function and + allows it to be used automatically as a source for secrets. + + If `replace_existing` is `True`, the secret manager will replace all existing + secrets sources, including the default secret managers such as environment + variables, dotenv files, and Google Colab secrets. If `replace_existing` is + None or not provided, the default behavior will be used from the `replace_existing` + of the class (`False` unless overridden by the subclass). + """ + if replace_existing is None: + replace_existing = self.replace_existing + + if replace_existing: + _SECRETS_SOURCES.clear() + + if self.as_backup: + # Add to end of list + _SECRETS_SOURCES.append(self) + else: + # Add to beginning of list + _SECRETS_SOURCES.insert(0, self) + + def _get_secret_sources() -> list[SecretManager]: """Initialize the default secret sources.""" if len(_SECRETS_SOURCES) == 0: From 2fc8ce82638b75312750aeed21993e4c2877bba8 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 09:03:00 -0700 Subject: [PATCH 075/118] refactor: secrets module --- .gitignore | 4 +- airbyte/secrets.py | 337 -------------------------------- airbyte/secrets/__init__.py | 33 ++++ airbyte/secrets/base.py | 187 ++++++++++++++++++ airbyte/secrets/config.py | 75 +++++++ airbyte/secrets/env_vars.py | 43 ++++ airbyte/secrets/google_colab.py | 36 ++++ airbyte/secrets/google_gsm.py | 168 ++++++++++++++++ airbyte/secrets/prompt.py | 24 +++ airbyte/secrets/util.py | 85 ++++++++ 10 files changed, 653 insertions(+), 339 deletions(-) delete mode 100644 airbyte/secrets.py create mode 100644 airbyte/secrets/__init__.py create mode 100644 airbyte/secrets/base.py create mode 100644 airbyte/secrets/config.py create mode 100644 airbyte/secrets/env_vars.py create mode 100644 airbyte/secrets/google_colab.py create mode 100644 airbyte/secrets/google_gsm.py create mode 100644 airbyte/secrets/prompt.py create mode 100644 airbyte/secrets/util.py diff --git a/.gitignore b/.gitignore index e15da207..5a5136a8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ -# Directories and subdirectories called 'secrets' or '.secrets' +# Directories and subdirectories called '.secrets' and the top-level '/secrets' directory .secrets -secrets +/secrets # Virtual Environments .venv diff --git a/airbyte/secrets.py b/airbyte/secrets.py deleted file mode 100644 index bb088852..00000000 --- a/airbyte/secrets.py +++ /dev/null @@ -1,337 +0,0 @@ -# Copyright (c) 2023 Airbyte, Inc., all rights reserved. -"""Secrets management for PyAirbyte.""" -from __future__ import annotations - -import contextlib -import os -import warnings -from abc import ABC, abstractmethod -from enum import Enum -from getpass import getpass -from typing import Any, cast - -from dotenv import dotenv_values - -from airbyte import exceptions as exc -from airbyte._util import meta - - -try: - from google.colab import userdata as colab_userdata -except ImportError: - colab_userdata = None - - -class SecretSourceEnum(str, Enum): - ENV = "env" - DOTENV = "dotenv" - GOOGLE_COLAB = "google_colab" - - PROMPT = "prompt" - - -_SECRETS_SOURCES: list[SecretManager] = [] - - -class SecretString(str): - """A string that represents a secret. - - This class is used to mark a string as a secret. When a secret is printed, it - will be masked to prevent accidental exposure of sensitive information. - """ - - __slots__ = () - - def __repr__(self) -> str: - return "" - - -class SecretManager(ABC): - """Abstract base class for secret managers. - - Secret managers are used to retrieve secrets from a secret store. - - By registering a secret manager, PyAirbyte can automatically locate and - retrieve secrets from the secret store when needed. This allows you to - securely store and access sensitive information such as API keys, passwords, - and other credentials without hardcoding them in your code. - - To create a custom secret manager, subclass this class and implement the - `get_secret` method. By default, the secret manager will be automatically - registered as a global secret source, but will not replace any existing - secret sources. To customize this behavior, override the `auto_register` and - `replace_existing` attributes in your subclass as needed. - - Note: Registered secrets managers always have priority over the default - secret sources such as environment variables, dotenv files, and Google Colab - secrets. If multiple secret managers are registered, the last one registered - will take priority. - """ - - replace_existing = False - as_backup = False - - def __init__(self) -> None: - """Instantiate the new secret manager.""" - if not hasattr(self, "name"): - # Default to the class name if no name is provided - self.name: str = self.__class__.__name__ - - @abstractmethod - def get_secret(self, secret_name: str) -> SecretString | None: - """Get a named secret from the secret manager. - - This method should be implemented by subclasses to retrieve secrets from - the secret store. If the secret is not found, the method should return `None`. - """ - ... - - def __str__(self) -> str: - return self.name - - def __eq__(self, value: object) -> bool: - if isinstance(value, SecretManager): - return self.name == value.name - - if isinstance(value, str): - return self.name == value - - if isinstance(value, SecretSourceEnum): - return self.name == str(value) - - return super().__eq__(value) - - -class EnvVarSecretManager(SecretManager): - """Secret manager that retrieves secrets from environment variables.""" - - name = SecretSourceEnum.ENV.value - - def get_secret(self, secret_name: str) -> SecretString | None: - """Get a named secret from the environment.""" - if secret_name not in os.environ: - return None - - return SecretString(os.environ[secret_name]) - - -class DotenvSecretManager(SecretManager): - """Secret manager that retrieves secrets from a `.env` file.""" - - name = SecretSourceEnum.DOTENV.value - - def get_secret(self, secret_name: str) -> SecretString | None: - """Get a named secret from the `.env` file.""" - try: - dotenv_vars: dict[str, str | None] = dotenv_values() - except Exception: - # Can't locate or parse a .env file - return None - - if secret_name not in dotenv_vars: - # Secret not found - return None - - return SecretString(dotenv_vars[secret_name]) - - -class ColabSecretManager(SecretManager): - """Secret manager that retrieves secrets from Google Colab user secrets.""" - - name = SecretSourceEnum.GOOGLE_COLAB.value - - def get_secret(self, secret_name: str) -> SecretString | None: - """Get a named secret from Google Colab user secrets.""" - if colab_userdata is None: - # The module doesn't exist. We probably aren't in Colab. - return None - - try: - return SecretString(colab_userdata.get(secret_name)) - except Exception: - # Secret name not found. Continue. - return None - - -class SecretsPrompt(SecretManager): - """Secret manager that prompts the user to enter a secret.""" - - name = SecretSourceEnum.PROMPT.value - - def get_secret( - self, - secret_name: str, - ) -> SecretString | None: - with contextlib.suppress(Exception): - return SecretString(getpass(f"Enter the value for secret '{secret_name}': ")) - - return None - - -class CustomSecretManager(SecretManager, ABC): - """Custom secret manager that retrieves secrets from a custom source. - - This class is a convenience class that can be used to create custom secret - managers. By default, custom secrets managers are auto-registered during - creation. - """ - - auto_register = True - replace_existing = False - as_backup = False - - def __init__(self) -> None: - super().__init__() - if self.auto_register: - self.register() - - def register(self, *, replace_existing: bool | None = None) -> None: - """Register the secret manager as global secret source. - - This makes the secret manager available to the `get_secret` function and - allows it to be used automatically as a source for secrets. - - If `replace_existing` is `True`, the secret manager will replace all existing - secrets sources, including the default secret managers such as environment - variables, dotenv files, and Google Colab secrets. If `replace_existing` is - None or not provided, the default behavior will be used from the `replace_existing` - of the class (`False` unless overridden by the subclass). - """ - if replace_existing is None: - replace_existing = self.replace_existing - - if replace_existing: - _SECRETS_SOURCES.clear() - - if self.as_backup: - # Add to end of list - _SECRETS_SOURCES.append(self) - else: - # Add to beginning of list - _SECRETS_SOURCES.insert(0, self) - - -def _get_secret_sources() -> list[SecretManager]: - """Initialize the default secret sources.""" - if len(_SECRETS_SOURCES) == 0: - # Initialize the default secret sources - _SECRETS_SOURCES.extend( - [ - EnvVarSecretManager(), - DotenvSecretManager(), - ] - ) - if meta.is_colab(): - _SECRETS_SOURCES.append(ColabSecretManager()) - - if meta.is_interactive(): - _SECRETS_SOURCES.append(SecretsPrompt()) - - return _SECRETS_SOURCES.copy() - - -# Ensure the default secret sources are initialized -_ = _get_secret_sources() - - -def register_secret_manager(secret_manager: CustomSecretManager) -> None: - """Register a custom secret manager.""" - secret_manager.register() - - -def disable_secret_source(source: SecretManager | SecretSourceEnum) -> None: - """Disable one of the default secrets sources. - - This function can accept either a `SecretManager` instance, a `SecretSourceEnum` enum value, or - a string representing the name of the source to disable. - """ - if isinstance(source, SecretManager) and source in _SECRETS_SOURCES: - _SECRETS_SOURCES.remove(source) - return - - # Else, remove by name - for s in _SECRETS_SOURCES: - if s.name == str(source): - _SECRETS_SOURCES.remove(s) - - -def get_secret( - secret_name: str, - /, - *, - sources: list[SecretManager | SecretSourceEnum] | None = None, - allow_prompt: bool = True, - **kwargs: dict[str, Any], -) -> SecretString: - """Get a secret from the environment. - - The optional `sources` argument of enum type `SecretSourceEnum` or list of `SecretSourceEnum` - options. If left blank, all available sources will be checked. If a list of `SecretSourceEnum` - entries is passed, then the sources will be checked using the provided ordering. - - If `allow_prompt` is `True` or if SecretSourceEnum.PROMPT is declared in the `source` arg, then - the user will be prompted to enter the secret if it is not found in any of the other sources. - """ - if "source" in kwargs: - warnings.warn( - message="The `source` argument is deprecated. Use the `sources` argument instead.", - category=DeprecationWarning, - stacklevel=2, - ) - sources = kwargs.pop("source") - - available_sources: dict[str, SecretManager] = {} - for available_source in _get_secret_sources(): - # Add available sources to the dict. Order matters. - available_sources[available_source.name] = available_source - - if sources is None: - # If ANY is in the list, then we don't need to check any other sources. - # This is the default behavior. - sources = list(available_sources.values()) - - elif not isinstance(sources, list): - sources = [sources] - - # Replace any SecretSourceEnum strings with the matching SecretManager object - for source in sources: - if isinstance(source, SecretSourceEnum): - if source not in available_sources: - raise exc.PyAirbyteInputError( - guidance="Invalid secret source name.", - input_value=source, - context={ - "Available Sources": list(available_sources.keys()), - }, - ) - - sources[sources.index(source)] = available_sources[source] - - secret_managers = cast(list[SecretManager], sources) - - if SecretSourceEnum.PROMPT in secret_managers: - prompt_source = secret_managers.pop( - secret_managers.index(SecretSourceEnum.PROMPT), - ) - - if allow_prompt: - # Always check prompt last. Add it to the end of the list. - secret_managers.append(prompt_source) - - for secret_mgr in secret_managers: - val = secret_mgr.get_secret(secret_name) - if val: - return SecretString(val) - - raise exc.PyAirbyteSecretNotFoundError( - secret_name=secret_name, - sources=[str(s) for s in available_sources], - ) - - -__all__ = [ - "get_secret", - "SecretSourceEnum", - "SecretManager", - "CustomSecretManager", -] diff --git a/airbyte/secrets/__init__.py b/airbyte/secrets/__init__.py new file mode 100644 index 00000000..b3861e4e --- /dev/null +++ b/airbyte/secrets/__init__.py @@ -0,0 +1,33 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""Secrets management for PyAirbyte.""" + +from __future__ import annotations + +from airbyte.secrets.base import SecretHandle, SecretSourceEnum, SecretString +from airbyte.secrets.config import disable_secret_source, register_secret_manager +from airbyte.secrets.env_vars import DotenvSecretManager, EnvVarSecretManager +from airbyte.secrets.google_colab import ColabSecretManager +from airbyte.secrets.google_gsm import GoogleGSMSecretManager +from airbyte.secrets.prompt import SecretsPrompt +from airbyte.secrets.util import get_secret + + +__all__ = [ + # Secret Access + "get_secret", + # Secret Classes + "SecretHandle", + "SecretString", + "SecretSourceEnum", + # Secret Managers + "ColabSecretManager", + "CustomSecretManager", + "DotenvSecretManager", + "EnvVarSecretManager", + "GoogleGSMSecretManager", + "SecretManager", + "SecretsPrompt", + # Registration Functions` + "register_secret_manager", + "disable_secret_source", +] diff --git a/airbyte/secrets/base.py b/airbyte/secrets/base.py new file mode 100644 index 00000000..54a1f839 --- /dev/null +++ b/airbyte/secrets/base.py @@ -0,0 +1,187 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""___""" + +from __future__ import annotations + +import json +from abc import ABC, abstractmethod +from enum import Enum +from typing import cast + +from airbyte import exceptions as exc +from airbyte.secrets.config import clear_secret_sources, register_secret_manager + + +class SecretSourceEnum(str, Enum): + ENV = "env" + DOTENV = "dotenv" + GOOGLE_COLAB = "google_colab" + GOOGLE_GSM = "google_gsm" # Not enabled by default + + PROMPT = "prompt" + + +class SecretString(str): + """A string that represents a secret. + + This class is used to mark a string as a secret. When a secret is printed, it + will be masked to prevent accidental exposure of sensitive information. + """ + + __slots__ = () + + def __repr__(self) -> str: + return "" + + def is_json(self) -> bool: + """Check if the secret string is a valid JSON string.""" + try: + json.loads(self) + except (json.JSONDecodeError, Exception): + return False + + return True + + def parse_json(self) -> dict: + """Parse the secret string as JSON.""" + try: + return json.loads(self) + except json.JSONDecodeError as ex: + raise exc.PyAirbyteInputError( + message="Failed to parse secret as JSON.", + context={ + "Message": ex.msg, + "Position": ex.pos, + "SecretString_Length": len(self), # Debug secret blank or an unexpected format. + }, + ) from None + + +class SecretManager(ABC): + """Abstract base class for secret managers. + + Secret managers are used to retrieve secrets from a secret store. + + By registering a secret manager, PyAirbyte can automatically locate and + retrieve secrets from the secret store when needed. This allows you to + securely store and access sensitive information such as API keys, passwords, + and other credentials without hardcoding them in your code. + + To create a custom secret manager, subclass this class and implement the + `get_secret` method. By default, the secret manager will be automatically + registered as a global secret source, but will not replace any existing + secret sources. To customize this behavior, override the `auto_register` and + `replace_existing` attributes in your subclass as needed. + + Note: Registered secrets managers always have priority over the default + secret sources such as environment variables, dotenv files, and Google Colab + secrets. If multiple secret managers are registered, the last one registered + will take priority. + """ + + replace_existing = False + as_backup = False + + def __init__(self) -> None: + """Instantiate the new secret manager.""" + if not hasattr(self, "name"): + # Default to the class name if no name is provided + self.name: str = self.__class__.__name__ + + @abstractmethod + def get_secret(self, secret_name: str) -> SecretString | None: + """Get a named secret from the secret manager. + + This method should be implemented by subclasses to retrieve secrets from + the secret store. If the secret is not found, the method should return `None`. + """ + ... + + def __str__(self) -> str: + return self.name + + def __eq__(self, value: object) -> bool: + if isinstance(value, SecretManager): + return self.name == value.name + + if isinstance(value, str): + return self.name == value + + if isinstance(value, SecretSourceEnum): + return self.name == str(value) + + return super().__eq__(value) + + +class SecretHandle: + """A handle for a secret in a secret manager. + + This class is used to store a reference to a secret in a secret manager. + The secret is not retrieved until the `get` method is called on the handle. + """ + + def __init__( + self, + parent: SecretManager, + secret_name: str, + ) -> None: + """Instantiate a new secret handle.""" + self.parent = parent + self.secret_name = secret_name + + def get_value(self) -> SecretString: + """Get the secret from the secret manager. + + Subclasses can optionally override this method to provide a more optimized code path. + """ + return cast(SecretString, self.parent.get_secret(self.secret_name)) + + +class CustomSecretManager(SecretManager, ABC): + """Custom secret manager that retrieves secrets from a custom source. + + This class is a convenience class that can be used to create custom secret + managers. By default, custom secrets managers are auto-registered during + creation. + """ + + auto_register = True + replace_existing = False + as_backup = False + + def __init__(self) -> None: + super().__init__() + if self.auto_register: + self.register() + + def register( + self, + *, + replace_existing: bool | None = None, + as_backup: bool | None = None, + ) -> None: + """Register the secret manager as global secret source. + + This makes the secret manager available to the `get_secret` function and + allows it to be used automatically as a source for secrets. + + If `replace_existing` is `True`, the secret manager will replace all existing + secrets sources, including the default secret managers such as environment + variables, dotenv files, and Google Colab secrets. If `replace_existing` is + None or not provided, the default behavior will be used from the `replace_existing` + of the class (`False` unless overridden by the subclass). + """ + if replace_existing is None: + replace_existing = self.replace_existing + + if as_backup is None: + as_backup = self.as_backup + + if replace_existing: + clear_secret_sources() + + register_secret_manager( + self, + as_backup=as_backup, + replace_existing=replace_existing, + ) diff --git a/airbyte/secrets/config.py b/airbyte/secrets/config.py new file mode 100644 index 00000000..078adc5a --- /dev/null +++ b/airbyte/secrets/config.py @@ -0,0 +1,75 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""___""" + +from __future__ import annotations + +from airbyte._util import meta +from airbyte.secrets.base import CustomSecretManager, SecretManager, SecretSourceEnum +from airbyte.secrets.env_vars import DotenvSecretManager, EnvVarSecretManager +from airbyte.secrets.google_colab import ColabSecretManager +from airbyte.secrets.prompt import SecretsPrompt + + +_SECRETS_SOURCES: list[SecretManager] = [] + + +def _get_secret_sources() -> list[SecretManager]: + """Initialize the default secret sources.""" + if len(_SECRETS_SOURCES) == 0: + # Initialize the default secret sources + _SECRETS_SOURCES.extend( + [ + EnvVarSecretManager(), + DotenvSecretManager(), + ] + ) + if meta.is_colab(): + _SECRETS_SOURCES.append(ColabSecretManager()) + + if meta.is_interactive(): + _SECRETS_SOURCES.append(SecretsPrompt()) + + return _SECRETS_SOURCES.copy() + + +# Ensure the default secret sources are initialized +_ = _get_secret_sources() + + +def register_secret_manager( + secret_manager: CustomSecretManager, + *, + as_backup: bool = False, + replace_existing: bool = False, +) -> None: + """Register a custom secret manager.""" + if replace_existing: + clear_secret_sources() + + if as_backup: + # Add to end of list + _SECRETS_SOURCES.append(secret_manager) + else: + # Add to beginning of list + _SECRETS_SOURCES.insert(0, secret_manager) + + +def clear_secret_sources() -> None: + """Clear all secret sources.""" + _SECRETS_SOURCES.clear() + + +def disable_secret_source(source: SecretManager | SecretSourceEnum) -> None: + """Disable one of the default secrets sources. + + This function can accept either a `SecretManager` instance, a `SecretSourceEnum` enum value, or + a string representing the name of the source to disable. + """ + if isinstance(source, SecretManager) and source in _SECRETS_SOURCES: + _SECRETS_SOURCES.remove(source) + return + + # Else, remove by name + for s in _SECRETS_SOURCES: + if s.name == str(source): + _SECRETS_SOURCES.remove(s) diff --git a/airbyte/secrets/env_vars.py b/airbyte/secrets/env_vars.py new file mode 100644 index 00000000..5a12135b --- /dev/null +++ b/airbyte/secrets/env_vars.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""___""" + +from __future__ import annotations + +import os + +from dotenv import dotenv_values + +from airbyte.secrets.base import SecretManager, SecretSourceEnum, SecretString + + +class EnvVarSecretManager(SecretManager): + """Secret manager that retrieves secrets from environment variables.""" + + name = SecretSourceEnum.ENV.value + + def get_secret(self, secret_name: str) -> SecretString | None: + """Get a named secret from the environment.""" + if secret_name not in os.environ: + return None + + return SecretString(os.environ[secret_name]) + + +class DotenvSecretManager(SecretManager): + """Secret manager that retrieves secrets from a `.env` file.""" + + name = SecretSourceEnum.DOTENV.value + + def get_secret(self, secret_name: str) -> SecretString | None: + """Get a named secret from the `.env` file.""" + try: + dotenv_vars: dict[str, str | None] = dotenv_values() + except Exception: + # Can't locate or parse a .env file + return None + + if secret_name not in dotenv_vars: + # Secret not found + return None + + return SecretString(dotenv_vars[secret_name]) diff --git a/airbyte/secrets/google_colab.py b/airbyte/secrets/google_colab.py new file mode 100644 index 00000000..49b46097 --- /dev/null +++ b/airbyte/secrets/google_colab.py @@ -0,0 +1,36 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Secrets manager for Google Colab user secrets.""" + +from __future__ import annotations + +from airbyte.secrets.base import SecretManager, SecretSourceEnum, SecretString + + +class ColabSecretManager(SecretManager): + """Secret manager that retrieves secrets from Google Colab user secrets.""" + + name = SecretSourceEnum.GOOGLE_COLAB.value + + def __init__(self) -> None: + try: + from google.colab import ( # pyright: ignore[reportMissingImports] + userdata as colab_userdata, + ) + + self.colab_userdata = colab_userdata + except ImportError: + self.colab_userdata = None + + super().__init__() + + def get_secret(self, secret_name: str) -> SecretString | None: + """Get a named secret from Google Colab user secrets.""" + if self.colab_userdata is None: + # The module doesn't exist. We probably aren't in Colab. + return None + + try: + return SecretString(self.colab_userdata.get(secret_name)) + except Exception: + # Secret name not found. Continue. + return None diff --git a/airbyte/secrets/google_gsm.py b/airbyte/secrets/google_gsm.py new file mode 100644 index 00000000..c155285d --- /dev/null +++ b/airbyte/secrets/google_gsm.py @@ -0,0 +1,168 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Secret manager that retrieves secrets from Google Secrets Manager (GSM).""" + +from __future__ import annotations + +import json +import os +from pathlib import Path +from typing import TYPE_CHECKING + +from google.cloud import secretmanager_v1 as secretmanager + +from airbyte import exceptions as exc +from airbyte.secrets.base import CustomSecretManager, SecretHandle, SecretSourceEnum, SecretString + + +if TYPE_CHECKING: + from collections.abc import Iterable + + from google.cloud.secretmanager_v1.services.secret_manager_service.pagers import ( + ListSecretsPager, + ) + + +class GoogleGSMSecretManager(CustomSecretManager): + """Secret manager that retrieves secrets from Google Secrets Manager (GSM). + + This secret manager is not enabled by default. To use it, you must provide the project ID and + the credentials for a service account with the necessary permissions to access the secrets. + """ + + name = SecretSourceEnum.GOOGLE_GSM.value + auto_register = False + as_backup = False + replace_existing = False + + CONNECTOR_LABEL = "connector" + """The label key used to filter secrets by connector name.""" + + def __init__( + self, + project: str, + credentials_path: str | None, + credentials_json: str | SecretString | None, + *, + auto_register: bool = False, + as_backup: bool = False, + ) -> None: + """Instantiate a new Google GSM secret manager instance. + + You can provide either the path to the credentials file or the JSON contents of the + credentials file. If both are provided, a `PyAirbyteInputError` will be raised. + """ + if credentials_path and credentials_json: + raise exc.PyAirbyteInputError( + guidance=("You can provide `credentials_path` or `credentials_json` but not both."), + ) + + self.project = project + + if credentials_json is not None and not isinstance(credentials_json, SecretString): + credentials_json = SecretString(credentials_json) + + if not credentials_json and not credentials_path: + if "GOOGLE_APPLICATION_CREDENTIALS" in os.environ: + credentials_path = os.environ["GOOGLE_APPLICATION_CREDENTIALS"] + + elif "GCP_GSM_CREDENTIALS" in os.environ: + credentials_json = SecretString(os.environ["GCP_GSM_CREDENTIALS"]) + + if credentials_path: + credentials_json = SecretString(Path(credentials_path).read_text()) + + if not credentials_json: + raise exc.PyAirbyteInputError( + guidance=( + "No Google Cloud credentials found. You can provide the path to the credentials " + "file using the `credentials_path` argument, or provide the JSON contents of the " + "credentials file using the `credentials_json` argument." + ), + ) + + self.secret_client = secretmanager.SecretManagerServiceClient.from_service_account_info( + json.loads(credentials_json) + ) + + if auto_register: + self.auto_register = auto_register + + if as_backup: + self.as_backup = as_backup + + super().__init__() # Handles the registration if needed + + def get_secret(self, secret_name: str) -> SecretString | None: + """Get a named secret from Google Colab user secrets.""" + return self.secret_client.access_secret_version( + name=f"projects/{self.project}/secrets/{secret_name}/versions/latest" + ).payload.data.decode("UTF-8") + + def fetch_secrets( + self, + filter_string: str, + ) -> Iterable[SecretHandle]: + """List all available secrets in the secret manager. + + Example filter strings: + - `labels.connector=source-bigquery`: Filter for secrets with the labe 'source-bigquery'. + + Args: + filter_string (str): A filter string to apply to the list of secrets, following the + format described in the Google Secret Manager documentation: + https://cloud.google.com/secret-manager/docs/filtering + + Returns: + Iterable[SecretHandle]: An iterable of `SecretHandle` objects for the matching secrets. + """ + gsm_secrets: ListSecretsPager = self.secret_client.list_secrets( + secretmanager.ListSecretsRequest( + request={ + "filter": filter_string, + } + ) + ) + + return [ + SecretHandle( + parent=self, + secret_name=secret.name, + ) + for secret in gsm_secrets + ] + + def fetch_secrets_by_label( + self, + label_key: str, + label_value: str, + ) -> Iterable[SecretHandle]: + """List all available secrets in the secret manager. + + Args: + label_key (str): The key of the label to filter by. + label_value (str): The value of the label to filter by. + + Returns: + Iterable[SecretHandle]: An iterable of `SecretHandle` objects for the matching secrets. + """ + return self.fetch_secrets(f"labels.{label_key}={label_value}") + + def fetch_connector_secrets( + self, + connector_name: str, + ) -> Iterable[SecretHandle]: + """Fetch secrets in the secret manager, using the connector name as a filter for the label. + + The label key used to filter the secrets is defined by the `CONNECTOR_LABEL` attribute, + which defaults to 'connector'. + + Args: + connector_name (str): The name of the connector to filter by. + + Returns: + Iterable[SecretHandle]: An iterable of `SecretHandle` objects for the matching secrets. + """ + return self.fetch_secrets_by_label( + label_key=self.CONNECTOR_LABEL, + label_value=connector_name, + ) diff --git a/airbyte/secrets/prompt.py b/airbyte/secrets/prompt.py new file mode 100644 index 00000000..01a0da9d --- /dev/null +++ b/airbyte/secrets/prompt.py @@ -0,0 +1,24 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""___""" + +from __future__ import annotations + +import contextlib +from getpass import getpass + +from airbyte.secrets.base import SecretManager, SecretSourceEnum, SecretString + + +class SecretsPrompt(SecretManager): + """Secret manager that prompts the user to enter a secret.""" + + name = SecretSourceEnum.PROMPT.value + + def get_secret( + self, + secret_name: str, + ) -> SecretString | None: + with contextlib.suppress(Exception): + return SecretString(getpass(f"Enter the value for secret '{secret_name}': ")) + + return None diff --git a/airbyte/secrets/util.py b/airbyte/secrets/util.py new file mode 100644 index 00000000..4637abec --- /dev/null +++ b/airbyte/secrets/util.py @@ -0,0 +1,85 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""___""" + +from __future__ import annotations + +import warnings +from typing import Any, cast + +from airbyte import exceptions as exc +from airbyte.secrets.base import SecretManager, SecretSourceEnum, SecretString +from airbyte.secrets.config import _get_secret_sources + + +def get_secret( + secret_name: str, + /, + *, + sources: list[SecretManager | SecretSourceEnum] | None = None, + allow_prompt: bool = True, + **kwargs: dict[str, Any], +) -> SecretString: + """Get a secret from the environment. + + The optional `sources` argument of enum type `SecretSourceEnum` or list of `SecretSourceEnum` + options. If left blank, all available sources will be checked. If a list of `SecretSourceEnum` + entries is passed, then the sources will be checked using the provided ordering. + + If `allow_prompt` is `True` or if SecretSourceEnum.PROMPT is declared in the `source` arg, then + the user will be prompted to enter the secret if it is not found in any of the other sources. + """ + if "source" in kwargs: + warnings.warn( + message="The `source` argument is deprecated. Use the `sources` argument instead.", + category=DeprecationWarning, + stacklevel=2, + ) + sources = kwargs.pop("source") + + available_sources: dict[str, SecretManager] = {} + for available_source in _get_secret_sources(): + # Add available sources to the dict. Order matters. + available_sources[available_source.name] = available_source + + if sources is None: + # If ANY is in the list, then we don't need to check any other sources. + # This is the default behavior. + sources = list(available_sources.values()) + + elif not isinstance(sources, list): + sources = [sources] + + # Replace any SecretSourceEnum strings with the matching SecretManager object + for source in sources: + if isinstance(source, SecretSourceEnum): + if source not in available_sources: + raise exc.PyAirbyteInputError( + guidance="Invalid secret source name.", + input_value=source, + context={ + "Available Sources": list(available_sources.keys()), + }, + ) + + sources[sources.index(source)] = available_sources[source] + + secret_managers = cast(list[SecretManager], sources) + + if SecretSourceEnum.PROMPT in secret_managers: + prompt_source = secret_managers.pop( + secret_managers.index(SecretSourceEnum.PROMPT), + ) + + if allow_prompt: + # Always check prompt last. Add it to the end of the list. + secret_managers.append(prompt_source) + + for secret_mgr in secret_managers: + val = secret_mgr.get_secret(secret_name) + if val: + return SecretString(val) + + raise exc.PyAirbyteSecretNotFoundError( + secret_name=secret_name, + sources=[str(s) for s in available_sources], + ) From 7c982928d023bf0ba4bc54080c12e921d36ea2fe Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 09:11:12 -0700 Subject: [PATCH 076/118] refactor: integration tests, remove get_ci_secret() functions --- airbyte/_util/google_secrets.py | 14 ----- tests/integration_tests/conftest.py | 80 +++++++++++------------------ 2 files changed, 31 insertions(+), 63 deletions(-) diff --git a/airbyte/_util/google_secrets.py b/airbyte/_util/google_secrets.py index 7ff426dc..184e48cc 100644 --- a/airbyte/_util/google_secrets.py +++ b/airbyte/_util/google_secrets.py @@ -12,20 +12,6 @@ def get_gcp_secret( project_name: str, secret_name: str, ) -> str: - """Try to get a GCP secret from the environment, or raise an error. - - We assume that the Google service account credentials file contents are stored in the - environment variable GCP_GSM_CREDENTIALS. If this environment variable is not set, we raise an - error. Otherwise, we use the Google Secret Manager API to fetch the secret with the given name. - """ - if "GCP_GSM_CREDENTIALS" not in os.environ: - raise EnvironmentError( # noqa: TRY003, UP024 - "GCP_GSM_CREDENTIALS env variable not set, can't fetch secrets. Make sure they are set " - "up as described: " - "https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/ci_credentials/" - "README.md#get-gsm-access" - ) - # load secrets from GSM using the GCP_GSM_CREDENTIALS env variable secret_client = secretmanager.SecretManagerServiceClient.from_service_account_info( json.loads(os.environ["GCP_GSM_CREDENTIALS"]) diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index 55853abc..2398b650 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -3,7 +3,6 @@ from __future__ import annotations from contextlib import suppress -import json import os import pytest @@ -11,12 +10,11 @@ from sqlalchemy import create_engine from airbyte._util import meta -from airbyte._util.google_secrets import get_gcp_secret from airbyte.caches.base import CacheBase from airbyte.caches.bigquery import BigQueryCache from airbyte.caches.motherduck import MotherDuckCache from airbyte.caches.snowflake import SnowflakeCache -from airbyte.secrets import CustomSecretManager +from airbyte.secrets import CustomSecretManager, GoogleGSMSecretManager, SecretHandle from airbyte._util.temp_files import as_temp_files import airbyte as ab @@ -24,48 +22,31 @@ AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" -def get_ci_secret( - secret_name, - project_name: str = AIRBYTE_INTERNAL_GCP_PROJECT, -) -> str: - return get_gcp_secret(project_name=project_name, secret_name=secret_name) - - -def get_ci_secret_json( - secret_name, - project_name: str = AIRBYTE_INTERNAL_GCP_PROJECT, -) -> dict: - return json.loads(get_ci_secret(secret_name=secret_name, project_name=project_name)) +@pytest.mark.requires_creds +@pytest.fixture +def ci_secret_manager() -> GoogleGSMSecretManager: + return GoogleGSMSecretManager( + project_name=AIRBYTE_INTERNAL_GCP_PROJECT, + credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), + ) def get_connector_config(self, connector_name: str, index: int = 0) -> dict | None: - # Import here because `airbyte_ci` may not be available in all environments: - from ci_credentials import RemoteSecret, get_connector_secrets - - assert connector_name is not None and connector_name != "all", \ - "We can only retrieve one connector config at a time." - + """Retrieve the connector configuration from GSM.""" gcp_gsm_credentials = ab.get_secret("GCP_GSM_CREDENTIALS") - secrets: list[RemoteSecret] = [] - secrets, _ = get_connector_secrets( - connector_name=connector_name, - gcp_gsm_credentials=gcp_gsm_credentials, - disable_masking=True, + gsm_secrets_manager = GoogleGSMSecretManager( + project_name=AIRBYTE_INTERNAL_GCP_PROJECT, + credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), ) + first_secret: SecretHandle = next(gsm_secrets_manager.fetch_secrets( + # https://cloud.google.com/secret-manager/docs/filtering + filter_string=f"labels.connector={connector_name}" + ), None) - if len(secrets) > 1: - print( - f"Found {len(secrets)} secrets for connector '{connector_name}'." - ) - else: - print( - f"Found '{connector_name}' credentials." - ) - - if index >= len(secrets): - raise IndexError(f"Index {index} is out of range for connector '{connector_name}'.") - - return secrets[index].value_dict + print( + f"Found '{connector_name}' credential secret ${first_secret.secret_name}." + ) + return first_secret.get_value().parse_json() class AirbyteIntegrationTestSecretManager(CustomSecretManager): @@ -141,10 +122,11 @@ def new_motherduck_cache( @pytest.fixture -def new_snowflake_cache(): - secret = get_ci_secret_json( +def new_snowflake_cache(ci_secret_manager: GoogleGSMSecretManager): + secret = ci_secret_manager.get_secret( "AIRBYTE_LIB_SNOWFLAKE_CREDS", - ) + ).parse_json() + config = SnowflakeCache( account=secret["account"], username=secret["username"], @@ -165,10 +147,10 @@ def new_snowflake_cache(): @pytest.fixture @pytest.mark.requires_creds -def new_bigquery_cache(): - dest_bigquery_config = get_ci_secret_json( +def new_bigquery_cache(ci_secret_manager: GoogleGSMSecretManager): + dest_bigquery_config = ci_secret_manager.get_secret( "SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" - ) + ).parse_json() dataset_name = f"test_deleteme_{str(ulid.ULID()).lower()[-6:]}" credentials_json = dest_bigquery_config["credentials_json"] @@ -189,12 +171,12 @@ def new_bigquery_cache(): @pytest.mark.requires_creds @pytest.fixture(autouse=True, scope="session") -def bigquery_credentials_file(): - dest_bigquery_config = get_ci_secret_json( +def bigquery_credentials_file(ci_secret_manager: GoogleGSMSecretManager): + dest_bigquery_config = ci_secret_manager.get_secret( secret_name="SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" - ) - credentials_json = dest_bigquery_config["credentials_json"] + ).parse_json() + credentials_json = dest_bigquery_config["credentials_json"] with as_temp_files([credentials_json]) as (credentials_path,): os.environ["BIGQUERY_CREDENTIALS_PATH"] = credentials_path From 35d7bb11af0bb108abbc270193a442a342e53ecb Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 09:26:18 -0700 Subject: [PATCH 077/118] refactor: remove legacy methods --- airbyte/_util/google_secrets.py | 34 ------------------------------- airbyte/secrets/google_gsm.py | 4 ++-- examples/run_bigquery_faker.py | 18 +++++++++++----- examples/run_integ_test_source.py | 15 ++++++++++---- examples/run_snowflake_faker.py | 33 ++++++++++++++++++------------ tests/conftest.py | 1 - 6 files changed, 46 insertions(+), 59 deletions(-) delete mode 100644 airbyte/_util/google_secrets.py diff --git a/airbyte/_util/google_secrets.py b/airbyte/_util/google_secrets.py deleted file mode 100644 index 184e48cc..00000000 --- a/airbyte/_util/google_secrets.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Helpers for accessing Google secrets.""" - -from __future__ import annotations - -import json -import os - -from google.cloud import secretmanager - - -def get_gcp_secret( - project_name: str, - secret_name: str, -) -> str: - # load secrets from GSM using the GCP_GSM_CREDENTIALS env variable - secret_client = secretmanager.SecretManagerServiceClient.from_service_account_info( - json.loads(os.environ["GCP_GSM_CREDENTIALS"]) - ) - return secret_client.access_secret_version( - name=f"projects/{project_name}/secrets/{secret_name}/versions/latest" - ).payload.data.decode("UTF-8") - - -def get_gcp_secret_json( - project_name: str, - secret_name: str, -) -> dict: - """Get a JSON GCP secret and return as a dict. - - We assume that the Google service account credentials file contents are stored in the - environment variable GCP_GSM_CREDENTIALS. If this environment variable is not set, we raise an - error. Otherwise, we use the Google Secret Manager API to fetch the secret with the given name. - """ - return json.loads(get_gcp_secret(secret_name, project_name)) diff --git a/airbyte/secrets/google_gsm.py b/airbyte/secrets/google_gsm.py index c155285d..adf21987 100644 --- a/airbyte/secrets/google_gsm.py +++ b/airbyte/secrets/google_gsm.py @@ -40,9 +40,9 @@ class GoogleGSMSecretManager(CustomSecretManager): def __init__( self, project: str, - credentials_path: str | None, - credentials_json: str | SecretString | None, *, + credentials_path: str | None = None, + credentials_json: str | SecretString | None = None, auto_register: bool = False, as_backup: bool = False, ) -> None: diff --git a/examples/run_bigquery_faker.py b/examples/run_bigquery_faker.py index eb1f7139..a763d983 100644 --- a/examples/run_bigquery_faker.py +++ b/examples/run_bigquery_faker.py @@ -9,19 +9,27 @@ import tempfile import warnings +from typing import cast import airbyte as ab -from airbyte._util.google_secrets import get_gcp_secret_json from airbyte.caches.bigquery import BigQueryCache +from airbyte.secrets.base import SecretString +from airbyte.secrets.google_gsm import GoogleGSMSecretManager warnings.filterwarnings("ignore", message="Cannot create BigQuery Storage client") -bigquery_destination_secret = get_gcp_secret_json( - project_name="dataline-integration-testing", - secret_name="SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS", -) +AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" +SECRET_NAME = "SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" + +bigquery_destination_secret: dict = cast( + SecretString, + GoogleGSMSecretManager( + project=AIRBYTE_INTERNAL_GCP_PROJECT, + credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), + ).get_secret(SECRET_NAME), +).parse_json() def main() -> None: diff --git a/examples/run_integ_test_source.py b/examples/run_integ_test_source.py index 51fa1de7..3b50f68f 100644 --- a/examples/run_integ_test_source.py +++ b/examples/run_integ_test_source.py @@ -14,10 +14,16 @@ import sys import airbyte as ab -from airbyte._util.google_secrets import get_gcp_secret_json +from airbyte.secrets.google_gsm import GoogleGSMSecretManager -GCP_SECRETS_PROJECT_NAME = "dataline-integration-testing" +AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" +SECRET_NAME = "SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" + +secret_mgr = GoogleGSMSecretManager( + project=AIRBYTE_INTERNAL_GCP_PROJECT, + credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), +) def get_secret_name(connector_name: str) -> str: @@ -39,10 +45,11 @@ def main( secret_name: str | None, streams: list[str] | None, ) -> None: - config = get_gcp_secret_json( + secret = secret_mgr.get_secret( secret_name=secret_name, - project_name=GCP_SECRETS_PROJECT_NAME, ) + assert secret is not None, f"Secret {secret_name} not found." + config = secret.parse_json() source = ab.get_source( connector_name, config=config, diff --git a/examples/run_snowflake_faker.py b/examples/run_snowflake_faker.py index 3e5f7b8f..b4047743 100644 --- a/examples/run_snowflake_faker.py +++ b/examples/run_snowflake_faker.py @@ -8,30 +8,37 @@ from __future__ import annotations import airbyte as ab -from airbyte._util.google_secrets import get_gcp_secret_json from airbyte.caches import SnowflakeCache +from airbyte.secrets.google_gsm import GoogleGSMSecretManager -source = ab.get_source( - "source-faker", - config={"count": 10000, "seed": 0, "parallelism": 1, "always_updated": False}, - install_if_missing=True, +AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" +secret_mgr = GoogleGSMSecretManager( + project=AIRBYTE_INTERNAL_GCP_PROJECT, + credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), ) -secret = get_gcp_secret_json( - project_name="dataline-integration-testing", +secret = secret_mgr.get_secret( secret_name="AIRBYTE_LIB_SNOWFLAKE_CREDS", ) +assert secret is not None, "Secret not found." +secret_config = secret.parse_json() + cache = SnowflakeCache( - account=secret["account"], - username=secret["username"], - password=secret["password"], - database=secret["database"], - warehouse=secret["warehouse"], - role=secret["role"], + account=secret_config["account"], + username=secret_config["username"], + password=secret_config["password"], + database=secret_config["database"], + warehouse=secret_config["warehouse"], + role=secret_config["role"], ) +source = ab.get_source( + "source-faker", + config={"count": 10000, "seed": 0, "parallelism": 1, "always_updated": False}, + install_if_missing=True, +) source.check() source.select_streams(["products"]) diff --git a/tests/conftest.py b/tests/conftest.py index 8fcf5702..0d9e2f8e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,7 +14,6 @@ from requests.exceptions import HTTPError import ulid -from airbyte._util.google_secrets import get_gcp_secret from airbyte._util.meta import is_windows from airbyte.caches.base import CacheBase from airbyte.caches.bigquery import BigQueryCache From 9269627e55d83de6402386e7dc61eb92ff070b63 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 09:33:00 -0700 Subject: [PATCH 078/118] refactor(secrets): fix circular refs --- airbyte/secrets/__init__.py | 1 + airbyte/secrets/base.py | 51 ------------------------------ airbyte/secrets/config.py | 9 +++++- airbyte/secrets/custom.py | 59 +++++++++++++++++++++++++++++++++++ airbyte/secrets/google_gsm.py | 3 +- 5 files changed, 70 insertions(+), 53 deletions(-) create mode 100644 airbyte/secrets/custom.py diff --git a/airbyte/secrets/__init__.py b/airbyte/secrets/__init__.py index b3861e4e..a910a72b 100644 --- a/airbyte/secrets/__init__.py +++ b/airbyte/secrets/__init__.py @@ -5,6 +5,7 @@ from airbyte.secrets.base import SecretHandle, SecretSourceEnum, SecretString from airbyte.secrets.config import disable_secret_source, register_secret_manager +from airbyte.secrets.custom import CustomSecretManager from airbyte.secrets.env_vars import DotenvSecretManager, EnvVarSecretManager from airbyte.secrets.google_colab import ColabSecretManager from airbyte.secrets.google_gsm import GoogleGSMSecretManager diff --git a/airbyte/secrets/base.py b/airbyte/secrets/base.py index 54a1f839..485156eb 100644 --- a/airbyte/secrets/base.py +++ b/airbyte/secrets/base.py @@ -9,7 +9,6 @@ from typing import cast from airbyte import exceptions as exc -from airbyte.secrets.config import clear_secret_sources, register_secret_manager class SecretSourceEnum(str, Enum): @@ -135,53 +134,3 @@ def get_value(self) -> SecretString: Subclasses can optionally override this method to provide a more optimized code path. """ return cast(SecretString, self.parent.get_secret(self.secret_name)) - - -class CustomSecretManager(SecretManager, ABC): - """Custom secret manager that retrieves secrets from a custom source. - - This class is a convenience class that can be used to create custom secret - managers. By default, custom secrets managers are auto-registered during - creation. - """ - - auto_register = True - replace_existing = False - as_backup = False - - def __init__(self) -> None: - super().__init__() - if self.auto_register: - self.register() - - def register( - self, - *, - replace_existing: bool | None = None, - as_backup: bool | None = None, - ) -> None: - """Register the secret manager as global secret source. - - This makes the secret manager available to the `get_secret` function and - allows it to be used automatically as a source for secrets. - - If `replace_existing` is `True`, the secret manager will replace all existing - secrets sources, including the default secret managers such as environment - variables, dotenv files, and Google Colab secrets. If `replace_existing` is - None or not provided, the default behavior will be used from the `replace_existing` - of the class (`False` unless overridden by the subclass). - """ - if replace_existing is None: - replace_existing = self.replace_existing - - if as_backup is None: - as_backup = self.as_backup - - if replace_existing: - clear_secret_sources() - - register_secret_manager( - self, - as_backup=as_backup, - replace_existing=replace_existing, - ) diff --git a/airbyte/secrets/config.py b/airbyte/secrets/config.py index 078adc5a..dc31a65e 100644 --- a/airbyte/secrets/config.py +++ b/airbyte/secrets/config.py @@ -3,13 +3,20 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from airbyte._util import meta -from airbyte.secrets.base import CustomSecretManager, SecretManager, SecretSourceEnum +from airbyte.secrets.base import SecretManager from airbyte.secrets.env_vars import DotenvSecretManager, EnvVarSecretManager from airbyte.secrets.google_colab import ColabSecretManager from airbyte.secrets.prompt import SecretsPrompt +if TYPE_CHECKING: + from airbyte.secrets.base import SecretSourceEnum + from airbyte.secrets.custom import CustomSecretManager + + _SECRETS_SOURCES: list[SecretManager] = [] diff --git a/airbyte/secrets/custom.py b/airbyte/secrets/custom.py new file mode 100644 index 00000000..1cee062c --- /dev/null +++ b/airbyte/secrets/custom.py @@ -0,0 +1,59 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""___""" + +from __future__ import annotations + +from abc import ABC + +from airbyte.secrets.base import SecretManager +from airbyte.secrets.config import clear_secret_sources, register_secret_manager + + +class CustomSecretManager(SecretManager, ABC): + """Custom secret manager that retrieves secrets from a custom source. + + This class is a convenience class that can be used to create custom secret + managers. By default, custom secrets managers are auto-registered during + creation. + """ + + auto_register = True + replace_existing = False + as_backup = False + + def __init__(self) -> None: + super().__init__() + if self.auto_register: + self.register() + + def register( + self, + *, + replace_existing: bool | None = None, + as_backup: bool | None = None, + ) -> None: + """Register the secret manager as global secret source. + + This makes the secret manager available to the `get_secret` function and + allows it to be used automatically as a source for secrets. + + If `replace_existing` is `True`, the secret manager will replace all existing + secrets sources, including the default secret managers such as environment + variables, dotenv files, and Google Colab secrets. If `replace_existing` is + None or not provided, the default behavior will be used from the `replace_existing` + of the class (`False` unless overridden by the subclass). + """ + if replace_existing is None: + replace_existing = self.replace_existing + + if as_backup is None: + as_backup = self.as_backup + + if replace_existing: + clear_secret_sources() + + register_secret_manager( + self, + as_backup=as_backup, + replace_existing=replace_existing, + ) diff --git a/airbyte/secrets/google_gsm.py b/airbyte/secrets/google_gsm.py index adf21987..87680384 100644 --- a/airbyte/secrets/google_gsm.py +++ b/airbyte/secrets/google_gsm.py @@ -11,7 +11,8 @@ from google.cloud import secretmanager_v1 as secretmanager from airbyte import exceptions as exc -from airbyte.secrets.base import CustomSecretManager, SecretHandle, SecretSourceEnum, SecretString +from airbyte.secrets.base import SecretHandle, SecretSourceEnum, SecretString +from airbyte.secrets.custom import CustomSecretManager if TYPE_CHECKING: From a01d31c201e1ef8a58d2e94b2ef3eadf568dbcb6 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 09:35:46 -0700 Subject: [PATCH 079/118] fix tests --- airbyte/secrets/google_gsm.py | 8 +++++--- tests/integration_tests/conftest.py | 6 +++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/airbyte/secrets/google_gsm.py b/airbyte/secrets/google_gsm.py index 87680384..67dd11a8 100644 --- a/airbyte/secrets/google_gsm.py +++ b/airbyte/secrets/google_gsm.py @@ -95,9 +95,11 @@ def __init__( def get_secret(self, secret_name: str) -> SecretString | None: """Get a named secret from Google Colab user secrets.""" - return self.secret_client.access_secret_version( - name=f"projects/{self.project}/secrets/{secret_name}/versions/latest" - ).payload.data.decode("UTF-8") + return SecretString( + self.secret_client.access_secret_version( + name=f"projects/{self.project}/secrets/{secret_name}/versions/latest" + ).payload.data.decode("UTF-8") + ) def fetch_secrets( self, diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index 2398b650..1f95ea18 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -23,10 +23,10 @@ @pytest.mark.requires_creds -@pytest.fixture +@pytest.fixture(scope="session") def ci_secret_manager() -> GoogleGSMSecretManager: return GoogleGSMSecretManager( - project_name=AIRBYTE_INTERNAL_GCP_PROJECT, + project=AIRBYTE_INTERNAL_GCP_PROJECT, credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), ) @@ -35,7 +35,7 @@ def get_connector_config(self, connector_name: str, index: int = 0) -> dict | No """Retrieve the connector configuration from GSM.""" gcp_gsm_credentials = ab.get_secret("GCP_GSM_CREDENTIALS") gsm_secrets_manager = GoogleGSMSecretManager( - project_name=AIRBYTE_INTERNAL_GCP_PROJECT, + project=AIRBYTE_INTERNAL_GCP_PROJECT, credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), ) first_secret: SecretHandle = next(gsm_secrets_manager.fetch_secrets( From e57de8cdbb7509fab55e90700168ffb5fda31280 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 09:48:35 -0700 Subject: [PATCH 080/118] fix more tests, add docstring with usage --- airbyte/secrets/google_gsm.py | 40 ++++++++++++++++++- .../cloud/test_cloud_workspaces.py | 2 +- tests/integration_tests/conftest.py | 10 ++--- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/airbyte/secrets/google_gsm.py b/airbyte/secrets/google_gsm.py index 67dd11a8..97b936bc 100644 --- a/airbyte/secrets/google_gsm.py +++ b/airbyte/secrets/google_gsm.py @@ -1,5 +1,43 @@ # Copyright (c) 2024 Airbyte, Inc., all rights reserved. -"""Secret manager that retrieves secrets from Google Secrets Manager (GSM).""" +"""Secret manager that retrieves secrets from Google Secrets Manager (GSM). + +Usage Example: + +```python +gsm_secrets_manager = GoogleGSMSecretManager( + project=AIRBYTE_INTERNAL_GCP_PROJECT, + credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), +) +first_secret: SecretHandle = next( + gsm_secrets_manager.fetch_connector_secrets( + connector_name=connector_name, + ), + None, +) + +print(f"Found '{connector_name}' credential secret '${first_secret.secret_name}'.") +return first_secret.get_value().parse_json() +``` + +More compact example: + +```python +gsm_secrets_manager = GoogleGSMSecretManager( + project=AIRBYTE_INTERNAL_GCP_PROJECT, + credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), +) +connector_config: dict = ( + next( + gsm_secrets_manager.fetch_connector_secrets( + connector_name=connector_name, + ), + None, + ) + .get_value() + .parse_json() +) +``` +""" from __future__ import annotations diff --git a/tests/integration_tests/cloud/test_cloud_workspaces.py b/tests/integration_tests/cloud/test_cloud_workspaces.py index e1a1a697..b7296c45 100644 --- a/tests/integration_tests/cloud/test_cloud_workspaces.py +++ b/tests/integration_tests/cloud/test_cloud_workspaces.py @@ -37,7 +37,7 @@ def test_deploy_cache_as_destination( schema_name="public", ) destination_id: str = cloud_workspace.deploy_cache_as_destination(cache=cache) - cloud_workspace.delete_destination(destination_id=destination_id) + cloud_workspace.delete_destination(destination=destination_id) def test_deploy_connection( diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index 1f95ea18..2289b40d 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -33,19 +33,15 @@ def ci_secret_manager() -> GoogleGSMSecretManager: def get_connector_config(self, connector_name: str, index: int = 0) -> dict | None: """Retrieve the connector configuration from GSM.""" - gcp_gsm_credentials = ab.get_secret("GCP_GSM_CREDENTIALS") gsm_secrets_manager = GoogleGSMSecretManager( project=AIRBYTE_INTERNAL_GCP_PROJECT, credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), ) - first_secret: SecretHandle = next(gsm_secrets_manager.fetch_secrets( - # https://cloud.google.com/secret-manager/docs/filtering - filter_string=f"labels.connector={connector_name}" + first_secret: SecretHandle = next(gsm_secrets_manager.fetch_connector_secrets( + connector_name=connector_name, ), None) - print( - f"Found '{connector_name}' credential secret ${first_secret.secret_name}." - ) + print(f"Found '{connector_name}' credential secret '${first_secret.secret_name}'.") return first_secret.get_value().parse_json() From aa049b181ff86d4d71887e7eb43d0346acc08383 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 09:49:53 -0700 Subject: [PATCH 081/118] remove `ci_credentials` library and `poetry lock` --- poetry.lock | 61 +------------------------------------------------- pyproject.toml | 6 ----- 2 files changed, 1 insertion(+), 66 deletions(-) diff --git a/poetry.lock b/poetry.lock index c15f1aa1..27473b38 100644 --- a/poetry.lock +++ b/poetry.lock @@ -382,42 +382,6 @@ files = [ {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, ] -[[package]] -name = "ci-credentials" -version = "1.1.0" -description = "CLI tooling to read and manage GSM secrets" -optional = false -python-versions = "^3.10" -files = [] -develop = false - -[package.dependencies] -click = "^8.1.3" -common_utils = {git = "https://github.com/airbytehq/airbyte.git", subdirectory = "airbyte-ci/connectors/common_utils"} -pyyaml = "^6.0" -requests = "^2.28.2" - -[package.source] -type = "git" -url = "https://github.com/airbytehq/airbyte.git" -reference = "aj/ci_credentials/make-portable-as-library" -resolved_reference = "13ba054ccf14df74d2bb7a07f8ff81f7ee4d2992" -subdirectory = "airbyte-ci/connectors/ci_credentials" - -[[package]] -name = "click" -version = "8.1.7" -description = "Composable command line interface toolkit" -optional = false -python-versions = ">=3.7" -files = [ - {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, - {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, -] - -[package.dependencies] -colorama = {version = "*", markers = "platform_system == \"Windows\""} - [[package]] name = "colorama" version = "0.4.6" @@ -429,26 +393,6 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] -[[package]] -name = "common-utils" -version = "0.0.0" -description = "Suite of all often used classes and common functions" -optional = false -python-versions = "^3.10" -files = [] -develop = false - -[package.dependencies] -pyjwt = "^2.8.0" -requests = "^2.31.0" - -[package.source] -type = "git" -url = "https://github.com/airbytehq/airbyte.git" -reference = "HEAD" -resolved_reference = "27e851c5caf9bfc8d9e555370a0aeedef959c67d" -subdirectory = "airbyte-ci/connectors/common_utils" - [[package]] name = "cryptography" version = "41.0.7" @@ -2923,10 +2867,7 @@ files = [ {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"}, ] -[extras] -integ-testing = [] - [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "81ff8fcf1213593612fbd388bfc7fab9d3de77e3bc6988df773ab62d8f9b1203" +content-hash = "9a031cf5b629604d4b79ad9aa2e93f86a0cc6eb1a07d3ffaf3c6aff29acc7d21" diff --git a/pyproject.toml b/pyproject.toml index 439b1447..a81c7f70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,12 +63,6 @@ responses = "^0.25.0" airbyte-source-pokeapi = "^0.2.0" pytest-mock = "^3.14.0" -# TODO: Move to 'main' branch dependencies once merged: https://github.com/airbytehq/airbyte/pull/35938 -ci_credentials = { python = "^3.10", git = "https://github.com/airbytehq/airbyte.git", branch = "aj/ci_credentials/make-portable-as-library", subdirectory="airbyte-ci/connectors/ci_credentials" } - -[tool.poetry.extras] -integ-testing = ["ci_credentials"] - [build-system] requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"] build-backend = "poetry_dynamic_versioning.backend" From 8779198019db2ebd5c8a882aefbf3e34a2ddbe86 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 11:47:06 -0700 Subject: [PATCH 082/118] lint auto-fixes --- airbyte/cloud/_workspaces.py | 2 -- tests/integration_tests/conftest.py | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/_workspaces.py index 7e1720f3..6c4ca7dd 100644 --- a/airbyte/cloud/_workspaces.py +++ b/airbyte/cloud/_workspaces.py @@ -11,7 +11,6 @@ from typing import TYPE_CHECKING from airbyte import exceptions as exc -from airbyte._util import api_util from airbyte._util.api_util import ( CLOUD_API_ROOT, create_connection, @@ -32,7 +31,6 @@ if TYPE_CHECKING: from airbyte_api.models.shared.connectionresponse import ConnectionResponse from airbyte_api.models.shared.destinationresponse import DestinationResponse - from airbyte_api.models.shared.jobresponse import JobResponse from airbyte.caches.base import CacheBase diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index 2289b40d..b5fabdda 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -19,6 +19,7 @@ import airbyte as ab + AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" From 38c8a17eed4f0c1abdf5e47d31fb1e13f111b895 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 11:47:46 -0700 Subject: [PATCH 083/118] fix format --- airbyte/secrets/google_gsm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/airbyte/secrets/google_gsm.py b/airbyte/secrets/google_gsm.py index 97b936bc..746fa48f 100644 --- a/airbyte/secrets/google_gsm.py +++ b/airbyte/secrets/google_gsm.py @@ -113,9 +113,9 @@ def __init__( if not credentials_json: raise exc.PyAirbyteInputError( guidance=( - "No Google Cloud credentials found. You can provide the path to the credentials " - "file using the `credentials_path` argument, or provide the JSON contents of the " - "credentials file using the `credentials_json` argument." + "No Google Cloud credentials found. You can provide the path to the " + "credentials file using the `credentials_path` argument, or provide the JSON " + "contents of the credentials file using the `credentials_json` argument." ), ) From 96975c523c2c50b627d0a28532f3a440df016f83 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 11:52:46 -0700 Subject: [PATCH 084/118] fix mypy --- airbyte/secrets/util.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/airbyte/secrets/util.py b/airbyte/secrets/util.py index 4637abec..028f3156 100644 --- a/airbyte/secrets/util.py +++ b/airbyte/secrets/util.py @@ -34,7 +34,7 @@ def get_secret( category=DeprecationWarning, stacklevel=2, ) - sources = kwargs.pop("source") + sources = kwargs.pop("source") # type: ignore [assignment] available_sources: dict[str, SecretManager] = {} for available_source in _get_secret_sources(): @@ -47,7 +47,7 @@ def get_secret( sources = list(available_sources.values()) elif not isinstance(sources, list): - sources = [sources] + sources = [sources] # type: ignore [unreachable] # This is a 'just in case' catch. # Replace any SecretSourceEnum strings with the matching SecretManager object for source in sources: @@ -67,7 +67,8 @@ def get_secret( if SecretSourceEnum.PROMPT in secret_managers: prompt_source = secret_managers.pop( - secret_managers.index(SecretSourceEnum.PROMPT), + # Mis-typed, but okay here since we have equality logic for the enum comparison: + secret_managers.index(SecretSourceEnum.PROMPT), # type: ignore [arg-type] ) if allow_prompt: From d88f1ee075b6fce5d7f1dcdb2485529c130b1bf8 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 12:59:15 -0700 Subject: [PATCH 085/118] get all integtest secrets from GSM --- airbyte/_util/api_util.py | 8 --- airbyte/secrets/base.py | 6 ++ tests/integration_tests/cloud/conftest.py | 67 ++++++----------------- tests/integration_tests/conftest.py | 52 ++++++++++-------- 4 files changed, 54 insertions(+), 79 deletions(-) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index 1997d66e..d756c12d 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -10,7 +10,6 @@ from __future__ import annotations import json -import os from typing import Any import airbyte_api @@ -30,8 +29,6 @@ JOB_WAIT_TIMEOUT_SECS_DEFAULT = 60 * 60 # 1 hour CLOUD_API_ROOT = "https://api.airbyte.com/v1" -AIRBYTE_API_KEY_SECRET_NAME = "AIRBYTE_CLOUD_API_KEY" - # Helper functions @@ -40,11 +37,6 @@ def status_ok(status_code: int) -> bool: return status_code >= 200 and status_code < 300 # noqa: PLR2004 # allow inline magic numbers -def get_default_bearer_token() -> str | None: - """Get the default bearer token from env variables.""" - return os.environ.get(AIRBYTE_API_KEY_SECRET_NAME, None) - - def get_airbyte_server_instance( *, api_key: str, diff --git a/airbyte/secrets/base.py b/airbyte/secrets/base.py index 485156eb..a399e5ea 100644 --- a/airbyte/secrets/base.py +++ b/airbyte/secrets/base.py @@ -41,6 +41,12 @@ def is_json(self) -> bool: return True + def __bool__(self) -> bool: + """Override the boolean value of the secret string. + + Always returns `True` without inspecting contents.""" + return True + def parse_json(self) -> dict: """Parse the secret string as JSON.""" try: diff --git a/tests/integration_tests/cloud/conftest.py b/tests/integration_tests/cloud/conftest.py index 23b5ba1f..d00915f3 100644 --- a/tests/integration_tests/cloud/conftest.py +++ b/tests/integration_tests/cloud/conftest.py @@ -13,11 +13,14 @@ from airbyte.caches.base import CacheBase from airbyte.cloud import CloudWorkspace from airbyte._util.temp_files import as_temp_files +from airbyte.secrets.base import SecretString +from airbyte.secrets.google_gsm import GoogleGSMSecretManager -ENV_AIRBYTE_API_KEY = "AIRBYTE_CLOUD_API_KEY" -ENV_AIRBYTE_API_WORKSPACE_ID = "AIRBYTE_CLOUD_API_WORKSPACE_ID" -ENV_MOTHERDUCK_API_KEY = "MOTHERDUCK_API_KEY" +AIRBYTE_CLOUD_WORKSPACE_ID = "19d7a891-8e0e-40ac-8a8c-5faf8d11e47c" + +ENV_MOTHERDUCK_API_KEY = "PYAIRBYTE_MOTHERDUCK_API_KEY" +AIRBYTE_CLOUD_API_KEY_SECRET_NAME = "PYAIRBYTE_CLOUD_INTEROP_API_KEY" @pytest.fixture(autouse=True) @@ -33,73 +36,39 @@ def add_venv_bin_to_path(monkeypatch: pytest.MonkeyPatch) -> None: @pytest.fixture def workspace_id() -> str: - return os.environ[ENV_AIRBYTE_API_WORKSPACE_ID] + return AIRBYTE_CLOUD_WORKSPACE_ID @pytest.fixture -def api_root() -> str: +def airbyte_cloud_api_root() -> str: return CLOUD_API_ROOT @pytest.fixture -def api_key() -> str: - dotenv_vars: dict[str, str | None] = dotenv_values() - if ENV_AIRBYTE_API_KEY in dotenv_vars: - return dotenv_vars[ENV_AIRBYTE_API_KEY] - - if ENV_AIRBYTE_API_KEY not in os.environ: - raise ValueError(f"Please set the '{ENV_AIRBYTE_API_KEY}' environment variable.") - - return os.environ[ENV_AIRBYTE_API_KEY] +def airbyte_cloud_api_key(ci_secret_manager: GoogleGSMSecretManager) -> SecretString: + secret: SecretString | None = ci_secret_manager.get_secret(AIRBYTE_CLOUD_API_KEY_SECRET_NAME) + assert secret, f"Secret '{AIRBYTE_CLOUD_API_KEY_SECRET_NAME}' not found." + return secret @pytest.fixture -def motherduck_api_key() -> str: - dotenv_vars: dict[str, str | None] = dotenv_values() - if ENV_MOTHERDUCK_API_KEY in dotenv_vars: - return dotenv_vars[ENV_MOTHERDUCK_API_KEY] - - if ENV_MOTHERDUCK_API_KEY not in os.environ: - raise ValueError(f"Please set the '{ENV_MOTHERDUCK_API_KEY}' environment variable.") - - return os.environ[ENV_MOTHERDUCK_API_KEY] +def motherduck_api_key(motherduck_secrets: dict) -> SecretString: + return SecretString(motherduck_secrets["motherduck_api_key"]) @pytest.fixture def cloud_workspace( workspace_id: str, - api_key: str, - api_root: str, + airbyte_cloud_api_key: SecretString, + airbyte_cloud_api_root: str, ) -> CloudWorkspace: return CloudWorkspace( workspace_id=workspace_id, - api_key=api_key, - api_root=api_root, + api_key=airbyte_cloud_api_key, + api_root=airbyte_cloud_api_root, ) -@pytest.fixture -def workspace_id() -> str: - return os.environ[ENV_AIRBYTE_API_WORKSPACE_ID] - - -@pytest.fixture -def api_root() -> str: - return CLOUD_API_ROOT - - -@pytest.fixture -def api_key() -> str: - dotenv_vars: dict[str, str | None] = dotenv_values() - if ENV_AIRBYTE_API_KEY in dotenv_vars: - return dotenv_vars[ENV_AIRBYTE_API_KEY] - - if ENV_AIRBYTE_API_KEY not in os.environ: - raise ValueError(f"Please set the {ENV_AIRBYTE_API_KEY} environment variable.") - - return os.environ[ENV_AIRBYTE_API_KEY] - - @pytest.fixture(scope="function") def new_deployable_cache(request) -> CacheBase: """This is a placeholder fixture that will be overridden by pytest_generate_tests().""" diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index b5fabdda..ed37991a 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -104,33 +104,40 @@ def airbyte_integration_test_secrets_manager() -> AirbyteIntegrationTestSecretMa return AirbyteIntegrationTestSecretManager() +@pytest.fixture(scope="session") +def motherduck_secrets(ci_secret_manager: GoogleGSMSecretManager) -> dict: + return ci_secret_manager.get_secret( + "SECRET_DESTINATION_DUCKDB__MOTHERDUCK__CREDS", + ).parse_json() + + @pytest.fixture def new_motherduck_cache( - airbyte_integration_test_secrets_manager: AirbyteIntegrationTestSecretManager, + motherduck_secrets, ) -> MotherDuckCache: - config = airbyte_integration_test_secrets_manager.get_connector_config( - connector_name="destination-duckdb", - ) return MotherDuckCache( database="integration_tests_deleteany", schema_name=f"test_deleteme_{str(ulid.ULID()).lower()[-6:]}", - api_key=config["motherduck_api_key"], + api_key=motherduck_secrets["motherduck_api_key"], ) -@pytest.fixture -def new_snowflake_cache(ci_secret_manager: GoogleGSMSecretManager): - secret = ci_secret_manager.get_secret( +@pytest.fixture(scope="session") +def snowflake_creds(ci_secret_manager: GoogleGSMSecretManager) -> dict: + return ci_secret_manager.get_secret( "AIRBYTE_LIB_SNOWFLAKE_CREDS", ).parse_json() + +@pytest.fixture +def new_snowflake_cache(snowflake_creds: dict): config = SnowflakeCache( - account=secret["account"], - username=secret["username"], - password=secret["password"], - database=secret["database"], - warehouse=secret["warehouse"], - role=secret["role"], + account=snowflake_creds["account"], + username=snowflake_creds["username"], + password=snowflake_creds["password"], + database=snowflake_creds["database"], + warehouse=snowflake_creds["warehouse"], + role=snowflake_creds["role"], schema_name=f"test{str(ulid.ULID()).lower()[-6:]}", ) sqlalchemy_url = config.get_sql_alchemy_url() @@ -174,7 +181,7 @@ def bigquery_credentials_file(ci_secret_manager: GoogleGSMSecretManager): ).parse_json() credentials_json = dest_bigquery_config["credentials_json"] - with as_temp_files([credentials_json]) as (credentials_path,): + with as_temp_files(files_contents=[credentials_json]) as (credentials_path,): os.environ["BIGQUERY_CREDENTIALS_PATH"] = credentials_path yield @@ -182,13 +189,14 @@ def bigquery_credentials_file(ci_secret_manager: GoogleGSMSecretManager): return -@pytest.fixture(scope="function") -def new_motherduck_cache() -> MotherDuckCache: - return MotherDuckCache( - api_key=ab.get_secret("MOTHERDUCK_API_KEY"), - schema_name=f"test{str(ulid.ULID()).lower()[-6:]}", - database="integration_tests_deleteany", - ) +@pytest.mark.requires_creds +@pytest.fixture(autouse=True, scope="session") +def with_snowflake_password_env_var(snowflake_creds: dict): + os.environ["SNOWFLAKE_PASSWORD"] = snowflake_creds["password"] + + yield + + return @pytest.fixture(scope="function") From 9b373acd7da7a9c523965401fa03df977d4a5384 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 13:04:33 -0700 Subject: [PATCH 086/118] remove bespoke ci secret manager class --- tests/integration_tests/conftest.py | 58 ----------------------------- 1 file changed, 58 deletions(-) diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index ed37991a..9684c679 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -46,64 +46,6 @@ def get_connector_config(self, connector_name: str, index: int = 0) -> dict | No return first_secret.get_value().parse_json() -class AirbyteIntegrationTestSecretManager(CustomSecretManager): - """Custom secret manager for Airbyte integration tests. - - This class is used to auto-retrieve needed secrets from GSM. - """ - auto_register = True - replace_existing = False - as_backup = True - - def get_secret( - self, - secret_name: str, - *, - required: bool = False, - ) -> str | None: - """This method attempts to find matching properties within the integration test config. - - If `required` is `True`, this method will raise an exception if the secret is not found. - Otherwise, it will return None. - """ - system_name = secret_name.split("_")[0].lower() - property_name = "_".join(secret_name.split("_")[1:]).lower() - - mapping = { - "snowflake": "destination-snowflake", - "bigquery": "destination-bigquery", - "postgres": "destination-postgres", - "duckdb": "destination-duckdb", - } - if system_name not in mapping: - return None - - connector_name = mapping[system_name] - connector_config = self.get_connector_config(connector_name) - if "credentials" in connector_config: - if property_name in connector_config["credentials"]: - return connector_config["credentials"][property_name] - - if property_name in connector_config: - return connector_config[property_name] - - if not required: - return None - - raise KeyError( - f"Property '{property_name}' not found in '{connector_name}' connector config. " - f"\nAvailable config keys: {', '.join(connector_config.keys())} " - f"\nAvailable 'credential' keys: {', '.join(connector_config.get('credentials', {}).keys())} " - ) - - -@pytest.fixture(autouse=True, scope="session") -def airbyte_integration_test_secrets_manager() -> AirbyteIntegrationTestSecretManager: - """Create a new instance of the custom secret manager.""" - - return AirbyteIntegrationTestSecretManager() - - @pytest.fixture(scope="session") def motherduck_secrets(ci_secret_manager: GoogleGSMSecretManager) -> dict: return ci_secret_manager.get_secret( From 56abe462b18eec621e6d1d6a842b6d09eb8996de Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 13:06:13 -0700 Subject: [PATCH 087/118] re-allow ci tests that need creds on 3.9 --- .github/workflows/python_pytest.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/python_pytest.yml b/.github/workflows/python_pytest.yml index 0f4e1ced..87b0994d 100644 --- a/.github/workflows/python_pytest.yml +++ b/.github/workflows/python_pytest.yml @@ -113,11 +113,8 @@ jobs: run: poetry install --all-extras # Job-specific step(s): - - name: "Run Pytest ${{ matrix.python-version == '3.9' && '(No Creds)' || '' }}" + - name: "Run Pytest" env: GCP_GSM_CREDENTIALS: ${{ secrets.GCP_GSM_CREDENTIALS }} - # We have to exclude `requires_creds` tests on 3.9, because the `airbyte_ci` package - # is not available for 3.9, and the tests that require creds will fail. run: > poetry run pytest -m "not linting and not super_slow - ${{ matrix.python-version == '3.9' && 'and not requires_creds' || '' }}" From 3decb327093731af8fdc2f20b5e141230557a3b6 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 13:07:20 -0700 Subject: [PATCH 088/118] fix trailing quote --- .github/workflows/python_pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python_pytest.yml b/.github/workflows/python_pytest.yml index 87b0994d..ec5b0ef6 100644 --- a/.github/workflows/python_pytest.yml +++ b/.github/workflows/python_pytest.yml @@ -117,4 +117,4 @@ jobs: env: GCP_GSM_CREDENTIALS: ${{ secrets.GCP_GSM_CREDENTIALS }} run: > - poetry run pytest -m "not linting and not super_slow + poetry run pytest -m "not linting and not super_slow" From 2a86d7ff9d85f8fb145f4929d58442468fb7dda9 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 13:09:17 -0700 Subject: [PATCH 089/118] revert: remove ' --all-extras' flag --- .github/workflows/autofix.yml | 2 +- .github/workflows/fix-pr-command.yml | 2 +- .github/workflows/pydoc_preview.yml | 2 +- .github/workflows/pydoc_publish.yml | 2 +- .github/workflows/python_pytest.yml | 6 +++--- .github/workflows/test-pr-command.yml | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/autofix.yml b/.github/workflows/autofix.yml index 0b4976b3..4e9ecd51 100644 --- a/.github/workflows/autofix.yml +++ b/.github/workflows/autofix.yml @@ -25,7 +25,7 @@ jobs: cache: 'poetry' - name: Install dependencies - run: poetry install --all-extras + run: poetry install - name: Format code run: poetry run ruff format . diff --git a/.github/workflows/fix-pr-command.yml b/.github/workflows/fix-pr-command.yml index 00206423..712fd852 100644 --- a/.github/workflows/fix-pr-command.yml +++ b/.github/workflows/fix-pr-command.yml @@ -79,7 +79,7 @@ jobs: python-version: ${{ matrix.python-version }} cache: 'poetry' - name: Install dependencies - run: poetry install --all-extras + run: poetry install # Fix any lint or format issues diff --git a/.github/workflows/pydoc_preview.yml b/.github/workflows/pydoc_preview.yml index ee052ecb..8284dfde 100644 --- a/.github/workflows/pydoc_preview.yml +++ b/.github/workflows/pydoc_preview.yml @@ -27,7 +27,7 @@ jobs: cache: 'poetry' - name: Install dependencies - run: poetry install --all-extras + run: poetry install - name: Generate documentation run: | diff --git a/.github/workflows/pydoc_publish.yml b/.github/workflows/pydoc_publish.yml index 9f2df8ec..0d719dbb 100644 --- a/.github/workflows/pydoc_publish.yml +++ b/.github/workflows/pydoc_publish.yml @@ -46,7 +46,7 @@ jobs: uses: actions/configure-pages@v4 - name: Install dependencies - run: poetry install --all-extras + run: poetry install - name: Generate documentation run: | diff --git a/.github/workflows/python_pytest.yml b/.github/workflows/python_pytest.yml index ec5b0ef6..afda9a4f 100644 --- a/.github/workflows/python_pytest.yml +++ b/.github/workflows/python_pytest.yml @@ -37,7 +37,7 @@ jobs: python-version: '3.10' cache: 'poetry' - name: Install dependencies - run: poetry install --all-extras + run: poetry install # Job-specific step(s): - name: Run Pytest (Fast Tests Only) @@ -65,7 +65,7 @@ jobs: python-version: '3.10' cache: 'poetry' - name: Install dependencies - run: poetry install --all-extras + run: poetry install # Job-specific step(s): - name: Run Pytest (No-Creds) @@ -110,7 +110,7 @@ jobs: python-version: ${{ matrix.python-version }} cache: 'poetry' - name: Install dependencies - run: poetry install --all-extras + run: poetry install # Job-specific step(s): - name: "Run Pytest" diff --git a/.github/workflows/test-pr-command.yml b/.github/workflows/test-pr-command.yml index d1bf2fce..7a71a9e4 100644 --- a/.github/workflows/test-pr-command.yml +++ b/.github/workflows/test-pr-command.yml @@ -77,7 +77,7 @@ jobs: python-version: ${{ matrix.python-version }} cache: 'poetry' - name: Install dependencies - run: poetry install --all-extras + run: poetry install - name: Run Pytest env: From 6af01bbaeaafc9a6c64c671f1726ec4d2e2dd2e8 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 13:09:52 -0700 Subject: [PATCH 090/118] revert quotes --- .github/workflows/python_pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python_pytest.yml b/.github/workflows/python_pytest.yml index afda9a4f..23ed8144 100644 --- a/.github/workflows/python_pytest.yml +++ b/.github/workflows/python_pytest.yml @@ -113,7 +113,7 @@ jobs: run: poetry install # Job-specific step(s): - - name: "Run Pytest" + - name: Run Pytest env: GCP_GSM_CREDENTIALS: ${{ secrets.GCP_GSM_CREDENTIALS }} run: > From b5f58567652a2ddc3c400fe31b5061f80d3e9da8 Mon Sep 17 00:00:00 2001 From: "Aaron (\"AJ\") Steers" Date: Tue, 9 Apr 2024 15:15:50 -0700 Subject: [PATCH 091/118] Apply suggestions from code review --- airbyte/secrets/google_gsm.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/airbyte/secrets/google_gsm.py b/airbyte/secrets/google_gsm.py index 746fa48f..417339e8 100644 --- a/airbyte/secrets/google_gsm.py +++ b/airbyte/secrets/google_gsm.py @@ -64,8 +64,15 @@ class GoogleGSMSecretManager(CustomSecretManager): """Secret manager that retrieves secrets from Google Secrets Manager (GSM). + This class inherits from `CustomSecretManager` and also adds methods + that are specific to this implementation: `fetch_secrets()`, + `fetch_secrets_by_label()` and `fetch_connector_secrets()`. + This secret manager is not enabled by default. To use it, you must provide the project ID and the credentials for a service account with the necessary permissions to access the secrets. + + The `fetch_connector_secret()` method assumes a label name of `connector` + matches the name of the connector (`source-github`, `destination-snowflake`, etc.) """ name = SecretSourceEnum.GOOGLE_GSM.value @@ -141,6 +148,7 @@ def get_secret(self, secret_name: str) -> SecretString | None: def fetch_secrets( self, + *, filter_string: str, ) -> Iterable[SecretHandle]: """List all available secrets in the secret manager. From 5d4d876345affa473fd777332cfdee5a164b86a3 Mon Sep 17 00:00:00 2001 From: "Aaron (\"AJ\") Steers" Date: Tue, 9 Apr 2024 15:16:23 -0700 Subject: [PATCH 092/118] apply suggestion --- airbyte/secrets/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte/secrets/base.py b/airbyte/secrets/base.py index a399e5ea..e748181b 100644 --- a/airbyte/secrets/base.py +++ b/airbyte/secrets/base.py @@ -122,7 +122,7 @@ class SecretHandle: """A handle for a secret in a secret manager. This class is used to store a reference to a secret in a secret manager. - The secret is not retrieved until the `get` method is called on the handle. + The secret is not retrieved until the `get_value()` method is called on the handle. """ def __init__( From def5e2c2fae95e1f203c99c8c4286edb4571388b Mon Sep 17 00:00:00 2001 From: "Aaron (\"AJ\") Steers" Date: Tue, 9 Apr 2024 15:16:44 -0700 Subject: [PATCH 093/118] apply suggestion --- airbyte/datasets/_sql.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/airbyte/datasets/_sql.py b/airbyte/datasets/_sql.py index e9150c69..4a526405 100644 --- a/airbyte/datasets/_sql.py +++ b/airbyte/datasets/_sql.py @@ -42,6 +42,9 @@ def __init__( ) -> None: """Initialize the dataset with a cache, stream name, and query statement. + This class is not intended to be created directly. Instead, you can retrieve + datasets from caches or Cloud connection objects, etc. + The query statement should be a SQLAlchemy Selectable object that can be executed to retrieve records from the dataset. From 3dad193fe219b40ec09d64a8e90dc931f71fd9b8 Mon Sep 17 00:00:00 2001 From: "Aaron (\"AJ\") Steers" Date: Tue, 9 Apr 2024 15:28:20 -0700 Subject: [PATCH 094/118] doc: add comment about `api_util` module --- airbyte/_util/api_util.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py index d756c12d..7b0edd30 100644 --- a/airbyte/_util/api_util.py +++ b/airbyte/_util/api_util.py @@ -5,6 +5,10 @@ and design inconsistencies, we do not expose these functions or other Airbyte API classes within PyAirbyte. Classes and functions from the Airbyte API external library should always be wrapped in PyAirbyte classes - unless there's a very compelling reason to surface these models intentionally. + +Similarly, modules outside of this file should try to avoid interfacing with `airbyte_api` library +directly. This will ensure a single source of truth when mapping between the `airbyte` and +`airbyte_api` libraries. """ from __future__ import annotations From 3b494e8b301a61b908855fbc9a2d491ac8259613 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 15:42:12 -0700 Subject: [PATCH 095/118] add skip condition on missing `GCP_GSM_CREDENTIALS` secret in integration tests --- tests/integration_tests/conftest.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index 9684c679..b9b72920 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -26,6 +26,10 @@ @pytest.mark.requires_creds @pytest.fixture(scope="session") def ci_secret_manager() -> GoogleGSMSecretManager: + secret = ab.get_secret("GCP_GSM_CREDENTIALS") + if not secret: + pytest.skip("GCP_GSM_CREDENTIALS secret not found.") + return GoogleGSMSecretManager( project=AIRBYTE_INTERNAL_GCP_PROJECT, credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), From 994219902fd88e346bf4c8e128a536c58162b778 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 16:02:14 -0700 Subject: [PATCH 096/118] Fix empty value for GCP_GSM_CREDENTIALS in pytest workflow --- .github/workflows/python_pytest.yml | 4 ++-- airbyte/secrets/base.py | 4 ++++ tests/integration_tests/conftest.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python_pytest.yml b/.github/workflows/python_pytest.yml index 23ed8144..430b8641 100644 --- a/.github/workflows/python_pytest.yml +++ b/.github/workflows/python_pytest.yml @@ -70,8 +70,8 @@ jobs: # Job-specific step(s): - name: Run Pytest (No-Creds) env: - # Force this to an invalid value to ensure tests that no creds are required are run. - GCP_GSM_CREDENTIALS: "no-creds" + # Force this to a blank value. + GCP_GSM_CREDENTIALS: "" run: > poetry run pytest -m "not requires_creds and not linting and not super_slow" diff --git a/airbyte/secrets/base.py b/airbyte/secrets/base.py index e748181b..d77ac633 100644 --- a/airbyte/secrets/base.py +++ b/airbyte/secrets/base.py @@ -32,6 +32,10 @@ class SecretString(str): def __repr__(self) -> str: return "" + def is_empty(self) -> bool: + """Check if the secret is an empty string.""" + return len(self) == 0 + def is_json(self) -> bool: """Check if the secret string is a valid JSON string.""" try: diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index b9b72920..459c81b0 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -27,7 +27,7 @@ @pytest.fixture(scope="session") def ci_secret_manager() -> GoogleGSMSecretManager: secret = ab.get_secret("GCP_GSM_CREDENTIALS") - if not secret: + if not secret or secret.is_empty(): pytest.skip("GCP_GSM_CREDENTIALS secret not found.") return GoogleGSMSecretManager( From 7ce967e87a8d622d071de3553253f421f68df949 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 16:06:06 -0700 Subject: [PATCH 097/118] ci: fix test_cloud_api_util.py to use airbyte_cloud_api_root and airbyte_cloud_api_key variables --- .../cloud/test_cloud_api_util.py | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tests/integration_tests/cloud/test_cloud_api_util.py b/tests/integration_tests/cloud/test_cloud_api_util.py index dfe90d2c..83dea4f6 100644 --- a/tests/integration_tests/cloud/test_cloud_api_util.py +++ b/tests/integration_tests/cloud/test_cloud_api_util.py @@ -14,15 +14,15 @@ def test_create_and_delete_source( workspace_id: str, - api_root: str, - api_key: str, + airbyte_cloud_api_root: str, + airbyte_cloud_api_key: str, ) -> None: new_resource_name = "deleteme-source-faker" + str(ulid.ULID()).lower()[-6:] source_config = SourceFaker() source = api_util.create_source( name=new_resource_name, - api_root=api_root, - api_key=api_key, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, workspace_id=workspace_id, config=source_config, ) @@ -32,16 +32,16 @@ def test_create_and_delete_source( api_util.delete_source( source_id=source.source_id, - api_root=api_root, - api_key=api_key, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, workspace_id=workspace_id, ) def test_create_and_delete_destination( workspace_id: str, - api_root: str, - api_key: str, + airbyte_cloud_api_root: str, + airbyte_cloud_api_key: str, motherduck_api_key: str, ) -> None: new_resource_name = "deleteme-destination-faker" + str(ulid.ULID()).lower()[-6:] @@ -52,8 +52,8 @@ def test_create_and_delete_destination( destination = api_util.create_destination( name=new_resource_name, - api_root=api_root, - api_key=api_key, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, workspace_id=workspace_id, config=destination_config, ) @@ -63,16 +63,16 @@ def test_create_and_delete_destination( api_util.delete_destination( destination_id=destination.destination_id, - api_root=api_root, - api_key=api_key, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, workspace_id=workspace_id, ) def test_create_and_delete_connection( workspace_id: str, - api_root: str, - api_key: str, + airbyte_cloud_api_root: str, + airbyte_cloud_api_key: str, motherduck_api_key: str, ) -> None: new_source_name = "deleteme-source-faker" + str(ulid.ULID()).lower()[-6:] @@ -80,8 +80,8 @@ def test_create_and_delete_connection( new_connection_name = "deleteme-connection-dummy" + str(ulid.ULID()).lower()[-6:] source = api_util.create_source( name=new_source_name, - api_root=api_root, - api_key=api_key, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, workspace_id=workspace_id, config=SourceFaker(), ) @@ -91,8 +91,8 @@ def test_create_and_delete_connection( destination = api_util.create_destination( name=new_destination_name, - api_root=api_root, - api_key=api_key, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, workspace_id=workspace_id, config=DestinationDuckdb( destination_path="temp_db", @@ -105,8 +105,8 @@ def test_create_and_delete_connection( connection = api_util.create_connection( name=new_connection_name, - api_root=api_root, - api_key=api_key, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, workspace_id=workspace_id, source_id=source.source_id, destination_id=destination.destination_id, @@ -119,19 +119,19 @@ def test_create_and_delete_connection( api_util.delete_connection( connection_id=connection.connection_id, - api_root=api_root, - api_key=api_key, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, workspace_id=workspace_id, ) api_util.delete_source( source_id=source.source_id, - api_root=api_root, - api_key=api_key, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, workspace_id=workspace_id, ) api_util.delete_destination( destination_id=destination.destination_id, - api_root=api_root, - api_key=api_key, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, workspace_id=workspace_id, ) From ea1b5f79bc93e8582b81428f3a26a7e46b693267 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 16:13:35 -0700 Subject: [PATCH 098/118] fix: remove marks since they don't work on fixtures --- tests/integration_tests/conftest.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py index 459c81b0..e2976348 100644 --- a/tests/integration_tests/conftest.py +++ b/tests/integration_tests/conftest.py @@ -23,7 +23,6 @@ AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" -@pytest.mark.requires_creds @pytest.fixture(scope="session") def ci_secret_manager() -> GoogleGSMSecretManager: secret = ab.get_secret("GCP_GSM_CREDENTIALS") @@ -96,7 +95,6 @@ def new_snowflake_cache(snowflake_creds: dict): @pytest.fixture -@pytest.mark.requires_creds def new_bigquery_cache(ci_secret_manager: GoogleGSMSecretManager): dest_bigquery_config = ci_secret_manager.get_secret( "SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" @@ -119,7 +117,6 @@ def new_bigquery_cache(ci_secret_manager: GoogleGSMSecretManager): connection.execute(f"DROP SCHEMA IF EXISTS {cache.schema_name}") -@pytest.mark.requires_creds @pytest.fixture(autouse=True, scope="session") def bigquery_credentials_file(ci_secret_manager: GoogleGSMSecretManager): dest_bigquery_config = ci_secret_manager.get_secret( @@ -135,7 +132,6 @@ def bigquery_credentials_file(ci_secret_manager: GoogleGSMSecretManager): return -@pytest.mark.requires_creds @pytest.fixture(autouse=True, scope="session") def with_snowflake_password_env_var(snowflake_creds: dict): os.environ["SNOWFLAKE_PASSWORD"] = snowflake_creds["password"] From 681e5d09e2afee742ce95bcdb0c250cc0009a24e Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 16:21:35 -0700 Subject: [PATCH 099/118] re-order secrets submodules --- airbyte/secrets/__init__.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/airbyte/secrets/__init__.py b/airbyte/secrets/__init__.py index a910a72b..54398e66 100644 --- a/airbyte/secrets/__init__.py +++ b/airbyte/secrets/__init__.py @@ -14,20 +14,29 @@ __all__ = [ + # Submodules + "base", + "config", + "custom", + "env_vars", + "google_colab", + "google_gsm", + "prompt", + "util", # Secret Access "get_secret", # Secret Classes - "SecretHandle", - "SecretString", "SecretSourceEnum", + "SecretString", + "SecretHandle", # Secret Managers + "SecretManager", + "EnvVarSecretManager", + "DotenvSecretManager", "ColabSecretManager", + "SecretsPrompt", "CustomSecretManager", - "DotenvSecretManager", - "EnvVarSecretManager", "GoogleGSMSecretManager", - "SecretManager", - "SecretsPrompt", # Registration Functions` "register_secret_manager", "disable_secret_source", From f6ed93d0fbb4a382be56e96c33b27bf81c322298 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 16:34:19 -0700 Subject: [PATCH 100/118] update docs and import submodules --- .DS_Store | Bin 0 -> 6148 bytes airbyte/__init__.py | 3 +- airbyte/cloud/__init__.py | 12 +++- airbyte/cloud/_destinations.py | 66 ------------------ .../cloud/{_connections.py => connections.py} | 4 +- .../{_sync_results.py => sync_results.py} | 4 +- .../cloud/{_workspaces.py => workspaces.py} | 4 +- airbyte/exceptions.py | 2 +- airbyte/secrets/__init__.py | 10 +++ airbyte/secrets/google_gsm.py | 6 +- docs/.DS_Store | Bin 0 -> 6148 bytes .../cloud/test_cloud_sql_reads.py | 2 +- 12 files changed, 34 insertions(+), 79 deletions(-) create mode 100644 .DS_Store delete mode 100644 airbyte/cloud/_destinations.py rename airbyte/cloud/{_connections.py => connections.py} (98%) rename airbyte/cloud/{_sync_results.py => sync_results.py} (98%) rename airbyte/cloud/{_workspaces.py => workspaces.py} (99%) create mode 100644 docs/.DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..178cca89c34a965879e69264f420afe87f697473 GIT binary patch literal 6148 zcmeHK%}T>S5Z<-XrW7Fug&r5Y7OYk)#Y>3w1&ruHr6#6mFlI}WnnNk%tS{t~_&m<+ zZp31}ir5+0{pNQ!`$6`HF~+@Vw8xmk7_*@va#WTGx|fF5Ofn+JF~TAnhlvcqelxMZ z4*2aBit1M5cKVr=y7~h@%mtTwlg%BnwYY(#lc!X^XhJ#gtz^?+1+SoduIJgGq=`)L!B^#1d4$9OF+dCu1FOq` zITP&W>QX=}CkBXt9~i*>K|n)v4VD_!)&U(}pD}JBqJWNX2}EJgHCSqd2ng4ufVz~M zCkEH$;1?#(HCSrY<&3MDVH`7a^?2cGcJK?8&bX_QdSZYWSY)84O$X2aQ}|_;KJph! z$RY-afq%vTw+8;egGHIM_1p6BtQF91p`l=2i3$kl3zq;ea35)@ppFaFA DestinationResponse: - """Get the destination response.""" - if self._destination_response is None or force_refresh: - self._destination_response = api_util.get_destination( - destination_id=self.destination_id, - api_root=self.workspace.api_root, - api_key=self.workspace.api_key, - ) - - return self._destination_response - - def get_destination_config( - self, - ) -> DestinationBigquery | DestinationDuckdb | DestinationPostgres | DestinationSnowflake | Any: # noqa: ANN401 - """Get the destination configuration.""" - return self._get_destination_response().configuration - - def as_cache(self) -> CacheBase: - """Get the cache for the destination.""" - if self._as_cache is None: - self._as_cache = dest_util.create_cache_from_destination_config( - destination_configuration=self.get_destination_config(), - ) - - return self._as_cache - - def get_sql_engine(self) -> Engine: - """Get the SQL engine for the destination.""" - return self.as_cache().get_sql_engine() diff --git a/airbyte/cloud/_connections.py b/airbyte/cloud/connections.py similarity index 98% rename from airbyte/cloud/_connections.py rename to airbyte/cloud/connections.py index f72fffca..9f4f1eea 100644 --- a/airbyte/cloud/_connections.py +++ b/airbyte/cloud/connections.py @@ -6,14 +6,14 @@ from typing import TYPE_CHECKING, cast from airbyte._util import api_util -from airbyte.cloud._sync_results import SyncResult +from airbyte.cloud.sync_results import SyncResult if TYPE_CHECKING: from airbyte_api.models.shared.connectionresponse import ConnectionResponse from airbyte_api.models.shared.jobresponse import JobResponse - from airbyte.cloud._workspaces import CloudWorkspace + from airbyte.cloud.workspaces import CloudWorkspace class CloudConnection: diff --git a/airbyte/cloud/_sync_results.py b/airbyte/cloud/sync_results.py similarity index 98% rename from airbyte/cloud/_sync_results.py rename to airbyte/cloud/sync_results.py index 0d9c2b47..1dc0c9cb 100644 --- a/airbyte/cloud/_sync_results.py +++ b/airbyte/cloud/sync_results.py @@ -23,8 +23,8 @@ import sqlalchemy from airbyte.caches.base import CacheBase - from airbyte.cloud._connections import CloudConnection - from airbyte.cloud._workspaces import CloudWorkspace + from airbyte.cloud.connections import CloudConnection + from airbyte.cloud.workspaces import CloudWorkspace FINAL_STATUSES = { diff --git a/airbyte/cloud/_workspaces.py b/airbyte/cloud/workspaces.py similarity index 99% rename from airbyte/cloud/_workspaces.py rename to airbyte/cloud/workspaces.py index 6c4ca7dd..edf446f3 100644 --- a/airbyte/cloud/_workspaces.py +++ b/airbyte/cloud/workspaces.py @@ -22,9 +22,9 @@ get_connection, get_workspace, ) -from airbyte.cloud._connections import CloudConnection from airbyte.cloud._destination_util import get_destination_config_from_cache -from airbyte.cloud._sync_results import SyncResult +from airbyte.cloud.connections import CloudConnection +from airbyte.cloud.sync_results import SyncResult from airbyte.sources.base import Source diff --git a/airbyte/exceptions.py b/airbyte/exceptions.py index a3b11bca..32280c8b 100644 --- a/airbyte/exceptions.py +++ b/airbyte/exceptions.py @@ -44,7 +44,7 @@ if TYPE_CHECKING: from airbyte._util.api_duck_types import AirbyteApiResponseDuckType - from airbyte.cloud._workspaces import CloudWorkspace + from airbyte.cloud.workspaces import CloudWorkspace NEW_ISSUE_URL = "https://github.com/airbytehq/airbyte/issues/new/choose" diff --git a/airbyte/secrets/__init__.py b/airbyte/secrets/__init__.py index 54398e66..d38c84ad 100644 --- a/airbyte/secrets/__init__.py +++ b/airbyte/secrets/__init__.py @@ -3,6 +3,16 @@ from __future__ import annotations +from airbyte.secrets import ( + base, + config, + custom, + env_vars, + google_colab, + google_gsm, + prompt, + util, +) from airbyte.secrets.base import SecretHandle, SecretSourceEnum, SecretString from airbyte.secrets.config import disable_secret_source, register_secret_manager from airbyte.secrets.custom import CustomSecretManager diff --git a/airbyte/secrets/google_gsm.py b/airbyte/secrets/google_gsm.py index 417339e8..c23c19e3 100644 --- a/airbyte/secrets/google_gsm.py +++ b/airbyte/secrets/google_gsm.py @@ -64,13 +64,13 @@ class GoogleGSMSecretManager(CustomSecretManager): """Secret manager that retrieves secrets from Google Secrets Manager (GSM). - This class inherits from `CustomSecretManager` and also adds methods + This class inherits from `CustomSecretManager` and also adds methods that are specific to this implementation: `fetch_secrets()`, `fetch_secrets_by_label()` and `fetch_connector_secrets()`. This secret manager is not enabled by default. To use it, you must provide the project ID and the credentials for a service account with the necessary permissions to access the secrets. - + The `fetch_connector_secret()` method assumes a label name of `connector` matches the name of the connector (`source-github`, `destination-snowflake`, etc.) """ @@ -194,7 +194,7 @@ def fetch_secrets_by_label( Returns: Iterable[SecretHandle]: An iterable of `SecretHandle` objects for the matching secrets. """ - return self.fetch_secrets(f"labels.{label_key}={label_value}") + return self.fetch_secrets(filter_string=f"labels.{label_key}={label_value}") def fetch_connector_secrets( self, diff --git a/docs/.DS_Store b/docs/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..f26b72cb05ac64f566549c80c86e55515a506922 GIT binary patch literal 6148 zcmeHK%}T>S5Z-O8O(;SR3OxqA7OYk);w8lT0!H+pQWFwtFlI}dnnNk%tS{t~_&m<+ zZp6|GoY3%Ioon8zdlc!X?C^|X(c~Z7CR`3eOikUroS(=D+1ktLp zsvsl=hyh|?T^P{&pV3;E3DZ1@0b=0C4B-A?gCaT_ONDalfCjIR7;hk=fQ@enL|dby zu~Y~V5UxrARVmj`46e$-Z|giqW2sP;GcHGl@92@aexYzVI{0mw&bXtHT4I10SY)8C zx)z@Q=ik5o7mKJz3=jkViUHmjcmo%vq|er+$>CXRgWiLpU|y;4vjhxz6hkf^#cfb2 Z;J48LbTpO ab.Source: From d1c535ec2471b8cc17df67f633ce6b516589c834 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 16:46:47 -0700 Subject: [PATCH 101/118] fix links in docs, add missing get_connection() implementation --- airbyte/cloud/connections.py | 4 ++-- airbyte/cloud/workspaces.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/airbyte/cloud/connections.py b/airbyte/cloud/connections.py index 9f4f1eea..894ef526 100644 --- a/airbyte/cloud/connections.py +++ b/airbyte/cloud/connections.py @@ -19,8 +19,8 @@ class CloudConnection: """A connection is a link between a source and a destination. - Do not instantiate this class directly. Instead, use the `CloudWorkspace.create_connection` - or `CloudWorkspace.get_connection` methods. + Do not instantiate this class directly. Instead, use + `.CloudWorkspace.deploy_connection` or `.CloudWorkspace.get_connection` methods. """ def __init__( diff --git a/airbyte/cloud/workspaces.py b/airbyte/cloud/workspaces.py index edf446f3..3280e691 100644 --- a/airbyte/cloud/workspaces.py +++ b/airbyte/cloud/workspaces.py @@ -254,6 +254,20 @@ def deploy_connection( return deployed_connection.connection_id + def get_connection( + self, + connection_id: str, + ) -> CloudConnection: + """Get a connection by ID. + + This method does not fetch data from the API. It returns a `CloudConnection` object, + which will be loaded lazily as needed. + """ + return CloudConnection( + workspace=self, + connection_id=connection_id, + ) + def delete_connection( self, connection_id: str | None, From 65d37e59452b36b74317fb2ec2dec8605343189b Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 21:01:12 -0700 Subject: [PATCH 102/118] chore: don't commit docs zip --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 5a5136a8..bcd1cea1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Packaged docs +docs/*.zip + # Directories and subdirectories called '.secrets' and the top-level '/secrets' directory .secrets /secrets From bcb598d3df48fa1d109816e0e433c41ebfe7a21b Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 21:02:34 -0700 Subject: [PATCH 103/118] un-feat: Refactor delete methods in CloudConnection and CloudWorkspace to use private _permanently_delete prefix --- airbyte/cloud/connections.py | 14 ++++++++++---- airbyte/cloud/workspaces.py | 10 +++++----- .../cloud/test_cloud_sql_reads.py | 6 +++--- tests/integration_tests/cloud/test_cloud_sync.py | 2 +- .../cloud/test_cloud_workspaces.py | 6 +++--- 5 files changed, 22 insertions(+), 16 deletions(-) diff --git a/airbyte/cloud/connections.py b/airbyte/cloud/connections.py index 894ef526..2e38398a 100644 --- a/airbyte/cloud/connections.py +++ b/airbyte/cloud/connections.py @@ -185,7 +185,7 @@ def get_sync_result( # Deletions - def delete( + def _permanently_delete( self, *, delete_source: bool = False, @@ -197,10 +197,16 @@ def delete( delete_source: Whether to also delete the source. delete_destination: Whether to also delete the destination. """ - self.workspace.delete_connection(connection_id=self.connection_id) + self.workspace._permanently_delete_connection( # noqa: SLF001 # Non-public API (for now) + connection_id=self.connection_id + ) if delete_source: - self.workspace.delete_source(source=self.source_id) + self.workspace._permanently_delete_source( # noqa: SLF001 # Non-public API (for now) + source=self.source_id + ) if delete_destination: - self.workspace.delete_destination(destination=self.destination_id) + self.workspace._permanently_delete_destination( # noqa: SLF001 # Non-public API + destination=self.destination_id, + ) diff --git a/airbyte/cloud/workspaces.py b/airbyte/cloud/workspaces.py index 3280e691..ef2c2d4d 100644 --- a/airbyte/cloud/workspaces.py +++ b/airbyte/cloud/workspaces.py @@ -95,7 +95,7 @@ def deploy_source( return deployed_source.source_id - def delete_source( + def _permanently_delete_source( self, source: str | Source, ) -> None: @@ -148,7 +148,7 @@ def deploy_cache_as_destination( return deployed_destination.destination_id - def delete_destination( + def _permanently_delete_destination( self, *, destination: str | None = None, @@ -268,7 +268,7 @@ def get_connection( connection_id=connection_id, ) - def delete_connection( + def _permanently_delete_connection( self, connection_id: str | None, *, @@ -292,10 +292,10 @@ def delete_connection( workspace_id=self.workspace_id, ) if delete_source: - self.delete_source(source=connection.source_id) + self._permanently_delete_source(source=connection.source_id) if delete_destination: - self.delete_destination(destination=connection.destination_id) + self._permanently_delete_destination(destination=connection.destination_id) # Run syncs diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py index 8249e6fb..dd27ce3b 100644 --- a/tests/integration_tests/cloud/test_cloud_sql_reads.py +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -64,15 +64,15 @@ def test_deploy_and_run_and_read( # Cleanup with suppress(Exception): - cloud_workspace.delete_connection( + cloud_workspace._permanently_delete_connection( connection_id=connection_id, delete_source=True, delete_destination=True, ) with suppress(Exception): - cloud_workspace.delete_source(source_id=source_id) + cloud_workspace._permanently_delete_source(source_id=source_id) with suppress(Exception): - cloud_workspace.delete_destination(destination_id=destination_id) + cloud_workspace._permanently_delete_destination(destination_id=destination_id) @pytest.mark.parametrize( diff --git a/tests/integration_tests/cloud/test_cloud_sync.py b/tests/integration_tests/cloud/test_cloud_sync.py index 4e4bae95..39f6ee77 100644 --- a/tests/integration_tests/cloud/test_cloud_sync.py +++ b/tests/integration_tests/cloud/test_cloud_sync.py @@ -56,4 +56,4 @@ def test_deploy_and_run_connection( assert cache.stream_names assert cache.streams["users"].to_pandas() - cloud_workspace.delete_connection(connection_id=connection_id) + cloud_workspace._permanently_delete_connection(connection_id=connection_id) diff --git a/tests/integration_tests/cloud/test_cloud_workspaces.py b/tests/integration_tests/cloud/test_cloud_workspaces.py index b7296c45..1143e844 100644 --- a/tests/integration_tests/cloud/test_cloud_workspaces.py +++ b/tests/integration_tests/cloud/test_cloud_workspaces.py @@ -23,7 +23,7 @@ def test_deploy_source( source.check() source_id: str = cloud_workspace.deploy_source(source) - cloud_workspace.delete_source(source=source_id) + cloud_workspace._permanently_delete_source(source=source_id) def test_deploy_cache_as_destination( @@ -37,7 +37,7 @@ def test_deploy_cache_as_destination( schema_name="public", ) destination_id: str = cloud_workspace.deploy_cache_as_destination(cache=cache) - cloud_workspace.delete_destination(destination=destination_id) + cloud_workspace._permanently_delete_destination(destination=destination_id) def test_deploy_connection( @@ -63,4 +63,4 @@ def test_deploy_connection( source=source, cache=cache, ) - cloud_workspace.delete_connection(connection_id=connection_id) + cloud_workspace._permanently_delete_connection(connection_id=connection_id) From f01e911e329f0972919c3053328115dc85fe5bbb Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 23:22:53 -0700 Subject: [PATCH 104/118] fix imports and docstring --- airbyte/datasets/_sql.py | 3 ++- airbyte/records.py | 11 +++++++---- airbyte/secrets/__init__.py | 2 +- airbyte/sources/__init__.py | 1 + 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/airbyte/datasets/_sql.py b/airbyte/datasets/_sql.py index 4a526405..4dba4f39 100644 --- a/airbyte/datasets/_sql.py +++ b/airbyte/datasets/_sql.py @@ -18,8 +18,9 @@ from collections.abc import Iterator from pandas import DataFrame - from sqlalchemy import Selectable, Table + from sqlalchemy import Table from sqlalchemy.sql import ClauseElement + from sqlalchemy.sql.selectable import Selectable from airbyte_protocol.models import ConfiguredAirbyteStream diff --git a/airbyte/records.py b/airbyte/records.py index c3261b35..9533b976 100644 --- a/airbyte/records.py +++ b/airbyte/records.py @@ -168,10 +168,13 @@ def __init__( """Initialize the dictionary with the given data. Args: - - normalize_keys: If `True`, the keys will be normalized using the given normalizer. - - expected_keys: If provided, the dictionary will be initialized with these given keys. - - expected_keys: If provided and `prune_extra_fields` is True, then unexpected fields - will be removed. This option is ignored if `expected_keys` is not provided. + from_dict: The dictionary to initialize the StreamRecord with. + prune_extra_fields: If `True`, unexpected fields will be removed. + normalize_keys: If `True`, the keys will be normalized using the given normalizer. + normalizer: The normalizer to use when normalizing keys. If not provided, the + LowerCaseNormalizer will be used. + expected_keys: If provided and `prune_extra_fields` is True, then unexpected fields + will be removed. This option is ignored if `expected_keys` is not provided. """ # If no normalizer is provided, use LowerCaseNormalizer. self._normalize_keys = normalize_keys diff --git a/airbyte/secrets/__init__.py b/airbyte/secrets/__init__.py index d38c84ad..156772df 100644 --- a/airbyte/secrets/__init__.py +++ b/airbyte/secrets/__init__.py @@ -13,7 +13,7 @@ prompt, util, ) -from airbyte.secrets.base import SecretHandle, SecretSourceEnum, SecretString +from airbyte.secrets.base import SecretHandle, SecretManager, SecretSourceEnum, SecretString from airbyte.secrets.config import disable_secret_source, register_secret_manager from airbyte.secrets.custom import CustomSecretManager from airbyte.secrets.env_vars import DotenvSecretManager, EnvVarSecretManager diff --git a/airbyte/sources/__init__.py b/airbyte/sources/__init__.py index aff3b5ad..bd3938bb 100644 --- a/airbyte/sources/__init__.py +++ b/airbyte/sources/__init__.py @@ -2,6 +2,7 @@ from __future__ import annotations from airbyte.sources import base, util +from airbyte.sources.base import Source from airbyte.sources.registry import ( ConnectorMetadata, get_available_connectors, From b630f9637003f20b5e4597b74097c90e278ab66c Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 23:34:21 -0700 Subject: [PATCH 105/118] remove redundant submodule declarations in pdoc --- docs/generate.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/docs/generate.py b/docs/generate.py index 1b95663c..1e640171 100755 --- a/docs/generate.py +++ b/docs/generate.py @@ -26,19 +26,6 @@ def run() -> None: if pathlib.Path("docs/generated").exists(): shutil.rmtree("docs/generated") - # All files and folders that don't start with "_" are treated as public. - for submodule in os.listdir("airbyte"): - submodule_path = pathlib.Path(f"airbyte/{submodule}") - if not submodule.startswith("_"): - public_modules.append(submodule_path) - if submodule_path.is_file(): - continue - - for subsubmodule in os.listdir(submodule_path): - subsubmodule_path = submodule_path / subsubmodule - if not subsubmodule.startswith("_"): - public_modules.append(subsubmodule_path) - pdoc.render.configure( template_directory="docs", show_source=True, From 5755bc74944955b814c99fe266d77ae7fa67fc3a Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 23:34:35 -0700 Subject: [PATCH 106/118] fix pdoc rendering bug --- airbyte/caches/duckdb.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/airbyte/caches/duckdb.py b/airbyte/caches/duckdb.py index 1bbaf550..d5514b3b 100644 --- a/airbyte/caches/duckdb.py +++ b/airbyte/caches/duckdb.py @@ -8,9 +8,10 @@ from airbyte.caches import DuckDBCache cache = DuckDBCache( - db_path="/path/to/my/database.duckdb", + db_path="/path/to/my/duckdb-file", schema_name="myschema", ) +``` """ from __future__ import annotations @@ -41,7 +42,7 @@ class DuckDBCache(CacheBase): """Normally db_path is a Path object. The database name will be inferred from the file name. For example, given a `db_path` of - `/path/to/my/my_db.duckdb`, the database name is `my_db`. + `/path/to/my/duckdb-file`, the database name is `my_db`. """ schema_name: str = "main" From 03c6c5acc572f77b5b0fd5b0ffdd4dcc6cfbf629 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 23:55:48 -0700 Subject: [PATCH 107/118] add badges --- README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 50e4b015..001623d7 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,16 @@ # PyAirbyte -PyAirbyte brings the power of Airbyte to every Python developer. PyAirbyte provides a set of utilities to use Airbyte connectors in Python. It is meant to be used in situations where setting up an Airbyte server or cloud account is not possible or desirable. +PyAirbyte brings the power of Airbyte to every Python developer. PyAirbyte provides a set of utilities to use Airbyte connectors in Python. + +[![PyPI version](https://badge.fury.io/py/airbyte.svg)](https://badge.fury.io/py/airbyte) +[![PyPI - Downloads](https://img.shields.io/pypi/dm/airbyte)](https://pypi.org/project/airbyte/) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/airbyte)](https://pypi.org/project/airbyte/) + +[![PyPI - Wheel](https://img.shields.io/pypi/wheel/airbyte)](https://pypi.org/project/airbyte/) + +[![PyPI - Implementation](https://img.shields.io/pypi/implementation/airbyte)](https://pypi.org/project/airbyte/) +[![PyPI - Format](https://img.shields.io/pypi/format/airbyte)](https://pypi.org/project/airbyte/) +[![Star on GitHub](https://img.shields.io/github/stars/airbytehq/pyairbyte.svg?style=social&label=★%20on%20GitHub)](https://github.com/airbytehq/pyairbyte) - [Getting Started](#getting-started) - [Secrets Management](#secrets-management) From ad3e8ac019566a1660d99108a5efbb8ff4968a0b Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 9 Apr 2024 23:59:53 -0700 Subject: [PATCH 108/118] update code sample --- airbyte/cloud/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/airbyte/cloud/__init__.py b/airbyte/cloud/__init__.py index 4812129a..6f36ea67 100644 --- a/airbyte/cloud/__init__.py +++ b/airbyte/cloud/__init__.py @@ -14,10 +14,10 @@ api_key=ab.get_secret("AIRBYTE_CLOUD_API_KEY"), ) -source = ab.get_source("source-faker", config={}) -source.check() - -workspace.deploy_source(source) +sync_result = workspace.run_sync( + connection_id="456", +) +print(sync_result.get_job_status()) ``` """ From e644ffba38f86065d850726e254e1556ef60479c Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Wed, 10 Apr 2024 00:03:09 -0700 Subject: [PATCH 109/118] update docstring --- airbyte/cloud/connections.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/airbyte/cloud/connections.py b/airbyte/cloud/connections.py index 2e38398a..898b135b 100644 --- a/airbyte/cloud/connections.py +++ b/airbyte/cloud/connections.py @@ -17,11 +17,7 @@ class CloudConnection: - """A connection is a link between a source and a destination. - - Do not instantiate this class directly. Instead, use - `.CloudWorkspace.deploy_connection` or `.CloudWorkspace.get_connection` methods. - """ + """A connection is an extract-load (EL) pairing of a source and destination.""" def __init__( self, From f2bac115de14244182004557b2e461a0ff78b95e Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Wed, 10 Apr 2024 00:03:56 -0700 Subject: [PATCH 110/118] un-feat: make `deploy*()` methods private for now --- airbyte/cloud/workspaces.py | 13 ++++++++----- .../integration_tests/cloud/test_cloud_sql_reads.py | 6 +++--- tests/integration_tests/cloud/test_cloud_sync.py | 2 +- .../cloud/test_cloud_workspaces.py | 6 +++--- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/airbyte/cloud/workspaces.py b/airbyte/cloud/workspaces.py index ef2c2d4d..7adca7fc 100644 --- a/airbyte/cloud/workspaces.py +++ b/airbyte/cloud/workspaces.py @@ -69,7 +69,8 @@ def connect(self) -> None: # Deploy and delete sources - def deploy_source( + # TODO: Make this a public API + def _deploy_source( self, source: Source, ) -> str: @@ -123,7 +124,8 @@ def _permanently_delete_source( # Deploy and delete destinations - def deploy_cache_as_destination( + # TODO: Make this a public API + def _deploy_cache_as_destination( self, cache: CacheBase, ) -> str: @@ -182,7 +184,8 @@ def _permanently_delete_destination( # Deploy and delete connections - def deploy_connection( + # TODO: Make this a public API + def _deploy_connection( self, source: Source | str, cache: CacheBase | None = None, @@ -210,7 +213,7 @@ def deploy_connection( if source._deployed_source_id: # noqa: SLF001 source_id = source._deployed_source_id # noqa: SLF001 else: - source_id = self.deploy_source(source) + source_id = self._deploy_source(source) else: source_id = source if not selected_streams: @@ -225,7 +228,7 @@ def deploy_connection( elif cache: table_prefix = table_prefix if table_prefix is not None else (cache.table_prefix or "") if not cache._deployed_destination_id: # noqa: SLF001 - destination_id = self.deploy_cache_as_destination(cache) + destination_id = self._deploy_cache_as_destination(cache) else: destination_id = cache._deployed_destination_id # noqa: SLF001 else: diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py index dd27ce3b..e0ec2f18 100644 --- a/tests/integration_tests/cloud/test_cloud_sql_reads.py +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -35,9 +35,9 @@ def test_deploy_and_run_and_read( """Test reading from a cache.""" # Deploy source, destination, and connection: - source_id = cloud_workspace.deploy_source(source=deployable_source) - destination_id = cloud_workspace.deploy_cache_as_destination(cache=new_deployable_cache) - connection_id = cloud_workspace.deploy_connection( + source_id = cloud_workspace._deploy_source(source=deployable_source) + destination_id = cloud_workspace._deploy_cache_as_destination(cache=new_deployable_cache) + connection_id = cloud_workspace._deploy_connection( source=deployable_source, cache=new_deployable_cache, table_prefix=new_deployable_cache.table_prefix, diff --git a/tests/integration_tests/cloud/test_cloud_sync.py b/tests/integration_tests/cloud/test_cloud_sync.py index 39f6ee77..a9c12fc7 100644 --- a/tests/integration_tests/cloud/test_cloud_sync.py +++ b/tests/integration_tests/cloud/test_cloud_sync.py @@ -48,7 +48,7 @@ def test_deploy_and_run_connection( schema_name="public", ) - connection_id: str = cloud_workspace.deploy_connection(source=source, cache=cache) + connection_id: str = cloud_workspace._deploy_connection(source=source, cache=cache) sync_result = cloud_workspace.run_sync(connection_id=connection_id) _ = sync_result diff --git a/tests/integration_tests/cloud/test_cloud_workspaces.py b/tests/integration_tests/cloud/test_cloud_workspaces.py index 1143e844..f89c9ae6 100644 --- a/tests/integration_tests/cloud/test_cloud_workspaces.py +++ b/tests/integration_tests/cloud/test_cloud_workspaces.py @@ -21,7 +21,7 @@ def test_deploy_source( install_if_missing=False, ) source.check() - source_id: str = cloud_workspace.deploy_source(source) + source_id: str = cloud_workspace._deploy_source(source) cloud_workspace._permanently_delete_source(source=source_id) @@ -36,7 +36,7 @@ def test_deploy_cache_as_destination( database="temp", schema_name="public", ) - destination_id: str = cloud_workspace.deploy_cache_as_destination(cache=cache) + destination_id: str = cloud_workspace._deploy_cache_as_destination(cache=cache) cloud_workspace._permanently_delete_destination(destination=destination_id) @@ -59,7 +59,7 @@ def test_deploy_connection( schema_name="public", ) - connection_id: str = cloud_workspace.deploy_connection( + connection_id: str = cloud_workspace._deploy_connection( source=source, cache=cache, ) From 251d4ad1d7cc64eeda33212e8c8a434018b12d78 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Wed, 10 Apr 2024 00:13:24 -0700 Subject: [PATCH 111/118] chore: remove and ignore .DS_Store files --- .DS_Store | Bin 6148 -> 0 bytes .gitignore | 3 +++ docs/.DS_Store | Bin 6148 -> 0 bytes 3 files changed, 3 insertions(+) delete mode 100644 .DS_Store delete mode 100644 docs/.DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 178cca89c34a965879e69264f420afe87f697473..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}T>S5Z<-XrW7Fug&r5Y7OYk)#Y>3w1&ruHr6#6mFlI}WnnNk%tS{t~_&m<+ zZp31}ir5+0{pNQ!`$6`HF~+@Vw8xmk7_*@va#WTGx|fF5Ofn+JF~TAnhlvcqelxMZ z4*2aBit1M5cKVr=y7~h@%mtTwlg%BnwYY(#lc!X^XhJ#gtz^?+1+SoduIJgGq=`)L!B^#1d4$9OF+dCu1FOq` zITP&W>QX=}CkBXt9~i*>K|n)v4VD_!)&U(}pD}JBqJWNX2}EJgHCSqd2ng4ufVz~M zCkEH$;1?#(HCSrY<&3MDVH`7a^?2cGcJK?8&bX_QdSZYWSY)84O$X2aQ}|_;KJph! z$RY-afq%vTw+8;egGHIM_1p6BtQF91p`l=2i3$kl3zq;ea35)@ppFaFAS5Z-O8O(;SR3OxqA7OYk);w8lT0!H+pQWFwtFlI}dnnNk%tS{t~_&m<+ zZp6|GoY3%Ioon8zdlc!X?C^|X(c~Z7CR`3eOikUroS(=D+1ktLp zsvsl=hyh|?T^P{&pV3;E3DZ1@0b=0C4B-A?gCaT_ONDalfCjIR7;hk=fQ@enL|dby zu~Y~V5UxrARVmj`46e$-Z|giqW2sP;GcHGl@92@aexYzVI{0mw&bXtHT4I10SY)8C zx)z@Q=ik5o7mKJz3=jkViUHmjcmo%vq|er+$>CXRgWiLpU|y;4vjhxz6hkf^#cfb2 Z;J48LbTpO Date: Wed, 10 Apr 2024 12:26:55 -0700 Subject: [PATCH 112/118] feat: add `airbyte.cloud.experimental` module --- airbyte/cloud/__init__.py | 8 ++++- airbyte/cloud/experimental.py | 59 +++++++++++++++++++++++++++++++++++ docs/generate.py | 2 +- 3 files changed, 67 insertions(+), 2 deletions(-) create mode 100644 airbyte/cloud/experimental.py diff --git a/airbyte/cloud/__init__.py b/airbyte/cloud/__init__.py index 6f36ea67..cd58e5de 100644 --- a/airbyte/cloud/__init__.py +++ b/airbyte/cloud/__init__.py @@ -19,7 +19,13 @@ ) print(sync_result.get_job_status()) ``` -""" + + +ℹ️ **Experimental Features** + +You can use the `airbyte.cloud.experimental` module to access experimental features. +These additional features are subject to change and may not be available in all environments. +""" # noqa: RUF002 # Allow emoji from __future__ import annotations diff --git a/airbyte/cloud/experimental.py b/airbyte/cloud/experimental.py new file mode 100644 index 00000000..fbc3ace4 --- /dev/null +++ b/airbyte/cloud/experimental.py @@ -0,0 +1,59 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Experimental features for interacting with the Airbyte Cloud API. + +You can use this module to access experimental features in Airbyte Cloud, OSS, and Enterprise. These +features are subject to change and may not be available in all environments. **Future versions of +PyAirbyte may remove or change these features without notice.** + +To use this module, replace an import like this: + +```python +from airbyte.cloud import CloudConnection, CloudWorkspace +``` + +with an import like this: + +```python +from airbyte.cloud.experimental import CloudConnection, CloudWorkspace +``` + +You can toggle between the stable and experimental versions of these classes by changing the import +path. This allows you to test new features without requiring substantial changes to your codebase. + +""" +# ruff: noqa: SLF001 # This file accesses private members of other classes. + +from __future__ import annotations + +import warnings + +from airbyte.cloud.connections import CloudConnection as Stable_CloudConnection +from airbyte.cloud.workspaces import CloudWorkspace as Stable_CloudWorkspace + + +# This module is not imported anywhere by default, so this warning should only print if the user +# explicitly imports it. +warnings.warn( + message="The `airbyte.cloud.experimental` module is experimental and may change in the future.", + category=FutureWarning, + stacklevel=2, +) + + +class CloudWorkspace(Stable_CloudWorkspace): + __doc__ = ( + f"Experimental implementation of `.CloudWorkspace`.\n\n{Stable_CloudConnection.__doc__}" + ) + deploy_connection = Stable_CloudWorkspace._deploy_connection + deploy_source = Stable_CloudWorkspace._deploy_source + deploy_cache_as_destination = Stable_CloudWorkspace._deploy_cache_as_destination + permanently_delete_connection = Stable_CloudWorkspace._permanently_delete_connection + permanently_delete_source = Stable_CloudWorkspace._permanently_delete_source + permanently_delete_destination = Stable_CloudWorkspace._permanently_delete_destination + + +class CloudConnection(Stable_CloudConnection): + __doc__ = ( + f"Experimental implementation of `.CloudConnection`.\n\n{Stable_CloudConnection.__doc__}" + ) + permanently_delete = Stable_CloudConnection._permanently_delete diff --git a/docs/generate.py b/docs/generate.py index 1e640171..c0b0fb45 100755 --- a/docs/generate.py +++ b/docs/generate.py @@ -20,7 +20,7 @@ def run() -> None: """Generate docs for all public modules in PyAirbyte and save them to docs/generated.""" - public_modules = ["airbyte"] + public_modules = ["airbyte", "airbyte/cloud/experimental.py"] # recursively delete the docs/generated folder if it exists if pathlib.Path("docs/generated").exists(): From 8fd29df252614065c64f4993ec0cf78ae4d6fc4a Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Wed, 10 Apr 2024 12:50:58 -0700 Subject: [PATCH 113/118] add and fix tests for stream names and prefixes --- airbyte/caches/base.py | 2 +- airbyte/cloud/connections.py | 4 +-- airbyte/cloud/sync_results.py | 4 +-- airbyte/cloud/workspaces.py | 31 ++++++++++++------- airbyte/sources/base.py | 3 +- .../cloud/test_cloud_sql_reads.py | 4 +-- .../cloud/test_cloud_workspaces.py | 14 +++++++-- 7 files changed, 39 insertions(+), 23 deletions(-) diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index a6f566f9..bb5664a6 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -46,8 +46,8 @@ class CacheBase(BaseModel): _deployed_api_root: Optional[str] = PrivateAttr(default=None) _deployed_workspace_id: Optional[str] = PrivateAttr(default=None) - _deployed_destination_id: Optional[str] = PrivateAttr(default=None) _deployed_connection_id: Optional[str] = PrivateAttr(default=None) + _deployed_destination_id: Optional[str] = PrivateAttr(default=None) _sql_processor_class: type[SqlProcessorBase] = PrivateAttr() _sql_processor: Optional[SqlProcessorBase] = PrivateAttr(default=None) diff --git a/airbyte/cloud/connections.py b/airbyte/cloud/connections.py index 898b135b..21fc5549 100644 --- a/airbyte/cloud/connections.py +++ b/airbyte/cloud/connections.py @@ -87,7 +87,7 @@ def table_prefix(self) -> str: if not self._connection_info: self._connection_info = self._fetch_connection_info() - return self._connection_info.configurations.prefix + return self._connection_info.prefix @property def connection_url(self) -> str | None: @@ -194,7 +194,7 @@ def _permanently_delete( delete_destination: Whether to also delete the destination. """ self.workspace._permanently_delete_connection( # noqa: SLF001 # Non-public API (for now) - connection_id=self.connection_id + connection=self ) if delete_source: diff --git a/airbyte/cloud/sync_results.py b/airbyte/cloud/sync_results.py index 1dc0c9cb..0f17809e 100644 --- a/airbyte/cloud/sync_results.py +++ b/airbyte/cloud/sync_results.py @@ -195,9 +195,9 @@ def get_sql_schema_name(self) -> str: return cache.schema_name @property - def stream_names(self) -> set[str]: + def stream_names(self) -> list[str]: """Return the set of stream names.""" - return self.get_sql_cache().processor.expected_streams + return self.connection.stream_names @final @property diff --git a/airbyte/cloud/workspaces.py b/airbyte/cloud/workspaces.py index 7adca7fc..796a9a6f 100644 --- a/airbyte/cloud/workspaces.py +++ b/airbyte/cloud/workspaces.py @@ -192,7 +192,7 @@ def _deploy_connection( destination: str | None = None, table_prefix: str | None = None, selected_streams: list[str] | None = None, - ) -> str: + ) -> CloudConnection: """Deploy a source and cache to the workspace as a new connection. Returns the newly deployed connection ID as a `str`. @@ -251,11 +251,20 @@ def _deploy_connection( ) if isinstance(source, Source): + source._deployed_workspace_id = self.workspace_id # noqa: SLF001 source._deployed_connection_id = deployed_connection.connection_id # noqa: SLF001 + source._deployed_source_id = source_id # noqa: SLF001 if cache: + cache._deployed_workspace_id = self.workspace_id # noqa: SLF001 cache._deployed_connection_id = deployed_connection.connection_id # noqa: SLF001 + cache._deployed_destination_id = deployed_connection.destination_id # noqa: SLF001 - return deployed_connection.connection_id + return CloudConnection( + workspace=self, + connection_id=deployed_connection.connection_id, + source=deployed_connection.source_id, + destination=deployed_connection.destination_id, + ) def get_connection( self, @@ -273,23 +282,23 @@ def get_connection( def _permanently_delete_connection( self, - connection_id: str | None, + connection: str | CloudConnection, *, delete_source: bool = False, delete_destination: bool = False, ) -> None: """Delete a deployed connection from the workspace.""" - if connection_id is None: + if connection is None: raise ValueError("No connection ID provided.") # noqa: TRY003 - connection: ConnectionResponse = get_connection( - connection_id=connection_id, - api_root=self.api_root, - api_key=self.api_key, - workspace_id=self.workspace_id, - ) + if isinstance(connection, str): + connection = CloudConnection( + workspace=self, + connection_id=connection, + ) + delete_connection( - connection_id=connection_id, + connection_id=connection.connection_id, api_root=self.api_root, api_key=self.api_key, workspace_id=self.workspace_id, diff --git a/airbyte/sources/base.py b/airbyte/sources/base.py index 7e7041e0..02b1746b 100644 --- a/airbyte/sources/base.py +++ b/airbyte/sources/base.py @@ -84,10 +84,9 @@ def __init__( if streams is not None: self.select_streams(streams) - self._deployed_api_root: str | None = None self._deployed_workspace_id: str | None = None - self._deployed_source_id: str | None = None self._deployed_connection_id: str | None = None + self._deployed_source_id: str | None = None def set_streams(self, streams: list[str]) -> None: """Deprecated. See select_streams().""" diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py index e0ec2f18..0c3f88a2 100644 --- a/tests/integration_tests/cloud/test_cloud_sql_reads.py +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -53,9 +53,7 @@ def test_deploy_and_run_and_read( # Check sync result: assert sync_result.is_job_complete() - - # TODO: Rebuild streams property from connection's configured streams API endpoint - # assert sync_result.stream_names == ["users", "products", "purchases"] + assert set(sync_result.stream_names) == set(["users", "products", "purchases"]) dataset: ab.CachedDataset = sync_result.get_dataset(stream_name="users") assert dataset.stream_name == "users" diff --git a/tests/integration_tests/cloud/test_cloud_workspaces.py b/tests/integration_tests/cloud/test_cloud_workspaces.py index f89c9ae6..fb6cd930 100644 --- a/tests/integration_tests/cloud/test_cloud_workspaces.py +++ b/tests/integration_tests/cloud/test_cloud_workspaces.py @@ -8,6 +8,7 @@ import airbyte as ab from airbyte.caches import MotherDuckCache from airbyte.cloud import CloudWorkspace +from airbyte.cloud.connections import CloudConnection def test_deploy_source( @@ -57,10 +58,19 @@ def test_deploy_connection( api_key=motherduck_api_key, database="temp", schema_name="public", + table_prefix="abc_deleteme_", + # table_suffix="", # Suffix not supported in CloudConnection ) - connection_id: str = cloud_workspace._deploy_connection( + connection: CloudConnection = cloud_workspace._deploy_connection( source=source, cache=cache, ) - cloud_workspace._permanently_delete_connection(connection_id=connection_id) + assert set(connection.stream_names) == set(["users", "products", "purchases"]) + assert connection.table_prefix == "abc_deleteme_" + # assert connection.table_suffix == "" # Suffix not supported in CloudConnection + cloud_workspace._permanently_delete_connection( + connection=connection, + delete_source=True, + delete_destination=True, + ) From 33fa49b61a4f57b65418010c884e6fc16886b367 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Wed, 10 Apr 2024 13:04:10 -0700 Subject: [PATCH 114/118] clean up and fix tests --- airbyte/cloud/workspaces.py | 1 - .../cloud/test_cloud_sql_reads.py | 8 +++---- .../cloud/test_cloud_sync.py | 24 ++++++++++++++++--- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/airbyte/cloud/workspaces.py b/airbyte/cloud/workspaces.py index 796a9a6f..348bb129 100644 --- a/airbyte/cloud/workspaces.py +++ b/airbyte/cloud/workspaces.py @@ -19,7 +19,6 @@ delete_connection, delete_destination, delete_source, - get_connection, get_workspace, ) from airbyte.cloud._destination_util import get_destination_config_from_cache diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py index 0c3f88a2..8de74bc1 100644 --- a/tests/integration_tests/cloud/test_cloud_sql_reads.py +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -37,7 +37,7 @@ def test_deploy_and_run_and_read( # Deploy source, destination, and connection: source_id = cloud_workspace._deploy_source(source=deployable_source) destination_id = cloud_workspace._deploy_cache_as_destination(cache=new_deployable_cache) - connection_id = cloud_workspace._deploy_connection( + connection: cloud.CloudConnection = cloud_workspace._deploy_connection( source=deployable_source, cache=new_deployable_cache, table_prefix=new_deployable_cache.table_prefix, @@ -45,11 +45,11 @@ def test_deploy_and_run_and_read( ) # Run sync and get result: - sync_result: SyncResult = cloud_workspace.run_sync(connection_id=connection_id) + sync_result: SyncResult = connection.run_sync() # TODO: Remove this second run after Destination bug is resolved: # https://github.com/airbytehq/airbyte/issues/36875 - sync_result: SyncResult = cloud_workspace.run_sync(connection_id=connection_id) + sync_result: SyncResult = connection.run_sync() # Check sync result: assert sync_result.is_job_complete() @@ -142,8 +142,8 @@ def test_read_from_previous_job( cache = sync_result.get_sql_cache() sqlalchemy_url = cache.get_sql_alchemy_url() engine: Engine = sync_result.get_sql_engine() - # assert sync_result.stream_names == ["users", "products", "purchases"] + assert set(sync_result.stream_names) == set(["users", "products", "purchases"]) dataset: ab.CachedDataset = sync_result.get_dataset(stream_name="users") assert dataset.stream_name == "users" data_as_list = list(dataset) diff --git a/tests/integration_tests/cloud/test_cloud_sync.py b/tests/integration_tests/cloud/test_cloud_sync.py index a9c12fc7..2c79cc24 100644 --- a/tests/integration_tests/cloud/test_cloud_sync.py +++ b/tests/integration_tests/cloud/test_cloud_sync.py @@ -10,6 +10,7 @@ import airbyte as ab from airbyte.caches import MotherDuckCache from airbyte.cloud import CloudWorkspace +from airbyte.cloud.sync_results import SyncResult @pytest.fixture @@ -21,10 +22,27 @@ def pre_created_connection_id() -> str: def test_run_connection( cloud_workspace: CloudWorkspace, pre_created_connection_id: str, -): +) -> None: """Test running a connection.""" - sync_result = cloud_workspace.run_sync(connection_id=pre_created_connection_id) - _ = sync_result + sync_result: SyncResult = cloud_workspace.run_sync(connection_id=pre_created_connection_id) + assert sync_result.is_job_complete() + assert sync_result.stream_names + + + +@pytest.mark.super_slow +def test_get_previous_sync_result( + cloud_workspace: CloudWorkspace, + pre_created_connection_id: str, +) -> None: + """Test running a connection.""" + sync_result: SyncResult = cloud_workspace.get_previous_sync_logs( + connection_id=pre_created_connection_id, + ) + assert sync_result.is_job_complete() + assert sync_result.get_job_status() + assert sync_result.stream_names + @pytest.mark.super_slow From 41dba0f4ec736fd20baaa24f5b7fc79f4b5e9b36 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Wed, 10 Apr 2024 13:07:29 -0700 Subject: [PATCH 115/118] lint fix --- airbyte/cloud/workspaces.py | 1 - 1 file changed, 1 deletion(-) diff --git a/airbyte/cloud/workspaces.py b/airbyte/cloud/workspaces.py index 348bb129..a9ce314e 100644 --- a/airbyte/cloud/workspaces.py +++ b/airbyte/cloud/workspaces.py @@ -28,7 +28,6 @@ if TYPE_CHECKING: - from airbyte_api.models.shared.connectionresponse import ConnectionResponse from airbyte_api.models.shared.destinationresponse import DestinationResponse from airbyte.caches.base import CacheBase From c2f045a72c5ef76b87236b65b2cc2a71ae1f3d6e Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Wed, 10 Apr 2024 13:29:11 -0700 Subject: [PATCH 116/118] fix test --- tests/integration_tests/cloud/test_cloud_sql_reads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py index 8de74bc1..15e69064 100644 --- a/tests/integration_tests/cloud/test_cloud_sql_reads.py +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -143,7 +143,7 @@ def test_read_from_previous_job( sqlalchemy_url = cache.get_sql_alchemy_url() engine: Engine = sync_result.get_sql_engine() - assert set(sync_result.stream_names) == set(["users", "products", "purchases"]) + assert "users" in sync_result.stream_names dataset: ab.CachedDataset = sync_result.get_dataset(stream_name="users") assert dataset.stream_name == "users" data_as_list = list(dataset) From ba011db09f48334fae8d41b19038509059b10426 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Wed, 10 Apr 2024 13:40:51 -0700 Subject: [PATCH 117/118] improve sync result properties, fix tests --- airbyte/caches/base.py | 1 - airbyte/cloud/connections.py | 2 +- airbyte/cloud/sync_results.py | 40 ++++++++++++++++++++++++++--------- airbyte/cloud/workspaces.py | 4 ++-- airbyte/sources/base.py | 2 +- 5 files changed, 34 insertions(+), 15 deletions(-) diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index bb5664a6..4af67266 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -46,7 +46,6 @@ class CacheBase(BaseModel): _deployed_api_root: Optional[str] = PrivateAttr(default=None) _deployed_workspace_id: Optional[str] = PrivateAttr(default=None) - _deployed_connection_id: Optional[str] = PrivateAttr(default=None) _deployed_destination_id: Optional[str] = PrivateAttr(default=None) _sql_processor_class: type[SqlProcessorBase] = PrivateAttr() diff --git a/airbyte/cloud/connections.py b/airbyte/cloud/connections.py index 21fc5549..52003264 100644 --- a/airbyte/cloud/connections.py +++ b/airbyte/cloud/connections.py @@ -147,7 +147,7 @@ def get_previous_sync_logs( workspace=self.workspace, connection=self, job_id=sync_log.job_id, - _latest_status=sync_log.status, + _latest_job_info=sync_log, ) for sync_log in sync_logs ] diff --git a/airbyte/cloud/sync_results.py b/airbyte/cloud/sync_results.py index 0f17809e..72ba7547 100644 --- a/airbyte/cloud/sync_results.py +++ b/airbyte/cloud/sync_results.py @@ -6,9 +6,10 @@ import time from collections.abc import Iterator, Mapping from dataclasses import dataclass +from datetime import datetime from typing import TYPE_CHECKING, Any, final -from airbyte_api.models.shared import ConnectionResponse, JobStatusEnum +from airbyte_api.models.shared import ConnectionResponse, JobResponse, JobStatusEnum from airbyte._util import api_util from airbyte.cloud._destination_util import create_cache_from_destination_config @@ -47,7 +48,7 @@ class SyncResult: job_id: str table_name_prefix: str = "" table_name_suffix: str = "" - _latest_status: JobStatusEnum | None = None + _latest_job_info: JobResponse | None = None _connection_response: ConnectionResponse | None = None _cache: CacheBase | None = None @@ -85,17 +86,35 @@ def is_job_complete(self) -> bool: def get_job_status(self) -> JobStatusEnum: """Check if the sync job is still running.""" - if self._latest_status and self._latest_status in FINAL_STATUSES: - return self._latest_status + return self._fetch_latest_job_info().status - job_info = api_util.get_job_info( + def _fetch_latest_job_info(self) -> JobResponse: + """Return the job info for the sync job.""" + if self._latest_job_info and self._latest_job_info.status in FINAL_STATUSES: + return self._latest_job_info + + self._latest_job_info = api_util.get_job_info( job_id=self.job_id, api_root=self.workspace.api_root, api_key=self.workspace.api_key, ) - self._latest_status = job_info.status + return self._latest_job_info + + @property + def bytes_synced(self) -> int: + """Return the number of records processed.""" + return self._fetch_latest_job_info().bytes_synced - return job_info.status + @property + def records_synced(self) -> int: + """Return the number of records processed.""" + return self._fetch_latest_job_info().rows_synced + + @property + def start_time(self) -> datetime: + """Return the start time of the sync job in UTC.""" + # Parse from ISO 8601 format: + return datetime.fromisoformat(self._fetch_latest_job_info().start_time) def raise_failure_status( self, @@ -110,8 +129,9 @@ def raise_failure_status( Otherwise, do nothing. """ - latest_status = self._latest_status - if refresh_status: + if not refresh_status and self._latest_job_info: + latest_status = self._latest_job_info.status + else: latest_status = self.get_job_status() if latest_status in FAILED_STATUSES: @@ -119,7 +139,7 @@ def raise_failure_status( workspace=self.workspace, connection_id=self.connection.connection_id, job_id=self.job_id, - job_status=self._latest_status, + job_status=self.get_job_status(), ) def wait_for_completion( diff --git a/airbyte/cloud/workspaces.py b/airbyte/cloud/workspaces.py index a9ce314e..d9c556c2 100644 --- a/airbyte/cloud/workspaces.py +++ b/airbyte/cloud/workspaces.py @@ -249,12 +249,12 @@ def _deploy_connection( ) if isinstance(source, Source): + source._deployed_api_root = self.api_root # noqa: SLF001 source._deployed_workspace_id = self.workspace_id # noqa: SLF001 - source._deployed_connection_id = deployed_connection.connection_id # noqa: SLF001 source._deployed_source_id = source_id # noqa: SLF001 if cache: + cache._deployed_api_root = self.api_root # noqa: SLF001 cache._deployed_workspace_id = self.workspace_id # noqa: SLF001 - cache._deployed_connection_id = deployed_connection.connection_id # noqa: SLF001 cache._deployed_destination_id = deployed_connection.destination_id # noqa: SLF001 return CloudConnection( diff --git a/airbyte/sources/base.py b/airbyte/sources/base.py index 02b1746b..bb2c99c8 100644 --- a/airbyte/sources/base.py +++ b/airbyte/sources/base.py @@ -84,8 +84,8 @@ def __init__( if streams is not None: self.select_streams(streams) + self._deployed_api_root: str | None = None self._deployed_workspace_id: str | None = None - self._deployed_connection_id: str | None = None self._deployed_source_id: str | None = None def set_streams(self, streams: list[str]) -> None: From ae261ffc7ca95b3d0142394e901241172d2f1537 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Wed, 10 Apr 2024 13:50:29 -0700 Subject: [PATCH 118/118] pin `airbyte-api` to a specific commit --- poetry.lock | 10 +++++----- pyproject.toml | 9 ++++++++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index 27473b38..9c67f0d9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -31,7 +31,7 @@ dev = ["pylint (==2.16.2)"] [package.source] type = "git" url = "https://github.com/airbytehq/airbyte-api-python-sdk.git" -reference = "aj/manual_rename_dir" +reference = "856599a4861ee1f0ee4e994feff22e44ffb4cbd4" resolved_reference = "856599a4861ee1f0ee4e994feff22e44ffb4cbd4" [[package]] @@ -595,13 +595,13 @@ python-dateutil = ">=2.4" [[package]] name = "filelock" -version = "3.13.3" +version = "3.13.4" description = "A platform independent file lock." optional = false python-versions = ">=3.8" files = [ - {file = "filelock-3.13.3-py3-none-any.whl", hash = "sha256:5ffa845303983e7a0b7ae17636509bc97997d58afeafa72fb141a17b152284cb"}, - {file = "filelock-3.13.3.tar.gz", hash = "sha256:a79895a25bbefdf55d1a2a0a80968f7dbb28edcd6d4234a0afb3f37ecde4b546"}, + {file = "filelock-3.13.4-py3-none-any.whl", hash = "sha256:404e5e9253aa60ad457cae1be07c0f0ca90a63931200a47d9b6a6af84fd7b45f"}, + {file = "filelock-3.13.4.tar.gz", hash = "sha256:d13f466618bfde72bd2c18255e269f72542c6e70e7bac83a0232d6b1cc5c8cf4"}, ] [package.extras] @@ -2870,4 +2870,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "9a031cf5b629604d4b79ad9aa2e93f86a0cc6eb1a07d3ffaf3c6aff29acc7d21" +content-hash = "13b6f429df688ba505ffc513a714af167da17dc2acb34cd0749cda8d54183a73" diff --git a/pyproject.toml b/pyproject.toml index a81c7f70..bb48efa4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,14 @@ ulid = "^1.1" # TODO: Remove this arbitrary python constraint once `sqlalchemy-bigquery` has done so. sqlalchemy-bigquery = { version = "1.9.0", python = "<3.13" } -airbyte-api = {git = "https://github.com/airbytehq/airbyte-api-python-sdk.git", rev = "aj/manual_rename_dir"} + +[tool.poetry.dependencies.airbyte-api] +git = "https://github.com/airbytehq/airbyte-api-python-sdk.git" +# Pinned to a specific commit to avoid breaking changes. +# TODO: Use a PyPi version of this after this resolves: +# https://github.com/airbytehq/airbyte-api-python-sdk/issues/67 +# rev = "aj/manual_rename_dir" This is the branch, but the commit is: +rev = "856599a4861ee1f0ee4e994feff22e44ffb4cbd4" [tool.poetry.group.dev.dependencies] docker = "^7.0.0"