From 79c5f4068536df3e07870125d386efb57c9182e6 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Thu, 23 Jan 2025 16:28:57 -0600 Subject: [PATCH 01/49] file-based: initial implementation to sync metadarecords --- .../config/abstract_file_based_spec.py | 7 ++++ .../sources/file_based/file_based_source.py | 19 +++++++++ .../file_based/file_based_stream_reader.py | 42 +++++++++++++++++++ .../stream/default_file_based_stream.py | 14 +++++++ .../file_based/scenarios/csv_scenarios.py | 9 +++- 5 files changed, 90 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py b/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py index 626d50fef..c08a46536 100644 --- a/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +++ b/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py @@ -22,6 +22,13 @@ class Config(OneOfOptionConfig): delivery_type: Literal["use_records_transfer"] = Field("use_records_transfer", const=True) + sync_metadata: bool = Field( + title="Make stream sync files metadata", + description="If enabled, streams will sync files metadata instead of files data.", + default=False, + airbyte_hidden=True, + ) + class DeliverRawFiles(BaseModel): class Config(OneOfOptionConfig): diff --git a/airbyte_cdk/sources/file_based/file_based_source.py b/airbyte_cdk/sources/file_based/file_based_source.py index 0eb90ac24..8baae8ee8 100644 --- a/airbyte_cdk/sources/file_based/file_based_source.py +++ b/airbyte_cdk/sources/file_based/file_based_source.py @@ -312,6 +312,7 @@ def _make_default_stream( cursor=cursor, use_file_transfer=self._use_file_transfer(parsed_config), preserve_directory_structure=self._preserve_directory_structure(parsed_config), + sync_metadata=self._sync_metadata(parsed_config), ) def _get_stream_from_catalog( @@ -387,6 +388,14 @@ def _use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool: ) return use_file_transfer + @staticmethod + def _use_records_transfer(parsed_config: AbstractFileBasedSpec) -> bool: + use_records_transfer = ( + hasattr(parsed_config.delivery_method, "delivery_type") + and parsed_config.delivery_method.delivery_type == "use_records_transfer" + ) + return use_records_transfer + @staticmethod def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool: """ @@ -408,3 +417,13 @@ def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool: ): return parsed_config.delivery_method.preserve_directory_structure return True + + @staticmethod + def _sync_metadata(parsed_config: AbstractFileBasedSpec) -> bool: + if ( + FileBasedSource._use_records_transfer(parsed_config) + and hasattr(parsed_config.delivery_method, "sync_metadata") + and parsed_config.delivery_method.sync_metadata is not None + ): + return parsed_config.delivery_method.sync_metadata + return False diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 065125621..7fd5fff5c 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -135,6 +135,15 @@ def use_file_transfer(self) -> bool: return use_file_transfer return False + def use_records_transfer(self) -> bool: + if self.config: + use_records_transfer = ( + hasattr(self.config.delivery_method, "delivery_type") + and self.config.delivery_method.delivery_type == "use_records_transfer" + ) + return use_records_transfer + return False + def preserve_directory_structure(self) -> bool: # fall back to preserve subdirectories if config is not present or incomplete if ( @@ -146,6 +155,16 @@ def preserve_directory_structure(self) -> bool: return self.config.delivery_method.preserve_directory_structure return True + def sync_metadata(self) -> bool: + if ( + self.config + and self.use_records_transfer() + and hasattr(self.config.delivery_method, "sync_metadata") + and self.config.delivery_method.sync_metadata is not None + ): + return self.config.delivery_method.sync_metadata + return False + @abstractmethod def get_file( self, file: RemoteFile, local_directory: str, logger: logging.Logger @@ -183,3 +202,26 @@ def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> Li makedirs(path.dirname(local_file_path), exist_ok=True) absolute_file_path = path.abspath(local_file_path) return [file_relative_path, local_file_path, absolute_file_path] + + def get_file_metadata(self, file: RemoteFile, logger: logging.Logger) -> Dict[str, Any]: + """ + This is required for connectors that will support syncing + metadata from files. + """ + ... + + def get_metadata_schema(self) -> Dict[str, Any]: + """ " + Base schema to emit metadata records for a file, + override in stream reader implementation if the requirements + are different. + """ + return { + "type": "object", + "properties": { + "id": {"type": "string"}, + "file_path": {"type": "string"}, + "allowed_identity_remote_ids": {"type": "array", "items": "string"}, + "is_public": {"type": "boolean"}, + }, + } diff --git a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py index 604322549..062362dc3 100644 --- a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +++ b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py @@ -47,6 +47,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin): FILE_TRANSFER_KW = "use_file_transfer" PRESERVE_DIRECTORY_STRUCTURE_KW = "preserve_directory_structure" + SYNC_METADATA_KW = "sync_metadata" FILES_KEY = "files" DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" ab_last_mod_col = "_ab_source_file_last_modified" @@ -56,6 +57,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin): airbyte_columns = [ab_last_mod_col, ab_file_name_col] use_file_transfer = False preserve_directory_structure = True + sync_metadata = False def __init__(self, **kwargs: Any): if self.FILE_TRANSFER_KW in kwargs: @@ -64,6 +66,8 @@ def __init__(self, **kwargs: Any): self.preserve_directory_structure = kwargs.pop( self.PRESERVE_DIRECTORY_STRUCTURE_KW, True ) + if self.SYNC_METADATA_KW in kwargs: + self.sync_metadata = kwargs.pop(self.SYNC_METADATA_KW, False) super().__init__(**kwargs) @property @@ -105,6 +109,8 @@ def _filter_schema_invalid_properties( self.ab_file_name_col: {"type": "string"}, }, } + elif self.sync_metadata: + return self.stream_reader.get_metadata_schema() else: return super()._filter_schema_invalid_properties(configured_catalog_json_schema) @@ -187,6 +193,12 @@ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Airbyte yield stream_data_to_airbyte_message( self.name, record, is_file_transfer_message=True ) + elif self.sync_metadata: + metadata_record = self.stream_reader.get_file_metadata(file, logger=self.logger) + yield stream_data_to_airbyte_message( + self.name, metadata_record, is_file_transfer_message=False + ) + else: for record in parser.parse_records( self.config, file, self.stream_reader, self.logger, schema @@ -284,6 +296,8 @@ def get_json_schema(self) -> JsonSchema: def _get_raw_json_schema(self) -> JsonSchema: if self.use_file_transfer: return file_transfer_schema + elif self.sync_metadata: + self.stream_reader.get_metadata_schema() elif self.config.input_schema: return self.config.get_input_schema() # type: ignore elif self.config.schemaless: diff --git a/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/unit_tests/sources/file_based/scenarios/csv_scenarios.py index 9e919c911..c609b4096 100644 --- a/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -508,7 +508,14 @@ "const": "use_records_transfer", "enum": ["use_records_transfer"], "type": "string", - } + }, + "sync_metadata": { + "airbyte_hidden": True, + "default": False, + "description": "If enabled, streams will sync files metadata instead of files data.", + "title": "Make stream sync files metadata", + "type": "boolean", + }, }, "description": "Recommended - Extract and load structured records into your destination of choice. This is the classic method of moving data in Airbyte. It allows for blocking and hashing individual fields or files from a structured schema. Data can be flattened, typed and deduped depending on the destination.", "required": ["delivery_type"], From 4638f891c3f5935a4a105f40728d71200177b13f Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Thu, 23 Jan 2025 16:37:46 -0600 Subject: [PATCH 02/49] file-based: fix lint --- airbyte_cdk/sources/file_based/file_based_stream_reader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 7fd5fff5c..db22f27de 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -203,6 +203,7 @@ def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> Li absolute_file_path = path.abspath(local_file_path) return [file_relative_path, local_file_path, absolute_file_path] + @abstractmethod def get_file_metadata(self, file: RemoteFile, logger: logging.Logger) -> Dict[str, Any]: """ This is required for connectors that will support syncing From 266c0cdbe53220e23a6a41f6c023b354f935aa84 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Thu, 23 Jan 2025 16:45:27 -0600 Subject: [PATCH 03/49] file-based: fix errors --- .../stream/default_file_based_stream.py | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py index 062362dc3..a0dd67e10 100644 --- a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +++ b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py @@ -194,11 +194,21 @@ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Airbyte self.name, record, is_file_transfer_message=True ) elif self.sync_metadata: - metadata_record = self.stream_reader.get_file_metadata(file, logger=self.logger) - yield stream_data_to_airbyte_message( - self.name, metadata_record, is_file_transfer_message=False - ) - + try: + metadata_record = self.stream_reader.get_file_metadata(file, logger=self.logger) + yield stream_data_to_airbyte_message( + self.name, metadata_record, is_file_transfer_message=False + ) + except Exception as e: + self.logger.error(f"Failed to retrieve metadata for file {file.uri}: {str(e)}") + yield AirbyteMessage( + type=MessageType.LOG, + log=AirbyteLogMessage( + level=Level.ERROR, + message = f"Error retrieving metadata: stream={self.name} file={file.uri}", + stack_trace = traceback.format_exc(), + ) + ) else: for record in parser.parse_records( self.config, file, self.stream_reader, self.logger, schema @@ -297,7 +307,7 @@ def _get_raw_json_schema(self) -> JsonSchema: if self.use_file_transfer: return file_transfer_schema elif self.sync_metadata: - self.stream_reader.get_metadata_schema() + return self.stream_reader.get_metadata_schema() elif self.config.input_schema: return self.config.get_input_schema() # type: ignore elif self.config.schemaless: @@ -428,3 +438,4 @@ async def _infer_file_schema(self, file: RemoteFile) -> SchemaType: format=str(self.config.format), stream=self.name, ) from exc + From 7bfb8c3a200aa30f738b7421dbf85cc6c163f7c1 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Thu, 23 Jan 2025 22:54:45 +0000 Subject: [PATCH 04/49] Auto-fix lint and format issues --- .../stream/default_file_based_stream.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py index a0dd67e10..6e8664c64 100644 --- a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +++ b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py @@ -195,19 +195,23 @@ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Airbyte ) elif self.sync_metadata: try: - metadata_record = self.stream_reader.get_file_metadata(file, logger=self.logger) + metadata_record = self.stream_reader.get_file_metadata( + file, logger=self.logger + ) yield stream_data_to_airbyte_message( self.name, metadata_record, is_file_transfer_message=False ) except Exception as e: - self.logger.error(f"Failed to retrieve metadata for file {file.uri}: {str(e)}") + self.logger.error( + f"Failed to retrieve metadata for file {file.uri}: {str(e)}" + ) yield AirbyteMessage( type=MessageType.LOG, log=AirbyteLogMessage( - level=Level.ERROR, - message = f"Error retrieving metadata: stream={self.name} file={file.uri}", - stack_trace = traceback.format_exc(), - ) + level=Level.ERROR, + message=f"Error retrieving metadata: stream={self.name} file={file.uri}", + stack_trace=traceback.format_exc(), + ), ) else: for record in parser.parse_records( @@ -438,4 +442,3 @@ async def _infer_file_schema(self, file: RemoteFile) -> SchemaType: format=str(self.config.format), stream=self.name, ) from exc - From 88af54364ae7c2b6ae0e1e4d801099a2432ea529 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Thu, 23 Jan 2025 16:57:44 -0600 Subject: [PATCH 05/49] file-based: remove abstract decorator --- airbyte_cdk/sources/file_based/file_based_stream_reader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index db22f27de..a9003a07c 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -203,13 +203,12 @@ def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> Li absolute_file_path = path.abspath(local_file_path) return [file_relative_path, local_file_path, absolute_file_path] - @abstractmethod def get_file_metadata(self, file: RemoteFile, logger: logging.Logger) -> Dict[str, Any]: """ This is required for connectors that will support syncing metadata from files. """ - ... + return {} def get_metadata_schema(self) -> Dict[str, Any]: """ " From edd6f69bd6871f8f9696804a8d277368a02f0d8c Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Fri, 24 Jan 2025 12:00:48 -0600 Subject: [PATCH 06/49] file-based: fix check --- airbyte_cdk/sources/file_based/file_based_source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/file_based_source.py b/airbyte_cdk/sources/file_based/file_based_source.py index 8baae8ee8..381797e90 100644 --- a/airbyte_cdk/sources/file_based/file_based_source.py +++ b/airbyte_cdk/sources/file_based/file_based_source.py @@ -163,7 +163,7 @@ def check_connection( parsed_config = self._get_parsed_config(config) availability_method = ( stream.availability_strategy.check_availability - if self._use_file_transfer(parsed_config) + if self._use_file_transfer(parsed_config) or self._sync_metadata(parsed_config) else stream.availability_strategy.check_availability_and_parsability ) ( From 35e0e684077aba7951e3c63dbfc417d2709efdac Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Sun, 26 Jan 2025 16:48:39 -0600 Subject: [PATCH 07/49] file-based: add identities stream and rename acl toggle --- .../config/abstract_file_based_spec.py | 14 ++- .../config/identities_based_stream_config.py | 7 ++ .../sources/file_based/config/permissions.py | 33 +++++++ .../sources/file_based/file_based_source.py | 51 ++++++++-- .../file_based/file_based_stream_reader.py | 33 +++---- .../sources/file_based/schema_helpers.py | 25 +++++ .../sources/file_based/stream/__init__.py | 3 +- .../stream/default_file_based_stream.py | 21 ++-- .../file_based/stream/identities_stream.py | 99 +++++++++++++++++++ .../file_based/scenarios/csv_scenarios.py | 33 ++++++- 10 files changed, 274 insertions(+), 45 deletions(-) create mode 100644 airbyte_cdk/sources/file_based/config/identities_based_stream_config.py create mode 100644 airbyte_cdk/sources/file_based/config/permissions.py create mode 100644 airbyte_cdk/sources/file_based/stream/identities_stream.py diff --git a/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py b/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py index c08a46536..5bda45421 100644 --- a/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +++ b/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py @@ -11,6 +11,9 @@ from airbyte_cdk import OneOfOptionConfig from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.file_based.config.identities_based_stream_config import ( + IdentitiesStreamConfig, +) from airbyte_cdk.sources.utils import schema_helpers @@ -22,12 +25,17 @@ class Config(OneOfOptionConfig): delivery_type: Literal["use_records_transfer"] = Field("use_records_transfer", const=True) - sync_metadata: bool = Field( - title="Make stream sync files metadata", - description="If enabled, streams will sync files metadata instead of files data.", + sync_acl_permissions: bool = Field( + title="Include ACL Permissions", + description="Joins Document allowlists to each stream.", default=False, airbyte_hidden=True, ) + identities: Optional[IdentitiesStreamConfig] = Field( + title="Identities configuration", + description="Configuration for identities", + airbyte_hidden=True, + ) class DeliverRawFiles(BaseModel): diff --git a/airbyte_cdk/sources/file_based/config/identities_based_stream_config.py b/airbyte_cdk/sources/file_based/config/identities_based_stream_config.py new file mode 100644 index 000000000..6df27f492 --- /dev/null +++ b/airbyte_cdk/sources/file_based/config/identities_based_stream_config.py @@ -0,0 +1,7 @@ +from pydantic.v1 import BaseModel, Field +from typing import Literal + + +class IdentitiesStreamConfig(BaseModel): + name: Literal["identities"] = Field("identities", const=True, airbyte_hidden=True) + domain: str = Field(title="Domain", description="The domain of the identities.") diff --git a/airbyte_cdk/sources/file_based/config/permissions.py b/airbyte_cdk/sources/file_based/config/permissions.py new file mode 100644 index 000000000..534ac56e4 --- /dev/null +++ b/airbyte_cdk/sources/file_based/config/permissions.py @@ -0,0 +1,33 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +import uuid +from datetime import datetime +from enum import Enum +from pydantic.v1 import BaseModel + + +class RemoteFileIdentityType(Enum): + USER = "user" + GROUP = "group" + + +class RemoteFileIdentity(BaseModel): + id: uuid.UUID + remote_id: str + parent_id: str | None = None + name: str | None = None + description: str | None = None + email_address: str | None = None + member_email_addresses: list[str] | None = None + type: RemoteFileIdentityType + modified_at: datetime + + +class RemoteFilePermissions(BaseModel): + id: str + file_path: str + allowed_identity_remote_ids: list[str] | None = None + denied_identity_remote_ids: list[str] | None = None + publicly_accessible: bool = False diff --git a/airbyte_cdk/sources/file_based/file_based_source.py b/airbyte_cdk/sources/file_based/file_based_source.py index 381797e90..3a889811a 100644 --- a/airbyte_cdk/sources/file_based/file_based_source.py +++ b/airbyte_cdk/sources/file_based/file_based_source.py @@ -33,6 +33,9 @@ FileBasedStreamConfig, ValidationPolicy, ) +from airbyte_cdk.sources.file_based.config.identities_based_stream_config import ( + IdentitiesStreamConfig, +) from airbyte_cdk.sources.file_based.discovery_policy import ( AbstractDiscoveryPolicy, DefaultDiscoveryPolicy, @@ -49,7 +52,11 @@ DEFAULT_SCHEMA_VALIDATION_POLICIES, AbstractSchemaValidationPolicy, ) -from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream, DefaultFileBasedStream +from airbyte_cdk.sources.file_based.stream import ( + AbstractFileBasedStream, + DefaultFileBasedStream, + IdentitiesStream, +) from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade from airbyte_cdk.sources.file_based.stream.concurrent.cursor import ( AbstractConcurrentFileBasedCursor, @@ -157,13 +164,17 @@ def check_connection( errors = [] tracebacks = [] for stream in streams: + if isinstance(stream, IdentitiesStream): + # Probably need to check identities endpoint/api access but will skip for now. + continue if not isinstance(stream, AbstractFileBasedStream): raise ValueError(f"Stream {stream} is not a file-based stream.") try: parsed_config = self._get_parsed_config(config) availability_method = ( stream.availability_strategy.check_availability - if self._use_file_transfer(parsed_config) or self._sync_metadata(parsed_config) + if self._use_file_transfer(parsed_config) + or self._sync_acl_permissions(parsed_config) else stream.availability_strategy.check_availability_and_parsability ) ( @@ -289,6 +300,12 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]: ) streams.append(stream) + + if self._add_identities_stream(parsed_config): + identities_stream = self._make_identities_stream( + stream_config=parsed_config.delivery_method.identities + ) + streams.append(identities_stream) return streams except ValidationError as exc: @@ -312,7 +329,19 @@ def _make_default_stream( cursor=cursor, use_file_transfer=self._use_file_transfer(parsed_config), preserve_directory_structure=self._preserve_directory_structure(parsed_config), - sync_metadata=self._sync_metadata(parsed_config), + sync_acl_permissions=self._sync_acl_permissions(parsed_config), + ) + + def _make_identities_stream( + self, + stream_config: IdentitiesStreamConfig, + ) -> Stream: + return IdentitiesStream( + config=stream_config, + catalog_schema=self.stream_schemas.get(stream_config.name), + stream_reader=self.stream_reader, + discovery_policy=self.discovery_policy, + errors_collector=self.errors_collector, ) def _get_stream_from_catalog( @@ -419,11 +448,19 @@ def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool: return True @staticmethod - def _sync_metadata(parsed_config: AbstractFileBasedSpec) -> bool: + def _sync_acl_permissions(parsed_config: AbstractFileBasedSpec) -> bool: if ( FileBasedSource._use_records_transfer(parsed_config) - and hasattr(parsed_config.delivery_method, "sync_metadata") - and parsed_config.delivery_method.sync_metadata is not None + and hasattr(parsed_config.delivery_method, "sync_acl_permissions") + and parsed_config.delivery_method.sync_acl_permissions is not None ): - return parsed_config.delivery_method.sync_metadata + return parsed_config.delivery_method.sync_acl_permissions return False + + @staticmethod + def _add_identities_stream(parsed_config: AbstractFileBasedSpec) -> bool: + return ( + FileBasedSource._sync_acl_permissions(parsed_config) + and parsed_config.delivery_method.identities is not None + and parsed_config.delivery_method.identities.domain + ) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index a9003a07c..3f804f24a 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -155,14 +155,14 @@ def preserve_directory_structure(self) -> bool: return self.config.delivery_method.preserve_directory_structure return True - def sync_metadata(self) -> bool: + def sync_acl_permissions(self) -> bool: if ( self.config and self.use_records_transfer() - and hasattr(self.config.delivery_method, "sync_metadata") - and self.config.delivery_method.sync_metadata is not None + and hasattr(self.config.delivery_method, "sync_acl_permissions") + and self.config.delivery_method.sync_acl_permissions is not None ): - return self.config.delivery_method.sync_metadata + return self.config.delivery_method.sync_acl_permissions return False @abstractmethod @@ -203,25 +203,16 @@ def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> Li absolute_file_path = path.abspath(local_file_path) return [file_relative_path, local_file_path, absolute_file_path] - def get_file_metadata(self, file: RemoteFile, logger: logging.Logger) -> Dict[str, Any]: + def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> Dict[str, Any]: """ This is required for connectors that will support syncing - metadata from files. + ACL Permissions from files. """ return {} - def get_metadata_schema(self) -> Dict[str, Any]: - """ " - Base schema to emit metadata records for a file, - override in stream reader implementation if the requirements - are different. - """ - return { - "type": "object", - "properties": { - "id": {"type": "string"}, - "file_path": {"type": "string"}, - "allowed_identity_remote_ids": {"type": "array", "items": "string"}, - "is_public": {"type": "boolean"}, - }, - } + def load_identity_groups(self) -> Iterable[Dict[str, Any]]: + """ + This is required for connectors that will support syncing + identities. + """ + yield {} diff --git a/airbyte_cdk/sources/file_based/schema_helpers.py b/airbyte_cdk/sources/file_based/schema_helpers.py index 1b653db67..fb12efe5e 100644 --- a/airbyte_cdk/sources/file_based/schema_helpers.py +++ b/airbyte_cdk/sources/file_based/schema_helpers.py @@ -23,6 +23,31 @@ "properties": {"data": {"type": "object"}, "file": {"type": "object"}}, } +remote_file_permissions_schema = { + "type": "object", + "properties": { + "id": {"type": "string"}, + "file_path": {"type": "string"}, + "allowed_identity_remote_ids": {"type": "array", "items": "string"}, + "publicly_accessible": {"type": "boolean"}, + }, +} + +remote_file_identity_schema = { + "type": "object", + "properties": { + "id": {"type": "string"}, + "remote_id": {"type": "string"}, + "parent_id": {"type": ["null", "string"]}, + "name": {"type": ["null", "string"]}, + "description": {"type": ["null", "string"]}, + "email_address": {"type": ["null", "string"]}, + "member_email_addresses": {"type": ["null", "array"]}, + "type": {"type": "string"}, + "modified_at": {"type": "string"}, + }, +} + @total_ordering class ComparableType(Enum): diff --git a/airbyte_cdk/sources/file_based/stream/__init__.py b/airbyte_cdk/sources/file_based/stream/__init__.py index 4b5c4bc2e..78c2b1062 100644 --- a/airbyte_cdk/sources/file_based/stream/__init__.py +++ b/airbyte_cdk/sources/file_based/stream/__init__.py @@ -1,4 +1,5 @@ from airbyte_cdk.sources.file_based.stream.abstract_file_based_stream import AbstractFileBasedStream from airbyte_cdk.sources.file_based.stream.default_file_based_stream import DefaultFileBasedStream +from airbyte_cdk.sources.file_based.stream.identities_stream import IdentitiesStream -__all__ = ["AbstractFileBasedStream", "DefaultFileBasedStream"] +__all__ = ["AbstractFileBasedStream", "DefaultFileBasedStream", "IdentitiesStream"] diff --git a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py index 6e8664c64..743b51e10 100644 --- a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +++ b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py @@ -30,6 +30,7 @@ file_transfer_schema, merge_schemas, schemaless_schema, + remote_file_permissions_schema, ) from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor @@ -47,7 +48,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin): FILE_TRANSFER_KW = "use_file_transfer" PRESERVE_DIRECTORY_STRUCTURE_KW = "preserve_directory_structure" - SYNC_METADATA_KW = "sync_metadata" + SYNC_ACL_PERMISSIONS_KW = "sync_acl_permissions" FILES_KEY = "files" DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" ab_last_mod_col = "_ab_source_file_last_modified" @@ -57,7 +58,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin): airbyte_columns = [ab_last_mod_col, ab_file_name_col] use_file_transfer = False preserve_directory_structure = True - sync_metadata = False + sync_acl_permissions = False def __init__(self, **kwargs: Any): if self.FILE_TRANSFER_KW in kwargs: @@ -66,8 +67,8 @@ def __init__(self, **kwargs: Any): self.preserve_directory_structure = kwargs.pop( self.PRESERVE_DIRECTORY_STRUCTURE_KW, True ) - if self.SYNC_METADATA_KW in kwargs: - self.sync_metadata = kwargs.pop(self.SYNC_METADATA_KW, False) + if self.SYNC_ACL_PERMISSIONS_KW in kwargs: + self.sync_acl_permissions = kwargs.pop(self.SYNC_ACL_PERMISSIONS_KW, False) super().__init__(**kwargs) @property @@ -109,8 +110,8 @@ def _filter_schema_invalid_properties( self.ab_file_name_col: {"type": "string"}, }, } - elif self.sync_metadata: - return self.stream_reader.get_metadata_schema() + elif self.sync_acl_permissions: + return remote_file_permissions_schema else: return super()._filter_schema_invalid_properties(configured_catalog_json_schema) @@ -193,9 +194,9 @@ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Airbyte yield stream_data_to_airbyte_message( self.name, record, is_file_transfer_message=True ) - elif self.sync_metadata: + elif self.sync_acl_permissions: try: - metadata_record = self.stream_reader.get_file_metadata( + metadata_record = self.stream_reader.get_file_acl_permissions( file, logger=self.logger ) yield stream_data_to_airbyte_message( @@ -310,8 +311,8 @@ def get_json_schema(self) -> JsonSchema: def _get_raw_json_schema(self) -> JsonSchema: if self.use_file_transfer: return file_transfer_schema - elif self.sync_metadata: - return self.stream_reader.get_metadata_schema() + elif self.sync_acl_permissions: + return remote_file_permissions_schema elif self.config.input_schema: return self.config.get_input_schema() # type: ignore elif self.config.schemaless: diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py new file mode 100644 index 000000000..b70f20519 --- /dev/null +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -0,0 +1,99 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +import traceback +from functools import cache +from typing import Any, Iterable, List, Mapping, MutableMapping, Optional + +from airbyte_protocol_dataclasses.models import SyncMode + +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType +from airbyte_cdk.sources.file_based.config.identities_based_stream_config import ( + IdentitiesStreamConfig, +) +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader +from airbyte_cdk.sources.file_based.types import StreamSlice +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.core import JsonSchema +from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message +from airbyte_cdk.utils.traced_exception import AirbyteTracedException +from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy +from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError +from airbyte_cdk.sources.file_based.schema_helpers import remote_file_identity_schema +from airbyte_cdk.sources.streams.checkpoint import Cursor + + +class IdentitiesStream(Stream): + """ + The identities stream. A full refresh stream to sync identities from a certain domain. + The stream reader manage the logic to get such data, which is implemented on connector side. + """ + + is_resumable = False + + def __init__( + self, + config: IdentitiesStreamConfig, + catalog_schema: Optional[Mapping[str, Any]], + stream_reader: AbstractFileBasedStreamReader, + discovery_policy: AbstractDiscoveryPolicy, + errors_collector: FileBasedErrorsCollector, + ): + super().__init__() + self.config = config + self.catalog_schema = catalog_schema + self.stream_reader = stream_reader + self._discovery_policy = discovery_policy + self.errors_collector = errors_collector + self._cursor = {} + + @property + def state(self) -> MutableMapping[str, Any]: + return self._cursor + + @state.setter + def state(self, value: MutableMapping[str, Any]) -> None: + """State setter, accept state serialized by state getter.""" + self._cursor = value + + @property + def primary_key(self) -> PrimaryKeyType: + return None + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[StreamSlice] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Mapping[str, Any] | AirbyteMessage]: + try: + identity_groups = self.stream_reader.load_identity_groups() + for record in identity_groups: + yield stream_data_to_airbyte_message(self.name, record) + except AirbyteTracedException as exc: + # Re-raise the exception to stop the whole sync immediately as this is a fatal error + raise exc + except Exception: + yield AirbyteMessage( + type=MessageType.LOG, + log=AirbyteLogMessage( + level=Level.ERROR, + message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name}", + stack_trace=traceback.format_exc(), + ), + ) + + @cache + def get_json_schema(self) -> JsonSchema: + return remote_file_identity_schema + + @property + def name(self) -> str: + return self.config.name + + def get_cursor(self) -> Optional[Cursor]: + return None diff --git a/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/unit_tests/sources/file_based/scenarios/csv_scenarios.py index c609b4096..e618a6296 100644 --- a/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -509,13 +509,40 @@ "enum": ["use_records_transfer"], "type": "string", }, - "sync_metadata": { + "sync_acl_permissions": { "airbyte_hidden": True, "default": False, - "description": "If enabled, streams will sync files metadata instead of files data.", - "title": "Make stream sync files metadata", + "description": "Joins Document allowlists to each stream.", + "title": "Include ACL Permissions", "type": "boolean", }, + "identities": { + "airbyte_hidden": True, + "allOf": [ + { + "properties": { + "domain": { + "description": "The domain of the identities.", + "title": "Domain", + "type": "string", + }, + "name": { + "airbyte_hidden": True, + "const": "identities", + "default": "identities", + "enum": ["identities"], + "title": "Name", + "type": "string", + }, + }, + "required": ["domain"], + "title": "IdentitiesStreamConfig", + "type": "object", + } + ], + "title": "Identities configuration", + "description": "Configuration for identities", + }, }, "description": "Recommended - Extract and load structured records into your destination of choice. This is the classic method of moving data in Airbyte. It allows for blocking and hashing individual fields or files from a structured schema. Data can be flattened, typed and deduped depending on the destination.", "required": ["delivery_type"], From 0ae4267bdc054231ae7aeee19b37138f08a8c62a Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Sun, 26 Jan 2025 22:51:37 +0000 Subject: [PATCH 08/49] Auto-fix lint and format issues --- .../file_based/config/identities_based_stream_config.py | 3 ++- airbyte_cdk/sources/file_based/config/permissions.py | 1 + .../file_based/stream/default_file_based_stream.py | 2 +- .../sources/file_based/stream/identities_stream.py | 8 ++++---- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/airbyte_cdk/sources/file_based/config/identities_based_stream_config.py b/airbyte_cdk/sources/file_based/config/identities_based_stream_config.py index 6df27f492..bf5955fa6 100644 --- a/airbyte_cdk/sources/file_based/config/identities_based_stream_config.py +++ b/airbyte_cdk/sources/file_based/config/identities_based_stream_config.py @@ -1,6 +1,7 @@ -from pydantic.v1 import BaseModel, Field from typing import Literal +from pydantic.v1 import BaseModel, Field + class IdentitiesStreamConfig(BaseModel): name: Literal["identities"] = Field("identities", const=True, airbyte_hidden=True) diff --git a/airbyte_cdk/sources/file_based/config/permissions.py b/airbyte_cdk/sources/file_based/config/permissions.py index 534ac56e4..d0aef044a 100644 --- a/airbyte_cdk/sources/file_based/config/permissions.py +++ b/airbyte_cdk/sources/file_based/config/permissions.py @@ -5,6 +5,7 @@ import uuid from datetime import datetime from enum import Enum + from pydantic.v1 import BaseModel diff --git a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py index 743b51e10..f2205dc07 100644 --- a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +++ b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py @@ -29,8 +29,8 @@ SchemaType, file_transfer_schema, merge_schemas, - schemaless_schema, remote_file_permissions_schema, + schemaless_schema, ) from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index b70f20519..0a2cf22d1 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -14,16 +14,16 @@ from airbyte_cdk.sources.file_based.config.identities_based_stream_config import ( IdentitiesStreamConfig, ) +from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy +from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader +from airbyte_cdk.sources.file_based.schema_helpers import remote_file_identity_schema from airbyte_cdk.sources.file_based.types import StreamSlice from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.checkpoint import Cursor from airbyte_cdk.sources.streams.core import JsonSchema from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message from airbyte_cdk.utils.traced_exception import AirbyteTracedException -from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy -from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError -from airbyte_cdk.sources.file_based.schema_helpers import remote_file_identity_schema -from airbyte_cdk.sources.streams.checkpoint import Cursor class IdentitiesStream(Stream): From 43e3ea38deb6749506dff571149269731ef0e503 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Sun, 26 Jan 2025 17:15:21 -0600 Subject: [PATCH 09/49] file-based: fix annoying mypy issues --- .../sources/file_based/file_based_source.py | 18 +++++++++++++----- .../file_based/stream/identities_stream.py | 2 +- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_source.py b/airbyte_cdk/sources/file_based/file_based_source.py index 3a889811a..68907b942 100644 --- a/airbyte_cdk/sources/file_based/file_based_source.py +++ b/airbyte_cdk/sources/file_based/file_based_source.py @@ -301,9 +301,10 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]: streams.append(stream) - if self._add_identities_stream(parsed_config): + identities_stream_config = self._get_identities_stream_config(parsed_config) + if identities_stream_config: identities_stream = self._make_identities_stream( - stream_config=parsed_config.delivery_method.identities + stream_config=identities_stream_config ) streams.append(identities_stream) return streams @@ -458,9 +459,16 @@ def _sync_acl_permissions(parsed_config: AbstractFileBasedSpec) -> bool: return False @staticmethod - def _add_identities_stream(parsed_config: AbstractFileBasedSpec) -> bool: - return ( + def _get_identities_stream_config( + parsed_config: AbstractFileBasedSpec, + ) -> Optional[IdentitiesStreamConfig]: + identities_stream_config = None + if ( FileBasedSource._sync_acl_permissions(parsed_config) + and hasattr(parsed_config.delivery_method, "identities") and parsed_config.delivery_method.identities is not None + and isinstance(parsed_config.delivery_method.identities, IdentitiesStreamConfig) and parsed_config.delivery_method.identities.domain - ) + ): + identities_stream_config = parsed_config.delivery_method.identities + return identities_stream_config diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index 0a2cf22d1..0b7ac8c5c 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -48,7 +48,7 @@ def __init__( self.stream_reader = stream_reader self._discovery_policy = discovery_policy self.errors_collector = errors_collector - self._cursor = {} + self._cursor: MutableMapping[str, Any] = {} @property def state(self) -> MutableMapping[str, Any]: From 4aee2c9ab772b52b6e975717a14b08318ef141bb Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Sun, 26 Jan 2025 17:34:45 -0600 Subject: [PATCH 10/49] file-based: minor fix to schema --- airbyte_cdk/sources/file_based/schema_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/schema_helpers.py b/airbyte_cdk/sources/file_based/schema_helpers.py index fb12efe5e..459e6de73 100644 --- a/airbyte_cdk/sources/file_based/schema_helpers.py +++ b/airbyte_cdk/sources/file_based/schema_helpers.py @@ -28,7 +28,7 @@ "properties": { "id": {"type": "string"}, "file_path": {"type": "string"}, - "allowed_identity_remote_ids": {"type": "array", "items": "string"}, + "allowed_identity_remote_ids": {"type": "array", "items": {"type": "string"}}, "publicly_accessible": {"type": "boolean"}, }, } From 7b5c245ce35a5777cafa4dc3c2a94035971cb0a3 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Sun, 26 Jan 2025 19:18:38 -0600 Subject: [PATCH 11/49] file-based: add logger to load_identity_groups method --- airbyte_cdk/sources/file_based/file_based_stream_reader.py | 2 +- airbyte_cdk/sources/file_based/stream/identities_stream.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 3f804f24a..69b110e67 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -210,7 +210,7 @@ def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> """ return {} - def load_identity_groups(self) -> Iterable[Dict[str, Any]]: + def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any]]: """ This is required for connectors that will support syncing identities. diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index 0b7ac8c5c..d00973d44 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -71,7 +71,7 @@ def read_records( stream_state: Optional[Mapping[str, Any]] = None, ) -> Iterable[Mapping[str, Any] | AirbyteMessage]: try: - identity_groups = self.stream_reader.load_identity_groups() + identity_groups = self.stream_reader.load_identity_groups(logger=self.logger) for record in identity_groups: yield stream_data_to_airbyte_message(self.name, record) except AirbyteTracedException as exc: From f022b4c085b81764e58ad63fbb579b0abfdc9aa5 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Sun, 26 Jan 2025 20:42:55 -0600 Subject: [PATCH 12/49] file-based: simplify sync permissions schema --- .../config/abstract_file_based_spec.py | 6 ++-- .../sources/file_based/file_based_source.py | 31 +++---------------- .../file_based/stream/identities_stream.py | 6 ++-- 3 files changed, 10 insertions(+), 33 deletions(-) diff --git a/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py b/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py index 5bda45421..f4892f20b 100644 --- a/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +++ b/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py @@ -31,10 +31,8 @@ class Config(OneOfOptionConfig): default=False, airbyte_hidden=True, ) - identities: Optional[IdentitiesStreamConfig] = Field( - title="Identities configuration", - description="Configuration for identities", - airbyte_hidden=True, + domain: Optional[str] = Field( + title="Domain", description="The domain of the identities.", airbyte_hidden=True ) diff --git a/airbyte_cdk/sources/file_based/file_based_source.py b/airbyte_cdk/sources/file_based/file_based_source.py index 68907b942..3d3a8b4d6 100644 --- a/airbyte_cdk/sources/file_based/file_based_source.py +++ b/airbyte_cdk/sources/file_based/file_based_source.py @@ -33,9 +33,6 @@ FileBasedStreamConfig, ValidationPolicy, ) -from airbyte_cdk.sources.file_based.config.identities_based_stream_config import ( - IdentitiesStreamConfig, -) from airbyte_cdk.sources.file_based.discovery_policy import ( AbstractDiscoveryPolicy, DefaultDiscoveryPolicy, @@ -64,6 +61,7 @@ FileBasedFinalStateCursor, ) from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor +from airbyte_cdk.sources.file_based.stream.identities_stream import IDENTITIES_STREAM_NAME from airbyte_cdk.sources.message.repository import InMemoryMessageRepository, MessageRepository from airbyte_cdk.sources.streams import Stream from airbyte_cdk.sources.streams.concurrent.cursor import CursorField @@ -73,6 +71,7 @@ DEFAULT_CONCURRENCY = 100 MAX_CONCURRENCY = 100 INITIAL_N_PARTITIONS = MAX_CONCURRENCY // 2 +IDENTITIES_STREAM = "identities" class FileBasedSource(ConcurrentSourceAdapter, ABC): @@ -301,11 +300,8 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]: streams.append(stream) - identities_stream_config = self._get_identities_stream_config(parsed_config) - if identities_stream_config: - identities_stream = self._make_identities_stream( - stream_config=identities_stream_config - ) + if self._sync_acl_permissions(parsed_config): + identities_stream = self._make_identities_stream() streams.append(identities_stream) return streams @@ -335,11 +331,9 @@ def _make_default_stream( def _make_identities_stream( self, - stream_config: IdentitiesStreamConfig, ) -> Stream: return IdentitiesStream( - config=stream_config, - catalog_schema=self.stream_schemas.get(stream_config.name), + catalog_schema=self.stream_schemas.get(IDENTITIES_STREAM_NAME), stream_reader=self.stream_reader, discovery_policy=self.discovery_policy, errors_collector=self.errors_collector, @@ -457,18 +451,3 @@ def _sync_acl_permissions(parsed_config: AbstractFileBasedSpec) -> bool: ): return parsed_config.delivery_method.sync_acl_permissions return False - - @staticmethod - def _get_identities_stream_config( - parsed_config: AbstractFileBasedSpec, - ) -> Optional[IdentitiesStreamConfig]: - identities_stream_config = None - if ( - FileBasedSource._sync_acl_permissions(parsed_config) - and hasattr(parsed_config.delivery_method, "identities") - and parsed_config.delivery_method.identities is not None - and isinstance(parsed_config.delivery_method.identities, IdentitiesStreamConfig) - and parsed_config.delivery_method.identities.domain - ): - identities_stream_config = parsed_config.delivery_method.identities - return identities_stream_config diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index d00973d44..0d23a6dbf 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -25,6 +25,8 @@ from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message from airbyte_cdk.utils.traced_exception import AirbyteTracedException +IDENTITIES_STREAM_NAME = "identities" + class IdentitiesStream(Stream): """ @@ -36,14 +38,12 @@ class IdentitiesStream(Stream): def __init__( self, - config: IdentitiesStreamConfig, catalog_schema: Optional[Mapping[str, Any]], stream_reader: AbstractFileBasedStreamReader, discovery_policy: AbstractDiscoveryPolicy, errors_collector: FileBasedErrorsCollector, ): super().__init__() - self.config = config self.catalog_schema = catalog_schema self.stream_reader = stream_reader self._discovery_policy = discovery_policy @@ -93,7 +93,7 @@ def get_json_schema(self) -> JsonSchema: @property def name(self) -> str: - return self.config.name + return IDENTITIES_STREAM_NAME def get_cursor(self) -> Optional[Cursor]: return None From 597e458b7a1b7c67d1eddf0f149bc794bd9cd536 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Sun, 26 Jan 2025 20:53:38 -0600 Subject: [PATCH 13/49] file-based: remove unused config and fix unit tests --- .../config/abstract_file_based_spec.py | 3 -- .../config/identities_based_stream_config.py | 8 ----- .../file_based/stream/identities_stream.py | 3 -- .../file_based/scenarios/csv_scenarios.py | 29 +++---------------- 4 files changed, 4 insertions(+), 39 deletions(-) delete mode 100644 airbyte_cdk/sources/file_based/config/identities_based_stream_config.py diff --git a/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py b/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py index f4892f20b..b28a41b46 100644 --- a/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +++ b/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py @@ -11,9 +11,6 @@ from airbyte_cdk import OneOfOptionConfig from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig -from airbyte_cdk.sources.file_based.config.identities_based_stream_config import ( - IdentitiesStreamConfig, -) from airbyte_cdk.sources.utils import schema_helpers diff --git a/airbyte_cdk/sources/file_based/config/identities_based_stream_config.py b/airbyte_cdk/sources/file_based/config/identities_based_stream_config.py deleted file mode 100644 index bf5955fa6..000000000 --- a/airbyte_cdk/sources/file_based/config/identities_based_stream_config.py +++ /dev/null @@ -1,8 +0,0 @@ -from typing import Literal - -from pydantic.v1 import BaseModel, Field - - -class IdentitiesStreamConfig(BaseModel): - name: Literal["identities"] = Field("identities", const=True, airbyte_hidden=True) - domain: str = Field(title="Domain", description="The domain of the identities.") diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index 0d23a6dbf..cbf461e91 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -11,9 +11,6 @@ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level from airbyte_cdk.models import Type as MessageType from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType -from airbyte_cdk.sources.file_based.config.identities_based_stream_config import ( - IdentitiesStreamConfig, -) from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader diff --git a/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/unit_tests/sources/file_based/scenarios/csv_scenarios.py index e618a6296..c1f2898f9 100644 --- a/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -516,32 +516,11 @@ "title": "Include ACL Permissions", "type": "boolean", }, - "identities": { + "domain": { "airbyte_hidden": True, - "allOf": [ - { - "properties": { - "domain": { - "description": "The domain of the identities.", - "title": "Domain", - "type": "string", - }, - "name": { - "airbyte_hidden": True, - "const": "identities", - "default": "identities", - "enum": ["identities"], - "title": "Name", - "type": "string", - }, - }, - "required": ["domain"], - "title": "IdentitiesStreamConfig", - "type": "object", - } - ], - "title": "Identities configuration", - "description": "Configuration for identities", + "description": "The domain of the identities.", + "title": "Domain", + "type": "string", }, }, "description": "Recommended - Extract and load structured records into your destination of choice. This is the classic method of moving data in Airbyte. It allows for blocking and hashing individual fields or files from a structured schema. Data can be flattened, typed and deduped depending on the destination.", From 2430eaea3c12b2787f4010535a9325df0bb0d064 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Wed, 29 Jan 2025 10:25:56 -0600 Subject: [PATCH 14/49] file-based: format record to have file last modified data --- .../file_based/stream/default_file_based_stream.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py index f2205dc07..ddbc97abb 100644 --- a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +++ b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py @@ -196,21 +196,24 @@ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Airbyte ) elif self.sync_acl_permissions: try: - metadata_record = self.stream_reader.get_file_acl_permissions( + permissions_record = self.stream_reader.get_file_acl_permissions( file, logger=self.logger ) + permissions_record = self.transform_record( + permissions_record, file, file_datetime_string + ) yield stream_data_to_airbyte_message( - self.name, metadata_record, is_file_transfer_message=False + self.name, permissions_record, is_file_transfer_message=False ) except Exception as e: self.logger.error( - f"Failed to retrieve metadata for file {file.uri}: {str(e)}" + f"Failed to retrieve permissions for file {file.uri}: {str(e)}" ) yield AirbyteMessage( type=MessageType.LOG, log=AirbyteLogMessage( level=Level.ERROR, - message=f"Error retrieving metadata: stream={self.name} file={file.uri}", + message=f"Error retrieving files permissions: stream={self.name} file={file.uri}", stack_trace=traceback.format_exc(), ), ) From 24a93badbcc7fe2179f7f572587971feeef4aa71 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Mon, 3 Feb 2025 15:34:06 -0600 Subject: [PATCH 15/49] file-based: create three toggle instead of option below transfer records --- .../config/abstract_file_based_spec.py | 29 +++++---- .../sources/file_based/config/permissions.py | 34 ---------- .../config/validate_config_transfer_modes.py | 51 +++++++++++++++ .../sources/file_based/file_based_source.py | 65 ++++--------------- .../file_based/file_based_stream_reader.py | 40 ++++-------- .../stream/default_file_based_stream.py | 14 ++-- .../file_based/scenarios/csv_scenarios.py | 37 +++++++---- 7 files changed, 121 insertions(+), 149 deletions(-) delete mode 100644 airbyte_cdk/sources/file_based/config/permissions.py create mode 100644 airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py diff --git a/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py b/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py index b28a41b46..19d0a075b 100644 --- a/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +++ b/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py @@ -14,6 +14,23 @@ from airbyte_cdk.sources.utils import schema_helpers +class DeliverPermissions(BaseModel): + class Config(OneOfOptionConfig): + title = "Replicate Permissions ACL" + description = "Sends one identity stream and one for more permissions (ACL) streams to the destination. This data can be used in downstream systems to recreate permission restrictions mirroring the original source." + discriminator = "delivery_type" + + delivery_type: Literal["use_permissions_transfer"] = Field( + "use_permissions_transfer", const=True + ) + + include_identities_stream: bool = Field( + title="Include Identity Stream", + description="This data can be used in downstream systems to recreate permission restrictions mirroring the original source", + default=True, + ) + + class DeliverRecords(BaseModel): class Config(OneOfOptionConfig): title = "Replicate Records" @@ -22,16 +39,6 @@ class Config(OneOfOptionConfig): delivery_type: Literal["use_records_transfer"] = Field("use_records_transfer", const=True) - sync_acl_permissions: bool = Field( - title="Include ACL Permissions", - description="Joins Document allowlists to each stream.", - default=False, - airbyte_hidden=True, - ) - domain: Optional[str] = Field( - title="Domain", description="The domain of the identities.", airbyte_hidden=True - ) - class DeliverRawFiles(BaseModel): class Config(OneOfOptionConfig): @@ -75,7 +82,7 @@ class AbstractFileBasedSpec(BaseModel): order=10, ) - delivery_method: Union[DeliverRecords, DeliverRawFiles] = Field( + delivery_method: Union[DeliverRecords, DeliverRawFiles, DeliverPermissions] = Field( title="Delivery Method", discriminator="delivery_type", type="object", diff --git a/airbyte_cdk/sources/file_based/config/permissions.py b/airbyte_cdk/sources/file_based/config/permissions.py deleted file mode 100644 index d0aef044a..000000000 --- a/airbyte_cdk/sources/file_based/config/permissions.py +++ /dev/null @@ -1,34 +0,0 @@ -# -# Copyright (c) 2024 Airbyte, Inc., all rights reserved. -# - -import uuid -from datetime import datetime -from enum import Enum - -from pydantic.v1 import BaseModel - - -class RemoteFileIdentityType(Enum): - USER = "user" - GROUP = "group" - - -class RemoteFileIdentity(BaseModel): - id: uuid.UUID - remote_id: str - parent_id: str | None = None - name: str | None = None - description: str | None = None - email_address: str | None = None - member_email_addresses: list[str] | None = None - type: RemoteFileIdentityType - modified_at: datetime - - -class RemoteFilePermissions(BaseModel): - id: str - file_path: str - allowed_identity_remote_ids: list[str] | None = None - denied_identity_remote_ids: list[str] | None = None - publicly_accessible: bool = False diff --git a/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py b/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py new file mode 100644 index 000000000..f14c36899 --- /dev/null +++ b/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py @@ -0,0 +1,51 @@ +# +# Copyright (c) 2025 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec + + +def use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool: + return ( + hasattr(parsed_config.delivery_method, "delivery_type") + and parsed_config.delivery_method.delivery_type == "use_file_transfer" + ) + + +def preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool: + """ + Determines whether to preserve directory structure during file transfer. + + When enabled, files maintain their subdirectory paths in the destination. + When disabled, files are flattened to the root of the destination. + + Args: + parsed_config: The parsed configuration containing delivery method settings + + Returns: + True if directory structure should be preserved (default), False otherwise + """ + if ( + use_file_transfer(parsed_config) + and hasattr(parsed_config.delivery_method, "preserve_directory_structure") + and parsed_config.delivery_method.preserve_directory_structure is not None + ): + return parsed_config.delivery_method.preserve_directory_structure + return True + + +def use_permissions_transfer(parsed_config: AbstractFileBasedSpec) -> bool: + return ( + hasattr(parsed_config.delivery_method, "delivery_type") + and parsed_config.delivery_method.delivery_type == "use_permissions_transfer" + ) + + +def include_identities_stream(parsed_config: AbstractFileBasedSpec) -> bool: + if ( + use_permissions_transfer(parsed_config) + and hasattr(parsed_config.delivery_method, "include_identities_stream") + and parsed_config.delivery_method.include_identities_stream is not None + ): + return parsed_config.delivery_method.include_identities_stream + return False diff --git a/airbyte_cdk/sources/file_based/file_based_source.py b/airbyte_cdk/sources/file_based/file_based_source.py index 3d3a8b4d6..e5cc7445f 100644 --- a/airbyte_cdk/sources/file_based/file_based_source.py +++ b/airbyte_cdk/sources/file_based/file_based_source.py @@ -33,6 +33,12 @@ FileBasedStreamConfig, ValidationPolicy, ) +from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import ( + use_file_transfer, + preserve_directory_structure, + use_permissions_transfer, + include_identities_stream, +) from airbyte_cdk.sources.file_based.discovery_policy import ( AbstractDiscoveryPolicy, DefaultDiscoveryPolicy, @@ -172,8 +178,7 @@ def check_connection( parsed_config = self._get_parsed_config(config) availability_method = ( stream.availability_strategy.check_availability - if self._use_file_transfer(parsed_config) - or self._sync_acl_permissions(parsed_config) + if use_file_transfer(parsed_config) or use_permissions_transfer(parsed_config) else stream.availability_strategy.check_availability_and_parsability ) ( @@ -300,7 +305,7 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]: streams.append(stream) - if self._sync_acl_permissions(parsed_config): + if include_identities_stream(parsed_config): identities_stream = self._make_identities_stream() streams.append(identities_stream) return streams @@ -324,9 +329,9 @@ def _make_default_stream( validation_policy=self._validate_and_get_validation_policy(stream_config), errors_collector=self.errors_collector, cursor=cursor, - use_file_transfer=self._use_file_transfer(parsed_config), - preserve_directory_structure=self._preserve_directory_structure(parsed_config), - sync_acl_permissions=self._sync_acl_permissions(parsed_config), + use_file_transfer=use_file_transfer(parsed_config), + preserve_directory_structure=preserve_directory_structure(parsed_config), + use_permissions_transfer=use_permissions_transfer(parsed_config), ) def _make_identities_stream( @@ -403,51 +408,3 @@ def _validate_input_schema(self, stream_config: FileBasedStreamConfig) -> None: "`input_schema` and `schemaless` options cannot both be set", model=FileBasedStreamConfig, ) - - @staticmethod - def _use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool: - use_file_transfer = ( - hasattr(parsed_config.delivery_method, "delivery_type") - and parsed_config.delivery_method.delivery_type == "use_file_transfer" - ) - return use_file_transfer - - @staticmethod - def _use_records_transfer(parsed_config: AbstractFileBasedSpec) -> bool: - use_records_transfer = ( - hasattr(parsed_config.delivery_method, "delivery_type") - and parsed_config.delivery_method.delivery_type == "use_records_transfer" - ) - return use_records_transfer - - @staticmethod - def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool: - """ - Determines whether to preserve directory structure during file transfer. - - When enabled, files maintain their subdirectory paths in the destination. - When disabled, files are flattened to the root of the destination. - - Args: - parsed_config: The parsed configuration containing delivery method settings - - Returns: - True if directory structure should be preserved (default), False otherwise - """ - if ( - FileBasedSource._use_file_transfer(parsed_config) - and hasattr(parsed_config.delivery_method, "preserve_directory_structure") - and parsed_config.delivery_method.preserve_directory_structure is not None - ): - return parsed_config.delivery_method.preserve_directory_structure - return True - - @staticmethod - def _sync_acl_permissions(parsed_config: AbstractFileBasedSpec) -> bool: - if ( - FileBasedSource._use_records_transfer(parsed_config) - and hasattr(parsed_config.delivery_method, "sync_acl_permissions") - and parsed_config.delivery_method.sync_acl_permissions is not None - ): - return parsed_config.delivery_method.sync_acl_permissions - return False diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 69b110e67..12faac751 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -13,6 +13,11 @@ from wcmatch.glob import GLOBSTAR, globmatch from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec +from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import ( + use_file_transfer, + preserve_directory_structure, + include_identities_stream, +) from airbyte_cdk.sources.file_based.remote_file import RemoteFile @@ -128,41 +133,18 @@ def get_prefixes_from_globs(globs: List[str]) -> Set[str]: def use_file_transfer(self) -> bool: if self.config: - use_file_transfer = ( - hasattr(self.config.delivery_method, "delivery_type") - and self.config.delivery_method.delivery_type == "use_file_transfer" - ) - return use_file_transfer - return False - - def use_records_transfer(self) -> bool: - if self.config: - use_records_transfer = ( - hasattr(self.config.delivery_method, "delivery_type") - and self.config.delivery_method.delivery_type == "use_records_transfer" - ) - return use_records_transfer + return use_file_transfer(self.config) return False def preserve_directory_structure(self) -> bool: # fall back to preserve subdirectories if config is not present or incomplete - if ( - self.use_file_transfer() - and self.config - and hasattr(self.config.delivery_method, "preserve_directory_structure") - and self.config.delivery_method.preserve_directory_structure is not None - ): - return self.config.delivery_method.preserve_directory_structure + if self.config: + return preserve_directory_structure(self.config) return True - def sync_acl_permissions(self) -> bool: - if ( - self.config - and self.use_records_transfer() - and hasattr(self.config.delivery_method, "sync_acl_permissions") - and self.config.delivery_method.sync_acl_permissions is not None - ): - return self.config.delivery_method.sync_acl_permissions + def include_identities_stream(self) -> bool: + if self.config: + return include_identities_stream(self.config) return False @abstractmethod diff --git a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py index ddbc97abb..29bb24e19 100644 --- a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +++ b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py @@ -48,7 +48,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin): FILE_TRANSFER_KW = "use_file_transfer" PRESERVE_DIRECTORY_STRUCTURE_KW = "preserve_directory_structure" - SYNC_ACL_PERMISSIONS_KW = "sync_acl_permissions" + PERMISSIONS_TRANSFER_KW = "use_permissions_transfer" FILES_KEY = "files" DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" ab_last_mod_col = "_ab_source_file_last_modified" @@ -58,7 +58,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin): airbyte_columns = [ab_last_mod_col, ab_file_name_col] use_file_transfer = False preserve_directory_structure = True - sync_acl_permissions = False + use_permissions_transfer = False def __init__(self, **kwargs: Any): if self.FILE_TRANSFER_KW in kwargs: @@ -67,8 +67,8 @@ def __init__(self, **kwargs: Any): self.preserve_directory_structure = kwargs.pop( self.PRESERVE_DIRECTORY_STRUCTURE_KW, True ) - if self.SYNC_ACL_PERMISSIONS_KW in kwargs: - self.sync_acl_permissions = kwargs.pop(self.SYNC_ACL_PERMISSIONS_KW, False) + if self.PERMISSIONS_TRANSFER_KW in kwargs: + self.use_permissions_transfer = kwargs.pop(self.PERMISSIONS_TRANSFER_KW, False) super().__init__(**kwargs) @property @@ -110,7 +110,7 @@ def _filter_schema_invalid_properties( self.ab_file_name_col: {"type": "string"}, }, } - elif self.sync_acl_permissions: + elif self.use_permissions_transfer: return remote_file_permissions_schema else: return super()._filter_schema_invalid_properties(configured_catalog_json_schema) @@ -194,7 +194,7 @@ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Airbyte yield stream_data_to_airbyte_message( self.name, record, is_file_transfer_message=True ) - elif self.sync_acl_permissions: + elif self.use_permissions_transfer: try: permissions_record = self.stream_reader.get_file_acl_permissions( file, logger=self.logger @@ -314,7 +314,7 @@ def get_json_schema(self) -> JsonSchema: def _get_raw_json_schema(self) -> JsonSchema: if self.use_file_transfer: return file_transfer_schema - elif self.sync_acl_permissions: + elif self.use_permissions_transfer: return remote_file_permissions_schema elif self.config.input_schema: return self.config.get_input_schema() # type: ignore diff --git a/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/unit_tests/sources/file_based/scenarios/csv_scenarios.py index c1f2898f9..f75dbe481 100644 --- a/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -508,20 +508,7 @@ "const": "use_records_transfer", "enum": ["use_records_transfer"], "type": "string", - }, - "sync_acl_permissions": { - "airbyte_hidden": True, - "default": False, - "description": "Joins Document allowlists to each stream.", - "title": "Include ACL Permissions", - "type": "boolean", - }, - "domain": { - "airbyte_hidden": True, - "description": "The domain of the identities.", - "title": "Domain", - "type": "string", - }, + } }, "description": "Recommended - Extract and load structured records into your destination of choice. This is the classic method of moving data in Airbyte. It allows for blocking and hashing individual fields or files from a structured schema. Data can be flattened, typed and deduped depending on the destination.", "required": ["delivery_type"], @@ -547,6 +534,28 @@ "description": "Copy raw files without parsing their contents. Bits are copied into the destination exactly as they appeared in the source. Recommended for use with unstructured text data, non-text and compressed files.", "required": ["delivery_type"], }, + { + "description": "Sends one identity stream and one for more permissions (ACL) streams to the destination. This data can be used in downstream systems to recreate permission restrictions mirroring the original source.", + "properties": { + "delivery_type": { + "const": "use_permissions_transfer", + "default": "use_permissions_transfer", + "enum": ["use_permissions_transfer"], + "title": "Delivery Type", + "type": "string", + }, + "include_identities_stream": { + "airbyte_hidden": True, + "default": True, + "description": "This data can be used in downstream systems to recreate permission restrictions mirroring the original source", + "title": "Include Identity Stream", + "type": "boolean", + }, + }, + "required": ["delivery_type"], + "title": "Replicate Permissions ACL", + "type": "object", + }, ], }, }, From 40c17871f8e5822a9ca232fd493484096f4f9a0f Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Mon, 3 Feb 2025 15:36:04 -0600 Subject: [PATCH 16/49] file-based: fix csv test --- unit_tests/sources/file_based/scenarios/csv_scenarios.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/unit_tests/sources/file_based/scenarios/csv_scenarios.py index f75dbe481..b1d74334a 100644 --- a/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -545,7 +545,6 @@ "type": "string", }, "include_identities_stream": { - "airbyte_hidden": True, "default": True, "description": "This data can be used in downstream systems to recreate permission restrictions mirroring the original source", "title": "Include Identity Stream", From 36e0bca5aa3faf66833a0f159d270eadb4a25e42 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Mon, 3 Feb 2025 21:41:27 +0000 Subject: [PATCH 17/49] Auto-fix lint and format issues --- airbyte_cdk/sources/file_based/file_based_source.py | 4 ++-- airbyte_cdk/sources/file_based/file_based_stream_reader.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_source.py b/airbyte_cdk/sources/file_based/file_based_source.py index e5cc7445f..24f9c7a27 100644 --- a/airbyte_cdk/sources/file_based/file_based_source.py +++ b/airbyte_cdk/sources/file_based/file_based_source.py @@ -34,10 +34,10 @@ ValidationPolicy, ) from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import ( - use_file_transfer, + include_identities_stream, preserve_directory_structure, + use_file_transfer, use_permissions_transfer, - include_identities_stream, ) from airbyte_cdk.sources.file_based.discovery_policy import ( AbstractDiscoveryPolicy, diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 12faac751..1c762205f 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -14,9 +14,9 @@ from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import ( - use_file_transfer, - preserve_directory_structure, include_identities_stream, + preserve_directory_structure, + use_file_transfer, ) from airbyte_cdk.sources.file_based.remote_file import RemoteFile From d30b1ad8932617769a230e5d0a8c972ae47c43de Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Mon, 3 Feb 2025 15:50:21 -0600 Subject: [PATCH 18/49] file-based: make new methods abstract --- airbyte_cdk/sources/file_based/file_based_stream_reader.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 1c762205f..099b401f7 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -185,16 +185,18 @@ def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> Li absolute_file_path = path.abspath(local_file_path) return [file_relative_path, local_file_path, absolute_file_path] + @abstractmethod def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> Dict[str, Any]: """ This is required for connectors that will support syncing ACL Permissions from files. """ - return {} + ... + @abstractmethod def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any]]: """ This is required for connectors that will support syncing identities. """ - yield {} + ... From b5cf88cd43a58ab18f9d342ca1d600f185e5fe37 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Wed, 5 Feb 2025 08:29:14 -0600 Subject: [PATCH 19/49] file-based: add check for identities stream --- airbyte_cdk/sources/file_based/file_based_source.py | 4 +++- airbyte_cdk/sources/file_based/stream/identities_stream.py | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_source.py b/airbyte_cdk/sources/file_based/file_based_source.py index 24f9c7a27..dcd442c0f 100644 --- a/airbyte_cdk/sources/file_based/file_based_source.py +++ b/airbyte_cdk/sources/file_based/file_based_source.py @@ -170,7 +170,9 @@ def check_connection( tracebacks = [] for stream in streams: if isinstance(stream, IdentitiesStream): - # Probably need to check identities endpoint/api access but will skip for now. + identity = next(iter(stream.load_identity_groups())) + if not identity: + errors.append("Unable to get identities for current configuration, please check your credentials") continue if not isinstance(stream, AbstractFileBasedStream): raise ValueError(f"Stream {stream} is not a file-based stream.") diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index cbf461e91..e9fcf6a3a 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -68,7 +68,7 @@ def read_records( stream_state: Optional[Mapping[str, Any]] = None, ) -> Iterable[Mapping[str, Any] | AirbyteMessage]: try: - identity_groups = self.stream_reader.load_identity_groups(logger=self.logger) + identity_groups = self.load_identity_groups() for record in identity_groups: yield stream_data_to_airbyte_message(self.name, record) except AirbyteTracedException as exc: @@ -84,6 +84,9 @@ def read_records( ), ) + def load_identity_groups(self): + return self.stream_reader.load_identity_groups(logger=self.logger) + @cache def get_json_schema(self) -> JsonSchema: return remote_file_identity_schema From 2a04f121d2a146f66c6af05910c8a8830ecd8df6 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Wed, 5 Feb 2025 08:30:46 -0600 Subject: [PATCH 20/49] file-based: ruff format --- airbyte_cdk/sources/file_based/file_based_source.py | 4 +++- airbyte_cdk/sources/file_based/stream/identities_stream.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_source.py b/airbyte_cdk/sources/file_based/file_based_source.py index dcd442c0f..dbad35d02 100644 --- a/airbyte_cdk/sources/file_based/file_based_source.py +++ b/airbyte_cdk/sources/file_based/file_based_source.py @@ -172,7 +172,9 @@ def check_connection( if isinstance(stream, IdentitiesStream): identity = next(iter(stream.load_identity_groups())) if not identity: - errors.append("Unable to get identities for current configuration, please check your credentials") + errors.append( + "Unable to get identities for current configuration, please check your credentials" + ) continue if not isinstance(stream, AbstractFileBasedStream): raise ValueError(f"Stream {stream} is not a file-based stream.") diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index e9fcf6a3a..5f16046cc 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -4,7 +4,7 @@ import traceback from functools import cache -from typing import Any, Iterable, List, Mapping, MutableMapping, Optional +from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Dict from airbyte_protocol_dataclasses.models import SyncMode @@ -84,7 +84,7 @@ def read_records( ), ) - def load_identity_groups(self): + def load_identity_groups(self) -> Iterable[Dict[str, Any]]: return self.stream_reader.load_identity_groups(logger=self.logger) @cache From 4c18889982581fec999e69f1a364cf0c5ff99f98 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Wed, 5 Feb 2025 14:32:46 +0000 Subject: [PATCH 21/49] Auto-fix lint and format issues --- airbyte_cdk/sources/file_based/stream/identities_stream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index 5f16046cc..1dc18eede 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -4,7 +4,7 @@ import traceback from functools import cache -from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Dict +from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional from airbyte_protocol_dataclasses.models import SyncMode From 1271e3a7b7aa2489101c6bde3b95297ea2c659b3 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Wed, 5 Feb 2025 08:47:06 -0600 Subject: [PATCH 22/49] file-based: add notImplemented error --- .../sources/file_based/file_based_stream_reader.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 099b401f7..c9d91bccb 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -191,7 +191,9 @@ def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> This is required for connectors that will support syncing ACL Permissions from files. """ - ... + raise NotImplementedError( + f"{self.__class__.__name__} must implement get_file_acl_permissions()" + ) @abstractmethod def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any]]: @@ -199,4 +201,6 @@ def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any This is required for connectors that will support syncing identities. """ - ... + raise NotImplementedError( + f"{self.__class__.__name__} must implement load_identity_groups()" + ) From 4105c3c07576eb505f50bfeb8075e17d52b1a537 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Tue, 11 Feb 2025 06:49:28 -0600 Subject: [PATCH 23/49] file-based: fix unit tests --- .../sources/file_based/file_based_stream_reader.py | 8 ++------ unit_tests/sources/file_based/in_memory_files_source.py | 6 ++++++ .../sources/file_based/test_file_based_stream_reader.py | 6 ++++++ 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index c9d91bccb..099b401f7 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -191,9 +191,7 @@ def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> This is required for connectors that will support syncing ACL Permissions from files. """ - raise NotImplementedError( - f"{self.__class__.__name__} must implement get_file_acl_permissions()" - ) + ... @abstractmethod def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any]]: @@ -201,6 +199,4 @@ def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any This is required for connectors that will support syncing identities. """ - raise NotImplementedError( - f"{self.__class__.__name__} must implement load_identity_groups()" - ) + ... diff --git a/unit_tests/sources/file_based/in_memory_files_source.py b/unit_tests/sources/file_based/in_memory_files_source.py index f3e3a90e2..8f4a3d2ca 100644 --- a/unit_tests/sources/file_based/in_memory_files_source.py +++ b/unit_tests/sources/file_based/in_memory_files_source.py @@ -145,6 +145,12 @@ def get_file( ) -> Dict[str, Any]: return {} + def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> Dict[str, Any]: + return {} + + def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any]]: + return [{}] + def open_file( self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger ) -> IOBase: diff --git a/unit_tests/sources/file_based/test_file_based_stream_reader.py b/unit_tests/sources/file_based/test_file_based_stream_reader.py index 725ce67b0..6830a31d6 100644 --- a/unit_tests/sources/file_based/test_file_based_stream_reader.py +++ b/unit_tests/sources/file_based/test_file_based_stream_reader.py @@ -84,6 +84,12 @@ def get_file( ) -> Dict[str, Any]: return {} + def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> Dict[str, Any]: + return {} + + def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any]]: + return [{}] + class TestSpec(AbstractFileBasedSpec): @classmethod From a6d1b627994f1c1859af5ec47c21a4be299929f7 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Tue, 11 Feb 2025 06:58:53 -0600 Subject: [PATCH 24/49] file-based: add NotImplementedError --- .../sources/file_based/file_based_stream_reader.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 099b401f7..23894b5e1 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -191,7 +191,9 @@ def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> This is required for connectors that will support syncing ACL Permissions from files. """ - ... + raise NotImplementedError( + f"{self.__class__.__name__} required to support get_file_acl_permissions()" + ) @abstractmethod def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any]]: @@ -199,4 +201,6 @@ def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any This is required for connectors that will support syncing identities. """ - ... + raise NotImplementedError( + f"{self.__class__.__name__} required to support load_identity_groups()" + ) From a7081d3a5c577c6952f93323151f2f95f5203f27 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Tue, 11 Feb 2025 08:43:21 -0600 Subject: [PATCH 25/49] file-based: allow connector to provide permissions and identities schemas --- .../file_based/file_based_stream_reader.py | 7 ++++-- .../sources/file_based/schema_helpers.py | 25 ------------------- .../stream/default_file_based_stream.py | 5 ++-- .../file_based/stream/identities_stream.py | 3 +-- 4 files changed, 8 insertions(+), 32 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 23894b5e1..f3e6cacf6 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -19,6 +19,7 @@ use_file_transfer, ) from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import schemaless_schema class FileReadMode(Enum): @@ -28,6 +29,8 @@ class FileReadMode(Enum): class AbstractFileBasedStreamReader(ABC): DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" + REMOTE_FILE_PERMISSIONS_SCHEMA = schemaless_schema + REMOTE_FILE_IDENTITY_SCHEMA = schemaless_schema def __init__(self) -> None: self._config = None @@ -192,7 +195,7 @@ def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> ACL Permissions from files. """ raise NotImplementedError( - f"{self.__class__.__name__} required to support get_file_acl_permissions()" + f"{self.__class__.__name__} required to support get_file_acl_permissions(), please update REMOTE_FILE_PERMISSIONS_SCHEMA accordingly" ) @abstractmethod @@ -202,5 +205,5 @@ def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any identities. """ raise NotImplementedError( - f"{self.__class__.__name__} required to support load_identity_groups()" + f"{self.__class__.__name__} required to support load_identity_groups(), please update REMOTE_FILE_IDENTITY_SCHEMA accordingly" ) diff --git a/airbyte_cdk/sources/file_based/schema_helpers.py b/airbyte_cdk/sources/file_based/schema_helpers.py index 459e6de73..1b653db67 100644 --- a/airbyte_cdk/sources/file_based/schema_helpers.py +++ b/airbyte_cdk/sources/file_based/schema_helpers.py @@ -23,31 +23,6 @@ "properties": {"data": {"type": "object"}, "file": {"type": "object"}}, } -remote_file_permissions_schema = { - "type": "object", - "properties": { - "id": {"type": "string"}, - "file_path": {"type": "string"}, - "allowed_identity_remote_ids": {"type": "array", "items": {"type": "string"}}, - "publicly_accessible": {"type": "boolean"}, - }, -} - -remote_file_identity_schema = { - "type": "object", - "properties": { - "id": {"type": "string"}, - "remote_id": {"type": "string"}, - "parent_id": {"type": ["null", "string"]}, - "name": {"type": ["null", "string"]}, - "description": {"type": ["null", "string"]}, - "email_address": {"type": ["null", "string"]}, - "member_email_addresses": {"type": ["null", "array"]}, - "type": {"type": "string"}, - "modified_at": {"type": "string"}, - }, -} - @total_ordering class ComparableType(Enum): diff --git a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py index 29bb24e19..2efdf8eab 100644 --- a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +++ b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py @@ -29,7 +29,6 @@ SchemaType, file_transfer_schema, merge_schemas, - remote_file_permissions_schema, schemaless_schema, ) from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream @@ -111,7 +110,7 @@ def _filter_schema_invalid_properties( }, } elif self.use_permissions_transfer: - return remote_file_permissions_schema + return self.stream_reader.REMOTE_FILE_PERMISSIONS_SCHEMA else: return super()._filter_schema_invalid_properties(configured_catalog_json_schema) @@ -315,7 +314,7 @@ def _get_raw_json_schema(self) -> JsonSchema: if self.use_file_transfer: return file_transfer_schema elif self.use_permissions_transfer: - return remote_file_permissions_schema + return self.stream_reader.REMOTE_FILE_PERMISSIONS_SCHEMA elif self.config.input_schema: return self.config.get_input_schema() # type: ignore elif self.config.schemaless: diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index 1dc18eede..dd39e0444 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -14,7 +14,6 @@ from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader -from airbyte_cdk.sources.file_based.schema_helpers import remote_file_identity_schema from airbyte_cdk.sources.file_based.types import StreamSlice from airbyte_cdk.sources.streams import Stream from airbyte_cdk.sources.streams.checkpoint import Cursor @@ -89,7 +88,7 @@ def load_identity_groups(self) -> Iterable[Dict[str, Any]]: @cache def get_json_schema(self) -> JsonSchema: - return remote_file_identity_schema + return self.stream_reader.REMOTE_FILE_IDENTITY_SCHEMA @property def name(self) -> str: From 7e4d73fdd47928abea23e127960c4e88fe25b52c Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Tue, 11 Feb 2025 09:42:13 -0600 Subject: [PATCH 26/49] file-based: move permissions transfer mode to general with abstract Identitie stream, so file based Identity stream becomes an implementation --- .../config/abstract_file_based_spec.py | 18 +---- .../sources/file_based/file_based_source.py | 9 +-- .../sources/file_based/stream/__init__.py | 4 +- .../file_based/stream/identities_stream.py | 59 +------------- airbyte_cdk/sources/specs/transfer_modes.py | 25 ++++++ .../sources/streams/permissions/identities.py | 81 +++++++++++++++++++ 6 files changed, 117 insertions(+), 79 deletions(-) create mode 100644 airbyte_cdk/sources/specs/transfer_modes.py create mode 100644 airbyte_cdk/sources/streams/permissions/identities.py diff --git a/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py b/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py index 19d0a075b..d9b67e34c 100644 --- a/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +++ b/airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py @@ -11,26 +11,10 @@ from airbyte_cdk import OneOfOptionConfig from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig +from airbyte_cdk.sources.specs.transfer_modes import DeliverPermissions from airbyte_cdk.sources.utils import schema_helpers -class DeliverPermissions(BaseModel): - class Config(OneOfOptionConfig): - title = "Replicate Permissions ACL" - description = "Sends one identity stream and one for more permissions (ACL) streams to the destination. This data can be used in downstream systems to recreate permission restrictions mirroring the original source." - discriminator = "delivery_type" - - delivery_type: Literal["use_permissions_transfer"] = Field( - "use_permissions_transfer", const=True - ) - - include_identities_stream: bool = Field( - title="Include Identity Stream", - description="This data can be used in downstream systems to recreate permission restrictions mirroring the original source", - default=True, - ) - - class DeliverRecords(BaseModel): class Config(OneOfOptionConfig): title = "Replicate Records" diff --git a/airbyte_cdk/sources/file_based/file_based_source.py b/airbyte_cdk/sources/file_based/file_based_source.py index dbad35d02..60bfb254d 100644 --- a/airbyte_cdk/sources/file_based/file_based_source.py +++ b/airbyte_cdk/sources/file_based/file_based_source.py @@ -58,7 +58,7 @@ from airbyte_cdk.sources.file_based.stream import ( AbstractFileBasedStream, DefaultFileBasedStream, - IdentitiesStream, + FileIdentities, ) from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade from airbyte_cdk.sources.file_based.stream.concurrent.cursor import ( @@ -67,7 +67,6 @@ FileBasedFinalStateCursor, ) from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor -from airbyte_cdk.sources.file_based.stream.identities_stream import IDENTITIES_STREAM_NAME from airbyte_cdk.sources.message.repository import InMemoryMessageRepository, MessageRepository from airbyte_cdk.sources.streams import Stream from airbyte_cdk.sources.streams.concurrent.cursor import CursorField @@ -169,7 +168,7 @@ def check_connection( errors = [] tracebacks = [] for stream in streams: - if isinstance(stream, IdentitiesStream): + if isinstance(stream, FileIdentities): identity = next(iter(stream.load_identity_groups())) if not identity: errors.append( @@ -341,8 +340,8 @@ def _make_default_stream( def _make_identities_stream( self, ) -> Stream: - return IdentitiesStream( - catalog_schema=self.stream_schemas.get(IDENTITIES_STREAM_NAME), + return FileIdentities( + catalog_schema=self.stream_schemas.get(FileIdentities.IDENTITIES_STREAM_NAME), stream_reader=self.stream_reader, discovery_policy=self.discovery_policy, errors_collector=self.errors_collector, diff --git a/airbyte_cdk/sources/file_based/stream/__init__.py b/airbyte_cdk/sources/file_based/stream/__init__.py index 78c2b1062..0c1359343 100644 --- a/airbyte_cdk/sources/file_based/stream/__init__.py +++ b/airbyte_cdk/sources/file_based/stream/__init__.py @@ -1,5 +1,5 @@ from airbyte_cdk.sources.file_based.stream.abstract_file_based_stream import AbstractFileBasedStream from airbyte_cdk.sources.file_based.stream.default_file_based_stream import DefaultFileBasedStream -from airbyte_cdk.sources.file_based.stream.identities_stream import IdentitiesStream +from airbyte_cdk.sources.file_based.stream.identities_stream import FileIdentities -__all__ = ["AbstractFileBasedStream", "DefaultFileBasedStream", "IdentitiesStream"] +__all__ = ["AbstractFileBasedStream", "DefaultFileBasedStream", "FileIdentities"] diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index dd39e0444..7fdcd7216 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -2,29 +2,18 @@ # Copyright (c) 2024 Airbyte, Inc., all rights reserved. # -import traceback from functools import cache -from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional +from typing import Any, Dict, Iterable, Mapping, MutableMapping, Optional -from airbyte_protocol_dataclasses.models import SyncMode - -from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level -from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.streams.permissions.identities import Identities from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy -from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError +from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader -from airbyte_cdk.sources.file_based.types import StreamSlice -from airbyte_cdk.sources.streams import Stream -from airbyte_cdk.sources.streams.checkpoint import Cursor from airbyte_cdk.sources.streams.core import JsonSchema -from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message -from airbyte_cdk.utils.traced_exception import AirbyteTracedException - -IDENTITIES_STREAM_NAME = "identities" -class IdentitiesStream(Stream): +class FileIdentities(Identities): """ The identities stream. A full refresh stream to sync identities from a certain domain. The stream reader manage the logic to get such data, which is implemented on connector side. @@ -46,53 +35,13 @@ def __init__( self.errors_collector = errors_collector self._cursor: MutableMapping[str, Any] = {} - @property - def state(self) -> MutableMapping[str, Any]: - return self._cursor - - @state.setter - def state(self, value: MutableMapping[str, Any]) -> None: - """State setter, accept state serialized by state getter.""" - self._cursor = value - @property def primary_key(self) -> PrimaryKeyType: return None - def read_records( - self, - sync_mode: SyncMode, - cursor_field: Optional[List[str]] = None, - stream_slice: Optional[StreamSlice] = None, - stream_state: Optional[Mapping[str, Any]] = None, - ) -> Iterable[Mapping[str, Any] | AirbyteMessage]: - try: - identity_groups = self.load_identity_groups() - for record in identity_groups: - yield stream_data_to_airbyte_message(self.name, record) - except AirbyteTracedException as exc: - # Re-raise the exception to stop the whole sync immediately as this is a fatal error - raise exc - except Exception: - yield AirbyteMessage( - type=MessageType.LOG, - log=AirbyteLogMessage( - level=Level.ERROR, - message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name}", - stack_trace=traceback.format_exc(), - ), - ) - def load_identity_groups(self) -> Iterable[Dict[str, Any]]: return self.stream_reader.load_identity_groups(logger=self.logger) @cache def get_json_schema(self) -> JsonSchema: return self.stream_reader.REMOTE_FILE_IDENTITY_SCHEMA - - @property - def name(self) -> str: - return IDENTITIES_STREAM_NAME - - def get_cursor(self) -> Optional[Cursor]: - return None diff --git a/airbyte_cdk/sources/specs/transfer_modes.py b/airbyte_cdk/sources/specs/transfer_modes.py new file mode 100644 index 000000000..a9fdea222 --- /dev/null +++ b/airbyte_cdk/sources/specs/transfer_modes.py @@ -0,0 +1,25 @@ +# +# Copyright (c) 2025 Airbyte, Inc., all rights reserved. +# + +from typing import Literal + +from pydantic.v1 import AnyUrl, BaseModel, Field +from airbyte_cdk import OneOfOptionConfig + + +class DeliverPermissions(BaseModel): + class Config(OneOfOptionConfig): + title = "Replicate Permissions ACL" + description = "Sends one identity stream and one for more permissions (ACL) streams to the destination. This data can be used in downstream systems to recreate permission restrictions mirroring the original source." + discriminator = "delivery_type" + + delivery_type: Literal["use_permissions_transfer"] = Field( + "use_permissions_transfer", const=True + ) + + include_identities_stream: bool = Field( + title="Include Identity Stream", + description="This data can be used in downstream systems to recreate permission restrictions mirroring the original source", + default=True, + ) diff --git a/airbyte_cdk/sources/streams/permissions/identities.py b/airbyte_cdk/sources/streams/permissions/identities.py new file mode 100644 index 000000000..c487474b5 --- /dev/null +++ b/airbyte_cdk/sources/streams/permissions/identities.py @@ -0,0 +1,81 @@ +# +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +# + +import traceback +from abc import ABC, abstractmethod +from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional + +from airbyte_protocol_dataclasses.models import SyncMode + +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.sources.streams.checkpoint import Cursor +from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message +from airbyte_cdk.utils.traced_exception import AirbyteTracedException + +DEFAULT_IDENTITIES_STREAM_NAME = "identities" + + +class Identities(Stream, ABC): + """ + The identities stream. A full refresh stream to sync identities from a certain domain. + The load_identity_groups method manage the logic to get such data. + """ + + IDENTITIES_STREAM_NAME = DEFAULT_IDENTITIES_STREAM_NAME + + is_resumable = False + + def __init__( + self, + catalog_schema: Optional[Mapping[str, Any]], + ): + super().__init__() + self.catalog_schema = catalog_schema + self._cursor: MutableMapping[str, Any] = {} + + @property + def state(self) -> MutableMapping[str, Any]: + return self._cursor + + @state.setter + def state(self, value: MutableMapping[str, Any]) -> None: + """State setter, accept state serialized by state getter.""" + self._cursor = value + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[Mapping[str, Any]] = None, + stream_state: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Mapping[str, Any] | AirbyteMessage]: + try: + identity_groups = self.load_identity_groups() + for record in identity_groups: + yield stream_data_to_airbyte_message(self.name, record) + except AirbyteTracedException as exc: + # Re-raise the exception to stop the whole sync immediately as this is a fatal error + raise exc + except Exception as e: + yield AirbyteMessage( + type=MessageType.LOG, + log=AirbyteLogMessage( + level=Level.ERROR, + message=f"Error trying to read identities: {e} stream={self.name}", + stack_trace=traceback.format_exc(), + ), + ) + + @abstractmethod + def load_identity_groups(self) -> Iterable[Dict[str, Any]]: + raise NotImplementedError("Implement this method to read identity records") + + @property + def name(self) -> str: + return self.IDENTITIES_STREAM_NAME + + def get_cursor(self) -> Optional[Cursor]: + return None From 0f5cc5f3fcbcb7c9092c91a2132bd6609d6cdc57 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Tue, 11 Feb 2025 15:49:15 +0000 Subject: [PATCH 27/49] Auto-fix lint and format issues --- airbyte_cdk/sources/file_based/stream/identities_stream.py | 2 +- airbyte_cdk/sources/specs/transfer_modes.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index 7fdcd7216..e438a8cb8 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -5,12 +5,12 @@ from functools import cache from typing import Any, Dict, Iterable, Mapping, MutableMapping, Optional -from airbyte_cdk.sources.streams.permissions.identities import Identities from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader from airbyte_cdk.sources.streams.core import JsonSchema +from airbyte_cdk.sources.streams.permissions.identities import Identities class FileIdentities(Identities): diff --git a/airbyte_cdk/sources/specs/transfer_modes.py b/airbyte_cdk/sources/specs/transfer_modes.py index a9fdea222..7b5651e42 100644 --- a/airbyte_cdk/sources/specs/transfer_modes.py +++ b/airbyte_cdk/sources/specs/transfer_modes.py @@ -5,6 +5,7 @@ from typing import Literal from pydantic.v1 import AnyUrl, BaseModel, Field + from airbyte_cdk import OneOfOptionConfig From a36ec7d032872215ddb0259c4b9b101b95e17e02 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Tue, 11 Feb 2025 09:56:05 -0600 Subject: [PATCH 28/49] file-based: remove unnecesary param in Abstract Identities --- airbyte_cdk/sources/streams/permissions/identities.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/airbyte_cdk/sources/streams/permissions/identities.py b/airbyte_cdk/sources/streams/permissions/identities.py index c487474b5..1988886ef 100644 --- a/airbyte_cdk/sources/streams/permissions/identities.py +++ b/airbyte_cdk/sources/streams/permissions/identities.py @@ -28,12 +28,8 @@ class Identities(Stream, ABC): is_resumable = False - def __init__( - self, - catalog_schema: Optional[Mapping[str, Any]], - ): + def __init__(self): super().__init__() - self.catalog_schema = catalog_schema self._cursor: MutableMapping[str, Any] = {} @property From ad4c49c03b97c8002f66a80b0503ce4bfe9513a9 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Tue, 11 Feb 2025 10:00:22 -0600 Subject: [PATCH 29/49] file-based: fix init return types --- airbyte_cdk/sources/file_based/stream/identities_stream.py | 2 +- airbyte_cdk/sources/streams/permissions/identities.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index e438a8cb8..0dc7cd3c6 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -27,7 +27,7 @@ def __init__( stream_reader: AbstractFileBasedStreamReader, discovery_policy: AbstractDiscoveryPolicy, errors_collector: FileBasedErrorsCollector, - ): + ) -> None: super().__init__() self.catalog_schema = catalog_schema self.stream_reader = stream_reader diff --git a/airbyte_cdk/sources/streams/permissions/identities.py b/airbyte_cdk/sources/streams/permissions/identities.py index 1988886ef..2896e6f10 100644 --- a/airbyte_cdk/sources/streams/permissions/identities.py +++ b/airbyte_cdk/sources/streams/permissions/identities.py @@ -28,7 +28,7 @@ class Identities(Stream, ABC): is_resumable = False - def __init__(self): + def __init__(self) -> None: super().__init__() self._cursor: MutableMapping[str, Any] = {} From 750935cb981bfbd6210b099b727247dd81860d31 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Tue, 11 Feb 2025 13:24:52 -0600 Subject: [PATCH 30/49] file-based: update interfaces to obtain schemas for file_permissions and identities --- .../config/validate_config_transfer_modes.py | 16 +++++++++++----- .../file_based/file_based_stream_reader.py | 11 ++++++----- .../stream/default_file_based_stream.py | 4 ++-- .../file_based/stream/identities_stream.py | 5 ----- .../sources/file_based/in_memory_files_source.py | 4 ++++ .../file_based/test_file_based_stream_reader.py | 4 ++++ 6 files changed, 27 insertions(+), 17 deletions(-) diff --git a/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py b/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py index f14c36899..b3198fb39 100644 --- a/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +++ b/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py @@ -4,10 +4,15 @@ from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec +DELIVERY_TYPE_KEY = "delivery_type" +DELIVERY_TYPE_PERMISSION_TRANSFER_MODE_VALUE = "use_permissions_transfer" +PRESERVE_DIRECTORY_STRUCTURE_KEY = "preserve_directory_structure" +INCLUDE_IDENTITIES_STREAM_KEY = "include_identities_stream" + def use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool: return ( - hasattr(parsed_config.delivery_method, "delivery_type") + hasattr(parsed_config.delivery_method, DELIVERY_TYPE_KEY) and parsed_config.delivery_method.delivery_type == "use_file_transfer" ) @@ -27,7 +32,7 @@ def preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool: """ if ( use_file_transfer(parsed_config) - and hasattr(parsed_config.delivery_method, "preserve_directory_structure") + and hasattr(parsed_config.delivery_method, PRESERVE_DIRECTORY_STRUCTURE_KEY) and parsed_config.delivery_method.preserve_directory_structure is not None ): return parsed_config.delivery_method.preserve_directory_structure @@ -36,15 +41,16 @@ def preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool: def use_permissions_transfer(parsed_config: AbstractFileBasedSpec) -> bool: return ( - hasattr(parsed_config.delivery_method, "delivery_type") - and parsed_config.delivery_method.delivery_type == "use_permissions_transfer" + hasattr(parsed_config.delivery_method, DELIVERY_TYPE_KEY) + and parsed_config.delivery_method.delivery_type + == DELIVERY_TYPE_PERMISSION_TRANSFER_MODE_VALUE ) def include_identities_stream(parsed_config: AbstractFileBasedSpec) -> bool: if ( use_permissions_transfer(parsed_config) - and hasattr(parsed_config.delivery_method, "include_identities_stream") + and hasattr(parsed_config.delivery_method, INCLUDE_IDENTITIES_STREAM_KEY) and parsed_config.delivery_method.include_identities_stream is not None ): return parsed_config.delivery_method.include_identities_stream diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index f3e6cacf6..4b53a6068 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -19,7 +19,6 @@ use_file_transfer, ) from airbyte_cdk.sources.file_based.remote_file import RemoteFile -from airbyte_cdk.sources.file_based.schema_helpers import schemaless_schema class FileReadMode(Enum): @@ -29,8 +28,6 @@ class FileReadMode(Enum): class AbstractFileBasedStreamReader(ABC): DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" - REMOTE_FILE_PERMISSIONS_SCHEMA = schemaless_schema - REMOTE_FILE_IDENTITY_SCHEMA = schemaless_schema def __init__(self) -> None: self._config = None @@ -195,7 +192,7 @@ def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> ACL Permissions from files. """ raise NotImplementedError( - f"{self.__class__.__name__} required to support get_file_acl_permissions(), please update REMOTE_FILE_PERMISSIONS_SCHEMA accordingly" + f"{self.__class__.__name__} required to support get_file_acl_permissions(), please update file_permissions_schema accordingly to obtain the required schema for each stream on the source implementation" ) @abstractmethod @@ -205,5 +202,9 @@ def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any identities. """ raise NotImplementedError( - f"{self.__class__.__name__} required to support load_identity_groups(), please update REMOTE_FILE_IDENTITY_SCHEMA accordingly" + f"{self.__class__.__name__} required to support load_identity_groups(), please add schema for your identities stream in schemas folder" ) + + @property + @abstractmethod + def file_permissions_schema(self) -> Dict[str, Any]: ... diff --git a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py index 2efdf8eab..9bb91b2ca 100644 --- a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +++ b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py @@ -110,7 +110,7 @@ def _filter_schema_invalid_properties( }, } elif self.use_permissions_transfer: - return self.stream_reader.REMOTE_FILE_PERMISSIONS_SCHEMA + return self.stream_reader.file_permissions_schema else: return super()._filter_schema_invalid_properties(configured_catalog_json_schema) @@ -314,7 +314,7 @@ def _get_raw_json_schema(self) -> JsonSchema: if self.use_file_transfer: return file_transfer_schema elif self.use_permissions_transfer: - return self.stream_reader.REMOTE_FILE_PERMISSIONS_SCHEMA + return self.stream_reader.file_permissions_schema elif self.config.input_schema: return self.config.get_input_schema() # type: ignore elif self.config.schemaless: diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index 0dc7cd3c6..4215367a3 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -9,7 +9,6 @@ from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader -from airbyte_cdk.sources.streams.core import JsonSchema from airbyte_cdk.sources.streams.permissions.identities import Identities @@ -41,7 +40,3 @@ def primary_key(self) -> PrimaryKeyType: def load_identity_groups(self) -> Iterable[Dict[str, Any]]: return self.stream_reader.load_identity_groups(logger=self.logger) - - @cache - def get_json_schema(self) -> JsonSchema: - return self.stream_reader.REMOTE_FILE_IDENTITY_SCHEMA diff --git a/unit_tests/sources/file_based/in_memory_files_source.py b/unit_tests/sources/file_based/in_memory_files_source.py index 8f4a3d2ca..244d58c88 100644 --- a/unit_tests/sources/file_based/in_memory_files_source.py +++ b/unit_tests/sources/file_based/in_memory_files_source.py @@ -151,6 +151,10 @@ def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any]]: return [{}] + @property + def file_permissions_schema(self) -> Dict[str, Any]: + return {"type": "object", "properties": {}} + def open_file( self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger ) -> IOBase: diff --git a/unit_tests/sources/file_based/test_file_based_stream_reader.py b/unit_tests/sources/file_based/test_file_based_stream_reader.py index 6830a31d6..d27055384 100644 --- a/unit_tests/sources/file_based/test_file_based_stream_reader.py +++ b/unit_tests/sources/file_based/test_file_based_stream_reader.py @@ -90,6 +90,10 @@ def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any]]: return [{}] + @property + def file_permissions_schema(self) -> Dict[str, Any]: + return {"type": "object", "properties": {}} + class TestSpec(AbstractFileBasedSpec): @classmethod From d4f2b1d1c37866ef385728192ca2a1ecf01d8625 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Tue, 11 Feb 2025 13:38:35 -0600 Subject: [PATCH 31/49] file-based: update transfer mode validations --- .../config/validate_config_transfer_modes.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py b/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py index b3198fb39..37053b0c1 100644 --- a/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +++ b/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py @@ -2,10 +2,15 @@ # Copyright (c) 2025 Airbyte, Inc., all rights reserved. # -from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec +from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import ( + AbstractFileBasedSpec, + DeliverRawFiles, +) +from airbyte_cdk.sources.specs.transfer_modes import DeliverPermissions DELIVERY_TYPE_KEY = "delivery_type" DELIVERY_TYPE_PERMISSION_TRANSFER_MODE_VALUE = "use_permissions_transfer" +DELIVERY_TYPE_FILES_TRANSFER_MODE_VALUE = "use_file_transfer" PRESERVE_DIRECTORY_STRUCTURE_KEY = "preserve_directory_structure" INCLUDE_IDENTITIES_STREAM_KEY = "include_identities_stream" @@ -13,7 +18,7 @@ def use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool: return ( hasattr(parsed_config.delivery_method, DELIVERY_TYPE_KEY) - and parsed_config.delivery_method.delivery_type == "use_file_transfer" + and parsed_config.delivery_method.delivery_type == DELIVERY_TYPE_FILES_TRANSFER_MODE_VALUE ) @@ -33,7 +38,7 @@ def preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool: if ( use_file_transfer(parsed_config) and hasattr(parsed_config.delivery_method, PRESERVE_DIRECTORY_STRUCTURE_KEY) - and parsed_config.delivery_method.preserve_directory_structure is not None + and isinstance(parsed_config.delivery_method, DeliverRawFiles) ): return parsed_config.delivery_method.preserve_directory_structure return True @@ -51,7 +56,7 @@ def include_identities_stream(parsed_config: AbstractFileBasedSpec) -> bool: if ( use_permissions_transfer(parsed_config) and hasattr(parsed_config.delivery_method, INCLUDE_IDENTITIES_STREAM_KEY) - and parsed_config.delivery_method.include_identities_stream is not None + and isinstance(parsed_config.delivery_method, DeliverPermissions) ): return parsed_config.delivery_method.include_identities_stream return False From 496294938ba2ba16ed67aa3f7e6718b2c524d665 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Tue, 11 Feb 2025 13:42:04 -0600 Subject: [PATCH 32/49] file-based: update docstrings for abstract methos --- .../sources/file_based/file_based_stream_reader.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 4b53a6068..f70f758c9 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -192,7 +192,7 @@ def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> ACL Permissions from files. """ raise NotImplementedError( - f"{self.__class__.__name__} required to support get_file_acl_permissions(), please update file_permissions_schema accordingly to obtain the required schema for each stream on the source implementation" + f"{self.__class__.__name__} required to support ACL permissions, please update file_permissions_schema accordingly to obtain the required schema for each stream on the source implementation" ) @abstractmethod @@ -202,9 +202,16 @@ def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any identities. """ raise NotImplementedError( - f"{self.__class__.__name__} required to support load_identity_groups(), please add schema for your identities stream in schemas folder" + f"{self.__class__.__name__} required to support identities, please add schema for your identities stream in schemas folder" ) @property @abstractmethod - def file_permissions_schema(self) -> Dict[str, Any]: ... + def file_permissions_schema(self) -> Dict[str, Any]: + """ + This is required for connectors that will support syncing + ACL Permissions from files. + """ + raise NotImplementedError( + f"{self.__class__.__name__} required to support ACL Permissions, please return required json schema for your permissions streams" + ) From ea31a2ddbade41d4f6c7908d9d88f763c3215f62 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Tue, 11 Feb 2025 14:13:32 -0600 Subject: [PATCH 33/49] file-based: add method in stream reader to obtain Identities schema --- .../file_based/file_based_stream_reader.py | 17 ++++++++++++++--- .../file_based/stream/identities_stream.py | 5 +++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index f70f758c9..4b81d98a2 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -192,7 +192,7 @@ def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> ACL Permissions from files. """ raise NotImplementedError( - f"{self.__class__.__name__} required to support ACL permissions, please update file_permissions_schema accordingly to obtain the required schema for each stream on the source implementation" + f"{self.__class__.__name__} required to support ACL permissions, please update file_permissions_schema accordingly." ) @abstractmethod @@ -202,7 +202,7 @@ def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any identities. """ raise NotImplementedError( - f"{self.__class__.__name__} required to support identities, please add schema for your identities stream in schemas folder" + f"{self.__class__.__name__} required to support identities, please update identities_schema." ) @property @@ -213,5 +213,16 @@ def file_permissions_schema(self) -> Dict[str, Any]: ACL Permissions from files. """ raise NotImplementedError( - f"{self.__class__.__name__} required to support ACL Permissions, please return required json schema for your permissions streams" + f"{self.__class__.__name__} required to support ACL Permissions, please return required json schema for your permissions streams." + ) + + @property + @abstractmethod + def identities_schema(self) -> Dict[str, Any]: + """ + This is required for connectors that will support syncing + identities. + """ + raise NotImplementedError( + f"{self.__class__.__name__} required to support fetch Identities, please return required json schema for your Identities stream." ) diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index 4215367a3..8b9983e98 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -10,6 +10,7 @@ from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader from airbyte_cdk.sources.streams.permissions.identities import Identities +from airbyte_cdk.sources.streams.core import JsonSchema class FileIdentities(Identities): @@ -40,3 +41,7 @@ def primary_key(self) -> PrimaryKeyType: def load_identity_groups(self) -> Iterable[Dict[str, Any]]: return self.stream_reader.load_identity_groups(logger=self.logger) + + @cache + def get_json_schema(self) -> JsonSchema: + return self.stream_reader.identities_schema From 428c6690ef96f49b2314b46ac3b1d3a69c1ce699 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Tue, 11 Feb 2025 20:22:08 +0000 Subject: [PATCH 34/49] Auto-fix lint and format issues --- airbyte_cdk/sources/file_based/stream/identities_stream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index 8b9983e98..14e14987d 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -9,8 +9,8 @@ from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader -from airbyte_cdk.sources.streams.permissions.identities import Identities from airbyte_cdk.sources.streams.core import JsonSchema +from airbyte_cdk.sources.streams.permissions.identities import Identities class FileIdentities(Identities): From 791241ccb3546eafe97ce4b4b41e18065f5f18a5 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Tue, 11 Feb 2025 14:24:43 -0600 Subject: [PATCH 35/49] file-based: implement get schema method/proerty in unit tests --- unit_tests/sources/file_based/in_memory_files_source.py | 4 ++++ .../sources/file_based/test_file_based_stream_reader.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/unit_tests/sources/file_based/in_memory_files_source.py b/unit_tests/sources/file_based/in_memory_files_source.py index 244d58c88..c8ee78f0f 100644 --- a/unit_tests/sources/file_based/in_memory_files_source.py +++ b/unit_tests/sources/file_based/in_memory_files_source.py @@ -155,6 +155,10 @@ def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any def file_permissions_schema(self) -> Dict[str, Any]: return {"type": "object", "properties": {}} + @property + def identities_schema(self) -> Dict[str, Any]: + return {"type": "object", "properties": {}} + def open_file( self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger ) -> IOBase: diff --git a/unit_tests/sources/file_based/test_file_based_stream_reader.py b/unit_tests/sources/file_based/test_file_based_stream_reader.py index d27055384..4a9d4e349 100644 --- a/unit_tests/sources/file_based/test_file_based_stream_reader.py +++ b/unit_tests/sources/file_based/test_file_based_stream_reader.py @@ -94,6 +94,10 @@ def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any def file_permissions_schema(self) -> Dict[str, Any]: return {"type": "object", "properties": {}} + @property + def identities_schema(self) -> Dict[str, Any]: + return {"type": "object", "properties": {}} + class TestSpec(AbstractFileBasedSpec): @classmethod From b74a0059d801a0f8d8796d7d01f3de60d1df80ba Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Tue, 11 Feb 2025 14:36:59 -0600 Subject: [PATCH 36/49] file-based: update messages for not implemented errors --- .../sources/file_based/file_based_stream_reader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 4b81d98a2..789d0c3a9 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -192,7 +192,7 @@ def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> ACL Permissions from files. """ raise NotImplementedError( - f"{self.__class__.__name__} required to support ACL permissions, please update file_permissions_schema accordingly." + f"{self.__class__.__name__} does not implement get_file_acl_permissions(). To support ACL permissions, implement this method and update file_permissions_schema." ) @abstractmethod @@ -202,7 +202,7 @@ def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any identities. """ raise NotImplementedError( - f"{self.__class__.__name__} required to support identities, please update identities_schema." + f"{self.__class__.__name__} does not implement load_identity_groups(). To support identities, implement this method and update identities_schema." ) @property @@ -213,7 +213,7 @@ def file_permissions_schema(self) -> Dict[str, Any]: ACL Permissions from files. """ raise NotImplementedError( - f"{self.__class__.__name__} required to support ACL Permissions, please return required json schema for your permissions streams." + f"{self.__class__.__name__} does not implement file_permissions_schema, please return json schema for your permissions streams." ) @property @@ -224,5 +224,5 @@ def identities_schema(self) -> Dict[str, Any]: identities. """ raise NotImplementedError( - f"{self.__class__.__name__} required to support fetch Identities, please return required json schema for your Identities stream." + f"{self.__class__.__name__} does not implement identities_schema, please return json schema for your identities stream." ) From 2c08bb4121ad3626bfc95545f5c8ae31cf34c634 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Tue, 11 Feb 2025 23:05:37 -0600 Subject: [PATCH 37/49] file-based: create new stream for acls so we can save from if-else pain in default file based stream --- .../sources/file_based/file_based_source.py | 36 +++++++++++-- .../stream/default_file_based_stream.py | 31 ----------- .../stream/permissions_file_based_stream.py | 54 +++++++++++++++++++ 3 files changed, 86 insertions(+), 35 deletions(-) create mode 100644 airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py diff --git a/airbyte_cdk/sources/file_based/file_based_source.py b/airbyte_cdk/sources/file_based/file_based_source.py index 60bfb254d..d8c9adb4b 100644 --- a/airbyte_cdk/sources/file_based/file_based_source.py +++ b/airbyte_cdk/sources/file_based/file_based_source.py @@ -67,6 +67,9 @@ FileBasedFinalStateCursor, ) from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor +from airbyte_cdk.sources.file_based.stream.permissions_file_based_stream import ( + PermissionsFileBasedStream, +) from airbyte_cdk.sources.message.repository import InMemoryMessageRepository, MessageRepository from airbyte_cdk.sources.streams import Stream from airbyte_cdk.sources.streams.concurrent.cursor import CursorField @@ -257,7 +260,7 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]: message_repository=self.message_repository, ) stream = FileBasedStreamFacade.create_from_stream( - stream=self._make_default_stream( + stream=self._make_file_based_stream( stream_config=stream_config, cursor=cursor, parsed_config=parsed_config, @@ -288,7 +291,7 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]: CursorField(DefaultFileBasedStream.ab_last_mod_col), ) stream = FileBasedStreamFacade.create_from_stream( - stream=self._make_default_stream( + stream=self._make_file_based_stream( stream_config=stream_config, cursor=cursor, parsed_config=parsed_config, @@ -300,7 +303,7 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]: ) else: cursor = self.cursor_cls(stream_config) - stream = self._make_default_stream( + stream = self._make_file_based_stream( stream_config=stream_config, cursor=cursor, parsed_config=parsed_config, @@ -334,9 +337,34 @@ def _make_default_stream( cursor=cursor, use_file_transfer=use_file_transfer(parsed_config), preserve_directory_structure=preserve_directory_structure(parsed_config), - use_permissions_transfer=use_permissions_transfer(parsed_config), ) + def _make_permissions_stream( + self, stream_config: FileBasedStreamConfig, cursor: Optional[AbstractFileBasedCursor] + ) -> AbstractFileBasedStream: + return PermissionsFileBasedStream( + config=stream_config, + catalog_schema=self.stream_schemas.get(stream_config.name), + stream_reader=self.stream_reader, + availability_strategy=self.availability_strategy, + discovery_policy=self.discovery_policy, + parsers=self.parsers, + validation_policy=self._validate_and_get_validation_policy(stream_config), + errors_collector=self.errors_collector, + cursor=cursor, + ) + + def _make_file_based_stream( + self, + stream_config: FileBasedStreamConfig, + cursor: Optional[AbstractFileBasedCursor], + parsed_config: AbstractFileBasedSpec, + ) -> AbstractFileBasedStream: + if use_permissions_transfer(parsed_config): + return self._make_permissions_stream(stream_config, cursor) + else: + return self._make_default_stream(stream_config, cursor, parsed_config) + def _make_identities_stream( self, ) -> Stream: diff --git a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py index 9bb91b2ca..604322549 100644 --- a/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +++ b/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py @@ -47,7 +47,6 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin): FILE_TRANSFER_KW = "use_file_transfer" PRESERVE_DIRECTORY_STRUCTURE_KW = "preserve_directory_structure" - PERMISSIONS_TRANSFER_KW = "use_permissions_transfer" FILES_KEY = "files" DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" ab_last_mod_col = "_ab_source_file_last_modified" @@ -57,7 +56,6 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin): airbyte_columns = [ab_last_mod_col, ab_file_name_col] use_file_transfer = False preserve_directory_structure = True - use_permissions_transfer = False def __init__(self, **kwargs: Any): if self.FILE_TRANSFER_KW in kwargs: @@ -66,8 +64,6 @@ def __init__(self, **kwargs: Any): self.preserve_directory_structure = kwargs.pop( self.PRESERVE_DIRECTORY_STRUCTURE_KW, True ) - if self.PERMISSIONS_TRANSFER_KW in kwargs: - self.use_permissions_transfer = kwargs.pop(self.PERMISSIONS_TRANSFER_KW, False) super().__init__(**kwargs) @property @@ -109,8 +105,6 @@ def _filter_schema_invalid_properties( self.ab_file_name_col: {"type": "string"}, }, } - elif self.use_permissions_transfer: - return self.stream_reader.file_permissions_schema else: return super()._filter_schema_invalid_properties(configured_catalog_json_schema) @@ -193,29 +187,6 @@ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Airbyte yield stream_data_to_airbyte_message( self.name, record, is_file_transfer_message=True ) - elif self.use_permissions_transfer: - try: - permissions_record = self.stream_reader.get_file_acl_permissions( - file, logger=self.logger - ) - permissions_record = self.transform_record( - permissions_record, file, file_datetime_string - ) - yield stream_data_to_airbyte_message( - self.name, permissions_record, is_file_transfer_message=False - ) - except Exception as e: - self.logger.error( - f"Failed to retrieve permissions for file {file.uri}: {str(e)}" - ) - yield AirbyteMessage( - type=MessageType.LOG, - log=AirbyteLogMessage( - level=Level.ERROR, - message=f"Error retrieving files permissions: stream={self.name} file={file.uri}", - stack_trace=traceback.format_exc(), - ), - ) else: for record in parser.parse_records( self.config, file, self.stream_reader, self.logger, schema @@ -313,8 +284,6 @@ def get_json_schema(self) -> JsonSchema: def _get_raw_json_schema(self) -> JsonSchema: if self.use_file_transfer: return file_transfer_schema - elif self.use_permissions_transfer: - return self.stream_reader.file_permissions_schema elif self.config.input_schema: return self.config.get_input_schema() # type: ignore elif self.config.schemaless: diff --git a/airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py b/airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py new file mode 100644 index 000000000..d68bc01bd --- /dev/null +++ b/airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py @@ -0,0 +1,54 @@ +# +# Copyright (c) 2025 Airbyte, Inc., all rights reserved. +# + +import traceback +from typing import Any, Dict, Iterable + +from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.file_based.types import StreamSlice +from airbyte_cdk.sources.streams.core import JsonSchema +from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message +from airbyte_cdk.sources.file_based.stream import DefaultFileBasedStream + + +class PermissionsFileBasedStream(DefaultFileBasedStream): + """ + The permissions stream, stream_reader on source handles logic for schemas and ACLs permissions. + """ + + def _filter_schema_invalid_properties( + self, configured_catalog_json_schema: Dict[str, Any] + ) -> Dict[str, Any]: + return self.stream_reader.file_permissions_schema + + def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[AirbyteMessage]: + """ + Yield permissions records from all remote files + """ + for file in stream_slice["files"]: + file_datetime_string = file.last_modified.strftime(self.DATE_TIME_FORMAT) + try: + permissions_record = self.stream_reader.get_file_acl_permissions( + file, logger=self.logger + ) + permissions_record = self.transform_record( + permissions_record, file, file_datetime_string + ) + yield stream_data_to_airbyte_message( + self.name, permissions_record, is_file_transfer_message=False + ) + except Exception as e: + self.logger.error(f"Failed to retrieve permissions for file {file.uri}: {str(e)}") + yield AirbyteMessage( + type=MessageType.LOG, + log=AirbyteLogMessage( + level=Level.ERROR, + message=f"Error retrieving files permissions: stream={self.name} file={file.uri}", + stack_trace=traceback.format_exc(), + ), + ) + + def _get_raw_json_schema(self) -> JsonSchema: + return self.stream_reader.file_permissions_schema From a71eb08f69d9048138415a6b443089a9c08d4b10 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Wed, 12 Feb 2025 05:11:57 +0000 Subject: [PATCH 38/49] Auto-fix lint and format issues --- .../sources/file_based/stream/permissions_file_based_stream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py b/airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py index d68bc01bd..5984e9428 100644 --- a/airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +++ b/airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py @@ -7,10 +7,10 @@ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.sources.file_based.stream import DefaultFileBasedStream from airbyte_cdk.sources.file_based.types import StreamSlice from airbyte_cdk.sources.streams.core import JsonSchema from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message -from airbyte_cdk.sources.file_based.stream import DefaultFileBasedStream class PermissionsFileBasedStream(DefaultFileBasedStream): From a41934a5bc201d8e6d3d93a0f7f9b70b35e2b46f Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Wed, 12 Feb 2025 17:38:42 -0600 Subject: [PATCH 39/49] file-based: add unit tests for Permissions Stream --- .../test_permissions_file_based_stream.py | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 unit_tests/sources/file_based/stream/test_permissions_file_based_stream.py diff --git a/unit_tests/sources/file_based/stream/test_permissions_file_based_stream.py b/unit_tests/sources/file_based/stream/test_permissions_file_based_stream.py new file mode 100644 index 000000000..70b7ff5a4 --- /dev/null +++ b/unit_tests/sources/file_based/stream/test_permissions_file_based_stream.py @@ -0,0 +1,108 @@ +# +# Copyright (c) 2025 Airbyte, Inc., all rights reserved. +# + +import unittest +from copy import deepcopy +from datetime import datetime, timezone +from unittest.mock import Mock + +from airbyte_cdk.sources.file_based.availability_strategy import ( + AbstractFileBasedAvailabilityStrategy, +) +from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy +from airbyte_cdk.sources.file_based.exceptions import ( + FileBasedErrorsCollector, +) +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy +from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor +from airbyte_cdk.sources.file_based.stream.permissions_file_based_stream import ( + PermissionsFileBasedStream, +) + + +class MockFormat: + pass + + +class PermissionsFileBasedStreamTest(unittest.TestCase): + _NOW = datetime(2022, 10, 22, tzinfo=timezone.utc) + _A_RECORD = { + "id": "some_id", + "file_path": "Company_Files/Accounting/Financial_Statements/2023/February/Expenses_Feb.xlsx", + "allowed_identity_remote_ids": ["integration-test@somedomain.com"], + "publicly_accessible": False, + } + + _A_PERMISSIONS_SCHEMA = { + "type": "object", + "properties": { + "id": {"type": "string"}, + "remote_id": {"type": "string"}, + "parent_id": {"type": ["null", "string"]}, + "name": {"type": ["null", "string"]}, + "description": {"type": ["null", "string"]}, + "email_address": {"type": ["null", "string"]}, + "member_email_addresses": {"type": ["null", "array"]}, + "type": {"type": "string"}, + "modified_at": {"type": "string"}, + }, + } + + def setUp(self) -> None: + self._stream_config = Mock() + self._stream_config.format = MockFormat() + self._stream_config.name = "a stream name" + self._catalog_schema = Mock() + self._stream_reader = Mock(spec=AbstractFileBasedStreamReader) + self._availability_strategy = Mock(spec=AbstractFileBasedAvailabilityStrategy) + self._discovery_policy = Mock(spec=AbstractDiscoveryPolicy) + self._parser = Mock(spec=FileTypeParser) + self._validation_policy = Mock(spec=AbstractSchemaValidationPolicy) + self._validation_policy.name = "validation policy name" + self._cursor = Mock(spec=AbstractFileBasedCursor) + + self._stream_reader.file_permissions_schema = self._A_PERMISSIONS_SCHEMA + + self._stream = PermissionsFileBasedStream( + config=self._stream_config, + catalog_schema=self._catalog_schema, + stream_reader=self._stream_reader, + availability_strategy=self._availability_strategy, + discovery_policy=self._discovery_policy, + parsers={MockFormat: self._parser}, + validation_policy=self._validation_policy, + cursor=self._cursor, + errors_collector=FileBasedErrorsCollector(), + ) + + def test_when_read_records_from_slice_then_return_records(self) -> None: + # self._parser.parse_records.return_value = [self._A_RECORD] + self._stream_reader.get_file_acl_permissions.return_value = self._A_RECORD + messages = list( + self._stream.read_records_from_slice( + {"files": [RemoteFile(uri="uri", last_modified=self._NOW)]} + ) + ) + assert list(map(lambda message: message.record.data, messages)) == [self._A_RECORD] + + def test_when_transform_record_then_return_updated_record(self) -> None: + file = RemoteFile(uri="uri", last_modified=self._NOW) + last_updated = self._NOW.isoformat() + transformed_record = self._stream.transform_record(self._A_RECORD, file, last_updated) + assert transformed_record[self._stream.ab_last_mod_col] == last_updated + assert transformed_record[self._stream.ab_file_name_col] == file.uri + + def test_when_getting_schema(self): + returned_schema = self._stream.get_json_schema() + expected_schema = deepcopy(self._A_PERMISSIONS_SCHEMA) + expected_schema["properties"][PermissionsFileBasedStream.ab_last_mod_col] = { + "type": "string" + } + expected_schema["properties"][PermissionsFileBasedStream.ab_file_name_col] = { + "type": "string" + } + assert returned_schema == expected_schema From b764d46ec12054107d380db6c4c416668a457219 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Thu, 13 Feb 2025 08:03:17 -0600 Subject: [PATCH 40/49] file-based: add more tests for permissions file based stream --- .../stream/permissions_file_based_stream.py | 14 ++++++++++ .../test_permissions_file_based_stream.py | 28 ++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py b/airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py index 5984e9428..d03b70c88 100644 --- a/airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +++ b/airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py @@ -27,12 +27,17 @@ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Airbyte """ Yield permissions records from all remote files """ + for file in stream_slice["files"]: + no_permissions = False file_datetime_string = file.last_modified.strftime(self.DATE_TIME_FORMAT) try: permissions_record = self.stream_reader.get_file_acl_permissions( file, logger=self.logger ) + if not permissions_record: + no_permissions = True + continue permissions_record = self.transform_record( permissions_record, file, file_datetime_string ) @@ -49,6 +54,15 @@ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Airbyte stack_trace=traceback.format_exc(), ), ) + finally: + if no_permissions: + yield AirbyteMessage( + type=MessageType.LOG, + log=AirbyteLogMessage( + level=Level.WARN, + message=f"Unable to fetch permissions. stream={self.name} file={file.uri}", + ), + ) def _get_raw_json_schema(self) -> JsonSchema: return self.stream_reader.file_permissions_schema diff --git a/unit_tests/sources/file_based/stream/test_permissions_file_based_stream.py b/unit_tests/sources/file_based/stream/test_permissions_file_based_stream.py index 70b7ff5a4..08f80eaf0 100644 --- a/unit_tests/sources/file_based/stream/test_permissions_file_based_stream.py +++ b/unit_tests/sources/file_based/stream/test_permissions_file_based_stream.py @@ -49,6 +49,7 @@ class PermissionsFileBasedStreamTest(unittest.TestCase): "member_email_addresses": {"type": ["null", "array"]}, "type": {"type": "string"}, "modified_at": {"type": "string"}, + "publicly_accessible": {"type": "boolean"}, }, } @@ -80,7 +81,6 @@ def setUp(self) -> None: ) def test_when_read_records_from_slice_then_return_records(self) -> None: - # self._parser.parse_records.return_value = [self._A_RECORD] self._stream_reader.get_file_acl_permissions.return_value = self._A_RECORD messages = list( self._stream.read_records_from_slice( @@ -106,3 +106,29 @@ def test_when_getting_schema(self): "type": "string" } assert returned_schema == expected_schema + + def test_when_read_records_from_slice_and_raise_exception(self) -> None: + self._stream_reader.get_file_acl_permissions.side_effect = Exception( + "ACL permissions retrieval failed" + ) + + messages = list( + self._stream.read_records_from_slice( + {"files": [RemoteFile(uri="uri", last_modified=self._NOW)]} + ) + ) + assert ( + messages[0].log.message + == "Error retrieving files permissions: stream=a stream name file=uri" + ) + + def test_when_read_records_from_slice_with_empty_permissions_then_return_empty(self) -> None: + self._stream_reader.get_file_acl_permissions.return_value = {} + messages = list( + self._stream.read_records_from_slice( + {"files": [RemoteFile(uri="uri", last_modified=self._NOW)]} + ) + ) + assert ( + messages[0].log.message == "Unable to fetch permissions. stream=a stream name file=uri" + ) From 9566b3ce40f44792d70710fff99cbe503e9c280a Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Thu, 13 Feb 2025 08:15:59 -0600 Subject: [PATCH 41/49] file-based: add docstrings to some methods --- .../config/validate_config_transfer_modes.py | 18 ++++++++++++++++++ .../stream/permissions_file_based_stream.py | 19 ++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py b/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py index 37053b0c1..5ac83b6ad 100644 --- a/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +++ b/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py @@ -45,6 +45,15 @@ def preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool: def use_permissions_transfer(parsed_config: AbstractFileBasedSpec) -> bool: + """ + Determines whether to use permissions transfer to sync ACLs and Identities + + Args: + parsed_config: The parsed configuration containing delivery method settings + + Returns: + True if permissions transfer should be enabled, False otherwise + """ return ( hasattr(parsed_config.delivery_method, DELIVERY_TYPE_KEY) and parsed_config.delivery_method.delivery_type @@ -53,6 +62,15 @@ def use_permissions_transfer(parsed_config: AbstractFileBasedSpec) -> bool: def include_identities_stream(parsed_config: AbstractFileBasedSpec) -> bool: + """ + There are scenarios where user may not have access to identities but still is valuable to get ACLs + + Args: + parsed_config: The parsed configuration containing delivery method settings + + Returns: + True if we should include Identities stream. + """ if ( use_permissions_transfer(parsed_config) and hasattr(parsed_config.delivery_method, INCLUDE_IDENTITIES_STREAM_KEY) diff --git a/airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py b/airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py index d03b70c88..75e201101 100644 --- a/airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +++ b/airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py @@ -15,7 +15,15 @@ class PermissionsFileBasedStream(DefaultFileBasedStream): """ - The permissions stream, stream_reader on source handles logic for schemas and ACLs permissions. + A specialized stream for handling file-based ACL permissions. + + This stream works with the stream_reader to: + 1. Fetch ACL permissions for each file in the source + 2. Transform permissions into a standardized format + 3. Generate records containing permission information + + The stream_reader is responsible for the actual implementation of permission retrieval + and schema definition, while this class handles the streaming interface. """ def _filter_schema_invalid_properties( @@ -37,6 +45,9 @@ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Airbyte ) if not permissions_record: no_permissions = True + self.logger.warning( + f"Unable to fetch permissions. stream={self.name} file={file.uri}" + ) continue permissions_record = self.transform_record( permissions_record, file, file_datetime_string @@ -65,4 +76,10 @@ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Airbyte ) def _get_raw_json_schema(self) -> JsonSchema: + """ + Retrieve the raw JSON schema for file permissions from the stream reader. + + Returns: + The file permissions schema that defines the structure of permission records + """ return self.stream_reader.file_permissions_schema From 8612d4b9deb126f68413a04e2e8a006757242060 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Thu, 13 Feb 2025 08:22:16 -0600 Subject: [PATCH 42/49] file-based: add docstrings to some methods --- airbyte_cdk/sources/file_based/file_based_source.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/airbyte_cdk/sources/file_based/file_based_source.py b/airbyte_cdk/sources/file_based/file_based_source.py index d8c9adb4b..6eb219adc 100644 --- a/airbyte_cdk/sources/file_based/file_based_source.py +++ b/airbyte_cdk/sources/file_based/file_based_source.py @@ -360,8 +360,12 @@ def _make_file_based_stream( cursor: Optional[AbstractFileBasedCursor], parsed_config: AbstractFileBasedSpec, ) -> AbstractFileBasedStream: + """ + Creates different streams depending on the type of the transfer mode selected + """ if use_permissions_transfer(parsed_config): return self._make_permissions_stream(stream_config, cursor) + # we should have a stream for File transfer mode to decouple from DefaultFileBasedStream else: return self._make_default_stream(stream_config, cursor, parsed_config) From c94b704f7c5682a827ac0561b76b84792e233ec8 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Thu, 13 Feb 2025 09:03:14 -0600 Subject: [PATCH 43/49] file-based: user better example of schema for file permissions in test --- .../stream/test_permissions_file_based_stream.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/unit_tests/sources/file_based/stream/test_permissions_file_based_stream.py b/unit_tests/sources/file_based/stream/test_permissions_file_based_stream.py index 08f80eaf0..efc929c7e 100644 --- a/unit_tests/sources/file_based/stream/test_permissions_file_based_stream.py +++ b/unit_tests/sources/file_based/stream/test_permissions_file_based_stream.py @@ -41,14 +41,8 @@ class PermissionsFileBasedStreamTest(unittest.TestCase): "type": "object", "properties": { "id": {"type": "string"}, - "remote_id": {"type": "string"}, - "parent_id": {"type": ["null", "string"]}, - "name": {"type": ["null", "string"]}, - "description": {"type": ["null", "string"]}, - "email_address": {"type": ["null", "string"]}, - "member_email_addresses": {"type": ["null", "array"]}, - "type": {"type": "string"}, - "modified_at": {"type": "string"}, + "file_path": {"type": "string"}, + "allowed_identity_remote_ids": {"type": "array", "items": {"type": "string"}}, "publicly_accessible": {"type": "boolean"}, }, } From 722c7e08d257e3290f0a20bcbaf6c51691adde66 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Thu, 13 Feb 2025 09:59:44 -0600 Subject: [PATCH 44/49] file-based: add tests for identities stream --- .../sources/streams/permissions/identities.py | 4 +- .../concurrent/test_file_identities_stream.py | 94 +++++++++++++++++++ 2 files changed, 95 insertions(+), 3 deletions(-) create mode 100644 unit_tests/sources/file_based/stream/concurrent/test_file_identities_stream.py diff --git a/airbyte_cdk/sources/streams/permissions/identities.py b/airbyte_cdk/sources/streams/permissions/identities.py index 2896e6f10..03dfe372f 100644 --- a/airbyte_cdk/sources/streams/permissions/identities.py +++ b/airbyte_cdk/sources/streams/permissions/identities.py @@ -15,8 +15,6 @@ from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message from airbyte_cdk.utils.traced_exception import AirbyteTracedException -DEFAULT_IDENTITIES_STREAM_NAME = "identities" - class Identities(Stream, ABC): """ @@ -24,7 +22,7 @@ class Identities(Stream, ABC): The load_identity_groups method manage the logic to get such data. """ - IDENTITIES_STREAM_NAME = DEFAULT_IDENTITIES_STREAM_NAME + IDENTITIES_STREAM_NAME = "identities" is_resumable = False diff --git a/unit_tests/sources/file_based/stream/concurrent/test_file_identities_stream.py b/unit_tests/sources/file_based/stream/concurrent/test_file_identities_stream.py new file mode 100644 index 000000000..431729bf6 --- /dev/null +++ b/unit_tests/sources/file_based/stream/concurrent/test_file_identities_stream.py @@ -0,0 +1,94 @@ +# +# Copyright (c) 2025 Airbyte, Inc., all rights reserved. +# + +import unittest +from datetime import datetime, timezone +from unittest.mock import Mock + +from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy +from airbyte_cdk.sources.file_based.exceptions import ( + FileBasedErrorsCollector, +) +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader +from airbyte_cdk.sources.file_based.stream import FileIdentities +from airbyte_protocol_dataclasses.models import SyncMode + + +class MockFormat: + pass + + +class IdentitiesFileBasedStreamTest(unittest.TestCase): + _NOW = datetime(2022, 10, 22, tzinfo=timezone.utc) + _A_RECORD = { + "id": "923496ab-3eee-47d2-a824-b237e630082a", + "remote_id": "user1@domain.com", + "name": "user ond", + "email_address": "user1@domain.com", + "member_email_addresses": ["user1@domain.com", "user1@domain.com.test-google-a.com"], + "type": "user", + "modified_at": "2025-02-12T23:06:45.304942+00:00", + } + + _GROUP_RECORD = { + "id": "ebf97e50-a010-4daa-b1ce-b47494e7fb10", + "remote_id": "team_work@domain.com", + "name": "team_work", + "email_address": "team_work@domain.com", + "member_email_addresses": ["user1@domain.com", "user2@domain.com"], + "type": "group", + "modified_at": "2025-02-12T23:06:45.604572+00:00", + } + + _IDENTITIES_SCHEMA = { + "type": "object", + "properties": { + "id": {"type": "string"}, + "remote_id": {"type": "string"}, + "parent_id": {"type": ["null", "string"]}, + "name": {"type": ["null", "string"]}, + "description": {"type": ["null", "string"]}, + "email_address": {"type": ["null", "string"]}, + "member_email_addresses": {"type": ["null", "array"]}, + "type": {"type": "string"}, + "modified_at": {"type": "string"}, + }, + } + + def setUp(self) -> None: + self._catalog_schema = Mock() + self._stream_reader = Mock(spec=AbstractFileBasedStreamReader) + self._discovery_policy = Mock(spec=AbstractDiscoveryPolicy) + + self._stream_reader.identities_schema = self._IDENTITIES_SCHEMA + + self._stream = FileIdentities( + catalog_schema=self._catalog_schema, + stream_reader=self._stream_reader, + discovery_policy=self._discovery_policy, + errors_collector=FileBasedErrorsCollector(), + ) + + def test_when_read_records_then_return_records(self) -> None: + self._stream_reader.load_identity_groups.return_value = [self._A_RECORD, self._GROUP_RECORD] + messages = list(self._stream.read_records(SyncMode.full_refresh)) + assert list(map(lambda message: message.record.data, messages)) == [ + self._A_RECORD, + self._GROUP_RECORD, + ] + + def test_when_getting_schema(self): + returned_schema = self._stream.get_json_schema() + assert returned_schema == self._IDENTITIES_SCHEMA + + def test_when_read_records_and_raise_exception(self) -> None: + self._stream_reader.load_identity_groups.side_effect = Exception( + "Identities retrieval failed" + ) + + messages = list(self._stream.read_records(SyncMode.full_refresh)) + assert ( + messages[0].log.message + == "Error trying to read identities: Identities retrieval failed stream=identities" + ) From 54d5ec6ab493453f77448c9957b553355f08e223 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Thu, 13 Feb 2025 16:02:52 +0000 Subject: [PATCH 45/49] Auto-fix lint and format issues --- .../stream/concurrent/test_file_identities_stream.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unit_tests/sources/file_based/stream/concurrent/test_file_identities_stream.py b/unit_tests/sources/file_based/stream/concurrent/test_file_identities_stream.py index 431729bf6..488437b9d 100644 --- a/unit_tests/sources/file_based/stream/concurrent/test_file_identities_stream.py +++ b/unit_tests/sources/file_based/stream/concurrent/test_file_identities_stream.py @@ -6,13 +6,14 @@ from datetime import datetime, timezone from unittest.mock import Mock +from airbyte_protocol_dataclasses.models import SyncMode + from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy from airbyte_cdk.sources.file_based.exceptions import ( FileBasedErrorsCollector, ) from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader from airbyte_cdk.sources.file_based.stream import FileIdentities -from airbyte_protocol_dataclasses.models import SyncMode class MockFormat: From afdd60a5d57d663156916dc77557988d44aeedb7 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Thu, 13 Feb 2025 10:09:25 -0600 Subject: [PATCH 46/49] file-based: minor change in unit tests --- .../stream/concurrent/test_file_identities_stream.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unit_tests/sources/file_based/stream/concurrent/test_file_identities_stream.py b/unit_tests/sources/file_based/stream/concurrent/test_file_identities_stream.py index 488437b9d..88020180b 100644 --- a/unit_tests/sources/file_based/stream/concurrent/test_file_identities_stream.py +++ b/unit_tests/sources/file_based/stream/concurrent/test_file_identities_stream.py @@ -23,9 +23,9 @@ class MockFormat: class IdentitiesFileBasedStreamTest(unittest.TestCase): _NOW = datetime(2022, 10, 22, tzinfo=timezone.utc) _A_RECORD = { - "id": "923496ab-3eee-47d2-a824-b237e630082a", + "id": "userid1", "remote_id": "user1@domain.com", - "name": "user ond", + "name": "user one", "email_address": "user1@domain.com", "member_email_addresses": ["user1@domain.com", "user1@domain.com.test-google-a.com"], "type": "user", @@ -33,9 +33,9 @@ class IdentitiesFileBasedStreamTest(unittest.TestCase): } _GROUP_RECORD = { - "id": "ebf97e50-a010-4daa-b1ce-b47494e7fb10", + "id": "groupid1", "remote_id": "team_work@domain.com", - "name": "team_work", + "name": "team work", "email_address": "team_work@domain.com", "member_email_addresses": ["user1@domain.com", "user2@domain.com"], "type": "group", From 7b2ffce97ce2718c8b22cdcc73b417f6195a0d4a Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Thu, 13 Feb 2025 12:15:41 -0600 Subject: [PATCH 47/49] file-based: move test to correct folder --- .../stream/{concurrent => }/test_file_identities_stream.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename unit_tests/sources/file_based/stream/{concurrent => }/test_file_identities_stream.py (100%) diff --git a/unit_tests/sources/file_based/stream/concurrent/test_file_identities_stream.py b/unit_tests/sources/file_based/stream/test_file_identities_stream.py similarity index 100% rename from unit_tests/sources/file_based/stream/concurrent/test_file_identities_stream.py rename to unit_tests/sources/file_based/stream/test_file_identities_stream.py From 1b729d7f1a807f57b5f4ba32057396bcd33595bb Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Fri, 14 Feb 2025 09:49:46 -0600 Subject: [PATCH 48/49] file-based: rename streams to follow pattern and add more docs to methods --- .../config/validate_config_transfer_modes.py | 1 + .../sources/file_based/file_based_source.py | 12 ++-- .../file_based/file_based_stream_reader.py | 68 ++++++++++++++++--- .../sources/file_based/stream/__init__.py | 12 +++- .../file_based/stream/identities_stream.py | 4 +- .../{identities.py => identitiesstream.py} | 2 +- .../stream/test_file_identities_stream.py | 4 +- 7 files changed, 81 insertions(+), 22 deletions(-) rename airbyte_cdk/sources/streams/permissions/{identities.py => identitiesstream.py} (98%) diff --git a/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py b/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py index 5ac83b6ad..2b83eafbb 100644 --- a/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +++ b/airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py @@ -16,6 +16,7 @@ def use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool: + """Returns `True` if the configuration uses file transfer mode.""" return ( hasattr(parsed_config.delivery_method, DELIVERY_TYPE_KEY) and parsed_config.delivery_method.delivery_type == DELIVERY_TYPE_FILES_TRANSFER_MODE_VALUE diff --git a/airbyte_cdk/sources/file_based/file_based_source.py b/airbyte_cdk/sources/file_based/file_based_source.py index 6eb219adc..6cf0aa392 100644 --- a/airbyte_cdk/sources/file_based/file_based_source.py +++ b/airbyte_cdk/sources/file_based/file_based_source.py @@ -58,7 +58,8 @@ from airbyte_cdk.sources.file_based.stream import ( AbstractFileBasedStream, DefaultFileBasedStream, - FileIdentities, + FileIdentitiesStream, + PermissionsFileBasedStream, ) from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade from airbyte_cdk.sources.file_based.stream.concurrent.cursor import ( @@ -67,9 +68,6 @@ FileBasedFinalStateCursor, ) from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor -from airbyte_cdk.sources.file_based.stream.permissions_file_based_stream import ( - PermissionsFileBasedStream, -) from airbyte_cdk.sources.message.repository import InMemoryMessageRepository, MessageRepository from airbyte_cdk.sources.streams import Stream from airbyte_cdk.sources.streams.concurrent.cursor import CursorField @@ -171,7 +169,7 @@ def check_connection( errors = [] tracebacks = [] for stream in streams: - if isinstance(stream, FileIdentities): + if isinstance(stream, FileIdentitiesStream): identity = next(iter(stream.load_identity_groups())) if not identity: errors.append( @@ -372,8 +370,8 @@ def _make_file_based_stream( def _make_identities_stream( self, ) -> Stream: - return FileIdentities( - catalog_schema=self.stream_schemas.get(FileIdentities.IDENTITIES_STREAM_NAME), + return FileIdentitiesStream( + catalog_schema=self.stream_schemas.get(FileIdentitiesStream.IDENTITIES_STREAM_NAME), stream_reader=self.stream_reader, discovery_policy=self.discovery_policy, errors_collector=self.errors_collector, diff --git a/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte_cdk/sources/file_based/file_based_stream_reader.py index 789d0c3a9..d5cd759bf 100644 --- a/airbyte_cdk/sources/file_based/file_based_stream_reader.py +++ b/airbyte_cdk/sources/file_based/file_based_stream_reader.py @@ -188,8 +188,17 @@ def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> Li @abstractmethod def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> Dict[str, Any]: """ - This is required for connectors that will support syncing - ACL Permissions from files. + This function should return the allow list for a given file, i.e. the list of all identities and their permission levels associated with it + + e.g. + def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger): + api_conn = some_api.conn(credentials=SOME_CREDENTIALS) + result = api_conn.get_file_permissions_info(file.id) + return MyPermissionsModel( + id=result["id"], + access_control_list = result["access_control_list"], + is_public = result["is_public"], + ).dict() """ raise NotImplementedError( f"{self.__class__.__name__} does not implement get_file_acl_permissions(). To support ACL permissions, implement this method and update file_permissions_schema." @@ -198,8 +207,22 @@ def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> @abstractmethod def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any]]: """ - This is required for connectors that will support syncing - identities. + This function should return the Identities in a determined "space" or "domain" where the file metadata (ACLs) are fetched and ACLs items (Identities) exists. + + e.g. + def load_identity_groups(self, logger: logging.Logger) -> Dict[str, Any]: + api_conn = some_api.conn(credentials=SOME_CREDENTIALS) + users_api = api_conn.users() + groups_api = api_conn.groups() + members_api = self.google_directory_service.members() + for user in users_api.list(): + yield my_identity_model(id=user.id, name=user.name, email_address=user.email, type="user").dict() + for group in groups_api.list(): + group_obj = my_identity_model(id=group.id, name=groups.name, email_address=user.email, type="group").dict() + for member in members_api.list(group=group): + group_obj.member_email_addresses = group_obj.member_email_addresses or [] + group_obj.member_email_addresses.append(member.email) + yield group_obj.dict() """ raise NotImplementedError( f"{self.__class__.__name__} does not implement load_identity_groups(). To support identities, implement this method and update identities_schema." @@ -209,8 +232,23 @@ def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any @abstractmethod def file_permissions_schema(self) -> Dict[str, Any]: """ - This is required for connectors that will support syncing - ACL Permissions from files. + This function should return the permissions schema for file permissions stream. + + e.g. + def file_permissions_schema(self) -> Dict[str, Any]: + # you can also follow the patter we have for python connectors and have a json file and read from there e.g. schemas/identities.json + return { + "type": "object", + "properties": { + "id": { "type": "string" }, + "file_path": { "type": "string" }, + "access_control_list": { + "type": "array", + "items": { "type": "string" } + }, + "publicly_accessible": { "type": "boolean" } + } + } """ raise NotImplementedError( f"{self.__class__.__name__} does not implement file_permissions_schema, please return json schema for your permissions streams." @@ -220,8 +258,22 @@ def file_permissions_schema(self) -> Dict[str, Any]: @abstractmethod def identities_schema(self) -> Dict[str, Any]: """ - This is required for connectors that will support syncing - identities. + This function should return the identities schema for file identity stream. + + e.g. + def identities_schema(self) -> Dict[str, Any]: + # you can also follow the patter we have for python connectors and have a json file and read from there e.g. schemas/identities.json + return { + "type": "object", + "properties": { + "id": { "type": "string" }, + "remote_id": { "type": "string" }, + "name": { "type": ["null", "string"] }, + "email_address": { "type": ["null", "string"] }, + "member_email_addresses": { "type": ["null", "array"] }, + "type": { "type": "string" }, + } + } """ raise NotImplementedError( f"{self.__class__.__name__} does not implement identities_schema, please return json schema for your identities stream." diff --git a/airbyte_cdk/sources/file_based/stream/__init__.py b/airbyte_cdk/sources/file_based/stream/__init__.py index 0c1359343..8bba79029 100644 --- a/airbyte_cdk/sources/file_based/stream/__init__.py +++ b/airbyte_cdk/sources/file_based/stream/__init__.py @@ -1,5 +1,13 @@ from airbyte_cdk.sources.file_based.stream.abstract_file_based_stream import AbstractFileBasedStream from airbyte_cdk.sources.file_based.stream.default_file_based_stream import DefaultFileBasedStream -from airbyte_cdk.sources.file_based.stream.identities_stream import FileIdentities +from airbyte_cdk.sources.file_based.stream.identities_stream import FileIdentitiesStream +from airbyte_cdk.sources.file_based.stream.permissions_file_based_stream import ( + PermissionsFileBasedStream, +) -__all__ = ["AbstractFileBasedStream", "DefaultFileBasedStream", "FileIdentities"] +__all__ = [ + "AbstractFileBasedStream", + "DefaultFileBasedStream", + "FileIdentitiesStream", + "PermissionsFileBasedStream", +] diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index 14e14987d..837140ff8 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -10,10 +10,10 @@ from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader from airbyte_cdk.sources.streams.core import JsonSchema -from airbyte_cdk.sources.streams.permissions.identities import Identities +from airbyte_cdk.sources.streams.permissions.identitiesstream import IdentitiesStream -class FileIdentities(Identities): +class FileIdentitiesStream(IdentitiesStream): """ The identities stream. A full refresh stream to sync identities from a certain domain. The stream reader manage the logic to get such data, which is implemented on connector side. diff --git a/airbyte_cdk/sources/streams/permissions/identities.py b/airbyte_cdk/sources/streams/permissions/identitiesstream.py similarity index 98% rename from airbyte_cdk/sources/streams/permissions/identities.py rename to airbyte_cdk/sources/streams/permissions/identitiesstream.py index 03dfe372f..8101234c5 100644 --- a/airbyte_cdk/sources/streams/permissions/identities.py +++ b/airbyte_cdk/sources/streams/permissions/identitiesstream.py @@ -16,7 +16,7 @@ from airbyte_cdk.utils.traced_exception import AirbyteTracedException -class Identities(Stream, ABC): +class IdentitiesStream(Stream, ABC): """ The identities stream. A full refresh stream to sync identities from a certain domain. The load_identity_groups method manage the logic to get such data. diff --git a/unit_tests/sources/file_based/stream/test_file_identities_stream.py b/unit_tests/sources/file_based/stream/test_file_identities_stream.py index 88020180b..59ba53166 100644 --- a/unit_tests/sources/file_based/stream/test_file_identities_stream.py +++ b/unit_tests/sources/file_based/stream/test_file_identities_stream.py @@ -13,7 +13,7 @@ FileBasedErrorsCollector, ) from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader -from airbyte_cdk.sources.file_based.stream import FileIdentities +from airbyte_cdk.sources.file_based.stream import FileIdentitiesStream class MockFormat: @@ -64,7 +64,7 @@ def setUp(self) -> None: self._stream_reader.identities_schema = self._IDENTITIES_SCHEMA - self._stream = FileIdentities( + self._stream = FileIdentitiesStream( catalog_schema=self._catalog_schema, stream_reader=self._stream_reader, discovery_policy=self._discovery_policy, From 7924c3d1b3b5692b62cd20b4f01d464b78ffb71d Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Fri, 14 Feb 2025 09:54:45 -0600 Subject: [PATCH 49/49] file-based: rename stream file to correct pattern --- airbyte_cdk/sources/file_based/stream/identities_stream.py | 2 +- .../permissions/{identitiesstream.py => identities_stream.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename airbyte_cdk/sources/streams/permissions/{identitiesstream.py => identities_stream.py} (100%) diff --git a/airbyte_cdk/sources/file_based/stream/identities_stream.py b/airbyte_cdk/sources/file_based/stream/identities_stream.py index 837140ff8..d0c33baa1 100644 --- a/airbyte_cdk/sources/file_based/stream/identities_stream.py +++ b/airbyte_cdk/sources/file_based/stream/identities_stream.py @@ -10,7 +10,7 @@ from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader from airbyte_cdk.sources.streams.core import JsonSchema -from airbyte_cdk.sources.streams.permissions.identitiesstream import IdentitiesStream +from airbyte_cdk.sources.streams.permissions.identities_stream import IdentitiesStream class FileIdentitiesStream(IdentitiesStream): diff --git a/airbyte_cdk/sources/streams/permissions/identitiesstream.py b/airbyte_cdk/sources/streams/permissions/identities_stream.py similarity index 100% rename from airbyte_cdk/sources/streams/permissions/identitiesstream.py rename to airbyte_cdk/sources/streams/permissions/identities_stream.py