Skip to content

Commit c109297

Browse files
feat(file-based): changes for not mirroring paths (#205)
The Source config receives a new option: Preserve Sub-Directories in File Paths. By default this is enabled (the current behavior). The new option should only appear when "Copy Raw Files" sync mode is enabled. When enabled, the sync will: Validate uniqueness. During at the start of each read operation, the source will check all files that exist and are defined in the stream. This will be performed once per stream. If any files exist with the same file name, the operation will fail. Sync without intermediate subdirectory information. During sync, the source will send relative filenames which exclude any path info between the extract root and the filename. To the destination, each file will appear to exist at the root of the extract location.
1 parent 76b5306 commit c109297

File tree

9 files changed

+375
-15
lines changed

9 files changed

+375
-15
lines changed

airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py

+11
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,17 @@ class Config(OneOfOptionConfig):
3131

3232
delivery_type: Literal["use_file_transfer"] = Field("use_file_transfer", const=True)
3333

34+
preserve_directory_structure: bool = Field(
35+
title="Preserve Sub-Directories in File Paths",
36+
description=(
37+
"If enabled, sends subdirectory folder structure "
38+
"along with source file names to the destination. "
39+
"Otherwise, files will be synced by their names only. "
40+
"This option is ignored when file-based replication is not enabled."
41+
),
42+
default=True,
43+
)
44+
3445

3546
class AbstractFileBasedSpec(BaseModel):
3647
"""

airbyte_cdk/sources/file_based/exceptions.py

+34
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,40 @@ class ErrorListingFiles(BaseFileBasedSourceError):
111111
pass
112112

113113

114+
class DuplicatedFilesError(BaseFileBasedSourceError):
115+
def __init__(self, duplicated_files_names: List[dict[str, List[str]]], **kwargs: Any):
116+
self._duplicated_files_names = duplicated_files_names
117+
self._stream_name: str = kwargs["stream"]
118+
super().__init__(self._format_duplicate_files_error_message(), **kwargs)
119+
120+
def _format_duplicate_files_error_message(self) -> str:
121+
duplicated_files_messages = []
122+
for duplicated_file in self._duplicated_files_names:
123+
for duplicated_file_name, file_paths in duplicated_file.items():
124+
file_duplicated_message = (
125+
f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
126+
+ "".join(f"\n - {file_paths}")
127+
)
128+
duplicated_files_messages.append(file_duplicated_message)
129+
130+
error_message = (
131+
f"ERROR: Duplicate filenames found for stream {self._stream_name}. "
132+
"Duplicate file names are not allowed if the Preserve Sub-Directories in File Paths option is disabled. "
133+
"Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
134+
+ "\n".join(duplicated_files_messages)
135+
)
136+
137+
return error_message
138+
139+
def __repr__(self) -> str:
140+
"""Return a string representation of the exception."""
141+
class_name = self.__class__.__name__
142+
properties_str = ", ".join(
143+
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
144+
)
145+
return f"{class_name}({properties_str})"
146+
147+
114148
class CustomFileBasedException(AirbyteTracedException):
115149
"""
116150
A specialized exception for file-based connectors.

airbyte_cdk/sources/file_based/file_based_source.py

+28-5
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]:
242242
stream=self._make_default_stream(
243243
stream_config=stream_config,
244244
cursor=cursor,
245-
use_file_transfer=self._use_file_transfer(parsed_config),
245+
parsed_config=parsed_config,
246246
),
247247
source=self,
248248
logger=self.logger,
@@ -273,7 +273,7 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]:
273273
stream=self._make_default_stream(
274274
stream_config=stream_config,
275275
cursor=cursor,
276-
use_file_transfer=self._use_file_transfer(parsed_config),
276+
parsed_config=parsed_config,
277277
),
278278
source=self,
279279
logger=self.logger,
@@ -285,7 +285,7 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]:
285285
stream = self._make_default_stream(
286286
stream_config=stream_config,
287287
cursor=cursor,
288-
use_file_transfer=self._use_file_transfer(parsed_config),
288+
parsed_config=parsed_config,
289289
)
290290

291291
streams.append(stream)
@@ -298,7 +298,7 @@ def _make_default_stream(
298298
self,
299299
stream_config: FileBasedStreamConfig,
300300
cursor: Optional[AbstractFileBasedCursor],
301-
use_file_transfer: bool = False,
301+
parsed_config: AbstractFileBasedSpec,
302302
) -> AbstractFileBasedStream:
303303
return DefaultFileBasedStream(
304304
config=stream_config,
@@ -310,7 +310,8 @@ def _make_default_stream(
310310
validation_policy=self._validate_and_get_validation_policy(stream_config),
311311
errors_collector=self.errors_collector,
312312
cursor=cursor,
313-
use_file_transfer=use_file_transfer,
313+
use_file_transfer=self._use_file_transfer(parsed_config),
314+
preserve_directory_structure=self._preserve_directory_structure(parsed_config),
314315
)
315316

316317
def _get_stream_from_catalog(
@@ -385,3 +386,25 @@ def _use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
385386
and parsed_config.delivery_method.delivery_type == "use_file_transfer"
386387
)
387388
return use_file_transfer
389+
390+
@staticmethod
391+
def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
392+
"""
393+
Determines whether to preserve directory structure during file transfer.
394+
395+
When enabled, files maintain their subdirectory paths in the destination.
396+
When disabled, files are flattened to the root of the destination.
397+
398+
Args:
399+
parsed_config: The parsed configuration containing delivery method settings
400+
401+
Returns:
402+
True if directory structure should be preserved (default), False otherwise
403+
"""
404+
if (
405+
FileBasedSource._use_file_transfer(parsed_config)
406+
and hasattr(parsed_config.delivery_method, "preserve_directory_structure")
407+
and parsed_config.delivery_method.preserve_directory_structure is not None
408+
):
409+
return parsed_config.delivery_method.preserve_directory_structure
410+
return True

airbyte_cdk/sources/file_based/file_based_stream_reader.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,17 @@ def use_file_transfer(self) -> bool:
135135
return use_file_transfer
136136
return False
137137

138+
def preserve_directory_structure(self) -> bool:
139+
# fall back to preserve subdirectories if config is not present or incomplete
140+
if (
141+
self.use_file_transfer()
142+
and self.config
143+
and hasattr(self.config.delivery_method, "preserve_directory_structure")
144+
and self.config.delivery_method.preserve_directory_structure is not None
145+
):
146+
return self.config.delivery_method.preserve_directory_structure
147+
return True
148+
138149
@abstractmethod
139150
def get_file(
140151
self, file: RemoteFile, local_directory: str, logger: logging.Logger
@@ -159,10 +170,13 @@ def get_file(
159170
"""
160171
...
161172

162-
@staticmethod
163-
def _get_file_transfer_paths(file: RemoteFile, local_directory: str) -> List[str]:
164-
# Remove left slashes from source path format to make relative path for writing locally
165-
file_relative_path = file.uri.lstrip("/")
173+
def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> List[str]:
174+
preserve_directory_structure = self.preserve_directory_structure()
175+
if preserve_directory_structure:
176+
# Remove left slashes from source path format to make relative path for writing locally
177+
file_relative_path = file.uri.lstrip("/")
178+
else:
179+
file_relative_path = path.basename(file.uri)
166180
local_file_path = path.join(local_directory, file_relative_path)
167181

168182
# Ensure the local directory exists

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py

+25-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
33
#
44
import logging
5+
import os
56
import traceback
67
from datetime import datetime
78
from io import BytesIO, IOBase
@@ -42,12 +43,34 @@
4243
unstructured_partition_docx = None
4344
unstructured_partition_pptx = None
4445

46+
AIRBYTE_NLTK_DATA_DIR = "/airbyte/nltk_data"
47+
TMP_NLTK_DATA_DIR = "/tmp/nltk_data"
48+
49+
50+
def get_nltk_temp_folder() -> str:
51+
"""
52+
For non-root connectors /tmp is not currently writable, but we should allow it in the future.
53+
It's safe to use /airbyte for now. Fallback to /tmp for local development.
54+
"""
55+
try:
56+
nltk_data_dir = AIRBYTE_NLTK_DATA_DIR
57+
os.makedirs(nltk_data_dir, exist_ok=True)
58+
except OSError:
59+
nltk_data_dir = TMP_NLTK_DATA_DIR
60+
os.makedirs(nltk_data_dir, exist_ok=True)
61+
return nltk_data_dir
62+
63+
4564
try:
65+
nltk_data_dir = get_nltk_temp_folder()
66+
nltk.data.path.append(nltk_data_dir)
4667
nltk.data.find("tokenizers/punkt.zip")
4768
nltk.data.find("tokenizers/punkt_tab.zip")
69+
nltk.data.find("tokenizers/averaged_perceptron_tagger_eng.zip")
4870
except LookupError:
49-
nltk.download("punkt")
50-
nltk.download("punkt_tab")
71+
nltk.download("punkt", download_dir=nltk_data_dir, quiet=True)
72+
nltk.download("punkt_tab", download_dir=nltk_data_dir, quiet=True)
73+
nltk.download("averaged_perceptron_tagger_eng", download_dir=nltk_data_dir, quiet=True)
5174

5275

5376
def optional_decode(contents: Union[str, bytes]) -> str:

airbyte_cdk/sources/file_based/stream/default_file_based_stream.py

+30-2
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,17 @@
55
import asyncio
66
import itertools
77
import traceback
8+
from collections import defaultdict
89
from copy import deepcopy
910
from functools import cache
10-
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Union
11+
from os import path
12+
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union
1113

1214
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, FailureType, Level
1315
from airbyte_cdk.models import Type as MessageType
1416
from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
1517
from airbyte_cdk.sources.file_based.exceptions import (
18+
DuplicatedFilesError,
1619
FileBasedSourceError,
1720
InvalidSchemaError,
1821
MissingSchemaError,
@@ -43,17 +46,24 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
4346
"""
4447

4548
FILE_TRANSFER_KW = "use_file_transfer"
49+
PRESERVE_DIRECTORY_STRUCTURE_KW = "preserve_directory_structure"
50+
FILES_KEY = "files"
4651
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
4752
ab_last_mod_col = "_ab_source_file_last_modified"
4853
ab_file_name_col = "_ab_source_file_url"
4954
modified = "modified"
5055
source_file_url = "source_file_url"
5156
airbyte_columns = [ab_last_mod_col, ab_file_name_col]
5257
use_file_transfer = False
58+
preserve_directory_structure = True
5359

5460
def __init__(self, **kwargs: Any):
5561
if self.FILE_TRANSFER_KW in kwargs:
5662
self.use_file_transfer = kwargs.pop(self.FILE_TRANSFER_KW, False)
63+
if self.PRESERVE_DIRECTORY_STRUCTURE_KW in kwargs:
64+
self.preserve_directory_structure = kwargs.pop(
65+
self.PRESERVE_DIRECTORY_STRUCTURE_KW, True
66+
)
5767
super().__init__(**kwargs)
5868

5969
@property
@@ -98,15 +108,33 @@ def _filter_schema_invalid_properties(
98108
else:
99109
return super()._filter_schema_invalid_properties(configured_catalog_json_schema)
100110

111+
def _duplicated_files_names(
112+
self, slices: List[dict[str, List[RemoteFile]]]
113+
) -> List[dict[str, List[str]]]:
114+
seen_file_names: Dict[str, List[str]] = defaultdict(list)
115+
for file_slice in slices:
116+
for file_found in file_slice[self.FILES_KEY]:
117+
file_name = path.basename(file_found.uri)
118+
seen_file_names[file_name].append(file_found.uri)
119+
return [
120+
{file_name: paths} for file_name, paths in seen_file_names.items() if len(paths) > 1
121+
]
122+
101123
def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]:
102124
# Sort files by last_modified, uri and return them grouped by last_modified
103125
all_files = self.list_files()
104126
files_to_read = self._cursor.get_files_to_sync(all_files, self.logger)
105127
sorted_files_to_read = sorted(files_to_read, key=lambda f: (f.last_modified, f.uri))
106128
slices = [
107-
{"files": list(group[1])}
129+
{self.FILES_KEY: list(group[1])}
108130
for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)
109131
]
132+
if slices and not self.preserve_directory_structure:
133+
duplicated_files_names = self._duplicated_files_names(slices)
134+
if duplicated_files_names:
135+
raise DuplicatedFilesError(
136+
stream=self.name, duplicated_files_names=duplicated_files_names
137+
)
110138
return slices
111139

112140
def transform_record(

unit_tests/sources/file_based/scenarios/csv_scenarios.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -523,7 +523,13 @@
523523
"const": "use_file_transfer",
524524
"enum": ["use_file_transfer"],
525525
"type": "string",
526-
}
526+
},
527+
"preserve_directory_structure": {
528+
"default": True,
529+
"description": "If enabled, sends subdirectory folder structure along with source file names to the destination. Otherwise, files will be synced by their names only. This option is ignored when file-based replication is not enabled.",
530+
"title": "Preserve Sub-Directories in File Paths",
531+
"type": "boolean",
532+
},
527533
},
528534
"description": "Copy raw files without parsing their contents. Bits are copied into the destination exactly as they appeared in the source. Recommended for use with unstructured text data, non-text and compressed files.",
529535
"required": ["delivery_type"],

0 commit comments

Comments
 (0)