|
5 | 5 | import asyncio
|
6 | 6 | import itertools
|
7 | 7 | import traceback
|
| 8 | +from collections import defaultdict |
8 | 9 | from copy import deepcopy
|
9 | 10 | from functools import cache
|
10 |
| -from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Union |
| 11 | +from os import path |
| 12 | +from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union |
11 | 13 |
|
12 | 14 | from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, FailureType, Level
|
13 | 15 | from airbyte_cdk.models import Type as MessageType
|
14 | 16 | from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
|
15 | 17 | from airbyte_cdk.sources.file_based.exceptions import (
|
| 18 | + DuplicatedFilesError, |
16 | 19 | FileBasedSourceError,
|
17 | 20 | InvalidSchemaError,
|
18 | 21 | MissingSchemaError,
|
@@ -43,17 +46,24 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
43 | 46 | """
|
44 | 47 |
|
45 | 48 | FILE_TRANSFER_KW = "use_file_transfer"
|
| 49 | + PRESERVE_DIRECTORY_STRUCTURE_KW = "preserve_directory_structure" |
| 50 | + FILES_KEY = "files" |
46 | 51 | DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
|
47 | 52 | ab_last_mod_col = "_ab_source_file_last_modified"
|
48 | 53 | ab_file_name_col = "_ab_source_file_url"
|
49 | 54 | modified = "modified"
|
50 | 55 | source_file_url = "source_file_url"
|
51 | 56 | airbyte_columns = [ab_last_mod_col, ab_file_name_col]
|
52 | 57 | use_file_transfer = False
|
| 58 | + preserve_directory_structure = True |
53 | 59 |
|
54 | 60 | def __init__(self, **kwargs: Any):
|
55 | 61 | if self.FILE_TRANSFER_KW in kwargs:
|
56 | 62 | self.use_file_transfer = kwargs.pop(self.FILE_TRANSFER_KW, False)
|
| 63 | + if self.PRESERVE_DIRECTORY_STRUCTURE_KW in kwargs: |
| 64 | + self.preserve_directory_structure = kwargs.pop( |
| 65 | + self.PRESERVE_DIRECTORY_STRUCTURE_KW, True |
| 66 | + ) |
57 | 67 | super().__init__(**kwargs)
|
58 | 68 |
|
59 | 69 | @property
|
@@ -98,15 +108,33 @@ def _filter_schema_invalid_properties(
|
98 | 108 | else:
|
99 | 109 | return super()._filter_schema_invalid_properties(configured_catalog_json_schema)
|
100 | 110 |
|
| 111 | + def _duplicated_files_names( |
| 112 | + self, slices: List[dict[str, List[RemoteFile]]] |
| 113 | + ) -> List[dict[str, List[str]]]: |
| 114 | + seen_file_names: Dict[str, List[str]] = defaultdict(list) |
| 115 | + for file_slice in slices: |
| 116 | + for file_found in file_slice[self.FILES_KEY]: |
| 117 | + file_name = path.basename(file_found.uri) |
| 118 | + seen_file_names[file_name].append(file_found.uri) |
| 119 | + return [ |
| 120 | + {file_name: paths} for file_name, paths in seen_file_names.items() if len(paths) > 1 |
| 121 | + ] |
| 122 | + |
101 | 123 | def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]:
|
102 | 124 | # Sort files by last_modified, uri and return them grouped by last_modified
|
103 | 125 | all_files = self.list_files()
|
104 | 126 | files_to_read = self._cursor.get_files_to_sync(all_files, self.logger)
|
105 | 127 | sorted_files_to_read = sorted(files_to_read, key=lambda f: (f.last_modified, f.uri))
|
106 | 128 | slices = [
|
107 |
| - {"files": list(group[1])} |
| 129 | + {self.FILES_KEY: list(group[1])} |
108 | 130 | for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)
|
109 | 131 | ]
|
| 132 | + if slices and not self.preserve_directory_structure: |
| 133 | + duplicated_files_names = self._duplicated_files_names(slices) |
| 134 | + if duplicated_files_names: |
| 135 | + raise DuplicatedFilesError( |
| 136 | + stream=self.name, duplicated_files_names=duplicated_files_names |
| 137 | + ) |
110 | 138 | return slices
|
111 | 139 |
|
112 | 140 | def transform_record(
|
|
0 commit comments