Skip to content

Commit d2016c6

Browse files
authored
feat: Adds ZipfileDecoder component (#169)
1 parent b5ed82c commit d2016c6

File tree

6 files changed

+191
-5
lines changed

6 files changed

+191
-5
lines changed

airbyte_cdk/sources/declarative/declarative_component_schema.yaml

+26
Original file line numberDiff line numberDiff line change
@@ -1513,6 +1513,7 @@ definitions:
15131513
anyOf:
15141514
- "$ref": "#/definitions/JsonDecoder"
15151515
- "$ref": "#/definitions/XmlDecoder"
1516+
- "$ref": "#/definitions/CompositeRawDecoder"
15161517
$parameters:
15171518
type: object
15181519
additionalProperties: true
@@ -2071,6 +2072,26 @@ definitions:
20712072
$parameters:
20722073
type: object
20732074
additionalProperties: true
2075+
ZipfileDecoder:
2076+
title: Zipfile Decoder
2077+
description: Decoder for response data that is returned as zipfile(s).
2078+
type: object
2079+
additionalProperties: true
2080+
required:
2081+
- type
2082+
- parser
2083+
properties:
2084+
type:
2085+
type: string
2086+
enum: [ZipfileDecoder]
2087+
parser:
2088+
title: Parser
2089+
description: Parser to parse the decompressed data from the zipfile(s).
2090+
anyOf:
2091+
- "$ref": "#/definitions/GzipParser"
2092+
- "$ref": "#/definitions/JsonParser"
2093+
- "$ref": "#/definitions/JsonLineParser"
2094+
- "$ref": "#/definitions/CsvParser"
20742095
ListPartitionRouter:
20752096
title: List Partition Router
20762097
description: A Partition router that specifies a list of attributes where each attribute describes a portion of the complete data set for a stream. During a sync, each value is iterated over and can be used as input to outbound API requests.
@@ -2899,6 +2920,7 @@ definitions:
28992920
- "$ref": "#/definitions/XmlDecoder"
29002921
- "$ref": "#/definitions/GzipJsonDecoder"
29012922
- "$ref": "#/definitions/CompositeRawDecoder"
2923+
- "$ref": "#/definitions/ZipfileDecoder"
29022924
$parameters:
29032925
type: object
29042926
additionalProperties: true
@@ -3097,6 +3119,8 @@ definitions:
30973119
- "$ref": "#/definitions/IterableDecoder"
30983120
- "$ref": "#/definitions/XmlDecoder"
30993121
- "$ref": "#/definitions/GzipJsonDecoder"
3122+
- "$ref": "#/definitions/CompositeRawDecoder"
3123+
- "$ref": "#/definitions/ZipfileDecoder"
31003124
download_decoder:
31013125
title: Download Decoder
31023126
description: Component decoding the download response so records can be extracted.
@@ -3107,6 +3131,8 @@ definitions:
31073131
- "$ref": "#/definitions/IterableDecoder"
31083132
- "$ref": "#/definitions/XmlDecoder"
31093133
- "$ref": "#/definitions/GzipJsonDecoder"
3134+
- "$ref": "#/definitions/CompositeRawDecoder"
3135+
- "$ref": "#/definitions/ZipfileDecoder"
31103136
$parameters:
31113137
type: object
31123138
additionalProperties: true

airbyte_cdk/sources/declarative/decoders/__init__.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,12 @@
22
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
33
#
44

5-
from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import CompositeRawDecoder
5+
from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import (
6+
CompositeRawDecoder,
7+
GzipParser,
8+
JsonParser,
9+
Parser,
10+
)
611
from airbyte_cdk.sources.declarative.decoders.decoder import Decoder
712
from airbyte_cdk.sources.declarative.decoders.json_decoder import (
813
GzipJsonDecoder,
@@ -15,15 +20,18 @@
1520
PaginationDecoderDecorator,
1621
)
1722
from airbyte_cdk.sources.declarative.decoders.xml_decoder import XmlDecoder
23+
from airbyte_cdk.sources.declarative.decoders.zipfile_decoder import ZipfileDecoder
1824

1925
__all__ = [
2026
"Decoder",
2127
"CompositeRawDecoder",
2228
"JsonDecoder",
29+
"JsonParser",
2330
"JsonlDecoder",
2431
"IterableDecoder",
2532
"GzipJsonDecoder",
2633
"NoopDecoder",
2734
"PaginationDecoderDecorator",
2835
"XmlDecoder",
36+
"ZipfileDecoder",
2937
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#
2+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3+
#
4+
5+
import logging
6+
import zipfile
7+
from dataclasses import dataclass
8+
from io import BytesIO
9+
from typing import Any, Generator, MutableMapping
10+
11+
import orjson
12+
import requests
13+
14+
from airbyte_cdk.models import FailureType
15+
from airbyte_cdk.sources.declarative.decoders import Decoder
16+
from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import (
17+
Parser,
18+
)
19+
from airbyte_cdk.utils import AirbyteTracedException
20+
21+
logger = logging.getLogger("airbyte")
22+
23+
24+
@dataclass
25+
class ZipfileDecoder(Decoder):
26+
parser: Parser
27+
28+
def is_stream_response(self) -> bool:
29+
return False
30+
31+
def decode(
32+
self, response: requests.Response
33+
) -> Generator[MutableMapping[str, Any], None, None]:
34+
try:
35+
with zipfile.ZipFile(BytesIO(response.content)) as zip_file:
36+
for file_name in zip_file.namelist():
37+
unzipped_content = zip_file.read(file_name)
38+
buffered_content = BytesIO(unzipped_content)
39+
try:
40+
yield from self.parser.parse(buffered_content)
41+
except Exception as e:
42+
logger.error(
43+
f"Failed to parse file: {file_name} from zip file: {response.request.url} with exception {e}."
44+
)
45+
raise AirbyteTracedException(
46+
message=f"Failed to parse file: {file_name} from zip file.",
47+
internal_message=f"Failed to parse file: {file_name} from zip file: {response.request.url}.",
48+
failure_type=FailureType.system_error,
49+
) from e
50+
except zipfile.BadZipFile as e:
51+
logger.error(
52+
f"Received an invalid zip file in response to URL: {response.request.url}. "
53+
f"The size of the response body is: {len(response.content)}"
54+
)
55+
raise AirbyteTracedException(
56+
message="Received an invalid zip file in response.",
57+
internal_message=f"Received an invalid zip file in response to URL: {response.request.url}.",
58+
failure_type=FailureType.system_error,
59+
) from e

airbyte_cdk/sources/declarative/models/declarative_component_schema.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -1223,9 +1223,6 @@ class LegacySessionTokenAuthenticator(BaseModel):
12231223

12241224

12251225
class JsonParser(BaseModel):
1226-
class Config:
1227-
extra = Extra.allow
1228-
12291226
type: Literal["JsonParser"]
12301227
encoding: Optional[str] = "utf-8"
12311228

@@ -1661,6 +1658,18 @@ class CompositeErrorHandler(BaseModel):
16611658
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
16621659

16631660

1661+
class ZipfileDecoder(BaseModel):
1662+
class Config:
1663+
extra = Extra.allow
1664+
1665+
type: Literal["ZipfileDecoder"]
1666+
parser: Union[GzipParser, JsonParser, JsonLineParser, CsvParser] = Field(
1667+
...,
1668+
description="Parser to parse the decompressed data from the zipfile(s).",
1669+
title="Parser",
1670+
)
1671+
1672+
16641673
class CompositeRawDecoder(BaseModel):
16651674
type: Literal["CompositeRawDecoder"]
16661675
parser: Union[GzipParser, JsonParser, JsonLineParser, CsvParser]
@@ -1866,7 +1875,7 @@ class SessionTokenAuthenticator(BaseModel):
18661875
description="Authentication method to use for requests sent to the API, specifying how to inject the session token.",
18671876
title="Data Request Authentication",
18681877
)
1869-
decoder: Optional[Union[JsonDecoder, XmlDecoder]] = Field(
1878+
decoder: Optional[Union[JsonDecoder, XmlDecoder, CompositeRawDecoder]] = Field(
18701879
None, description="Component used to decode the response.", title="Decoder"
18711880
)
18721881
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
@@ -2071,6 +2080,7 @@ class SimpleRetriever(BaseModel):
20712080
XmlDecoder,
20722081
GzipJsonDecoder,
20732082
CompositeRawDecoder,
2083+
ZipfileDecoder,
20742084
]
20752085
] = Field(
20762086
None,
@@ -2147,6 +2157,8 @@ class AsyncRetriever(BaseModel):
21472157
IterableDecoder,
21482158
XmlDecoder,
21492159
GzipJsonDecoder,
2160+
CompositeRawDecoder,
2161+
ZipfileDecoder,
21502162
]
21512163
] = Field(
21522164
None,
@@ -2161,6 +2173,8 @@ class AsyncRetriever(BaseModel):
21612173
IterableDecoder,
21622174
XmlDecoder,
21632175
GzipJsonDecoder,
2176+
CompositeRawDecoder,
2177+
ZipfileDecoder,
21642178
]
21652179
] = Field(
21662180
None,

airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py

+11
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
JsonlDecoder,
6767
PaginationDecoderDecorator,
6868
XmlDecoder,
69+
ZipfileDecoder,
6970
)
7071
from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import (
7172
CompositeRawDecoder,
@@ -356,6 +357,9 @@
356357
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
357358
XmlDecoder as XmlDecoderModel,
358359
)
360+
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
361+
ZipfileDecoder as ZipfileDecoderModel,
362+
)
359363
from airbyte_cdk.sources.declarative.partition_routers import (
360364
CartesianProductStreamSlicer,
361365
ListPartitionRouter,
@@ -571,6 +575,7 @@ def _init_mappings(self) -> None:
571575
ConfigComponentsResolverModel: self.create_config_components_resolver,
572576
StreamConfigModel: self.create_stream_config,
573577
ComponentMappingDefinitionModel: self.create_components_mapping_definition,
578+
ZipfileDecoderModel: self.create_zipfile_decoder,
574579
}
575580

576581
# Needed for the case where we need to perform a second parse on the fields of a custom component
@@ -1800,6 +1805,12 @@ def create_gzipjson_decoder(
18001805
) -> GzipJsonDecoder:
18011806
return GzipJsonDecoder(parameters={}, encoding=model.encoding)
18021807

1808+
def create_zipfile_decoder(
1809+
self, model: ZipfileDecoderModel, config: Config, **kwargs: Any
1810+
) -> ZipfileDecoder:
1811+
parser = self._create_component_from_model(model=model.parser, config=config)
1812+
return ZipfileDecoder(parser=parser)
1813+
18031814
def create_gzip_parser(
18041815
self, model: GzipParserModel, config: Config, **kwargs: Any
18051816
) -> GzipParser:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#
2+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3+
#
4+
import gzip
5+
import json
6+
import zipfile
7+
from io import BytesIO
8+
from typing import Union
9+
10+
import pytest
11+
import requests
12+
13+
from airbyte_cdk.sources.declarative.decoders import GzipParser, JsonParser, ZipfileDecoder
14+
15+
16+
def create_zip_from_dict(data: Union[dict, list]) -> bytes:
17+
zip_buffer = BytesIO()
18+
with zipfile.ZipFile(zip_buffer, mode="w") as zip_file:
19+
zip_file.writestr("data.json", data)
20+
return zip_buffer.getvalue()
21+
22+
23+
def create_multi_zip_from_dict(data: list) -> bytes:
24+
zip_buffer = BytesIO()
25+
26+
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
27+
for i, content in enumerate(data):
28+
file_content = json.dumps(content).encode("utf-8")
29+
zip_file.writestr(f"file_{i}.json", file_content)
30+
return zip_buffer.getvalue()
31+
32+
33+
@pytest.mark.parametrize(
34+
"json_data",
35+
[
36+
{"test": "test"},
37+
{"responses": [{"id": 1}, {"id": 2}]},
38+
[{"id": 1}, {"id": 2}],
39+
{},
40+
],
41+
)
42+
def test_zipfile_decoder_with_single_file_response(requests_mock, json_data):
43+
zipfile_decoder = ZipfileDecoder(parser=GzipParser(inner_parser=JsonParser()))
44+
compressed_data = gzip.compress(json.dumps(json_data).encode())
45+
zipped_data = create_zip_from_dict(compressed_data)
46+
requests_mock.register_uri("GET", "https://airbyte.io/", content=zipped_data)
47+
response = requests.get("https://airbyte.io/")
48+
49+
if isinstance(json_data, list):
50+
for i, actual in enumerate(zipfile_decoder.decode(response=response)):
51+
assert actual == json_data[i]
52+
else:
53+
assert next(zipfile_decoder.decode(response=response)) == json_data
54+
55+
56+
def test_zipfile_decoder_with_multi_file_response(requests_mock):
57+
data_to_zip = [{"key1": "value1"}, {"key2": "value2"}, {"key3": "value3"}]
58+
59+
mocked_response = create_multi_zip_from_dict(data_to_zip)
60+
61+
decoder = ZipfileDecoder(parser=JsonParser())
62+
requests_mock.register_uri("GET", "https://airbyte.io/", content=mocked_response)
63+
response = requests.get("https://airbyte.io/")
64+
results = list(decoder.decode(response))
65+
66+
assert len(results) == 3
67+
for i, actual in enumerate(results):
68+
assert actual == data_to_zip[i]

0 commit comments

Comments
 (0)