Skip to content

Commit cb5a921

Browse files
maxi297octavia-squidington-iii
and
octavia-squidington-iii
authored
chore(decoder): clean decoders and make csvdecoder available (#326)
Co-authored-by: octavia-squidington-iii <[email protected]>
1 parent 74631d8 commit cb5a921

11 files changed

+139
-330
lines changed

airbyte_cdk/sources/declarative/declarative_component_schema.yaml

+22-80
Original file line numberDiff line numberDiff line change
@@ -1549,7 +1549,6 @@ definitions:
15491549
anyOf:
15501550
- "$ref": "#/definitions/JsonDecoder"
15511551
- "$ref": "#/definitions/XmlDecoder"
1552-
- "$ref": "#/definitions/CompositeRawDecoder"
15531552
$parameters:
15541553
type: object
15551554
additionalProperties: true
@@ -2133,43 +2132,26 @@ definitions:
21332132
$parameters:
21342133
type: object
21352134
additionalProperties: true
2136-
GzipJsonDecoder:
2137-
title: GzipJson Decoder
2138-
description: Use this if the response is Gzip compressed Json.
2139-
type: object
2140-
additionalProperties: true
2141-
required:
2142-
- type
2143-
properties:
2144-
type:
2145-
type: string
2146-
enum: [GzipJsonDecoder]
2147-
encoding:
2148-
type: string
2149-
default: utf-8
2150-
$parameters:
2151-
type: object
2152-
additionalProperties: true
21532135
ZipfileDecoder:
21542136
title: Zipfile Decoder
21552137
description: Decoder for response data that is returned as zipfile(s).
21562138
type: object
21572139
additionalProperties: true
21582140
required:
21592141
- type
2160-
- parser
2142+
- decoder
21612143
properties:
21622144
type:
21632145
type: string
21642146
enum: [ZipfileDecoder]
2165-
parser:
2147+
decoder:
21662148
title: Parser
21672149
description: Parser to parse the decompressed data from the zipfile(s).
21682150
anyOf:
2169-
- "$ref": "#/definitions/GzipParser"
2170-
- "$ref": "#/definitions/JsonParser"
2171-
- "$ref": "#/definitions/JsonLineParser"
2172-
- "$ref": "#/definitions/CsvParser"
2151+
- "$ref": "#/definitions/CsvDecoder"
2152+
- "$ref": "#/definitions/GzipDecoder"
2153+
- "$ref": "#/definitions/JsonDecoder"
2154+
- "$ref": "#/definitions/JsonlDecoder"
21732155
ListPartitionRouter:
21742156
title: List Partition Router
21752157
description: A Partition router that specifies a list of attributes where each attribute describes a portion of the complete data set for a stream. During a sync, each value is iterated over and can be used as input to outbound API requests.
@@ -3002,79 +2984,39 @@ definitions:
30022984
description: Component decoding the response so records can be extracted.
30032985
anyOf:
30042986
- "$ref": "#/definitions/CustomDecoder"
2987+
- "$ref": "#/definitions/CsvDecoder"
2988+
- "$ref": "#/definitions/GzipDecoder"
30052989
- "$ref": "#/definitions/JsonDecoder"
30062990
- "$ref": "#/definitions/JsonlDecoder"
30072991
- "$ref": "#/definitions/IterableDecoder"
30082992
- "$ref": "#/definitions/XmlDecoder"
3009-
- "$ref": "#/definitions/GzipJsonDecoder"
3010-
- "$ref": "#/definitions/CompositeRawDecoder"
30112993
- "$ref": "#/definitions/ZipfileDecoder"
30122994
$parameters:
30132995
type: object
30142996
additionalProperties: true
3015-
CompositeRawDecoder:
3016-
description: "(This is experimental, use at your own risk)"
3017-
type: object
3018-
required:
3019-
- type
3020-
- parser
3021-
properties:
3022-
type:
3023-
type: string
3024-
enum: [CompositeRawDecoder]
3025-
parser:
3026-
anyOf:
3027-
- "$ref": "#/definitions/GzipParser"
3028-
- "$ref": "#/definitions/JsonParser"
3029-
- "$ref": "#/definitions/JsonLineParser"
3030-
- "$ref": "#/definitions/CsvParser"
3031-
# PARSERS
3032-
GzipParser:
2997+
GzipDecoder:
30332998
type: object
30342999
required:
30353000
- type
3036-
- inner_parser
3001+
- decoder
30373002
properties:
30383003
type:
30393004
type: string
3040-
enum: [GzipParser]
3041-
inner_parser:
3005+
enum: [GzipDecoder]
3006+
decoder:
30423007
anyOf:
3043-
- "$ref": "#/definitions/JsonLineParser"
3044-
- "$ref": "#/definitions/CsvParser"
3045-
- "$ref": "#/definitions/JsonParser"
3046-
JsonParser:
3047-
title: JsonParser
3048-
description: Parser used for parsing str, bytes, or bytearray data and returning data in a dictionary format.
3049-
type: object
3050-
required:
3051-
- type
3052-
properties:
3053-
type:
3054-
type: string
3055-
enum: [JsonParser]
3056-
encoding:
3057-
type: string
3058-
default: utf-8
3059-
JsonLineParser:
3060-
type: object
3061-
required:
3062-
- type
3063-
properties:
3064-
type:
3065-
type: string
3066-
enum: [JsonLineParser]
3067-
encoding:
3068-
type: string
3069-
default: utf-8
3070-
CsvParser:
3008+
- "$ref": "#/definitions/CsvDecoder"
3009+
- "$ref": "#/definitions/GzipDecoder"
3010+
- "$ref": "#/definitions/JsonDecoder"
3011+
- "$ref": "#/definitions/JsonlDecoder"
3012+
CsvDecoder:
30713013
type: object
30723014
required:
30733015
- type
30743016
properties:
30753017
type:
30763018
type: string
3077-
enum: [CsvParser]
3019+
enum: [CsvDecoder]
30783020
encoding:
30793021
type: string
30803022
default: utf-8
@@ -3202,24 +3144,24 @@ definitions:
32023144
description: Component decoding the response so records can be extracted.
32033145
anyOf:
32043146
- "$ref": "#/definitions/CustomDecoder"
3147+
- "$ref": "#/definitions/CsvDecoder"
3148+
- "$ref": "#/definitions/GzipDecoder"
32053149
- "$ref": "#/definitions/JsonDecoder"
32063150
- "$ref": "#/definitions/JsonlDecoder"
32073151
- "$ref": "#/definitions/IterableDecoder"
32083152
- "$ref": "#/definitions/XmlDecoder"
3209-
- "$ref": "#/definitions/GzipJsonDecoder"
3210-
- "$ref": "#/definitions/CompositeRawDecoder"
32113153
- "$ref": "#/definitions/ZipfileDecoder"
32123154
download_decoder:
32133155
title: Download Decoder
32143156
description: Component decoding the download response so records can be extracted.
32153157
anyOf:
32163158
- "$ref": "#/definitions/CustomDecoder"
3159+
- "$ref": "#/definitions/CsvDecoder"
3160+
- "$ref": "#/definitions/GzipDecoder"
32173161
- "$ref": "#/definitions/JsonDecoder"
32183162
- "$ref": "#/definitions/JsonlDecoder"
32193163
- "$ref": "#/definitions/IterableDecoder"
32203164
- "$ref": "#/definitions/XmlDecoder"
3221-
- "$ref": "#/definitions/GzipJsonDecoder"
3222-
- "$ref": "#/definitions/CompositeRawDecoder"
32233165
- "$ref": "#/definitions/ZipfileDecoder"
32243166
$parameters:
32253167
type: object

airbyte_cdk/sources/declarative/decoders/__init__.py

-4
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,8 @@
1010
)
1111
from airbyte_cdk.sources.declarative.decoders.decoder import Decoder
1212
from airbyte_cdk.sources.declarative.decoders.json_decoder import (
13-
GzipJsonDecoder,
1413
IterableDecoder,
1514
JsonDecoder,
16-
JsonlDecoder,
1715
)
1816
from airbyte_cdk.sources.declarative.decoders.noop_decoder import NoopDecoder
1917
from airbyte_cdk.sources.declarative.decoders.pagination_decoder_decorator import (
@@ -27,9 +25,7 @@
2725
"CompositeRawDecoder",
2826
"JsonDecoder",
2927
"JsonParser",
30-
"JsonlDecoder",
3128
"IterableDecoder",
32-
"GzipJsonDecoder",
3329
"NoopDecoder",
3430
"PaginationDecoderDecorator",
3531
"XmlDecoder",

airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import csv
22
import gzip
3+
import io
34
import json
45
import logging
56
from abc import ABC, abstractmethod
@@ -130,11 +131,15 @@ class CompositeRawDecoder(Decoder):
130131
"""
131132

132133
parser: Parser
134+
stream_response: bool = True
133135

134136
def is_stream_response(self) -> bool:
135-
return True
137+
return self.stream_response
136138

137139
def decode(
138140
self, response: requests.Response
139141
) -> Generator[MutableMapping[str, Any], None, None]:
140-
yield from self.parser.parse(data=response.raw) # type: ignore[arg-type]
142+
if self.is_stream_response():
143+
yield from self.parser.parse(data=response.raw) # type: ignore[arg-type]
144+
else:
145+
yield from self.parser.parse(data=io.BytesIO(response.content))

airbyte_cdk/sources/declarative/decoders/json_decoder.py

+12-58
Original file line numberDiff line numberDiff line change
@@ -10,47 +10,41 @@
1010
import orjson
1111
import requests
1212

13+
from airbyte_cdk.sources.declarative.decoders import CompositeRawDecoder, JsonParser
1314
from airbyte_cdk.sources.declarative.decoders.decoder import Decoder
1415

1516
logger = logging.getLogger("airbyte")
1617

1718

18-
@dataclass
1919
class JsonDecoder(Decoder):
2020
"""
2121
Decoder strategy that returns the json-encoded content of a response, if any.
22+
23+
Usually, we would try to instantiate the equivalent `CompositeRawDecoder(parser=JsonParser(), stream_response=False)` but there were specific historical behaviors related to the JsonDecoder that we didn't know if we could remove like the fallback on {} in case of errors.
2224
"""
2325

24-
parameters: InitVar[Mapping[str, Any]]
26+
def __init__(self, parameters: Mapping[str, Any]):
27+
self._decoder = CompositeRawDecoder(parser=JsonParser(), stream_response=False)
2528

2629
def is_stream_response(self) -> bool:
27-
return False
30+
return self._decoder.is_stream_response()
2831

2932
def decode(
3033
self, response: requests.Response
3134
) -> Generator[MutableMapping[str, Any], None, None]:
3235
"""
3336
Given the response is an empty string or an emtpy list, the function will return a generator with an empty mapping.
3437
"""
38+
has_yielded = False
3539
try:
36-
body_json = response.json()
37-
yield from self.parse_body_json(body_json)
38-
except requests.exceptions.JSONDecodeError:
39-
logger.warning(
40-
f"Response cannot be parsed into json: {response.status_code=}, {response.text=}"
41-
)
40+
for element in self._decoder.decode(response):
41+
yield element
42+
has_yielded = True
43+
except Exception:
4244
yield {}
4345

44-
@staticmethod
45-
def parse_body_json(
46-
body_json: MutableMapping[str, Any] | List[MutableMapping[str, Any]],
47-
) -> Generator[MutableMapping[str, Any], None, None]:
48-
if not isinstance(body_json, list):
49-
body_json = [body_json]
50-
if len(body_json) == 0:
46+
if not has_yielded:
5147
yield {}
52-
else:
53-
yield from body_json
5448

5549

5650
@dataclass
@@ -69,43 +63,3 @@ def decode(
6963
) -> Generator[MutableMapping[str, Any], None, None]:
7064
for line in response.iter_lines():
7165
yield {"record": line.decode()}
72-
73-
74-
@dataclass
75-
class JsonlDecoder(Decoder):
76-
"""
77-
Decoder strategy that returns the json-encoded content of the response, if any.
78-
"""
79-
80-
parameters: InitVar[Mapping[str, Any]]
81-
82-
def is_stream_response(self) -> bool:
83-
return True
84-
85-
def decode(
86-
self, response: requests.Response
87-
) -> Generator[MutableMapping[str, Any], None, None]:
88-
# TODO???: set delimiter? usually it is `\n` but maybe it would be useful to set optional?
89-
# https://github.com/airbytehq/airbyte-internal-issues/issues/8436
90-
for record in response.iter_lines():
91-
yield orjson.loads(record)
92-
93-
94-
@dataclass
95-
class GzipJsonDecoder(JsonDecoder):
96-
encoding: Optional[str]
97-
98-
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
99-
if self.encoding:
100-
try:
101-
codecs.lookup(self.encoding)
102-
except LookupError:
103-
raise ValueError(
104-
f"Invalid encoding '{self.encoding}'. Please check provided encoding"
105-
)
106-
107-
def decode(
108-
self, response: requests.Response
109-
) -> Generator[MutableMapping[str, Any], None, None]:
110-
raw_string = decompress(response.content).decode(encoding=self.encoding or "utf-8")
111-
yield from self.parse_body_json(orjson.loads(raw_string))

0 commit comments

Comments
 (0)