1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
1
5
import csv
2
6
import gzip
3
7
import io
4
8
import json
5
9
import logging
6
- from abc import ABC , abstractmethod
7
10
from dataclasses import dataclass
8
11
from io import BufferedIOBase , TextIOWrapper
9
- from typing import Any , Generator , MutableMapping , Optional
12
+ from typing import Any , Optional
10
13
11
14
import orjson
12
15
import requests
13
16
14
17
from airbyte_cdk .models import FailureType
15
- from airbyte_cdk .sources .declarative .decoders .decoder import Decoder
18
+ from airbyte_cdk .sources .declarative .decoders .decoder import DECODER_OUTPUT_TYPE , Decoder
19
+ from airbyte_cdk .sources .declarative .decoders .decoder_parser import (
20
+ PARSER_OUTPUT_TYPE ,
21
+ PARSERS_BY_HEADER_TYPE ,
22
+ PARSERS_TYPE ,
23
+ Parser ,
24
+ )
16
25
from airbyte_cdk .utils import AirbyteTracedException
17
26
18
27
logger = logging .getLogger ("airbyte" )
19
28
20
29
21
- @dataclass
22
- class Parser (ABC ):
23
- @abstractmethod
24
- def parse (
25
- self ,
26
- data : BufferedIOBase ,
27
- ) -> Generator [MutableMapping [str , Any ], None , None ]:
28
- """
29
- Parse data and yield dictionaries.
30
- """
31
- pass
32
-
33
-
34
30
@dataclass
35
31
class GzipParser (Parser ):
36
32
inner_parser : Parser
37
33
38
- def parse (
39
- self ,
40
- data : BufferedIOBase ,
41
- ) -> Generator [MutableMapping [str , Any ], None , None ]:
34
+ def parse (self , data : BufferedIOBase ) -> PARSER_OUTPUT_TYPE :
42
35
"""
43
36
Decompress gzipped bytes and pass decompressed data to the inner parser.
37
+
38
+ IMPORTANT:
39
+ - If the data is not gzipped, reset the pointer and pass the data to the inner parser as is.
40
+
41
+ Note:
42
+ - The data is not decoded by default.
44
43
"""
44
+
45
45
with gzip .GzipFile (fileobj = data , mode = "rb" ) as gzipobj :
46
46
yield from self .inner_parser .parse (gzipobj )
47
47
@@ -50,7 +50,7 @@ def parse(
50
50
class JsonParser (Parser ):
51
51
encoding : str = "utf-8"
52
52
53
- def parse (self , data : BufferedIOBase ) -> Generator [ MutableMapping [ str , Any ], None , None ] :
53
+ def parse (self , data : BufferedIOBase ) -> PARSER_OUTPUT_TYPE :
54
54
"""
55
55
Attempts to deserialize data using orjson library. As an extra layer of safety we fallback on the json library to deserialize the data.
56
56
"""
@@ -90,10 +90,7 @@ def _parse_json(self, raw_data: bytes) -> Optional[Any]:
90
90
class JsonLineParser (Parser ):
91
91
encoding : Optional [str ] = "utf-8"
92
92
93
- def parse (
94
- self ,
95
- data : BufferedIOBase ,
96
- ) -> Generator [MutableMapping [str , Any ], None , None ]:
93
+ def parse (self , data : BufferedIOBase ) -> PARSER_OUTPUT_TYPE :
97
94
for line in data :
98
95
try :
99
96
yield json .loads (line .decode (encoding = self .encoding or "utf-8" ))
@@ -117,10 +114,7 @@ def _get_delimiter(self) -> Optional[str]:
117
114
118
115
return self .delimiter
119
116
120
- def parse (
121
- self ,
122
- data : BufferedIOBase ,
123
- ) -> Generator [MutableMapping [str , Any ], None , None ]:
117
+ def parse (self , data : BufferedIOBase ) -> PARSER_OUTPUT_TYPE :
124
118
"""
125
119
Parse CSV data from decompressed bytes.
126
120
"""
@@ -130,31 +124,95 @@ def parse(
130
124
yield row
131
125
132
126
133
- @dataclass
134
127
class CompositeRawDecoder (Decoder ):
135
128
"""
136
- Decoder strategy to transform a requests.Response into a Generator[MutableMapping[str, Any], None, None]
129
+ Decoder strategy to transform a requests.Response into a PARSER_OUTPUT_TYPE
137
130
passed response.raw to parser(s).
138
- Note: response.raw is not decoded/decompressed by default.
139
- parsers should be instantiated recursively.
131
+
132
+ Note: response.raw is not decoded/decompressed by default. Parsers should be instantiated recursively.
133
+
140
134
Example:
141
- composite_raw_decoder = CompositeRawDecoder(parser=GzipParser(inner_parser=JsonLineParser(encoding="iso-8859-1")))
135
+ composite_raw_decoder = CompositeRawDecoder(
136
+ parser=GzipParser(
137
+ inner_parser=JsonLineParser(encoding="iso-8859-1")
138
+ )
139
+ )
142
140
"""
143
141
144
- parser : Parser
145
- stream_response : bool = True
142
+ def __init__ (
143
+ self ,
144
+ parser : Parser ,
145
+ stream_response : bool = True ,
146
+ parsers_by_header : PARSERS_BY_HEADER_TYPE = None ,
147
+ ) -> None :
148
+ # since we moved from using `dataclass` to `__init__` method,
149
+ # we need to keep using the `parser` to be able to resolve the depenencies
150
+ # between the parsers correctly.
151
+ self .parser = parser
152
+
153
+ self ._parsers_by_header = parsers_by_header if parsers_by_header else {}
154
+ self ._stream_response = stream_response
155
+
156
+ @classmethod
157
+ def by_headers (
158
+ cls ,
159
+ parsers : PARSERS_TYPE ,
160
+ stream_response : bool ,
161
+ fallback_parser : Parser ,
162
+ ) -> "CompositeRawDecoder" :
163
+ """
164
+ Create a CompositeRawDecoder instance based on header values.
165
+
166
+ Args:
167
+ parsers (PARSERS_TYPE): A list of tuples where each tuple contains headers, header values, and a parser.
168
+ stream_response (bool): A flag indicating whether the response should be streamed.
169
+ fallback_parser (Parser): A parser to use if no matching header is found.
170
+
171
+ Returns:
172
+ CompositeRawDecoder: An instance of CompositeRawDecoder configured with the provided parsers.
173
+ """
174
+ parsers_by_header = {}
175
+ for headers , header_values , parser in parsers :
176
+ for header in headers :
177
+ parsers_by_header [header ] = {header_value : parser for header_value in header_values }
178
+ return cls (fallback_parser , stream_response , parsers_by_header )
146
179
147
180
def is_stream_response (self ) -> bool :
148
- return self .stream_response
181
+ return self ._stream_response
149
182
150
- def decode (
151
- self , response : requests .Response
152
- ) -> Generator [MutableMapping [str , Any ], None , None ]:
183
+ def decode (self , response : requests .Response ) -> DECODER_OUTPUT_TYPE :
184
+ parser = self ._select_parser (response )
153
185
if self .is_stream_response ():
154
- # urllib mentions that some interfaces don't play nice with auto_close [here](https://urllib3.readthedocs.io/en/stable/user-guide.html#using-io-wrappers-with-response-content)
155
- # We have indeed observed some issues with CSV parsing. Hence, we will manage the closing of the file ourselves until we find a better solution.
186
+ # urllib mentions that some interfaces don't play nice with auto_close
187
+ # More info here: https://urllib3.readthedocs.io/en/stable/user-guide.html#using-io-wrappers-with-response-content
188
+ # We have indeed observed some issues with CSV parsing.
189
+ # Hence, we will manage the closing of the file ourselves until we find a better solution.
156
190
response .raw .auto_close = False
157
- yield from self .parser .parse (data = response .raw ) # type: ignore[arg-type]
191
+ yield from parser .parse (
192
+ data = response .raw , # type: ignore[arg-type]
193
+ )
158
194
response .raw .close ()
159
195
else :
160
- yield from self .parser .parse (data = io .BytesIO (response .content ))
196
+ yield from parser .parse (data = io .BytesIO (response .content ))
197
+
198
+ def _select_parser (self , response : requests .Response ) -> Parser :
199
+ """
200
+ Selects the appropriate parser based on the response headers.
201
+
202
+ This method iterates through the `_parsers_by_header` dictionary to find a matching parser
203
+ based on the headers in the response. If a matching header and header value are found,
204
+ the corresponding parser is returned. If no match is found, the default parser is returned.
205
+
206
+ Args:
207
+ response (requests.Response): The HTTP response object containing headers to check.
208
+
209
+ Returns:
210
+ Parser: The parser corresponding to the matched header value, or the default parser if no match is found.
211
+ """
212
+ for header , parser_by_header_value in self ._parsers_by_header .items ():
213
+ if (
214
+ header in response .headers
215
+ and response .headers [header ] in parser_by_header_value .keys ()
216
+ ):
217
+ return parser_by_header_value [response .headers [header ]]
218
+ return self .parser
0 commit comments