From ba02e40b2b757aa3c59480c2ce92fe4879b71e6e Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Fri, 17 May 2024 11:02:16 -0700 Subject: [PATCH 1/3] fix: handle anyOf(object, string) in json schema --- airbyte/types.py | 10 +++++++++- tests/unit_tests/test_type_translation.py | 2 ++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/airbyte/types.py b/airbyte/types.py index d84ce545..c510b606 100644 --- a/airbyte/types.py +++ b/airbyte/types.py @@ -53,6 +53,14 @@ def _get_airbyte_type( # noqa: PLR0911 # Too many return statements non_null_types = [t for t in json_schema_type if t != "null"] if len(non_null_types) == 1: json_schema_type = non_null_types[0] + elif "object" in non_null_types: + # If one of the types is an object, we pick "object" as the type. + # For example, ["object", "string"] should be treated as "object". + json_schema_type = "object" + + if not isinstance(json_schema_type, str): + err_msg = f"Could not determine airbyte type from JSON schema: {json_schema_property_def}" + raise SQLTypeConversionError(err_msg) if json_schema_type == "string": if json_schema_format == "date": @@ -65,7 +73,7 @@ def _get_airbyte_type( # noqa: PLR0911 # Too many return statements return "time_without_timezone", None if json_schema_type in {"string", "number", "boolean", "integer"}: - return cast(str, json_schema_type), None + return json_schema_type, None if json_schema_type == "object": return "object", None diff --git a/tests/unit_tests/test_type_translation.py b/tests/unit_tests/test_type_translation.py index 2f165bb3..178282f3 100644 --- a/tests/unit_tests/test_type_translation.py +++ b/tests/unit_tests/test_type_translation.py @@ -54,6 +54,7 @@ ({"type": ["null", "array"], "items": {"type": "object"}}, types.JSON), ({"type": "object", "properties": {}}, types.JSON), ({"type": ["null", "object"], "properties": {}}, types.JSON), + ({"type": ["null", "string", "object"], "properties": {}}, types.JSON), # Malformed JSON schema seen in the wild: ({"type": "array", "items": {}}, types.JSON), ({"type": ["null", "array"], "items": {"items": {}}}, types.JSON), @@ -112,6 +113,7 @@ def test_to_sql_type(json_schema_property_def, expected_sql_type): ({"type": ["null", "array"], "items": {"type": "object"}}, "array"), # Object type: ({"type": "object"}, "object"), + ({"type": ["null", "object", "string"]}, "object"), # Malformed JSON schema seen in the wild: ({"type": "array", "items": {"items": {}}}, "array"), ({"type": ["null", "array"], "items": {"items": {}}}, "array"), From 7e640543e9e49eba37b929f4607b8ef453602322 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Fri, 17 May 2024 11:05:32 -0700 Subject: [PATCH 2/3] chore: add example case to test fixture --- .../fixtures/source-test/source_test/run.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/integration_tests/fixtures/source-test/source_test/run.py b/tests/integration_tests/fixtures/source-test/source_test/run.py index 9b9802d1..f5d0eb78 100644 --- a/tests/integration_tests/fixtures/source-test/source_test/run.py +++ b/tests/integration_tests/fixtures/source-test/source_test/run.py @@ -19,6 +19,12 @@ "properties": { "Column1": {"type": "string"}, "Column2": {"type": "number"}, + "sometimes_object": { + "type": ["null", "string", "object"], + "properties": { + "nested_column": {"type": "string"}, + }, + }, }, }, }, @@ -87,7 +93,11 @@ sample_record1_stream1 = { "type": "RECORD", "record": { - "data": {"Column1": "value1", "Column2": 1}, + "data": { + "Column1": "value1", + "Column2": 1, + "sometimes_object": {"nested_column": "nested_value"}, + }, "stream": "stream1", "emitted_at": 1704067200, }, @@ -95,7 +105,11 @@ sample_record2_stream1 = { "type": "RECORD", "record": { - "data": {"Column1": "value2", "Column2": 2}, + "data": { + "Column1": "value2", + "Column2": 2, + "sometimes_object": "string_value", + }, "stream": "stream1", "emitted_at": 1704067200, }, From 3fb93a4cb7c5eea70f962e624eadd194d28b3068 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Fri, 17 May 2024 12:48:48 -0700 Subject: [PATCH 3/3] chore: update test to check as stringified-json --- tests/integration_tests/test_source_test_fixture.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/integration_tests/test_source_test_fixture.py b/tests/integration_tests/test_source_test_fixture.py index 076081cb..d0509592 100644 --- a/tests/integration_tests/test_source_test_fixture.py +++ b/tests/integration_tests/test_source_test_fixture.py @@ -107,8 +107,16 @@ def source_test(source_test_env) -> ab.Source: def expected_test_stream_data() -> dict[str, list[dict[str, str | int]]]: return { "stream1": [ - {"column1": "value1", "column2": 1}, - {"column1": "value2", "column2": 2}, + { + "column1": "value1", + "column2": 1, + "sometimes_object": '{"nested_column":"nested_value"}', + }, + { + "column1": "value2", + "column2": 2, + "sometimes_object": '"string_value"', + }, ], "stream2": [ {