Skip to content

Commit afb0000

Browse files
committed
iceberg/json_schema: support oneOf for nullable types
Add support for the oneOf keyword, limited to the T|null pattern used to express nullable fields. Constraints from oneOf branches are intersected with the parent schema for types, format, properties, and items. Properties and items must appear in only one branch; specifying them on both sides is rejected. Format must either match across branches or appear in only one.
1 parent e321e2b commit afb0000

File tree

5 files changed

+711
-8
lines changed

5 files changed

+711
-8
lines changed

src/v/iceberg/conversion/ir_json.cc

Lines changed: 110 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,13 @@ class type_set {
4949
static type_set none() { return type_set{}; }
5050

5151
void set(json_type t) { bits_.set(index(t)); }
52+
void intersect(const type_set& other) { bits_ &= other.bits_; }
5253
bool test(json_type t) const { return bits_.test(index(t)); }
5354

55+
bool is_null_only() const {
56+
return bits_.count() == 1 && bits_.test(index(json_type::null));
57+
}
58+
5459
/// Returns the single non-null type if exactly one is set.
5560
std::optional<json_type> single_non_null_type() const {
5661
auto copy = bits_;
@@ -113,12 +118,56 @@ struct constraint {
113118

114119
// Object properties, keyed by name.
115120
std::map<std::string, constraint> properties = {};
121+
bool additional_properties_allowed = true;
116122

117123
// Array item constraints.
118124
// - nullopt: no "items" keyword
119125
// - empty vector: "items": []
120126
// - non-empty: item schema(s) to validate
121127
std::optional<std::vector<constraint>> items = std::nullopt;
128+
129+
conversion_outcome<void> intersect(const constraint& other) {
130+
types.intersect(other.types);
131+
132+
if (format.has_value() && other.format.has_value()) {
133+
if (format != other.format) {
134+
return conversion_exception(
135+
"Conflicting format annotations across branches");
136+
}
137+
} else if (other.format.has_value()) {
138+
format = other.format;
139+
}
140+
141+
if (!properties.empty() && !other.properties.empty()) {
142+
return conversion_exception(
143+
"Intersecting constraints with properties on both sides "
144+
"is not supported");
145+
} else if (!other.properties.empty()) {
146+
if (!additional_properties_allowed) {
147+
return conversion_exception(
148+
"additionalProperties: false conflicts with properties "
149+
"defined in another branch");
150+
}
151+
properties = other.properties;
152+
} else if (
153+
!properties.empty() && !other.additional_properties_allowed) {
154+
return conversion_exception(
155+
"additionalProperties: false conflicts with properties "
156+
"defined in another branch");
157+
}
158+
additional_properties_allowed = additional_properties_allowed
159+
&& other.additional_properties_allowed;
160+
161+
if (items.has_value() && other.items.has_value()) {
162+
return conversion_exception(
163+
"Intersecting constraints with items on both sides is not "
164+
"supported");
165+
} else if (other.items.has_value()) {
166+
items = other.items;
167+
}
168+
169+
return outcome::success();
170+
}
122171
};
123172

124173
/// Context for the resolution phase.
@@ -149,6 +198,48 @@ collect(collect_context& ctx, const conversion::json_schema::subschema&);
149198

150199
conversion_outcome<field_type> resolve(resolution_context&, const constraint&);
151200

201+
// This is not full fidelity oneOf support - only T|null is supported.
202+
// In the general case, oneOf is a XOR over multiple schemas. For T|null
203+
// it is reduced to OR.
204+
conversion_outcome<constraint> collect_one_of_t_xor_null(
205+
collect_context& ctx,
206+
const conversion::json_schema::const_list_view& one_of) {
207+
constexpr std::string_view unsupported_one_of_msg
208+
= "oneOf keyword is supported only for exclusive T|null structures";
209+
if (one_of.size() != 2) {
210+
return conversion_exception(std::string{unsupported_one_of_msg});
211+
}
212+
213+
auto c1 = collect(ctx, one_of.at(0));
214+
if (c1.has_error()) {
215+
return c1.error();
216+
}
217+
auto c2 = collect(ctx, one_of.at(1));
218+
if (c2.has_error()) {
219+
return c2.error();
220+
}
221+
222+
// Accept only exclusive oneOf(T, null):
223+
// - exactly one branch is null-only
224+
// - the non-null branch must not include null
225+
const bool c1_is_null_only = c1.value().types.is_null_only();
226+
const bool c2_is_null_only = c2.value().types.is_null_only();
227+
228+
if (c1_is_null_only == c2_is_null_only) {
229+
return conversion_exception(std::string{unsupported_one_of_msg});
230+
}
231+
232+
const constraint& non_null_branch = c1_is_null_only ? c2.value()
233+
: c1.value();
234+
if (non_null_branch.types.test(json_type::null)) {
235+
return conversion_exception(std::string{unsupported_one_of_msg});
236+
}
237+
238+
auto c = non_null_branch;
239+
c.types.set(json_type::null);
240+
return c;
241+
}
242+
152243
/// Collect item constraints from a JSON Schema array.
153244
conversion_outcome<std::optional<std::vector<constraint>>> collect_items(
154245
collect_context& ctx, const conversion::json_schema::subschema& s) {
@@ -232,12 +323,14 @@ collect(collect_context& ctx, const conversion::json_schema::subschema& s) {
232323
c.properties[name] = std::move(prop_constraint.value());
233324
}
234325

235-
if (
236-
s.additional_properties()
237-
&& s.additional_properties()->get().boolean_subschema() != false) {
238-
return conversion_exception(
239-
"Only 'false' subschema is supported "
240-
"for additionalProperties keyword");
326+
if (s.additional_properties()) {
327+
if (s.additional_properties()->get().boolean_subschema() == false) {
328+
c.additional_properties_allowed = false;
329+
} else {
330+
return conversion_exception(
331+
"Only 'false' subschema is supported "
332+
"for additionalProperties keyword");
333+
}
241334
}
242335
}
243336

@@ -250,6 +343,17 @@ collect(collect_context& ctx, const conversion::json_schema::subschema& s) {
250343
c.items = std::move(items_result.value());
251344
}
252345

346+
if (!s.one_of().empty()) {
347+
auto one_of_constraint = collect_one_of_t_xor_null(ctx, s.one_of());
348+
if (one_of_constraint.has_error()) {
349+
return one_of_constraint.error();
350+
}
351+
auto intersect_result = c.intersect(one_of_constraint.value());
352+
if (intersect_result.has_error()) {
353+
return intersect_result.error();
354+
}
355+
}
356+
253357
return c;
254358
}
255359

src/v/iceberg/conversion/json_schema/frontend.cc

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,6 @@ constexpr auto banned_keywords = std::to_array({
300300
"else",
301301
"allOf",
302302
"anyOf",
303-
"oneOf",
304303
});
305304

306305
}; // namespace
@@ -525,6 +524,12 @@ class frontend::frontend_impl {
525524
}
526525
}
527526

527+
if (
528+
const auto one_of_node = find_keyword(
529+
node, "oneOf", {json_value_type::array})) {
530+
compile_one_of(ctx, *sub, *one_of_node);
531+
}
532+
528533
if (
529534
const auto format_node = find_keyword(
530535
node, "format", {json_value_type::string})) {
@@ -657,6 +662,33 @@ class frontend::frontend_impl {
657662
"The items keyword must be an object or an array of objects");
658663
}
659664
}
665+
666+
void compile_one_of(
667+
compile_context& ctx, subschema& sub, const json::Value& node) const {
668+
vassert(
669+
sub.one_of_.empty(),
670+
"oneOf subschema vector should be empty at this point");
671+
672+
if (!node.IsArray() || node.GetArray().Empty()) {
673+
throw std::runtime_error(
674+
"The oneOf keyword must be a non-empty array");
675+
}
676+
677+
std::vector<ss::shared_ptr<subschema>> one_of_subschemas;
678+
one_of_subschemas.reserve(node.GetArray().Size());
679+
for (const auto& item : node.GetArray()) {
680+
one_of_subschemas.push_back(compile_subschema(ctx, item));
681+
}
682+
683+
for (size_t i = 0; i < one_of_subschemas.size(); ++i) {
684+
auto& one_of_subschema = one_of_subschemas[i];
685+
auto [it, inserted] = sub.subschemas_.emplace(
686+
fmt::format("oneOf/{}", i), one_of_subschema);
687+
vassert(inserted, "unique insertion should have succeeded");
688+
}
689+
690+
sub.one_of_ = std::move(one_of_subschemas);
691+
}
660692
};
661693

662694
frontend::frontend()

src/v/iceberg/conversion/json_schema/ir.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,9 @@ class subschema {
303303
: std::nullopt;
304304
}
305305

306+
/// \return Subschemas in the "oneOf" keyword if specified.
307+
const_list_view one_of() const { return const_list_view(one_of_); }
308+
306309
/// \return Format of this schema object if specified.
307310
/// See https://www.learnjsonschema.com/2020-12/validation/format
308311
const std::optional<format>& format() const { return format_; }
@@ -333,6 +336,8 @@ class subschema {
333336
items_t items_;
334337
ss::shared_ptr<subschema> additional_items_;
335338

339+
std::vector<ss::shared_ptr<subschema>> one_of_;
340+
336341
std::optional<enum format> format_;
337342

338343
std::optional<std::string> ref_value_;

src/v/iceberg/conversion/json_schema/tests/frontend_test.cc

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -923,6 +923,87 @@ TEST(frontend_test, non_object_or_boolean_subschema) {
923923
StrEq("Subschema must be an object or a boolean")));
924924
}
925925

926+
TEST(frontend_test, one_of) {
927+
auto schema = frontend{}.compile(
928+
parse_json(R"({
929+
"$id": "https://example.com/root.json",
930+
"oneOf": [
931+
{ "type": "null" },
932+
{ "type": "string" },
933+
{ "type": "integer" }
934+
]
935+
})"),
936+
"https://example.com/irrelevant-base.json",
937+
dialect::draft7);
938+
939+
auto expected = R"(# (document root)
940+
base uri: https://example.com/root.json
941+
dialect: http://json-schema.org/draft-07/schema#
942+
#/oneOf/0
943+
base uri: https://example.com/root.json
944+
dialect: http://json-schema.org/draft-07/schema#
945+
types: [null]
946+
#/oneOf/1
947+
base uri: https://example.com/root.json
948+
dialect: http://json-schema.org/draft-07/schema#
949+
types: [string]
950+
#/oneOf/2
951+
base uri: https://example.com/root.json
952+
dialect: http://json-schema.org/draft-07/schema#
953+
types: [integer]
954+
)";
955+
956+
ASSERT_EQ(expected, ir_tree_printer::to_string(schema));
957+
ASSERT_EQ(schema.root().one_of().size(), 3);
958+
}
959+
960+
TEST(frontend_test, duplicate_one_of) {
961+
// Not required by json schema but we disallow duplicate keywords to reduce
962+
// ambiguity.
963+
EXPECT_THAT(
964+
[]() {
965+
frontend{}.compile(
966+
parse_json(R"({
967+
"$id": "https://example.com/root.json",
968+
"oneOf": [{ "type": "string" }],
969+
"oneOf": [{ "type": "integer" }]
970+
})"),
971+
"https://example.com/irrelevant-base.json",
972+
dialect::draft7);
973+
},
974+
ThrowsMessage<std::runtime_error>(StrEq("Duplicate keyword: oneOf")));
975+
}
976+
977+
TEST(frontend_test, one_of_empty_array) {
978+
EXPECT_THAT(
979+
[]() {
980+
frontend{}.compile(
981+
parse_json(R"({
982+
"$id": "https://example.com/root.json",
983+
"oneOf": []
984+
})"),
985+
"https://example.com/irrelevant-base.json",
986+
dialect::draft7);
987+
},
988+
ThrowsMessage<std::runtime_error>(
989+
StrEq("The oneOf keyword must be a non-empty array")));
990+
}
991+
992+
TEST(frontend_test, one_of_non_array) {
993+
EXPECT_THAT(
994+
[]() {
995+
frontend{}.compile(
996+
parse_json(R"({
997+
"$id": "https://example.com/root.json",
998+
"oneOf": "not an array"
999+
})"),
1000+
"https://example.com/irrelevant-base.json",
1001+
dialect::draft7);
1002+
},
1003+
ThrowsMessage<std::runtime_error>(
1004+
StrEq("Invalid type for keyword oneOf. Expected one of: [array].")));
1005+
}
1006+
9261007
TEST(frontend_test, non_string_dialect) {
9271008
EXPECT_THAT(
9281009
[]() {

0 commit comments

Comments
 (0)