Skip to content

Commit e009ba5

Browse files
authored
Merge pull request #29592 from nvartolomei/nv/json-schema-non-adjacent-duplicate-keywords
iceberg/json_schema: fix non-adjacent duplicate keywords
2 parents 18fbe06 + cec4bcf commit e009ba5

File tree

3 files changed

+222
-111
lines changed

3 files changed

+222
-111
lines changed

src/v/iceberg/conversion/json_schema/frontend.cc

Lines changed: 169 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@
2121

2222
#include <seastar/util/defer.hh>
2323

24+
#include <algorithm>
2425
#include <array>
2526
#include <memory>
27+
#include <ranges>
2628
#include <unordered_set>
2729
#include <variant>
2830

@@ -42,10 +44,100 @@ dialect dialect_from_schema_id(std::string_view schema_id) {
4244
return details::string_switch_table(schema_id, dialect_by_schema_id);
4345
}
4446

45-
// Prefer using this instead of using `FindMember` directly, as it checks for
46-
// duplicate keywords and throws an exception if a duplicate is found.
47+
constexpr auto banned_keywords = std::to_array({
48+
// Do not allow $dynamicRef keywords as would change the semantics of
49+
// the schema but we haven't implemented them yet.
50+
// Note for implementer: you'll also need to add encoding for fields when the
51+
// subschemas map is built. Make sure to add test cases with refs containing
52+
// characters that need escaping.
53+
"$dynamicRef",
54+
55+
// Not implementing for now for simplicity and because I don't think anyone
56+
// relies on it due to peculiarities of what this keyword does.
57+
// See: https://github.com/json-schema-org/json-schema-spec/issues/867
58+
"default",
59+
60+
// We can't use this in iceberg so don't spend time implementing it for now.
61+
"patternProperties",
62+
"dependencies",
63+
"if",
64+
"then",
65+
"else",
66+
"allOf",
67+
"anyOf",
68+
});
69+
70+
enum class keyword : uint8_t {
71+
schema, // "$schema"
72+
id, // "$id"
73+
id_draft4, // "id"
74+
ref, // "$ref"
75+
type, // "type"
76+
definitions, // "definitions"
77+
properties, // "properties"
78+
additional_properties, // "additionalProperties"
79+
items, // "items"
80+
additional_items, // "additionalItems"
81+
format, // "format"
82+
one_of, // "oneOf"
83+
};
84+
85+
constexpr auto keyword_table
86+
= std::to_array<std::pair<std::string_view, keyword>>({
87+
{"$schema", keyword::schema},
88+
{"$id", keyword::id},
89+
{"id", keyword::id_draft4},
90+
{"$ref", keyword::ref},
91+
{"type", keyword::type},
92+
{"definitions", keyword::definitions},
93+
{"properties", keyword::properties},
94+
{"additionalProperties", keyword::additional_properties},
95+
{"items", keyword::items},
96+
{"additionalItems", keyword::additional_items},
97+
{"format", keyword::format},
98+
{"oneOf", keyword::one_of},
99+
});
100+
101+
// The size assert below references the last enumerator explicitly. If you add
102+
// a new keyword after one_of, update the assert. Together with the ordering
103+
// assert (which guarantees keyword_table[i] maps to enum value i), these two
104+
// checks ensure a 1:1 mapping between the enum and the table.
105+
static_assert(
106+
keyword_table.size() == static_cast<size_t>(keyword::one_of) + 1,
107+
"keyword_table must have an entry for every keyword enum value");
108+
109+
static_assert(
110+
[]() consteval {
111+
for (size_t i = 0; i < keyword_table.size(); ++i) {
112+
if (static_cast<size_t>(keyword_table[i].second) != i) {
113+
return false;
114+
}
115+
}
116+
return true;
117+
}(),
118+
"keyword_table entries must be ordered by enum value");
119+
120+
static_assert(
121+
[]() consteval {
122+
for (const auto& [name, _] : keyword_table) {
123+
for (const auto& banned : banned_keywords) {
124+
if (name == banned) {
125+
return false;
126+
}
127+
}
128+
}
129+
return true;
130+
}(),
131+
"keyword_table and banned_keywords must be disjoint");
132+
133+
constexpr std::string_view keyword_name(keyword kw) {
134+
return keyword_table[static_cast<size_t>(kw)].first;
135+
}
136+
137+
// Indexes known JSON Schema keywords from a single pass over a node's members.
138+
// Detects duplicate and banned keywords during construction.
47139
//
48-
// This is important as user validator might have different behavior for how
140+
// This is important as user validators might have different behavior for how
49141
// duplicate object keys are handled and we might infer a schema which would
50142
// not fit user's data.
51143
//
@@ -65,56 +157,79 @@ dialect dialect_from_schema_id(std::string_view schema_id) {
65157
// Without rejecting schemas with duplicate keywords, we would end up with a
66158
// schema that accepts strings but the user data will contain integers if
67159
// data was validated with i.e. sourcemeta's validator.
68-
const json::Value* find_keyword(
69-
const json::Value& node,
70-
const char* keyword,
71-
std::vector<json_value_type> types = {}) {
72-
auto it = node.FindMember(keyword);
73-
if (it != node.MemberEnd()) {
74-
auto next = std::next(it);
75-
if (next != node.MemberEnd() && next->name == keyword) {
76-
throw std::runtime_error(
77-
fmt::format("Duplicate keyword: {}", keyword));
160+
class keyword_index {
161+
public:
162+
explicit keyword_index(const json::Value& node) {
163+
for (auto it = node.MemberBegin(); it != node.MemberEnd(); ++it) {
164+
std::string_view name{
165+
it->name.GetString(), it->name.GetStringLength()};
166+
auto entry = std::ranges::find(
167+
keyword_table,
168+
name,
169+
&std::pair<std::string_view, keyword>::first);
170+
if (entry != keyword_table.end()) {
171+
auto& slot = index_[static_cast<size_t>(entry->second)];
172+
if (slot != nullptr) {
173+
throw std::runtime_error(
174+
fmt::format("Duplicate keyword: {}", name));
175+
}
176+
slot = &it->value;
177+
} else if (std::ranges::contains(banned_keywords, name)) {
178+
throw std::runtime_error(
179+
fmt::format("The {} keyword is not allowed", name));
180+
}
181+
}
182+
}
183+
184+
const json::Value*
185+
find(keyword kw, std::initializer_list<json_value_type> types = {}) const {
186+
auto* val = index_[static_cast<size_t>(kw)];
187+
if (!val) {
188+
return nullptr;
78189
}
79-
if (!types.empty()) {
190+
if (types.size() > 0) {
80191
bool valid_type = false;
81192
for (const auto& type : types) {
82193
switch (type) {
83194
case json_value_type::object:
84-
valid_type |= it->value.IsObject();
195+
valid_type |= val->IsObject();
85196
break;
86197
case json_value_type::array:
87-
valid_type |= it->value.IsArray();
198+
valid_type |= val->IsArray();
88199
break;
89200
case json_value_type::string:
90-
valid_type |= it->value.IsString();
201+
valid_type |= val->IsString();
91202
break;
92203
case json_value_type::boolean:
93-
valid_type |= it->value.IsBool();
204+
valid_type |= val->IsBool();
94205
break;
95206
case json_value_type::integer:
96-
valid_type |= it->value.IsInt() || it->value.IsInt64();
207+
valid_type |= val->IsInt() || val->IsInt64();
97208
break;
98209
case json_value_type::number:
99-
valid_type |= it->value.IsNumber();
210+
valid_type |= val->IsNumber();
100211
break;
101212
case json_value_type::null:
102-
valid_type |= it->value.IsNull();
213+
valid_type |= val->IsNull();
103214
break;
104215
}
105216
}
106217
if (!valid_type) {
107218
throw std::runtime_error(
108219
fmt::format(
109220
"Invalid type for keyword {}. Expected one of: [{}].",
110-
keyword,
221+
keyword_name(kw),
111222
fmt::join(types, ", ")));
112223
}
113224
}
114-
return &it->value;
225+
return val;
115226
}
116-
return nullptr;
117-
}
227+
228+
private:
229+
/// Pointers to the values of keywords in the same order as the `keyword`
230+
/// enum. `nullptr` means the keyword is not present in the indexed node.
231+
std::array<const json::Value*, keyword_table.size()> index_{};
232+
};
118233

119234
class compile_context {
120235
public:
@@ -238,13 +353,16 @@ class compile_context {
238353
size_t depth_{0};
239354
};
240355

241-
bool maybe_push_context(compile_context& ctx, const json::Value& node) {
356+
bool maybe_push_context(
357+
compile_context& ctx,
358+
const json::Value& node,
359+
const keyword_index& keywords) {
242360
// Identify the dialect first so we know which keyword is used for base URI,
243361
// i.e. `id` (draft-04) or `$id` (draft-6).
244-
auto dialect_at_node = [&ctx, &node]() {
362+
auto dialect_at_node = [&ctx, &keywords]() {
245363
if (
246-
auto dialect_node = find_keyword(
247-
node, "$schema", {json_value_type::string})) {
364+
auto dialect_node = keywords.find(
365+
keyword::schema, {json_value_type::string})) {
248366
return dialect_from_schema_id(dialect_node->GetString());
249367
} else {
250368
// If the $schema keyword is not present, we use the dialect of the
@@ -258,10 +376,9 @@ bool maybe_push_context(compile_context& ctx, const json::Value& node) {
258376
fmt::format("Unsupported JSON Schema dialect: {}", dialect_at_node));
259377
}
260378

261-
auto id_keyword = dialect_at_node == dialect::draft4 ? "id" : "$id";
262-
if (
263-
auto id_node = find_keyword(
264-
node, id_keyword, {json_value_type::string})) {
379+
auto id_kw = dialect_at_node == dialect::draft4 ? keyword::id_draft4
380+
: keyword::id;
381+
if (auto id_node = keywords.find(id_kw, {json_value_type::string})) {
265382
// The $id keyword starts a new resource context.
266383
ctx.push(
267384
parse_base_uri(id_node->GetString()).Resolve(ctx.base_uri()),
@@ -279,29 +396,6 @@ bool maybe_push_context(compile_context& ctx, const json::Value& node) {
279396
}
280397
}
281398

282-
constexpr auto banned_keywords = std::to_array({
283-
// Do not allow $dynamicRef keywords as would change the semantics of
284-
// the schema but we haven't implemented them yet.
285-
// Note for implementer: you'll also need to add encoding for fields when the
286-
// subschemas map is built. Make sure to add test cases with refs containing
287-
// characters that need escaping.
288-
"$dynamicRef",
289-
290-
// Not implementing for now for simplicity and because I don't think anyone
291-
// relies on it due to peculiarities of what this keyword does.
292-
// See: https://github.com/json-schema-org/json-schema-spec/issues/867
293-
"default",
294-
295-
// We can't use this in iceberg so don't spend time implementing it for now.
296-
"patternProperties",
297-
"dependencies",
298-
"if",
299-
"then",
300-
"else",
301-
"allOf",
302-
"anyOf",
303-
});
304-
305399
}; // namespace
306400

307401
class frontend::frontend_impl {
@@ -441,9 +535,11 @@ class frontend::frontend_impl {
441535
return sub;
442536
}
443537

538+
keyword_index keywords(node);
539+
444540
if (
445-
const auto ref_node = find_keyword(
446-
node, "$ref", {json_value_type::string})) {
541+
const auto ref_node = keywords.find(
542+
keyword::ref, {json_value_type::string})) {
447543
if (ctx.empty()) {
448544
throw std::runtime_error(
449545
"$ref at the root of the schema in draft-07 is not allowed "
@@ -456,26 +552,18 @@ class frontend::frontend_impl {
456552
return sub;
457553
}
458554

459-
auto new_ctx = maybe_push_context(ctx, node);
555+
auto new_ctx = maybe_push_context(ctx, node, keywords);
460556

461557
auto sub = new_ctx
462558
? ss::dynamic_pointer_cast<subschema>(ctx.top().schema())
463559
: ss::make_shared<subschema>();
464560

465-
// Check for banned keywords.
466-
for (const auto& keyword : banned_keywords) {
467-
if (node.HasMember(keyword)) {
468-
throw std::runtime_error(
469-
fmt::format("The {} keyword is not allowed", keyword));
470-
}
471-
}
472-
473-
if (auto types_node = find_keyword(node, "type")) {
561+
if (auto types_node = keywords.find(keyword::type)) {
474562
compile_types(ctx, *sub, *types_node);
475563
}
476564

477-
if (const auto defs_node = find_keyword(
478-
node, "definitions", {json_value_type::object});
565+
if (const auto defs_node = keywords.find(
566+
keyword::definitions, {json_value_type::object});
479567
new_ctx && defs_node) {
480568
for (const auto& [k, v] : defs_node->GetObject()) {
481569
if (!v.IsObject()) {
@@ -495,26 +583,27 @@ class frontend::frontend_impl {
495583
}
496584

497585
if (
498-
auto props = find_keyword(
499-
node, "properties", {json_value_type::object})) {
586+
auto props = keywords.find(
587+
keyword::properties, {json_value_type::object})) {
500588
for (const auto& [k, v] : props->GetObject()) {
501589
compile_property(ctx, *sub, k.GetString(), v);
502590
}
503591
}
504592

505593
if (
506-
const auto additional_properties_node = find_keyword(
507-
node, "additionalProperties")) {
594+
const auto additional_properties_node = keywords.find(
595+
keyword::additional_properties)) {
508596
compile_additional_properties(
509597
ctx, *sub, *additional_properties_node);
510598
}
511599

512-
if (auto items_node = find_keyword(node, "items")) {
600+
if (auto items_node = keywords.find(keyword::items)) {
513601
compile_items(ctx, *sub, *items_node);
514602
}
515603

516604
if (
517-
const auto additional_items = find_keyword(node, "additionalItems")) {
605+
const auto additional_items = keywords.find(
606+
keyword::additional_items)) {
518607
sub->additional_items_ = compile_subschema(ctx, *additional_items);
519608
auto [it, inserted] = sub->subschemas_.emplace(
520609
"additionalItems", sub->additional_items_);
@@ -525,14 +614,14 @@ class frontend::frontend_impl {
525614
}
526615

527616
if (
528-
const auto one_of_node = find_keyword(
529-
node, "oneOf", {json_value_type::array})) {
617+
const auto one_of_node = keywords.find(
618+
keyword::one_of, {json_value_type::array})) {
530619
compile_one_of(ctx, *sub, *one_of_node);
531620
}
532621

533622
if (
534-
const auto format_node = find_keyword(
535-
node, "format", {json_value_type::string})) {
623+
const auto format_node = keywords.find(
624+
keyword::format, {json_value_type::string})) {
536625
if (unlikely(ctx.dialect() != dialect::draft7)) {
537626
// When support for more dialects is added format parsing should
538627
// be revisited. We should explicitly handle all known formats

0 commit comments

Comments
 (0)