2121
2222#include < seastar/util/defer.hh>
2323
24+ #include < algorithm>
2425#include < array>
2526#include < memory>
27+ #include < ranges>
2628#include < unordered_set>
2729#include < variant>
2830
@@ -42,10 +44,100 @@ dialect dialect_from_schema_id(std::string_view schema_id) {
4244 return details::string_switch_table (schema_id, dialect_by_schema_id);
4345}
4446
45- // Prefer using this instead of using `FindMember` directly, as it checks for
46- // duplicate keywords and throws an exception if a duplicate is found.
47+ constexpr auto banned_keywords = std::to_array({
48+ // Do not allow $dynamicRef keywords as would change the semantics of
49+ // the schema but we haven't implemented them yet.
50+ // Note for implementer: you'll also need to add encoding for fields when the
51+ // subschemas map is built. Make sure to add test cases with refs containing
52+ // characters that need escaping.
53+ " $dynamicRef" ,
54+
55+ // Not implementing for now for simplicity and because I don't think anyone
56+ // relies on it due to peculiarities of what this keyword does.
57+ // See: https://github.com/json-schema-org/json-schema-spec/issues/867
58+ " default" ,
59+
60+ // We can't use this in iceberg so don't spend time implementing it for now.
61+ " patternProperties" ,
62+ " dependencies" ,
63+ " if" ,
64+ " then" ,
65+ " else" ,
66+ " allOf" ,
67+ " anyOf" ,
68+ });
69+
70+ enum class keyword : uint8_t {
71+ schema, // "$schema"
72+ id, // "$id"
73+ id_draft4, // "id"
74+ ref, // "$ref"
75+ type, // "type"
76+ definitions, // "definitions"
77+ properties, // "properties"
78+ additional_properties, // "additionalProperties"
79+ items, // "items"
80+ additional_items, // "additionalItems"
81+ format, // "format"
82+ one_of, // "oneOf"
83+ };
84+
85+ constexpr auto keyword_table
86+ = std::to_array<std::pair<std::string_view, keyword>>({
87+ {" $schema" , keyword::schema},
88+ {" $id" , keyword::id},
89+ {" id" , keyword::id_draft4},
90+ {" $ref" , keyword::ref},
91+ {" type" , keyword::type},
92+ {" definitions" , keyword::definitions},
93+ {" properties" , keyword::properties},
94+ {" additionalProperties" , keyword::additional_properties},
95+ {" items" , keyword::items},
96+ {" additionalItems" , keyword::additional_items},
97+ {" format" , keyword::format},
98+ {" oneOf" , keyword::one_of},
99+ });
100+
101+ // The size assert below references the last enumerator explicitly. If you add
102+ // a new keyword after one_of, update the assert. Together with the ordering
103+ // assert (which guarantees keyword_table[i] maps to enum value i), these two
104+ // checks ensure a 1:1 mapping between the enum and the table.
105+ static_assert (
106+ keyword_table.size() == static_cast <size_t >(keyword::one_of) + 1 ,
107+ " keyword_table must have an entry for every keyword enum value" );
108+
109+ static_assert (
110+ []() consteval {
111+ for (size_t i = 0 ; i < keyword_table.size (); ++i) {
112+ if (static_cast <size_t >(keyword_table[i].second ) != i) {
113+ return false ;
114+ }
115+ }
116+ return true ;
117+ }(),
118+ " keyword_table entries must be ordered by enum value" );
119+
120+ static_assert (
121+ []() consteval {
122+ for (const auto & [name, _] : keyword_table) {
123+ for (const auto & banned : banned_keywords) {
124+ if (name == banned) {
125+ return false ;
126+ }
127+ }
128+ }
129+ return true ;
130+ }(),
131+ " keyword_table and banned_keywords must be disjoint" );
132+
133+ constexpr std::string_view keyword_name (keyword kw) {
134+ return keyword_table[static_cast <size_t >(kw)].first ;
135+ }
136+
137+ // Indexes known JSON Schema keywords from a single pass over a node's members.
138+ // Detects duplicate and banned keywords during construction.
47139//
48- // This is important as user validator might have different behavior for how
140+ // This is important as user validators might have different behavior for how
49141// duplicate object keys are handled and we might infer a schema which would
50142// not fit user's data.
51143//
@@ -65,56 +157,79 @@ dialect dialect_from_schema_id(std::string_view schema_id) {
65157// Without rejecting schemas with duplicate keywords, we would end up with a
66158// schema that accepts strings but the user data will contain integers if
67159// data was validated with i.e. sourcemeta's validator.
68- const json::Value* find_keyword (
69- const json::Value& node,
70- const char * keyword,
71- std::vector<json_value_type> types = {}) {
72- auto it = node.FindMember (keyword);
73- if (it != node.MemberEnd ()) {
74- auto next = std::next (it);
75- if (next != node.MemberEnd () && next->name == keyword) {
76- throw std::runtime_error (
77- fmt::format (" Duplicate keyword: {}" , keyword));
160+ class keyword_index {
161+ public:
162+ explicit keyword_index (const json::Value& node) {
163+ for (auto it = node.MemberBegin (); it != node.MemberEnd (); ++it) {
164+ std::string_view name{
165+ it->name .GetString (), it->name .GetStringLength ()};
166+ auto entry = std::ranges::find (
167+ keyword_table,
168+ name,
169+ &std::pair<std::string_view, keyword>::first);
170+ if (entry != keyword_table.end ()) {
171+ auto & slot = index_[static_cast <size_t >(entry->second )];
172+ if (slot != nullptr ) {
173+ throw std::runtime_error (
174+ fmt::format (" Duplicate keyword: {}" , name));
175+ }
176+ slot = &it->value ;
177+ } else if (std::ranges::contains (banned_keywords, name)) {
178+ throw std::runtime_error (
179+ fmt::format (" The {} keyword is not allowed" , name));
180+ }
181+ }
182+ }
183+
184+ const json::Value*
185+ find (keyword kw, std::initializer_list<json_value_type> types = {}) const {
186+ auto * val = index_[static_cast <size_t >(kw)];
187+ if (!val) {
188+ return nullptr ;
78189 }
79- if (! types.empty () ) {
190+ if (types.size () > 0 ) {
80191 bool valid_type = false ;
81192 for (const auto & type : types) {
82193 switch (type) {
83194 case json_value_type::object:
84- valid_type |= it-> value . IsObject ();
195+ valid_type |= val-> IsObject ();
85196 break ;
86197 case json_value_type::array:
87- valid_type |= it-> value . IsArray ();
198+ valid_type |= val-> IsArray ();
88199 break ;
89200 case json_value_type::string:
90- valid_type |= it-> value . IsString ();
201+ valid_type |= val-> IsString ();
91202 break ;
92203 case json_value_type::boolean:
93- valid_type |= it-> value . IsBool ();
204+ valid_type |= val-> IsBool ();
94205 break ;
95206 case json_value_type::integer:
96- valid_type |= it-> value . IsInt () || it-> value . IsInt64 ();
207+ valid_type |= val-> IsInt () || val-> IsInt64 ();
97208 break ;
98209 case json_value_type::number:
99- valid_type |= it-> value . IsNumber ();
210+ valid_type |= val-> IsNumber ();
100211 break ;
101212 case json_value_type::null:
102- valid_type |= it-> value . IsNull ();
213+ valid_type |= val-> IsNull ();
103214 break ;
104215 }
105216 }
106217 if (!valid_type) {
107218 throw std::runtime_error (
108219 fmt::format (
109220 " Invalid type for keyword {}. Expected one of: [{}]." ,
110- keyword ,
221+ keyword_name (kw) ,
111222 fmt::join (types, " , " )));
112223 }
113224 }
114- return &it-> value ;
225+ return val ;
115226 }
116- return nullptr ;
117- }
227+
228+ private:
229+ // / Pointers to the values of keywords in the same order as the `keyword`
230+ // / enum. `nullptr` means the keyword is not present in the indexed node.
231+ std::array<const json::Value*, keyword_table.size()> index_{};
232+ };
118233
119234class compile_context {
120235public:
@@ -238,13 +353,16 @@ class compile_context {
238353 size_t depth_{0 };
239354};
240355
241- bool maybe_push_context (compile_context& ctx, const json::Value& node) {
356+ bool maybe_push_context (
357+ compile_context& ctx,
358+ const json::Value& node,
359+ const keyword_index& keywords) {
242360 // Identify the dialect first so we know which keyword is used for base URI,
243361 // i.e. `id` (draft-04) or `$id` (draft-6).
244- auto dialect_at_node = [&ctx, &node ]() {
362+ auto dialect_at_node = [&ctx, &keywords ]() {
245363 if (
246- auto dialect_node = find_keyword (
247- node, " $ schema" , {json_value_type::string})) {
364+ auto dialect_node = keywords. find (
365+ keyword:: schema, {json_value_type::string})) {
248366 return dialect_from_schema_id (dialect_node->GetString ());
249367 } else {
250368 // If the $schema keyword is not present, we use the dialect of the
@@ -258,10 +376,9 @@ bool maybe_push_context(compile_context& ctx, const json::Value& node) {
258376 fmt::format (" Unsupported JSON Schema dialect: {}" , dialect_at_node));
259377 }
260378
261- auto id_keyword = dialect_at_node == dialect::draft4 ? " id" : " $id" ;
262- if (
263- auto id_node = find_keyword (
264- node, id_keyword, {json_value_type::string})) {
379+ auto id_kw = dialect_at_node == dialect::draft4 ? keyword::id_draft4
380+ : keyword::id;
381+ if (auto id_node = keywords.find (id_kw, {json_value_type::string})) {
265382 // The $id keyword starts a new resource context.
266383 ctx.push (
267384 parse_base_uri (id_node->GetString ()).Resolve (ctx.base_uri ()),
@@ -279,29 +396,6 @@ bool maybe_push_context(compile_context& ctx, const json::Value& node) {
279396 }
280397}
281398
282- constexpr auto banned_keywords = std::to_array({
283- // Do not allow $dynamicRef keywords as would change the semantics of
284- // the schema but we haven't implemented them yet.
285- // Note for implementer: you'll also need to add encoding for fields when the
286- // subschemas map is built. Make sure to add test cases with refs containing
287- // characters that need escaping.
288- " $dynamicRef" ,
289-
290- // Not implementing for now for simplicity and because I don't think anyone
291- // relies on it due to peculiarities of what this keyword does.
292- // See: https://github.com/json-schema-org/json-schema-spec/issues/867
293- " default" ,
294-
295- // We can't use this in iceberg so don't spend time implementing it for now.
296- " patternProperties" ,
297- " dependencies" ,
298- " if" ,
299- " then" ,
300- " else" ,
301- " allOf" ,
302- " anyOf" ,
303- });
304-
305399}; // namespace
306400
307401class frontend ::frontend_impl {
@@ -441,9 +535,11 @@ class frontend::frontend_impl {
441535 return sub;
442536 }
443537
538+ keyword_index keywords (node);
539+
444540 if (
445- const auto ref_node = find_keyword (
446- node, " $ ref" , {json_value_type::string})) {
541+ const auto ref_node = keywords. find (
542+ keyword:: ref, {json_value_type::string})) {
447543 if (ctx.empty ()) {
448544 throw std::runtime_error (
449545 " $ref at the root of the schema in draft-07 is not allowed "
@@ -456,26 +552,18 @@ class frontend::frontend_impl {
456552 return sub;
457553 }
458554
459- auto new_ctx = maybe_push_context (ctx, node);
555+ auto new_ctx = maybe_push_context (ctx, node, keywords );
460556
461557 auto sub = new_ctx
462558 ? ss::dynamic_pointer_cast<subschema>(ctx.top ().schema ())
463559 : ss::make_shared<subschema>();
464560
465- // Check for banned keywords.
466- for (const auto & keyword : banned_keywords) {
467- if (node.HasMember (keyword)) {
468- throw std::runtime_error (
469- fmt::format (" The {} keyword is not allowed" , keyword));
470- }
471- }
472-
473- if (auto types_node = find_keyword (node, " type" )) {
561+ if (auto types_node = keywords.find (keyword::type)) {
474562 compile_types (ctx, *sub, *types_node);
475563 }
476564
477- if (const auto defs_node = find_keyword (
478- node, " definitions" , {json_value_type::object});
565+ if (const auto defs_node = keywords. find (
566+ keyword:: definitions, {json_value_type::object});
479567 new_ctx && defs_node) {
480568 for (const auto & [k, v] : defs_node->GetObject ()) {
481569 if (!v.IsObject ()) {
@@ -495,26 +583,27 @@ class frontend::frontend_impl {
495583 }
496584
497585 if (
498- auto props = find_keyword (
499- node, " properties" , {json_value_type::object})) {
586+ auto props = keywords. find (
587+ keyword:: properties, {json_value_type::object})) {
500588 for (const auto & [k, v] : props->GetObject ()) {
501589 compile_property (ctx, *sub, k.GetString (), v);
502590 }
503591 }
504592
505593 if (
506- const auto additional_properties_node = find_keyword (
507- node, " additionalProperties " )) {
594+ const auto additional_properties_node = keywords. find (
595+ keyword::additional_properties )) {
508596 compile_additional_properties (
509597 ctx, *sub, *additional_properties_node);
510598 }
511599
512- if (auto items_node = find_keyword (node, " items" )) {
600+ if (auto items_node = keywords. find (keyword:: items)) {
513601 compile_items (ctx, *sub, *items_node);
514602 }
515603
516604 if (
517- const auto additional_items = find_keyword (node, " additionalItems" )) {
605+ const auto additional_items = keywords.find (
606+ keyword::additional_items)) {
518607 sub->additional_items_ = compile_subschema (ctx, *additional_items);
519608 auto [it, inserted] = sub->subschemas_ .emplace (
520609 " additionalItems" , sub->additional_items_ );
@@ -525,14 +614,14 @@ class frontend::frontend_impl {
525614 }
526615
527616 if (
528- const auto one_of_node = find_keyword (
529- node, " oneOf " , {json_value_type::array})) {
617+ const auto one_of_node = keywords. find (
618+ keyword::one_of , {json_value_type::array})) {
530619 compile_one_of (ctx, *sub, *one_of_node);
531620 }
532621
533622 if (
534- const auto format_node = find_keyword (
535- node, " format" , {json_value_type::string})) {
623+ const auto format_node = keywords. find (
624+ keyword:: format, {json_value_type::string})) {
536625 if (unlikely (ctx.dialect () != dialect::draft7)) {
537626 // When support for more dialects is added format parsing should
538627 // be revisited. We should explicitly handle all known formats
0 commit comments