diff --git a/src/dom/dom_indexer.cc b/src/dom/dom_indexer.cc index a8afc95..18509c0 100644 --- a/src/dom/dom_indexer.cc +++ b/src/dom/dom_indexer.cc @@ -37,4 +37,9 @@ std::optional DOMIndexer::GetNodesByClass(std::string_view class_name) return it != class_index_.end() ? std::make_optional(it->second) : std::nullopt; } +std::optional DOMIndexer::GetNodesByAttribute(std::string_view attribute_name) const { + auto it = attr_index_.find(std::string(attribute_name)); + return it != attr_index_.end() ? std::make_optional(it->second) : std::nullopt; +} + } // namespace arboris diff --git a/src/dom/dom_indexer.hpp b/src/dom/dom_indexer.hpp index 3322b1b..59a379e 100644 --- a/src/dom/dom_indexer.hpp +++ b/src/dom/dom_indexer.hpp @@ -30,12 +30,15 @@ class DOMIndexer { [[nodiscard]] NodePtr GetNodeById(std::string_view id) const; [[nodiscard]] std::optional GetNodesByTag(Tag tag) const; [[nodiscard]] std::optional GetNodesByClass(std::string_view class_name) const; + [[nodiscard]] std::optional GetNodesByAttribute(std::string_view attribute_name) const; private: // TODO(team): consider using std::list instead of std::vector for indexes std::unordered_map id_index_; std::unordered_map tag_index_; std::unordered_map class_index_; + + // TODO(team): consider indexing by value instead of name std::unordered_map attr_index_; }; diff --git a/src/dom/dom_query.cc b/src/dom/dom_query.cc index 7ca7a77..e8ae0a9 100644 --- a/src/dom/dom_query.cc +++ b/src/dom/dom_query.cc @@ -8,17 +8,31 @@ #include #include +#include #include +#include + +#include "utils/set_utils.hpp" namespace arboris { std::optional DOMQuery::Find(const QueryOptions& options) const { // TODO(team): Implement this - return DOMQuery(root_, dom_indexer_); + auto candidates = searchCandidatesFromIndexer(options); + if (candidates.empty()) { + return std::nullopt; + } + + for (const auto& candidate : candidates) { + if (matchAllConditions(candidate, options)) { + return DOMQuery(*candidate, dom_indexer_); + } + } + + return std::nullopt; } std::optional DOMQuery::Find(const std::string& id) const { - // TODO(team): Implement this NodePtr node = dom_indexer_.get().GetNodeById(id); if (node) { return DOMQuery(*node, dom_indexer_); @@ -30,9 +44,86 @@ std::optional DOMQuery::Find(const std::string& id) const { std::vector DOMQuery::FindAll(const QueryOptions& options) const { std::vector ret; - const auto& tag_filtered_list = dom_indexer_.get().GetNodesByTag(options.tag.value()); - + auto candidates = searchCandidatesFromIndexer(options); + for (const auto& candidate : candidates) { + if (matchAllConditions(candidate, options)) { + ret.push_back(DOMQuery(*candidate, dom_indexer_)); + } + } return ret; } +NodeList DOMQuery::searchCandidatesFromIndexer(const QueryOptions& options) const { + std::size_t min_size = std::numeric_limits::max(); + NodeList min_candidates; + + if (options.tag.has_value()) { + auto nodes = dom_indexer_.get().GetNodesByTag(options.tag.value()); + if (nodes.has_value()) { + if (nodes->size() < min_size) { + min_size = nodes->size(); + min_candidates = std::move(*nodes); + } + } + } + + if (options.classes.has_value()) { + for (const auto& class_name : *options.classes) { + auto nodes = dom_indexer_.get().GetNodesByClass(class_name); + if (nodes.has_value()) { + if (nodes->size() < min_size) { + min_size = nodes->size(); + min_candidates = std::move(*nodes); + } + } + } + } + + if (options.attributes.has_value()) { + for (const auto& [attribute_name, _] : options.attributes.value()) { + auto nodes = dom_indexer_.get().GetNodesByAttribute(attribute_name); + if (nodes.has_value()) { + if (nodes->size() < min_size) { + min_size = nodes->size(); + min_candidates = std::move(*nodes); + } + } + } + } + + if (min_candidates.empty()) { + return {}; + } + + return min_candidates; +} + +bool DOMQuery::matchAllConditions(const NodePtr& node, const QueryOptions& options) const { + if (!isSubNode(node)) { + return false; + } + + if (options.tag && node->tag() != options.tag.value()) { + return false; + } + + if (options.classes && !IsSubset(options.classes.value(), node->classes())) { + return false; + } + + if (options.attributes && !IsSubset(options.attributes.value(), node->attributes())) { + return false; + } + // TODO(team): Implement text condition matching + return true; +} + +bool DOMQuery::isSubNode(const NodePtr& node) const { + const uint32_t node_in = node->in(); + const uint32_t node_out = node->out(); + const uint32_t root_in = root_.get().in(); + const uint32_t root_out = root_.get().out(); + return node_in >= root_in && node_out <= root_out; +} + } // namespace arboris diff --git a/src/dom/dom_query.hpp b/src/dom/dom_query.hpp index 5daa6a3..cf8d910 100644 --- a/src/dom/dom_query.hpp +++ b/src/dom/dom_query.hpp @@ -39,8 +39,12 @@ class DOMQuery { std::vector FindAll(const QueryOptions& options) const; private: - std::reference_wrapper root_; - std::reference_wrapper dom_indexer_; + NodeList searchCandidatesFromIndexer(const QueryOptions& options) const; + bool matchAllConditions(const NodePtr& node, const QueryOptions& options) const; + inline bool isSubNode(const NodePtr& node) const; + + std::reference_wrapper root_; + std::reference_wrapper dom_indexer_; }; } // namespace arboris diff --git a/src/dom/tag_node.hpp b/src/dom/tag_node.hpp index e6878f0..2fc72de 100644 --- a/src/dom/tag_node.hpp +++ b/src/dom/tag_node.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -29,11 +30,11 @@ class TagNode final : public BaseNode { return children_; } - [[nodiscard]] const std::unordered_map& attributes() const noexcept { + [[nodiscard]] const AttributeMap& attributes() const noexcept { return html_token_.attributes; } - [[nodiscard]] const std::vector& classes() const noexcept { + [[nodiscard]] const ClassSet& classes() const noexcept { return html_token_.classes; } diff --git a/src/utils/html_tokens.hpp b/src/utils/html_tokens.hpp index b7b8b2c..9b92feb 100644 --- a/src/utils/html_tokens.hpp +++ b/src/utils/html_tokens.hpp @@ -9,14 +9,18 @@ #include #include -#include #include +#include +#include #include "utils/tag.hpp" #include "utils/tokens.hpp" namespace arboris { +using AttributeMap = std::unordered_map>; +using ClassSet = std::unordered_set; + struct BaseHtmlToken : public BaseToken {}; struct HtmlToken : public BaseHtmlToken { @@ -24,8 +28,8 @@ struct HtmlToken : public BaseHtmlToken { bool is_void_tag = false; // TODO(team): Consider using string_views with an external string pool - std::unordered_map attributes; - std::vector classes; + AttributeMap attributes; + ClassSet classes; std::string id; }; diff --git a/src/utils/query_options.hpp b/src/utils/query_options.hpp index 180b3ec..650ea82 100644 --- a/src/utils/query_options.hpp +++ b/src/utils/query_options.hpp @@ -8,13 +8,14 @@ #define SRC_UTILS_QUERY_OPTIONS_HPP_ #include -#include +#include #include #include #include #include #include "utils/tag.hpp" +#include "utils/html_tokens.hpp" namespace arboris { @@ -40,8 +41,8 @@ class TextQueryCondition { struct QueryOptions { std::optional tag; - std::optional> classes; - std::optional>> attributes; + std::optional classes; + std::optional attributes; std::optional text; }; diff --git a/src/utils/set_utils.hpp b/src/utils/set_utils.hpp new file mode 100644 index 0000000..6ff83ac --- /dev/null +++ b/src/utils/set_utils.hpp @@ -0,0 +1,48 @@ +/* + * Copyright 2025 Team Arboris + * Licensed under the Apache License, Version 2.0 + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +#ifndef SRC_UTILS_SET_UTILS_HPP_ +#define SRC_UTILS_SET_UTILS_HPP_ + +#include +#include + +namespace arboris { + +template +bool IsSubset(const std::unordered_set& subset, + const std::unordered_set& super_set) { + if (subset.size() > super_set.size()) { + return false; + } + + for (const auto& x : subset) { + // std::unordered_set::contains is available in C++20 + if (!super_set.contains(x)) { + return false; + } + } + return true; +} + +template +bool IsSubset(const std::unordered_map>& subset, + const std::unordered_map>& super_set) { + for (const auto& [key, value_set] : subset) { + auto it = super_set.find(key); + if (it == super_set.end()) { + return false; + } + if (!IsSubset(value_set, it->second)) { + return false; + } + } + return true; +} + +} // namespace arboris + +#endif // SRC_UTILS_SET_UTILS_HPP_