Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/dom/dom_indexer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,9 @@ std::optional<NodeList> DOMIndexer::GetNodesByClass(std::string_view class_name)
return it != class_index_.end() ? std::make_optional(it->second) : std::nullopt;
}

std::optional<NodeList> DOMIndexer::GetNodesByAttribute(std::string_view attribute_name) const {
auto it = attr_index_.find(std::string(attribute_name));
return it != attr_index_.end() ? std::make_optional(it->second) : std::nullopt;
}

} // namespace arboris
3 changes: 3 additions & 0 deletions src/dom/dom_indexer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,15 @@ class DOMIndexer {
[[nodiscard]] NodePtr GetNodeById(std::string_view id) const;
[[nodiscard]] std::optional<NodeList> GetNodesByTag(Tag tag) const;
[[nodiscard]] std::optional<NodeList> GetNodesByClass(std::string_view class_name) const;
[[nodiscard]] std::optional<NodeList> GetNodesByAttribute(std::string_view attribute_name) const;

private:
// TODO(team): consider using std::list instead of std::vector for indexes
std::unordered_map<std::string, NodePtr> id_index_;
std::unordered_map<Tag, NodeList> tag_index_;
std::unordered_map<std::string, NodeList> class_index_;

// TODO(team): consider indexing by value instead of name
std::unordered_map<std::string, NodeList> attr_index_;
};

Expand Down
99 changes: 95 additions & 4 deletions src/dom/dom_query.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,31 @@

#include <optional>
#include <string>
#include <limits>
#include <vector>
#include <utility>

#include "utils/set_utils.hpp"

namespace arboris {

std::optional<DOMQuery> DOMQuery::Find(const QueryOptions& options) const {
// TODO(team): Implement this
return DOMQuery(root_, dom_indexer_);
auto candidates = searchCandidatesFromIndexer(options);
if (candidates.empty()) {
return std::nullopt;
}

for (const auto& candidate : candidates) {
if (matchAllConditions(candidate, options)) {
return DOMQuery(*candidate, dom_indexer_);
}
}
Comment on lines +26 to +30
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

searchCandidatesFromIndexer에서 이미 전체 Dom에 대해 모든 option들에 대한 쿼리를 진행 했는데, 여기서 다시 모든 option에 대한 검사를 중복해서 수행하는게 비효율적으로 보임.

searchCandidatesFromIndexer에서 각 option에 대해 한 번에 할 수 있어보임.

e.g.,

  1. tag 쿼리 결과 NodeList 생성
  2. 1번 결과에서 class에 대한 쿼리 진행 (Dom 전체가 아닌 이전 결과 NodeList에서 걸러내기)
  3. ... 이후 반복

코드가 더러울거 같긴 한데, 조금 다듬어 보면 괜찮게 나올지도?


return std::nullopt;
}

std::optional<DOMQuery> DOMQuery::Find(const std::string& id) const {
// TODO(team): Implement this
NodePtr node = dom_indexer_.get().GetNodeById(id);
if (node) {
return DOMQuery(*node, dom_indexer_);
Expand All @@ -30,9 +44,86 @@ std::optional<DOMQuery> DOMQuery::Find(const std::string& id) const {
std::vector<DOMQuery> DOMQuery::FindAll(const QueryOptions& options) const {
std::vector<DOMQuery> ret;

const auto& tag_filtered_list = dom_indexer_.get().GetNodesByTag(options.tag.value());

auto candidates = searchCandidatesFromIndexer(options);
for (const auto& candidate : candidates) {
if (matchAllConditions(candidate, options)) {
ret.push_back(DOMQuery(*candidate, dom_indexer_));
}
}
return ret;
}

NodeList DOMQuery::searchCandidatesFromIndexer(const QueryOptions& options) const {
std::size_t min_size = std::numeric_limits<std::size_t>::max();
NodeList min_candidates;

if (options.tag.has_value()) {
auto nodes = dom_indexer_.get().GetNodesByTag(options.tag.value());
if (nodes.has_value()) {
if (nodes->size() < min_size) {
min_size = nodes->size();
min_candidates = std::move(*nodes);
}
}
}

if (options.classes.has_value()) {
for (const auto& class_name : *options.classes) {
auto nodes = dom_indexer_.get().GetNodesByClass(class_name);
if (nodes.has_value()) {
if (nodes->size() < min_size) {
min_size = nodes->size();
min_candidates = std::move(*nodes);
}
}
}
}

if (options.attributes.has_value()) {
for (const auto& [attribute_name, _] : options.attributes.value()) {
auto nodes = dom_indexer_.get().GetNodesByAttribute(attribute_name);
if (nodes.has_value()) {
if (nodes->size() < min_size) {
min_size = nodes->size();
min_candidates = std::move(*nodes);
}
}
}
}

if (min_candidates.empty()) {
return {};
}

return min_candidates;
}

bool DOMQuery::matchAllConditions(const NodePtr& node, const QueryOptions& options) const {
if (!isSubNode(node)) {
return false;
}

if (options.tag && node->tag() != options.tag.value()) {
return false;
}

if (options.classes && !IsSubset(options.classes.value(), node->classes())) {
return false;
}

if (options.attributes && !IsSubset(options.attributes.value(), node->attributes())) {
return false;
}
// TODO(team): Implement text condition matching
return true;
}

bool DOMQuery::isSubNode(const NodePtr& node) const {
const uint32_t node_in = node->in();
const uint32_t node_out = node->out();
const uint32_t root_in = root_.get().in();
const uint32_t root_out = root_.get().out();
return node_in >= root_in && node_out <= root_out;
}

} // namespace arboris
8 changes: 6 additions & 2 deletions src/dom/dom_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,12 @@ class DOMQuery {
std::vector<DOMQuery> FindAll(const QueryOptions& options) const;

private:
std::reference_wrapper<const TagNode> root_;
std::reference_wrapper<const DOMIndexer> dom_indexer_;
NodeList searchCandidatesFromIndexer(const QueryOptions& options) const;
bool matchAllConditions(const NodePtr& node, const QueryOptions& options) const;
inline bool isSubNode(const NodePtr& node) const;

std::reference_wrapper<const TagNode> root_;
std::reference_wrapper<const DOMIndexer> dom_indexer_;
};

} // namespace arboris
Expand Down
5 changes: 3 additions & 2 deletions src/dom/tag_node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>

Expand All @@ -29,11 +30,11 @@ class TagNode final : public BaseNode {
return children_;
}

[[nodiscard]] const std::unordered_map<std::string, std::string>& attributes() const noexcept {
[[nodiscard]] const AttributeMap& attributes() const noexcept {
return html_token_.attributes;
}

[[nodiscard]] const std::vector<std::string>& classes() const noexcept {
[[nodiscard]] const ClassSet& classes() const noexcept {
return html_token_.classes;
}

Expand Down
10 changes: 7 additions & 3 deletions src/utils/html_tokens.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,27 @@

#include <string>
#include <string_view>
#include <unordered_map>
#include <vector>
#include <unordered_map>
#include <unordered_set>

#include "utils/tag.hpp"
#include "utils/tokens.hpp"

namespace arboris {

using AttributeMap = std::unordered_map<std::string, std::unordered_set<std::string>>;
using ClassSet = std::unordered_set<std::string>;

struct BaseHtmlToken : public BaseToken {};

struct HtmlToken : public BaseHtmlToken {
Tag tag = Tag::kUnknown;
bool is_void_tag = false;

// TODO(team): Consider using string_views with an external string pool
std::unordered_map<std::string, std::string> attributes;
std::vector<std::string> classes;
AttributeMap attributes;
ClassSet classes;
std::string id;
};

Expand Down
7 changes: 4 additions & 3 deletions src/utils/query_options.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@
#define SRC_UTILS_QUERY_OPTIONS_HPP_

#include <optional>
#include <vector>
#include <unordered_set>
#include <string>
#include <string_view>
#include <utility>
#include <functional>

#include "utils/tag.hpp"
#include "utils/html_tokens.hpp"

namespace arboris {

Expand All @@ -40,8 +41,8 @@ class TextQueryCondition {

struct QueryOptions {
std::optional<Tag> tag;
std::optional<std::vector<std::string>> classes;
std::optional<std::vector<std::pair<std::string, std::string>>> attributes;
std::optional<ClassSet> classes;
std::optional<AttributeMap> attributes;
std::optional<TextQueryCondition> text;
};

Expand Down
48 changes: 48 additions & 0 deletions src/utils/set_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Copyright 2025 Team Arboris
* Licensed under the Apache License, Version 2.0
* http://www.apache.org/licenses/LICENSE-2.0
*/

#ifndef SRC_UTILS_SET_UTILS_HPP_
#define SRC_UTILS_SET_UTILS_HPP_

#include <unordered_map>
#include <unordered_set>

namespace arboris {

template <typename T>
bool IsSubset(const std::unordered_set<T>& subset,
const std::unordered_set<T>& super_set) {
if (subset.size() > super_set.size()) {
return false;
}

for (const auto& x : subset) {
// std::unordered_set::contains is available in C++20
if (!super_set.contains(x)) {
return false;
}
}
return true;
}

template <typename T, typename U>
bool IsSubset(const std::unordered_map<T, std::unordered_set<U>>& subset,
const std::unordered_map<T, std::unordered_set<U>>& super_set) {
for (const auto& [key, value_set] : subset) {
auto it = super_set.find(key);
if (it == super_set.end()) {
return false;
}
if (!IsSubset(value_set, it->second)) {
return false;
}
}
return true;
}

} // namespace arboris

#endif // SRC_UTILS_SET_UTILS_HPP_