elastic · navarone-feekery · Oct 28, 2024 · Oct 28, 2024 · Oct 28, 2024
@@ -37,7 +37,6 @@ group :default do
   gem 'bson', '~> 4.15.0', platform: :jruby
   gem 'bigdecimal', '~> 3.1.7', platform: :jruby
   gem 'json', '~> 2.7.2', platform: :jruby
-  gem 'nokogiri', '= 1.13.10', platform: :jruby
   gem 'racc', '~> 1.7.3', platform: :jruby
   gem 'strscan', '~> 3.1.0', platform: :jruby
   gem 'thread_safe', '~> 0.3.6', platform: :jruby

@@ -171,7 +171,6 @@ DEPENDENCIES
   jar-dependencies (= 0.4.1)
   json (~> 2.7.2)
   json-schema (~> 4.3.0)
-  nokogiri (= 1.13.10)
   pry (~> 0.14.2)
   pry-nav
   pry-remote

@@ -23,7 +23,7 @@ jar 'org.apache.commons:commons-lang3', '3.10'
 # Indirect dependencies that we needed to upgrade
 jar 'com.google.protobuf:protobuf-java', '3.19.6'
 jar 'com.github.junrar:junrar', '7.4.1'
-jar 'org.jsoup:jsoup', '1.14.3'
+jar 'org.jsoup:jsoup', '1.18.1'
 jar 'commons-io:commons-io', '2.11.0'
 jar 'org.apache.cxf:cxf-rt-transports-http', '3.4.10'
 jar 'org.apache.cxf:cxf-core', '3.4.10'

@@ -1,4 +1,3 @@
-org.snakeyaml:snakeyaml-engine:2.7:compile:
 com.github.crawler-commons:crawler-commons:1.2:compile:
 org.slf4j:slf4j-api:1.7.7:compile:
 org.apache.httpcomponents.client5:httpclient5:5.1:compile:
@@ -102,7 +101,7 @@ org.slf4j:slf4j-nop:1.7.26:compile:
 org.apache.commons:commons-lang3:3.10:compile:
 com.google.protobuf:protobuf-java:3.19.6:compile:
 com.github.junrar:junrar:7.4.1:compile:
-org.jsoup:jsoup:1.14.3:compile:
+org.jsoup:jsoup:1.18.1:compile:
 commons-io:commons-io:2.11.0:compile:
 org.apache.cxf:cxf-rt-transports-http:3.4.10:compile:
 jakarta.xml.ws:jakarta.xml.ws-api:2.3.3:compile:

@@ -49,7 +49,15 @@ def self.extract_from_crawl_result(rule, crawl_result)
 
         return [] unless crawl_result.is_a?(Crawler::Data::CrawlResult::HTML)
 
-        crawl_result.extract_by_selector(rule.selector, [])
+        case rule.type
+        when Crawler::Data::Extraction::Rule::SELECTOR_TYPE_CSS
+          crawl_result.extract_by_css_selector(rule.selector, [])
+        when Crawler::Data::Extraction::Rule::SELECTOR_TYPE_XPATH
+          crawl_result.extract_by_xpath_selector(rule.selector, [])
+        else
+          raise ArgumentError,
+                "Unexpected extraction rule selector type '#{rule.type}' for selector '#{rule.selector}'"
+        end
       end
 
       def self.cast_result(rule, occurrences)

@@ -13,34 +13,34 @@ module Transformer
       EXCLUDE_ATTR = 'data-elastic-exclude'
       EXCLUDE_ATTR_SELECTOR = "[#{EXCLUDE_ATTR}]".freeze
 
-      def self.transform(doc)
-        transform!(doc.dup)
+      def self.transform(tag)
+        transform!(tag.dup)
       end
 
-      def self.transform!(doc)
+      def self.transform!(tag)
         loop do
-          node = doc.has_attribute?(EXCLUDE_ATTR) ? doc : doc.at_css(EXCLUDE_ATTR_SELECTOR)
-          break unless node
+          node = tag.hasAttr(EXCLUDE_ATTR) ? tag : tag.selectFirst(EXCLUDE_ATTR_SELECTOR)
+          break if node.nil?
 
           traverse!(node, mode: :exclude)
         end
 
-        doc
+        tag
       end
 
       def self.traverse!(node, mode:) # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
         # The exclusion attribute is used to determine what to traverse next in the parent loop,
         # so we should remove the attribute while traversing to avoid an infinite loop.
-        node.remove_attribute(EXCLUDE_ATTR) if node.has_attribute?(EXCLUDE_ATTR)
+        node.removeAttr(EXCLUDE_ATTR) if node.hasAttr(EXCLUDE_ATTR)
 
-        node.children.each do |child_node|
-          if child_node.text? && mode == :exclude
-            child_node.unlink
-          elsif child_node.element?
+        node.childNodes.each do |child_node|
+          if child_node.is_a?(Java::OrgJsoupNodes::TextNode) && mode == :exclude
+            child_node.remove
+          elsif child_node.is_a?(Java::OrgJsoupNodes::Element)
             new_mode =
-              if child_node.has_attribute?(INCLUDE_ATTR)
+              if child_node.hasAttr(INCLUDE_ATTR)
                 :include
-              elsif child_node.has_attribute?(EXCLUDE_ATTR)
+              elsif child_node.hasAttr(EXCLUDE_ATTR)
                 :exclude
               else
                 mode # mode is unchanged

@@ -34,7 +34,7 @@ module Utils
       def self.node_descendant_text(node, ignore_tags = NON_CONTENT_TAGS) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
         return '' unless node&.present?
 
-        unless node.respond_to?(:children) && node.respond_to?(:name) && node.respond_to?(:text?)
+        unless node.respond_to?(:childNodes) && node.respond_to?(:nodeName)
           raise ArgumentError, "Expecting something node-like but got a #{node.class}"
         end
 
@@ -54,17 +54,17 @@ def self.node_descendant_text(node, ignore_tags = NON_CONTENT_TAGS) # rubocop:di
 
           # Remove tags that do not contain any text
           # (and which sometimes are treated as CDATA, generating garbage text in jruby)
-          next if ignore_tags.include?(node.name)
+          next if ignore_tags.include?(node.nodeName)
 
           # Tags, that need to be replaced by spaces according to the standards
           if replace_with_whitespace?(node)
             text << ' ' unless text.last == ' '
             next
           end
 
-          # Extract the text from all text nodes
-          if node.text?
-            content = node.content
+          # Extract the text from text nodes
+          if node.is_a?(Java::OrgJsoupNodes::TextNode)
+            content = node.text
             text << content.squish if content
             next
           end
@@ -73,7 +73,7 @@ def self.node_descendant_text(node, ignore_tags = NON_CONTENT_TAGS) # rubocop:di
           to_process_stack << ' '
 
           # Recursion by adding the node's children to the stack and looping
-          node.children.reverse_each { |child| to_process_stack << child }
+          node.childNodes.reverse_each { |child| to_process_stack << child }
 
           # Add spaces after all tags
           to_process_stack << ' '
@@ -85,7 +85,7 @@ def self.node_descendant_text(node, ignore_tags = NON_CONTENT_TAGS) # rubocop:di
 
       # Returns true, if the node should be replaced with a space when extracting text from a document
       def self.replace_with_whitespace?(node)
-        BREAK_ELEMENTS.include?(node.name)
+        BREAK_ELEMENTS.include?(node.nodeName)
       end
 
       # Limits the size of a given string value down to a given limit (in bytes)

@@ -8,13 +8,14 @@
 
 require 'bson'
 require 'digest'
-require 'nokogiri'
 
 module Crawler
   module Data
     # A CrawlResult contains the fetched and extracted content for some CrawlTask.
     module CrawlResult
       class Base
+        java_import org.jsoup.Jsoup
+
         attr_reader :id, :url, :status_code, :content_type, :start_time, :end_time, :duration
 
         delegate :normalized_url, :normalized_hash, to: :url

@@ -20,7 +20,7 @@ def initialize(status_code: 200, **kwargs)
         end
 
         def parsed_content
-          @parsed_content ||= Nokogiri::HTML(content)
+          @parsed_content ||= Jsoup.parse(content)
         end
 
         def to_s
@@ -61,7 +61,7 @@ def extract_links(limit: nil, skip_invalid: false)
           links = Set.new
           limit_reached = false
 
-          parsed_content.css('a[href]').each do |a|
+          parsed_content.select('a[href]').each do |a|
             # Parse the link
             link = Link.new(base_url:, node: a)
 
@@ -104,12 +104,12 @@ def canonical_link
         #---------------------------------------------------------------------------------------------
         # Returns +true+ if the page contains a robots nofollow meta tag
         def meta_nofollow?
-          !!parsed_content.at_css('meta[name=robots][content*=nofollow]')
+          !!parsed_content.selectFirst('meta[name=robots][content*=nofollow]')
         end
 
         # Returns +true+ if the page contains a robots noindex meta tag
         def meta_noindex?
-          !!parsed_content.at_css('meta[name=robots][content*=noindex]')
+          !!parsed_content.selectFirst('meta[name=robots][content*=noindex]')
         end
 
         # Returns the meta tag value for keywords
@@ -127,14 +127,14 @@ def meta_description(limit: 1024)
         #---------------------------------------------------------------------------------------------
         # Returns the title of the document, cleaned up for indexing
         def document_title(limit: 1000)
-          title_tag = parsed_content.css('title').first
+          title_tag = parsed_content.selectFirst('title')
           title = Crawler::ContentEngine::Utils.node_descendant_text(title_tag)
           Crawler::ContentEngine::Utils.limit_bytesize(title, limit)
         end
 
         # Returns the body of the document, cleaned up for indexing
         def document_body(limit: 5.megabytes)
-          body_tag = parsed_content.at_css('body')
+          body_tag = parsed_content.body
           return '' unless body_tag
 
           body_tag = Crawler::ContentEngine::Transformer.transform(body_tag)
@@ -144,11 +144,11 @@ def document_body(limit: 5.megabytes)
 
         # Returns an array of section headings from the page (using h1-h6 tags to find those)
         def headings(limit: 10)
-          body_tag = parsed_content.css('body').first
+          body_tag = parsed_content.body
           return [] unless body_tag
 
           Set.new.tap do |headings|
-            body_tag.css('h1, h2, h3, h4, h5, h6').each do |heading|
+            body_tag.select('h1, h2, h3, h4, h5, h6').each do |heading|
               heading = heading.text.to_s.squish
               next if heading.empty?
 
@@ -160,15 +160,25 @@ def headings(limit: 10)
 
         #---------------------------------------------------------------------------------------------
         def extract_attribute_value(tag_name, attribute_name)
-          parsed_content.css(tag_name)&.attr(attribute_name)&.content
+          parsed_content.select(tag_name)&.attr(attribute_name)
         end
 
         # Lookup for content using CSS selector
         #
-        # @param [String] CSS selector or XPath expression
+        # @param [String] selector - CSS selector
         # @return [Array<String>]
-        def extract_by_selector(selector, ignore_tags)
-          parsed_content.search(selector).map do |node|
+        def extract_by_css_selector(selector, ignore_tags)
+          parsed_content.select(selector).map do |node|
+            Crawler::ContentEngine::Utils.node_descendant_text(node, ignore_tags)
+          end
+        end
+
+        # Lookup for content using XPath selector
+        #
+        # @param [String] selector - XPath selector
+        # @return [Array<String>]
+        def extract_by_xpath_selector(selector, ignore_tags)
+          parsed_content.selectXpath(selector).map do |node|
             Crawler::ContentEngine::Utils.node_descendant_text(node, ignore_tags)
           end
         end

@@ -12,6 +12,8 @@ module Crawler
   module Data
     module Extraction
       class Rule
+        java_import org.jsoup.Jsoup
+
         ACTION_TYPE_EXTRACT = 'extract'
         ACTION_TYPE_SET = 'set'
         ACTIONS = [ACTION_TYPE_EXTRACT, ACTION_TYPE_SET].freeze
@@ -24,7 +26,11 @@ class Rule
         SOURCES_HTML = 'html'
         SOURCES = [SOURCES_URL, SOURCES_HTML].freeze
 
-        attr_reader :action, :field_name, :selector, :join_as, :source, :value
+        SELECTOR_TYPE_CSS = 'css'
+        SELECTOR_TYPE_XPATH = 'xpath'
+        SELECTOR_TYPE_REGEXP = 'regexp'
+
+        attr_reader :action, :field_name, :selector, :join_as, :source, :value, :type
 
         def initialize(rule)
           @action = rule[:action]
@@ -33,6 +39,7 @@ def initialize(rule)
           @join_as = rule[:join_as]
           @source = rule[:source]
           @value = rule[:value]
+          @type = nil
           validate_rule
         end
 
@@ -86,20 +93,47 @@ def validate_selector
           raise ArgumentError, "Extraction rule selector can't be blank" if @selector.blank?
 
           if @source == SOURCES_HTML
-            begin
-              Nokogiri::HTML::DocumentFragment.parse('<a></a>').search(@selector)
-            rescue Nokogiri::CSS::SyntaxError, Nokogiri::XML::XPath::SyntaxError => e
-              raise ArgumentError, "Extraction rule selector `#{@selector}` is not a valid HTML selector: #{e.message}"
-            end
+            # For HTML we need to infer the selector type (xpath or css) based on the provided selector value,
+            # because jsoup has different parsing methods for each case.
+            css_error = validate_css_selector
+            return if css_error.nil?
+
+            xpath_error = validate_xpath_selector
+            return if xpath_error.nil?
+
+            # Only raise if neither were valid
+            raise ArgumentError, "#{css_error}; #{xpath_error}"
           else
             begin
               Regexp.new(@selector)
+              # At this point in time, URL selectors are always of type 'regexp'
+              @type = SELECTOR_TYPE_REGEXP
             rescue RegexpError => e
               raise ArgumentError,
                     "Extraction rule selector `#{@selector}` is not a valid regular expression: #{e.message}"
             end
           end
         end
+
+        def validate_css_selector
+          # If valid CSS selector, @type will be set to 'css', otherwise we return the error
+
+          Jsoup.parseBodyFragment('<a></a>').select(@selector)
+          @type = SELECTOR_TYPE_CSS
+          nil
+        rescue Java::OrgJsoupSelect::Selector::SelectorParseException => e
+          "Extraction rule selector `#{@selector}` is not a valid CSS selector: #{e.message}"
+        end
+
+        def validate_xpath_selector
+          # If valid XPath selector, @type will be set to 'xpath', otherwise we return the error
+
+          Jsoup.parseBodyFragment('<a></a>').selectXpath(@selector)
+          @type = SELECTOR_TYPE_XPATH
+          nil
+        rescue Java::OrgJsoupSelect::Selector::SelectorParseException => e
+          "Extraction rule selector `#{@selector}` is not a valid XPath selector: #{e.message}"
+        end
       end
     end
   end

@@ -14,21 +14,21 @@ class Link
 
       # There are two ways to pass a link in:
       # - `link` - a string representation of a link
-      # - `node` - a Nokogiri::XML::Element object
+      # - `node` - a Java::OrgJsoupNodes::Element object
       def initialize(base_url:, node: nil, link: nil) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
         raise ArgumentError, 'Base URL needs to be a URL object' unless base_url.is_a?(URL)
         raise ArgumentError, 'Needs an node or a string link argument' unless node || link
         raise ArgumentError, 'The :link argument needs to be a String' if link && !link.is_a?(String)
 
-        if node && !node.is_a?(Nokogiri::XML::Element)
+        if node && !node.is_a?(Java::OrgJsoupNodes::Element)
           raise ArgumentError,
-                'The :node argument needs to be a Nokogiri::XML::Element'
+                'The :node argument needs to be a Java::OrgJsoupNodes::Element'
         end
         raise ArgumentError, 'Needs only one link argument' if node && link
 
         @base_url = base_url
         @node = node
-        @link = node ? node['href'] : link
+        @link = node ? node.attr('abs:href') : link
         @error = nil
       end
 
@@ -91,7 +91,7 @@ def error
       # Returns an array with all the values of the rel attribute for the link
       # See https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel for details
       def rel
-        node ? node['rel'].to_s.squish.downcase.split : []
+        node ? node.attr('rel').squish.downcase.split : []
       end
 
       # Returns +true+ if the link contains a rel=nofollow attribute