Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace nokogiri with jsoup #155

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ group :default do
gem 'bson', '~> 4.15.0', platform: :jruby
gem 'bigdecimal', '~> 3.1.7', platform: :jruby
gem 'json', '~> 2.7.2', platform: :jruby
gem 'nokogiri', '= 1.13.10', platform: :jruby
gem 'racc', '~> 1.7.3', platform: :jruby
gem 'strscan', '~> 3.1.0', platform: :jruby
gem 'thread_safe', '~> 0.3.6', platform: :jruby
Expand Down
1 change: 0 additions & 1 deletion Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,6 @@ DEPENDENCIES
jar-dependencies (= 0.4.1)
json (~> 2.7.2)
json-schema (~> 4.3.0)
nokogiri (= 1.13.10)
pry (~> 0.14.2)
pry-nav
pry-remote
Expand Down
2 changes: 1 addition & 1 deletion Jarfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jar 'org.apache.commons:commons-lang3', '3.10'
# Indirect dependencies that we needed to upgrade
jar 'com.google.protobuf:protobuf-java', '3.19.6'
jar 'com.github.junrar:junrar', '7.4.1'
jar 'org.jsoup:jsoup', '1.14.3'
jar 'org.jsoup:jsoup', '1.18.1'
jar 'commons-io:commons-io', '2.11.0'
jar 'org.apache.cxf:cxf-rt-transports-http', '3.4.10'
jar 'org.apache.cxf:cxf-core', '3.4.10'
Expand Down
3 changes: 1 addition & 2 deletions Jars.lock
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
org.snakeyaml:snakeyaml-engine:2.7:compile:
com.github.crawler-commons:crawler-commons:1.2:compile:
org.slf4j:slf4j-api:1.7.7:compile:
org.apache.httpcomponents.client5:httpclient5:5.1:compile:
Expand Down Expand Up @@ -102,7 +101,7 @@ org.slf4j:slf4j-nop:1.7.26:compile:
org.apache.commons:commons-lang3:3.10:compile:
com.google.protobuf:protobuf-java:3.19.6:compile:
com.github.junrar:junrar:7.4.1:compile:
org.jsoup:jsoup:1.14.3:compile:
org.jsoup:jsoup:1.18.1:compile:
commons-io:commons-io:2.11.0:compile:
org.apache.cxf:cxf-rt-transports-http:3.4.10:compile:
jakarta.xml.ws:jakarta.xml.ws-api:2.3.3:compile:
Expand Down
10 changes: 9 additions & 1 deletion lib/crawler/content_engine/extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,15 @@ def self.extract_from_crawl_result(rule, crawl_result)

return [] unless crawl_result.is_a?(Crawler::Data::CrawlResult::HTML)

crawl_result.extract_by_selector(rule.selector, [])
case rule.type
when Crawler::Data::Extraction::Rule::SELECTOR_TYPE_CSS
crawl_result.extract_by_css_selector(rule.selector, [])
when Crawler::Data::Extraction::Rule::SELECTOR_TYPE_XPATH
crawl_result.extract_by_xpath_selector(rule.selector, [])
else
raise ArgumentError,
"Unexpected extraction rule selector type '#{rule.type}' for selector '#{rule.selector}'"
end
end

def self.cast_result(rule, occurrences)
Expand Down
26 changes: 13 additions & 13 deletions lib/crawler/content_engine/transformer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,34 +13,34 @@ module Transformer
EXCLUDE_ATTR = 'data-elastic-exclude'
EXCLUDE_ATTR_SELECTOR = "[#{EXCLUDE_ATTR}]".freeze

def self.transform(doc)
transform!(doc.dup)
def self.transform(tag)
transform!(tag.dup)
end

def self.transform!(doc)
def self.transform!(tag)
loop do
node = doc.has_attribute?(EXCLUDE_ATTR) ? doc : doc.at_css(EXCLUDE_ATTR_SELECTOR)
break unless node
node = tag.hasAttr(EXCLUDE_ATTR) ? tag : tag.selectFirst(EXCLUDE_ATTR_SELECTOR)
break if node.nil?

traverse!(node, mode: :exclude)
end

doc
tag
end

def self.traverse!(node, mode:) # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
# The exclusion attribute is used to determine what to traverse next in the parent loop,
# so we should remove the attribute while traversing to avoid an infinite loop.
node.remove_attribute(EXCLUDE_ATTR) if node.has_attribute?(EXCLUDE_ATTR)
node.removeAttr(EXCLUDE_ATTR) if node.hasAttr(EXCLUDE_ATTR)

node.children.each do |child_node|
if child_node.text? && mode == :exclude
child_node.unlink
elsif child_node.element?
node.childNodes.each do |child_node|
if child_node.is_a?(Java::OrgJsoupNodes::TextNode) && mode == :exclude
child_node.remove
elsif child_node.is_a?(Java::OrgJsoupNodes::Element)
new_mode =
if child_node.has_attribute?(INCLUDE_ATTR)
if child_node.hasAttr(INCLUDE_ATTR)
:include
elsif child_node.has_attribute?(EXCLUDE_ATTR)
elsif child_node.hasAttr(EXCLUDE_ATTR)
:exclude
else
mode # mode is unchanged
Expand Down
14 changes: 7 additions & 7 deletions lib/crawler/content_engine/utils.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ module Utils
def self.node_descendant_text(node, ignore_tags = NON_CONTENT_TAGS) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
return '' unless node&.present?

unless node.respond_to?(:children) && node.respond_to?(:name) && node.respond_to?(:text?)
unless node.respond_to?(:childNodes) && node.respond_to?(:nodeName)
raise ArgumentError, "Expecting something node-like but got a #{node.class}"
end

Expand All @@ -54,17 +54,17 @@ def self.node_descendant_text(node, ignore_tags = NON_CONTENT_TAGS) # rubocop:di

# Remove tags that do not contain any text
# (and which sometimes are treated as CDATA, generating garbage text in jruby)
next if ignore_tags.include?(node.name)
next if ignore_tags.include?(node.nodeName)

# Tags, that need to be replaced by spaces according to the standards
if replace_with_whitespace?(node)
text << ' ' unless text.last == ' '
next
end

# Extract the text from all text nodes
if node.text?
content = node.content
# Extract the text from text nodes
if node.is_a?(Java::OrgJsoupNodes::TextNode)
content = node.text
text << content.squish if content
next
end
Expand All @@ -73,7 +73,7 @@ def self.node_descendant_text(node, ignore_tags = NON_CONTENT_TAGS) # rubocop:di
to_process_stack << ' '

# Recursion by adding the node's children to the stack and looping
node.children.reverse_each { |child| to_process_stack << child }
node.childNodes.reverse_each { |child| to_process_stack << child }

# Add spaces after all tags
to_process_stack << ' '
Expand All @@ -85,7 +85,7 @@ def self.node_descendant_text(node, ignore_tags = NON_CONTENT_TAGS) # rubocop:di

# Returns true, if the node should be replaced with a space when extracting text from a document
def self.replace_with_whitespace?(node)
BREAK_ELEMENTS.include?(node.name)
BREAK_ELEMENTS.include?(node.nodeName)
end

# Limits the size of a given string value down to a given limit (in bytes)
Expand Down
3 changes: 2 additions & 1 deletion lib/crawler/data/crawl_result/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@

require 'bson'
require 'digest'
require 'nokogiri'

module Crawler
module Data
# A CrawlResult contains the fetched and extracted content for some CrawlTask.
module CrawlResult
class Base
java_import org.jsoup.Jsoup

attr_reader :id, :url, :status_code, :content_type, :start_time, :end_time, :duration

delegate :normalized_url, :normalized_hash, to: :url
Expand Down
34 changes: 22 additions & 12 deletions lib/crawler/data/crawl_result/html.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def initialize(status_code: 200, **kwargs)
end

def parsed_content
@parsed_content ||= Nokogiri::HTML(content)
@parsed_content ||= Jsoup.parse(content)
end

def to_s
Expand Down Expand Up @@ -61,7 +61,7 @@ def extract_links(limit: nil, skip_invalid: false)
links = Set.new
limit_reached = false

parsed_content.css('a[href]').each do |a|
parsed_content.select('a[href]').each do |a|
# Parse the link
link = Link.new(base_url:, node: a)

Expand Down Expand Up @@ -104,12 +104,12 @@ def canonical_link
#---------------------------------------------------------------------------------------------
# Returns +true+ if the page contains a robots nofollow meta tag
def meta_nofollow?
!!parsed_content.at_css('meta[name=robots][content*=nofollow]')
!!parsed_content.selectFirst('meta[name=robots][content*=nofollow]')
end

# Returns +true+ if the page contains a robots noindex meta tag
def meta_noindex?
!!parsed_content.at_css('meta[name=robots][content*=noindex]')
!!parsed_content.selectFirst('meta[name=robots][content*=noindex]')
end

# Returns the meta tag value for keywords
Expand All @@ -127,14 +127,14 @@ def meta_description(limit: 1024)
#---------------------------------------------------------------------------------------------
# Returns the title of the document, cleaned up for indexing
def document_title(limit: 1000)
title_tag = parsed_content.css('title').first
title_tag = parsed_content.selectFirst('title')
title = Crawler::ContentEngine::Utils.node_descendant_text(title_tag)
Crawler::ContentEngine::Utils.limit_bytesize(title, limit)
end

# Returns the body of the document, cleaned up for indexing
def document_body(limit: 5.megabytes)
body_tag = parsed_content.at_css('body')
body_tag = parsed_content.body
return '' unless body_tag

body_tag = Crawler::ContentEngine::Transformer.transform(body_tag)
Expand All @@ -144,11 +144,11 @@ def document_body(limit: 5.megabytes)

# Returns an array of section headings from the page (using h1-h6 tags to find those)
def headings(limit: 10)
body_tag = parsed_content.css('body').first
body_tag = parsed_content.body
return [] unless body_tag

Set.new.tap do |headings|
body_tag.css('h1, h2, h3, h4, h5, h6').each do |heading|
body_tag.select('h1, h2, h3, h4, h5, h6').each do |heading|
heading = heading.text.to_s.squish
next if heading.empty?

Expand All @@ -160,15 +160,25 @@ def headings(limit: 10)

#---------------------------------------------------------------------------------------------
def extract_attribute_value(tag_name, attribute_name)
parsed_content.css(tag_name)&.attr(attribute_name)&.content
parsed_content.select(tag_name)&.attr(attribute_name)
end

# Lookup for content using CSS selector
#
# @param [String] CSS selector or XPath expression
# @param [String] selector - CSS selector
# @return [Array<String>]
def extract_by_selector(selector, ignore_tags)
parsed_content.search(selector).map do |node|
def extract_by_css_selector(selector, ignore_tags)
parsed_content.select(selector).map do |node|
Crawler::ContentEngine::Utils.node_descendant_text(node, ignore_tags)
end
end

# Lookup for content using XPath selector
#
# @param [String] selector - XPath selector
# @return [Array<String>]
def extract_by_xpath_selector(selector, ignore_tags)
parsed_content.selectXpath(selector).map do |node|
Crawler::ContentEngine::Utils.node_descendant_text(node, ignore_tags)
end
end
Expand Down
46 changes: 40 additions & 6 deletions lib/crawler/data/extraction/rule.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ module Crawler
module Data
module Extraction
class Rule
java_import org.jsoup.Jsoup

ACTION_TYPE_EXTRACT = 'extract'
ACTION_TYPE_SET = 'set'
ACTIONS = [ACTION_TYPE_EXTRACT, ACTION_TYPE_SET].freeze
Expand All @@ -24,7 +26,11 @@ class Rule
SOURCES_HTML = 'html'
SOURCES = [SOURCES_URL, SOURCES_HTML].freeze

attr_reader :action, :field_name, :selector, :join_as, :source, :value
SELECTOR_TYPE_CSS = 'css'
SELECTOR_TYPE_XPATH = 'xpath'
SELECTOR_TYPE_REGEXP = 'regexp'

attr_reader :action, :field_name, :selector, :join_as, :source, :value, :type

def initialize(rule)
@action = rule[:action]
Expand All @@ -33,6 +39,7 @@ def initialize(rule)
@join_as = rule[:join_as]
@source = rule[:source]
@value = rule[:value]
@type = nil
validate_rule
end

Expand Down Expand Up @@ -86,20 +93,47 @@ def validate_selector
raise ArgumentError, "Extraction rule selector can't be blank" if @selector.blank?

if @source == SOURCES_HTML
begin
Nokogiri::HTML::DocumentFragment.parse('<a></a>').search(@selector)
rescue Nokogiri::CSS::SyntaxError, Nokogiri::XML::XPath::SyntaxError => e
raise ArgumentError, "Extraction rule selector `#{@selector}` is not a valid HTML selector: #{e.message}"
end
# For HTML we need to infer the selector type (xpath or css) based on the provided selector value,
# because jsoup has different parsing methods for each case.
css_error = validate_css_selector
return if css_error.nil?

xpath_error = validate_xpath_selector
return if xpath_error.nil?

# Only raise if neither were valid
raise ArgumentError, "#{css_error}; #{xpath_error}"
else
begin
Regexp.new(@selector)
# At this point in time, URL selectors are always of type 'regexp'
@type = SELECTOR_TYPE_REGEXP
rescue RegexpError => e
raise ArgumentError,
"Extraction rule selector `#{@selector}` is not a valid regular expression: #{e.message}"
end
end
end

def validate_css_selector
# If valid CSS selector, @type will be set to 'css', otherwise we return the error

Jsoup.parseBodyFragment('<a></a>').select(@selector)
@type = SELECTOR_TYPE_CSS
nil
rescue Java::OrgJsoupSelect::Selector::SelectorParseException => e
"Extraction rule selector `#{@selector}` is not a valid CSS selector: #{e.message}"
end

def validate_xpath_selector
# If valid XPath selector, @type will be set to 'xpath', otherwise we return the error

Jsoup.parseBodyFragment('<a></a>').selectXpath(@selector)
@type = SELECTOR_TYPE_XPATH
nil
rescue Java::OrgJsoupSelect::Selector::SelectorParseException => e
"Extraction rule selector `#{@selector}` is not a valid XPath selector: #{e.message}"
end
end
end
end
Expand Down
10 changes: 5 additions & 5 deletions lib/crawler/data/link.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,21 @@ class Link

# There are two ways to pass a link in:
# - `link` - a string representation of a link
# - `node` - a Nokogiri::XML::Element object
# - `node` - a Java::OrgJsoupNodes::Element object
def initialize(base_url:, node: nil, link: nil) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
raise ArgumentError, 'Base URL needs to be a URL object' unless base_url.is_a?(URL)
raise ArgumentError, 'Needs an node or a string link argument' unless node || link
raise ArgumentError, 'The :link argument needs to be a String' if link && !link.is_a?(String)

if node && !node.is_a?(Nokogiri::XML::Element)
if node && !node.is_a?(Java::OrgJsoupNodes::Element)
raise ArgumentError,
'The :node argument needs to be a Nokogiri::XML::Element'
'The :node argument needs to be a Java::OrgJsoupNodes::Element'
end
raise ArgumentError, 'Needs only one link argument' if node && link

@base_url = base_url
@node = node
@link = node ? node['href'] : link
@link = node ? node.attr('abs:href') : link
@error = nil
end

Expand Down Expand Up @@ -91,7 +91,7 @@ def error
# Returns an array with all the values of the rel attribute for the link
# See https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel for details
def rel
node ? node['rel'].to_s.squish.downcase.split : []
node ? node.attr('rel').squish.downcase.split : []
end

# Returns +true+ if the link contains a rel=nofollow attribute
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.