From 38b7ea3f207b5d241b7d62434753af7e58522da6 Mon Sep 17 00:00:00 2001 From: JoshAshby Date: Fri, 23 Aug 2019 22:01:24 -0600 Subject: [PATCH 1/3] experiment with making an easier to maintain crawler while also thinking about the future of being able to produce WARC files and have a better set of logs and information around requests and responses during caching to aid in debugging and also thinking about extendability and introducing newer crawlers while keeping the same data structures --- app/crawlers/classic.rb | 205 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 app/crawlers/classic.rb diff --git a/app/crawlers/classic.rb b/app/crawlers/classic.rb new file mode 100644 index 00000000..d84899d4 --- /dev/null +++ b/app/crawlers/classic.rb @@ -0,0 +1,205 @@ +require "tempfile" + +module HTTPRequestInResponse + refine HTTP::Response do + attr_accessor :req + end + + refine HTTP::Client do + alias_method :__perform__, :perform + + def perform(req, options) + __perform__(req, options).tap { |res| res.req = req } + end + end +end + +using HTTPRequestInResponse + +Request = Struct.new(:uri, :method, :headers, keyword_init: true) do +end + +Response = Struct.new(:status, :headers, :body, keyword_init: true) do +end + +class Document + extend Forwardable + + attr_accessor :uri, :request, :response, :content_type, :parent, :dependencies + + def initialize uri + @uri = uri + end + + def dependencies + @dependencies ||= {} + end + + def address + @address ||= Addressable::URI.parse uri + end + + # TODO: What if this is JSON and instead of xpath it's a json_path? + def nokogiri + @nokogiri ||= Nokogiri::HTML(response.body) + rescue => e + debugger + end + + def_delegators :nokogiri, :xpath + + # TODO: move this logic to a "dependency resolver/parser" + def asset_links uri + [ + "//link[@rel='stylesheet']/@href", + "//script/@src", + "//img/@src" + ].flat_map(&nokogiri.method(:xpath)) + .map(&:to_s) + .uniq + .compact + .map(&Addressable::URI.method(:parse)) + .map { |link| (uri + link).to_s } + end + + def add_dependency link, dep + dep.parent = self + dependencies[link] = dep + end +end + +class Classic + class Client + extend Forwardable + + def config + @config ||= ActiveSupport::OrderedOptions.new.tap do |opts| + opts.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:57.0) Gecko/20100101 Firefox/57.0" + + opts.headers = { + accept_language: "en-US,en;q=0.5", + accept: "text/html;q=0.9,*/*;q=0.8; charset=utf-8" + } + + opts.follow_redirects = true + end + end + + def_delegators :base, *HTTP::Request::METHODS + + private + + def base + @base ||= HTTP.headers({ user_agent: config.user_agent }.merge(config.headers)) + .yield_self { |client| config.follow_redirects ? client.follow : client } + end + end + + def crawl site + root = crawl_deps site + + fetch_favicon_for root + + return root + end + + private + + def crawl_deps site + root = request site + + fetch_assets_for root + + return root + end + + def client + @client ||= Client.new + end + + def fetch_favicon_for root + default_location = root.address.join("/favicon.ico").to_s + res = request default_location + + root.add_dependency default_location, res + return if (200...299).include? res.response.status + + root.xpath('//link[@rel="shortcut icon"]').find do |possible_favicon| + href = root.address.join(possible_favicon.attr("href")).to_s + + res = request href + + root.add_dependency href, res + end + end + + def fetch_assets_for root + root.asset_links(root.address).each do |link| + dep = crawl_deps link + + root.add_dependency link, dep + end + end + + def request uri + document = Document.new uri + + puts "Fetching uri: '#{ uri }'" + + res = client.get uri + + body, content_type = binary_temp_file do |temp_file| + write_body response: res, to: temp_file + content_type = get_content_type response: res, io: temp_file + + [temp_file.read, content_type] + end + + # TODO: There is a lot more info to capture from the request and + # response. + document.request = Request.new uri: uri, method: "GET", headers: {} #res.req.headers + document.response = Response.new status: res.code, headers: res.headers, body: body + + document.content_type = content_type + + return document + end + + def binary_temp_file + Tempfile.create do |temp_file| + temp_file.binmode + + yield temp_file + end + end + + # Streams the response body into the temp file to hopefully avoid some + # memory issues if this is a large document + def write_body response:, to: + io_handle = to + + response.body.each do |partial| + io_handle.write partial + end + + io_handle.rewind + end + + def get_content_type response:, io: + return response.content_type.mime_type if response.content_type.mime_type + + io_handle = io + io_handle.rewind + + # some websites *cough*offline.pink*cough* don't return a content type + # header, so we'll first see if the html doctype string is present and + # guess this is text/html or we'll fallback to assuming its binary + # https://en.wikipedia.org/wiki/Content_sniffing + content_type ||= MimeMagic.by_magic(io_handle) + content_type ||= "application/octet-stream" + + io_handle.rewind + + content_type + end +end From 1788a01e593d359f2edea558c201fbee6581b77f Mon Sep 17 00:00:00 2001 From: JoshAshby Date: Tue, 27 Aug 2019 15:42:03 -0600 Subject: [PATCH 2/3] wip on finding some spots that should/can be broken out --- app/crawlers/classic.rb | 89 ++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/app/crawlers/classic.rb b/app/crawlers/classic.rb index d84899d4..c0a5c6aa 100644 --- a/app/crawlers/classic.rb +++ b/app/crawlers/classic.rb @@ -35,10 +35,6 @@ def dependencies @dependencies ||= {} end - def address - @address ||= Addressable::URI.parse uri - end - # TODO: What if this is JSON and instead of xpath it's a json_path? def nokogiri @nokogiri ||= Nokogiri::HTML(response.body) @@ -49,7 +45,9 @@ def nokogiri def_delegators :nokogiri, :xpath # TODO: move this logic to a "dependency resolver/parser" - def asset_links uri + def asset_links + root_uri = Addressable::URI.parse @uri + [ "//link[@rel='stylesheet']/@href", "//script/@src", @@ -59,12 +57,15 @@ def asset_links uri .uniq .compact .map(&Addressable::URI.method(:parse)) - .map { |link| (uri + link).to_s } + .map { |link| (root_uri + link).to_s } end - def add_dependency link, dep + def add_dependency dep + binding.pry unless dep.is_a? Document + fail "dependency must be a Document, got `#{ dep.class }`" unless dep.is_a? Document + dep.parent = self - dependencies[link] = dep + dependencies[dep.uri] = dep end end @@ -96,21 +97,13 @@ def base end def crawl site - root = crawl_deps site - - fetch_favicon_for root - - return root + crawl_deps(site).tap(&method(:fetch_favicon_for)) end private def crawl_deps site - root = request site - - fetch_assets_for root - - return root + request(site).tap(&method(:fetch_assets_for)) end def client @@ -118,54 +111,60 @@ def client end def fetch_favicon_for root - default_location = root.address.join("/favicon.ico").to_s - res = request default_location + root_address = Addressable::URI.parse root.uri - root.add_dependency default_location, res + res = root_address + .yield_self { |uri| uri.join "/favicon.ico" } + .yield_self(&:to_s) + .yield_self(&method(:request)) + + root.add_dependency res return if (200...299).include? res.response.status root.xpath('//link[@rel="shortcut icon"]').find do |possible_favicon| - href = root.address.join(possible_favicon.attr("href")).to_s + res = possible_favicon.attr("href") + .yield_self { |href| root_address.join href } # Ensures that a relative URI is converted to absolute + .yield_self(&:to_s) + .yield_self(&method(:request)) - res = request href + root.add_dependency res - root.add_dependency href, res + (200..299).include? res.response.status end end def fetch_assets_for root - root.asset_links(root.address).each do |link| - dep = crawl_deps link - - root.add_dependency link, dep + root.asset_links.each do |link| + root.add_dependency crawl_deps link end end def request uri - document = Document.new uri + Document.new(uri).tap do |document| + puts "Fetching uri: '#{ uri }'" - puts "Fetching uri: '#{ uri }'" + res = client.get uri - res = client.get uri + body, content_type = with_temp_file do |temp_file| + write_body response: res, to: temp_file + content_type = get_content_type response: res, io: temp_file - body, content_type = binary_temp_file do |temp_file| - write_body response: res, to: temp_file - content_type = get_content_type response: res, io: temp_file - - [temp_file.read, content_type] - end - - # TODO: There is a lot more info to capture from the request and - # response. - document.request = Request.new uri: uri, method: "GET", headers: {} #res.req.headers - document.response = Response.new status: res.code, headers: res.headers, body: body + [temp_file.read, content_type] + end - document.content_type = content_type + # TODO: There is a lot more info to capture from the request and + # response. + document.request = Request.new uri: uri, method: "GET", headers: {} #res.req.headers + document.response = Response.new status: res.code, headers: res.headers, body: body - return document + document.content_type = content_type + rescue HTTP::ConnectionError => e + debugger + end end - def binary_temp_file + # TODO: Could these three methods live else where? + def with_temp_file Tempfile.create do |temp_file| temp_file.binmode From 1e8ecaf838e117b02c76a98c9e3d7344e515e737 Mon Sep 17 00:00:00 2001 From: JoshAshby Date: Thu, 26 Mar 2020 12:14:56 -0600 Subject: [PATCH 3/3] break out dependency resolving to give it a try --- app/crawlers/classic.rb | 50 ++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/app/crawlers/classic.rb b/app/crawlers/classic.rb index c0a5c6aa..968422e4 100644 --- a/app/crawlers/classic.rb +++ b/app/crawlers/classic.rb @@ -35,18 +35,37 @@ def dependencies @dependencies ||= {} end - # TODO: What if this is JSON and instead of xpath it's a json_path? - def nokogiri - @nokogiri ||= Nokogiri::HTML(response.body) - rescue => e - debugger + def add_dependency dep + binding.pry unless dep.is_a? Document + fail "dependency must be a Document, got `#{ dep.class }`" unless dep.is_a? Document + + dep.parent = self + dependencies[dep.uri] = dep + end + + def inspect + "#" + end +end + +class DependencyResolver + def initialize doc + @doc = doc + end + + # TODO: how + def resolve + return resolve_html if @doc.content_type == "text/html" + + [] end - def_delegators :nokogiri, :xpath + protected + + def resolve_html + root_uri = Addressable::URI.parse @doc.uri - # TODO: move this logic to a "dependency resolver/parser" - def asset_links - root_uri = Addressable::URI.parse @uri + nokogiri = Nokogiri::HTML(@doc.response.body) [ "//link[@rel='stylesheet']/@href", @@ -56,17 +75,10 @@ def asset_links .map(&:to_s) .uniq .compact + .reject { |src| src.start_with? "data:" } # TODO: should this reject all non http/https instead? .map(&Addressable::URI.method(:parse)) .map { |link| (root_uri + link).to_s } end - - def add_dependency dep - binding.pry unless dep.is_a? Document - fail "dependency must be a Document, got `#{ dep.class }`" unless dep.is_a? Document - - dep.parent = self - dependencies[dep.uri] = dep - end end class Classic @@ -134,12 +146,14 @@ def fetch_favicon_for root end def fetch_assets_for root - root.asset_links.each do |link| + DependencyResolver.new(root).resolve.each do |link| root.add_dependency crawl_deps link end end def request uri + return if uri.start_with? "data:" + Document.new(uri).tap do |document| puts "Fetching uri: '#{ uri }'"