From 38b7ea3f207b5d241b7d62434753af7e58522da6 Mon Sep 17 00:00:00 2001
From: JoshAshby <joshuaashby@joshashby.com>
Date: Fri, 23 Aug 2019 22:01:24 -0600
Subject: [PATCH 1/3] experiment with making an easier to maintain crawler

while also thinking about the future of being able to produce WARC
files and have a better set of logs and information around requests
and responses during caching to aid in debugging

and also thinking about extendability and introducing newer crawlers
while keeping the same data structures
---
 app/crawlers/classic.rb | 205 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 205 insertions(+)
 create mode 100644 app/crawlers/classic.rb

diff --git a/app/crawlers/classic.rb b/app/crawlers/classic.rb
new file mode 100644
index 00000000..d84899d4
--- /dev/null
+++ b/app/crawlers/classic.rb
@@ -0,0 +1,205 @@
+require "tempfile"
+
+module HTTPRequestInResponse
+  refine HTTP::Response do
+    attr_accessor :req
+  end
+
+  refine HTTP::Client do
+    alias_method :__perform__, :perform
+
+    def perform(req, options)
+      __perform__(req, options).tap { |res| res.req = req }
+    end
+  end
+end
+
+using HTTPRequestInResponse
+
+Request = Struct.new(:uri, :method, :headers, keyword_init: true) do
+end
+
+Response = Struct.new(:status, :headers, :body, keyword_init: true) do
+end
+
+class Document
+  extend Forwardable
+
+  attr_accessor :uri, :request, :response, :content_type, :parent, :dependencies
+
+  def initialize uri
+    @uri = uri
+  end
+
+  def dependencies
+    @dependencies ||= {}
+  end
+
+  def address
+    @address ||= Addressable::URI.parse uri
+  end
+
+  # TODO: What if this is JSON and instead of xpath it's a json_path?
+  def nokogiri
+    @nokogiri ||= Nokogiri::HTML(response.body)
+  rescue => e
+    debugger
+  end
+
+  def_delegators :nokogiri, :xpath
+
+  # TODO: move this logic to a "dependency resolver/parser"
+  def asset_links uri
+    [
+      "//link[@rel='stylesheet']/@href",
+      "//script/@src",
+      "//img/@src"
+    ].flat_map(&nokogiri.method(:xpath))
+      .map(&:to_s)
+      .uniq
+      .compact
+      .map(&Addressable::URI.method(:parse))
+      .map { |link| (uri + link).to_s }
+  end
+
+  def add_dependency link, dep
+    dep.parent = self
+    dependencies[link] = dep
+  end
+end
+
+class Classic
+  class Client
+    extend Forwardable
+
+    def config
+      @config ||= ActiveSupport::OrderedOptions.new.tap do |opts|
+        opts.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:57.0) Gecko/20100101 Firefox/57.0"
+
+        opts.headers = {
+          accept_language: "en-US,en;q=0.5",
+          accept: "text/html;q=0.9,*/*;q=0.8; charset=utf-8"
+        }
+
+        opts.follow_redirects = true
+      end
+    end
+
+    def_delegators :base, *HTTP::Request::METHODS
+
+    private
+
+    def base
+      @base ||= HTTP.headers({ user_agent: config.user_agent }.merge(config.headers))
+        .yield_self { |client| config.follow_redirects ? client.follow : client }
+    end
+  end
+
+  def crawl site
+    root = crawl_deps site
+
+    fetch_favicon_for root
+
+    return root
+  end
+
+  private
+
+  def crawl_deps site
+    root = request site
+
+    fetch_assets_for root
+
+    return root
+  end
+
+  def client
+    @client ||= Client.new
+  end
+
+  def fetch_favicon_for root
+    default_location = root.address.join("/favicon.ico").to_s
+    res = request default_location
+
+    root.add_dependency default_location, res
+    return if (200...299).include? res.response.status
+
+    root.xpath('//link[@rel="shortcut icon"]').find do |possible_favicon|
+      href = root.address.join(possible_favicon.attr("href")).to_s
+
+      res = request href
+
+      root.add_dependency href, res
+    end
+  end
+
+  def fetch_assets_for root
+    root.asset_links(root.address).each do |link|
+      dep = crawl_deps link
+
+      root.add_dependency link, dep
+    end
+  end
+
+  def request uri
+    document = Document.new uri
+
+    puts "Fetching uri: '#{ uri }'"
+
+    res = client.get uri
+
+    body, content_type = binary_temp_file do |temp_file|
+      write_body response: res, to: temp_file
+      content_type = get_content_type response: res, io: temp_file
+
+      [temp_file.read, content_type]
+    end
+
+    # TODO: There is a lot more info to capture from the request and
+    # response.
+    document.request = Request.new uri: uri, method: "GET", headers: {} #res.req.headers
+    document.response = Response.new status: res.code, headers: res.headers, body: body
+
+    document.content_type = content_type
+
+    return document
+  end
+
+  def binary_temp_file
+    Tempfile.create do |temp_file|
+      temp_file.binmode
+
+      yield temp_file
+    end
+  end
+
+  # Streams the response body into the temp file to hopefully avoid some
+  # memory issues if this is a large document
+  def write_body response:, to:
+    io_handle = to
+
+    response.body.each do |partial|
+      io_handle.write partial
+    end
+
+    io_handle.rewind
+  end
+
+  def get_content_type response:, io:
+    return response.content_type.mime_type if response.content_type.mime_type
+
+    io_handle = io
+    io_handle.rewind
+
+    # some websites *cough*offline.pink*cough* don't return a content type
+    # header, so we'll first see if the html doctype string is present and
+    # guess this is text/html or we'll fallback to assuming its binary
+    # https://en.wikipedia.org/wiki/Content_sniffing
+    content_type ||= MimeMagic.by_magic(io_handle)
+    content_type ||= "application/octet-stream"
+
+    io_handle.rewind
+
+    content_type
+  end
+end

From 1788a01e593d359f2edea558c201fbee6581b77f Mon Sep 17 00:00:00 2001
From: JoshAshby <joshuaashby@joshashby.com>
Date: Tue, 27 Aug 2019 15:42:03 -0600
Subject: [PATCH 2/3] wip on finding some spots that should/can be broken out

---
 app/crawlers/classic.rb | 89 ++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 45 deletions(-)

diff --git a/app/crawlers/classic.rb b/app/crawlers/classic.rb
index d84899d4..c0a5c6aa 100644
--- a/app/crawlers/classic.rb
+++ b/app/crawlers/classic.rb
@@ -35,10 +35,6 @@ def dependencies
     @dependencies ||= {}
   end
 
-  def address
-    @address ||= Addressable::URI.parse uri
-  end
-
   # TODO: What if this is JSON and instead of xpath it's a json_path?
   def nokogiri
     @nokogiri ||= Nokogiri::HTML(response.body)
@@ -49,7 +45,9 @@ def nokogiri
   def_delegators :nokogiri, :xpath
 
   # TODO: move this logic to a "dependency resolver/parser"
-  def asset_links uri
+  def asset_links
+    root_uri = Addressable::URI.parse @uri
+
     [
       "//link[@rel='stylesheet']/@href",
       "//script/@src",
@@ -59,12 +57,15 @@ def asset_links uri
       .uniq
       .compact
       .map(&Addressable::URI.method(:parse))
-      .map { |link| (uri + link).to_s }
+      .map { |link| (root_uri + link).to_s }
   end
 
-  def add_dependency link, dep
+  def add_dependency dep
+    binding.pry unless dep.is_a? Document
+    fail "dependency must be a Document, got `#{ dep.class }`" unless dep.is_a? Document
+
     dep.parent = self
-    dependencies[link] = dep
+    dependencies[dep.uri] = dep
   end
 end
 
@@ -96,21 +97,13 @@ def base
   end
 
   def crawl site
-    root = crawl_deps site
-
-    fetch_favicon_for root
-
-    return root
+    crawl_deps(site).tap(&method(:fetch_favicon_for))
   end
 
   private
 
   def crawl_deps site
-    root = request site
-
-    fetch_assets_for root
-
-    return root
+    request(site).tap(&method(:fetch_assets_for))
   end
 
   def client
@@ -118,54 +111,60 @@ def client
   end
 
   def fetch_favicon_for root
-    default_location = root.address.join("/favicon.ico").to_s
-    res = request default_location
+    root_address = Addressable::URI.parse root.uri
 
-    root.add_dependency default_location, res
+    res = root_address
+      .yield_self { |uri| uri.join "/favicon.ico" }
+      .yield_self(&:to_s)
+      .yield_self(&method(:request))
+
+    root.add_dependency res
     return if (200...299).include? res.response.status
 
     root.xpath('//link[@rel="shortcut icon"]').find do |possible_favicon|
-      href = root.address.join(possible_favicon.attr("href")).to_s
+      res = possible_favicon.attr("href")
+        .yield_self { |href| root_address.join href } # Ensures that a relative URI is converted to absolute
+        .yield_self(&:to_s)
+        .yield_self(&method(:request))
 
-      res = request href
+      root.add_dependency res
 
-      root.add_dependency href, res
+      (200..299).include? res.response.status
     end
   end
 
   def fetch_assets_for root
-    root.asset_links(root.address).each do |link|
-      dep = crawl_deps link
-
-      root.add_dependency link, dep
+    root.asset_links.each do |link|
+      root.add_dependency crawl_deps link
     end
   end
 
   def request uri
-    document = Document.new uri
+    Document.new(uri).tap do |document|
+      puts "Fetching uri: '#{ uri }'"
 
-    puts "Fetching uri: '#{ uri }'"
+      res = client.get uri
 
-    res = client.get uri
+      body, content_type = with_temp_file do |temp_file|
+        write_body response: res, to: temp_file
+        content_type = get_content_type response: res, io: temp_file
 
-    body, content_type = binary_temp_file do |temp_file|
-      write_body response: res, to: temp_file
-      content_type = get_content_type response: res, io: temp_file
-
-      [temp_file.read, content_type]
-    end
-
-    # TODO: There is a lot more info to capture from the request and
-    # response.
-    document.request = Request.new uri: uri, method: "GET", headers: {} #res.req.headers
-    document.response = Response.new status: res.code, headers: res.headers, body: body
+        [temp_file.read, content_type]
+      end
 
-    document.content_type = content_type
+      # TODO: There is a lot more info to capture from the request and
+      # response.
+      document.request = Request.new uri: uri, method: "GET", headers: {} #res.req.headers
+      document.response = Response.new status: res.code, headers: res.headers, body: body
 
-    return document
+      document.content_type = content_type
+    rescue HTTP::ConnectionError => e
+      debugger
+    end
   end
 
-  def binary_temp_file
+  # TODO: Could these three methods live else where?
+  def with_temp_file
     Tempfile.create do |temp_file|
       temp_file.binmode
 

From 1e8ecaf838e117b02c76a98c9e3d7344e515e737 Mon Sep 17 00:00:00 2001
From: JoshAshby <joshuaashby@joshashby.com>
Date: Thu, 26 Mar 2020 12:14:56 -0600
Subject: [PATCH 3/3] break out dependency resolving to give it a try

---
 app/crawlers/classic.rb | 50 ++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/app/crawlers/classic.rb b/app/crawlers/classic.rb
index c0a5c6aa..968422e4 100644
--- a/app/crawlers/classic.rb
+++ b/app/crawlers/classic.rb
@@ -35,18 +35,37 @@ def dependencies
     @dependencies ||= {}
   end
 
-  # TODO: What if this is JSON and instead of xpath it's a json_path?
-  def nokogiri
-    @nokogiri ||= Nokogiri::HTML(response.body)
-  rescue => e
-    debugger
+  def add_dependency dep
+    binding.pry unless dep.is_a? Document
+    fail "dependency must be a Document, got `#{ dep.class }`" unless dep.is_a? Document
+
+    dep.parent = self
+    dependencies[dep.uri] = dep
+  end
+
+  def inspect
+    "#<Document:TODO content-type=#{ content_type } uri=#{ uri }>"
+  end
+end
+
+class DependencyResolver
+  def initialize doc
+    @doc = doc
+  end
+
+  # TODO: how
+  def resolve
+    return resolve_html if @doc.content_type == "text/html"
+
+    []
   end
 
-  def_delegators :nokogiri, :xpath
+  protected
+
+  def resolve_html
+    root_uri = Addressable::URI.parse @doc.uri
 
-  # TODO: move this logic to a "dependency resolver/parser"
-  def asset_links
-    root_uri = Addressable::URI.parse @uri
+    nokogiri = Nokogiri::HTML(@doc.response.body)
 
     [
       "//link[@rel='stylesheet']/@href",
@@ -56,17 +75,10 @@ def asset_links
       .map(&:to_s)
       .uniq
       .compact
+      .reject { |src| src.start_with? "data:" } # TODO: should this reject all non http/https instead?
       .map(&Addressable::URI.method(:parse))
       .map { |link| (root_uri + link).to_s }
   end
-
-  def add_dependency dep
-    binding.pry unless dep.is_a? Document
-    fail "dependency must be a Document, got `#{ dep.class }`" unless dep.is_a? Document
-
-    dep.parent = self
-    dependencies[dep.uri] = dep
-  end
 end
 
 class Classic
@@ -134,12 +146,14 @@ def fetch_favicon_for root
   end
 
   def fetch_assets_for root
-    root.asset_links.each do |link|
+    DependencyResolver.new(root).resolve.each do |link|
       root.add_dependency crawl_deps link
     end
   end
 
   def request uri
+    return if uri.start_with? "data:"
+
     Document.new(uri).tap do |document|
       puts "Fetching uri: '#{ uri }'"