diff --git a/config/crawler.yml.example b/config/crawler.yml.example index cf2c2737..36419df0 100644 --- a/config/crawler.yml.example +++ b/config/crawler.yml.example @@ -41,6 +41,8 @@ # join_as: array # How to concatenate multiple values, can be: array | string # value: yes # The value to use, only applicable if action is `set` # source: html # The source to extract from, can be: html | url +# http_headers: +# - "Cookie": "jwt-token=token" # ## Where to send the results. Possible values are console, file, or elasticsearch #output_sink: elasticsearch diff --git a/lib/crawler/api/config.rb b/lib/crawler/api/config.rb index 0f0f22a6..bc86eb09 100644 --- a/lib/crawler/api/config.rb +++ b/lib/crawler/api/config.rb @@ -54,7 +54,8 @@ class Config # rubocop:disable Metrics/ClassLength :results_collection, # An Enumerable collection for storing mock crawl results :user_agent, # The User-Agent used for requests made from the crawler. :stats_dump_interval, # How often should we output stats in the logs during a crawl - :purge_crawl_enabled, # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks + :purge_crawl_enabled, # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks + :http_headers, # Custom HTTP headers # Elasticsearch settings :elasticsearch, # Elasticsearch connection settings @@ -180,13 +181,15 @@ class Config # rubocop:disable Metrics/ClassLength extraction_rules: {}, crawl_rules: {}, - purge_crawl_enabled: true + purge_crawl_enabled: true, + http_headers: [] }.freeze # Settings we are not allowed to log due to their sensitive nature SENSITIVE_FIELDS = %i[ auth http_header_service + http_headers http_proxy_username http_proxy_password elasticsearch diff --git a/lib/crawler/http_client.rb b/lib/crawler/http_client.rb index c11823f3..e09fac4b 100644 --- a/lib/crawler/http_client.rb +++ b/lib/crawler/http_client.rb @@ -7,7 +7,6 @@ # frozen_string_literal: true require 'weakref' - java_import java.util.LinkedHashMap java_import java.security.KeyStore java_import javax.net.ssl.SSLContext @@ -53,6 +52,7 @@ class HttpClient # rubocop:disable Metrics/ClassLength java_import org.apache.hc.client5.http.impl.auth.BasicCredentialsProvider java_import org.apache.hc.client5.http.impl.classic.HttpClientBuilder java_import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder + java_import org.apache.hc.core5.http.message.BasicHeader # Scoped this import to the class only to avoid conflicts with Ruby's Timeout module java_import org.apache.hc.core5.util.Timeout @@ -188,9 +188,20 @@ def new_http_client # rubocop:disable Metrics/AbcSize builder.disable_content_compression unless config.compression_enabled builder.set_content_decoder_registry(content_decoders) builder.set_proxy(proxy_host) + builder.set_default_headers(http_headers) builder.build end + #------------------------------------------------------------------------------------------------- + # Loads a list of HTTP Headers from the configuration + def http_headers + return [] if config.http_headers.empty? + + config.http_headers.map do |header| + headers = BasicHeader.new(header[0].to_s, header[1]) + end + end + #------------------------------------------------------------------------------------------------- def content_decoders CONTENT_DECODERS diff --git a/lib/crawler/http_executor.rb b/lib/crawler/http_executor.rb index 9e8813ad..e67ba72f 100644 --- a/lib/crawler/http_executor.rb +++ b/lib/crawler/http_executor.rb @@ -152,6 +152,7 @@ def http_client http_proxy_password: config.http_proxy_password, http_proxy_scheme: config.http_proxy_protocol, compression_enabled: config.compression_enabled, + http_headers: config.http_headers, logger: ) end diff --git a/lib/crawler/http_utils/config.rb b/lib/crawler/http_utils/config.rb index 31d955c4..6091cd1a 100644 --- a/lib/crawler/http_utils/config.rb +++ b/lib/crawler/http_utils/config.rb @@ -35,6 +35,7 @@ class Config < SimpleDelegator http_proxy_scheme http_proxy_username http_proxy_password + http_headers ].freeze REQUIRED_OPTIONS = %i[ @@ -122,6 +123,10 @@ def compression_enabled fetch(:compression_enabled, true) end + def http_headers + fetch(:http_headers, []) + end + private def crawler_default(setting)