diff --git a/src/data_crawling/crawlers/base.py b/src/data_crawling/crawlers/base.py index d7e5cb4..c4fa1c4 100644 --- a/src/data_crawling/crawlers/base.py +++ b/src/data_crawling/crawlers/base.py @@ -32,6 +32,9 @@ def __init__(self, scroll_limit: int = 5) -> None: options.add_argument(f"--disk-cache-dir={mkdtemp()}") options.add_argument("--remote-debugging-port=9226") + user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36' + options.add_argument(f'user-agent={user_agent}') + self.set_extra_driver_options(options) self.scroll_limit = scroll_limit diff --git a/src/data_crawling/main.py b/src/data_crawling/main.py index cf3b822..5b254e4 100644 --- a/src/data_crawling/main.py +++ b/src/data_crawling/main.py @@ -4,13 +4,13 @@ from aws_lambda_powertools.utilities.typing import LambdaContext from core import lib from core.db.documents import UserDocument -from crawlers import CustomArticleCrawler, GithubCrawler, LinkedInCrawler +from crawlers import CustomArticleCrawler, GithubCrawler, LinkedInCrawler, MediumCrawler from dispatcher import CrawlerDispatcher logger = Logger(service="llm-twin-course/crawler") _dispatcher = CrawlerDispatcher() -_dispatcher.register("medium", CustomArticleCrawler) +_dispatcher.register("medium", MediumCrawler) _dispatcher.register("linkedin", LinkedInCrawler) _dispatcher.register("github", GithubCrawler)