diff --git a/docs/examples/code_examples/playwright_crawler_with_fingerprint_generator.py b/docs/examples/code_examples/playwright_crawler_with_fingerprint_generator.py index a5d80797fb..24cb5bb907 100644 --- a/docs/examples/code_examples/playwright_crawler_with_fingerprint_generator.py +++ b/docs/examples/code_examples/playwright_crawler_with_fingerprint_generator.py @@ -13,7 +13,7 @@ async def main() -> None: # Generator will generate real looking browser fingerprint based on the options. # Unspecified fingerprint options will be automatically selected by the generator. fingerprint_generator = DefaultFingerprintGenerator( - header_options=HeaderGeneratorOptions(browsers=['chromium']), + header_options=HeaderGeneratorOptions(browsers=['chrome']), screen_options=ScreenOptions(min_width=400), ) diff --git a/docs/guides/code_examples/browser_fingerprint/default_fingerprint_generator_with_args.py b/docs/guides/code_examples/browser_fingerprint/default_fingerprint_generator_with_args.py index 14241e76ab..10980c2eae 100644 --- a/docs/guides/code_examples/browser_fingerprint/default_fingerprint_generator_with_args.py +++ b/docs/guides/code_examples/browser_fingerprint/default_fingerprint_generator_with_args.py @@ -5,6 +5,6 @@ ) fingerprint_generator = DefaultFingerprintGenerator( - header_options=HeaderGeneratorOptions(browsers=['chromium']), + header_options=HeaderGeneratorOptions(browsers=['chrome']), screen_options=ScreenOptions(min_width=400), ) diff --git a/src/crawlee/browsers/_playwright_browser_controller.py b/src/crawlee/browsers/_playwright_browser_controller.py index 8ab50123bc..aa584f5abe 100644 --- a/src/crawlee/browsers/_playwright_browser_controller.py +++ b/src/crawlee/browsers/_playwright_browser_controller.py @@ -13,6 +13,7 @@ from crawlee.browsers._browser_controller import BrowserController from crawlee.browsers._types import BrowserType from crawlee.fingerprint_suite import HeaderGenerator +from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type if TYPE_CHECKING: from collections.abc import Mapping @@ -225,7 +226,7 @@ async def _create_browser_context( 'sec-ch-ua-mobile', 'sec-ch-ua-platform', }, - browser_type=self.browser_type, + browser_type=fingerprint_browser_type_from_playwright_browser_type(self.browser_type), ) ) else: diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 4b5ffba9dc..609c05adc9 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -15,6 +15,7 @@ from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline from crawlee.errors import SessionError from crawlee.fingerprint_suite import DefaultFingerprintGenerator, FingerprintGenerator, HeaderGeneratorOptions +from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type from crawlee.sessions._cookies import PlaywrightCookieParam from crawlee.statistics import StatisticsState @@ -139,7 +140,11 @@ def __init__( # If browser_pool is not provided, create a new instance of BrowserPool with specified arguments. else: if fingerprint_generator == 'default': - generator_browser_type = None if browser_type is None else [browser_type] + if not browser_type: + generator_browser_type = None + else: + generator_browser_type = [fingerprint_browser_type_from_playwright_browser_type(browser_type)] + fingerprint_generator = DefaultFingerprintGenerator( header_options=HeaderGeneratorOptions(browsers=generator_browser_type) ) diff --git a/src/crawlee/fingerprint_suite/_browserforge_adapter.py b/src/crawlee/fingerprint_suite/_browserforge_adapter.py index aba4028e5f..7695decf1e 100644 --- a/src/crawlee/fingerprint_suite/_browserforge_adapter.py +++ b/src/crawlee/fingerprint_suite/_browserforge_adapter.py @@ -1,5 +1,6 @@ from __future__ import annotations +import random from collections.abc import Iterable from copy import deepcopy from functools import reduce @@ -10,7 +11,6 @@ from browserforge.fingerprints import Fingerprint as bf_Fingerprint from browserforge.fingerprints import FingerprintGenerator as bf_FingerprintGenerator from browserforge.fingerprints import Screen -from browserforge.headers import Browser from browserforge.headers.generator import DATA_DIR, ListOrString from browserforge.headers.generator import HeaderGenerator as bf_HeaderGenerator from typing_extensions import override @@ -21,6 +21,8 @@ from ._fingerprint_generator import FingerprintGenerator if TYPE_CHECKING: + from browserforge.headers import Browser + from ._types import HeaderGeneratorOptions, ScreenOptions, SupportedBrowserType @@ -60,8 +62,8 @@ def generate( This patched version of the method adds additional quality checks on the output of the original method. It tries to generate headers several times until they match the requirements. - The `browser` parameter accepts `chromium` as a general category, which includes not only Google Chrome - but also other Chromium-based browsers. As a result, a Safari-like user agent may be generated for a `chromium` + The `browser` parameter accepts `chrome` as a general category, which includes not only Google Chrome + but also other Chromium-based browsers. As a result, a Safari-like user agent may be generated for a `chrome` input, such as: ``` Mozilla/5.0 (iPhone; CPU iPhone OS 18_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) @@ -77,21 +79,18 @@ def generate( single_browser = self._get_single_browser_type(browser) - if single_browser == 'chromium': - # `BrowserForge` header generator considers `chromium` in general sense and therefore will generate also - # other `Chromium` based browser headers. This adapter desires only specific subset of `chromium` headers + if single_browser == 'chrome': + # `BrowserForge` header generator considers `chrome` in general sense and therefore will generate also + # other `chrome` based browser headers. This adapter desires only specific subset of `chrome` headers # that contain all 'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform' headers. # Increase max attempts as from `BrowserForge` header generator perspective even `chromium` # headers without `sec-...` headers are valid. max_attempts += 50 - # Browserforge uses term 'safari', we use term 'webkit' - bf_browser_type = 'safari' if single_browser == 'webkit' else single_browser - # Use browserforge to generate headers until it satisfies our additional requirements. for _attempt in range(max_attempts): generated_header: dict[str, str] = super().generate( - browser=bf_browser_type, + browser=single_browser, os=os, device=device, locale=locale, @@ -111,7 +110,7 @@ def generate( keyword in generated_header['User-Agent'] for keyword in self._get_expected_browser_keywords(single_browser) ): - if single_browser == 'chromium' and not self._contains_all_sec_headers(generated_header): + if single_browser == 'chrome' and not self._contains_all_sec_headers(generated_header): # Accept chromium header only with all sec headers. continue @@ -136,19 +135,16 @@ def _get_single_browser_type(self, browser: Iterable[str | Browser] | None) -> s Handling the original multitype would be pointlessly complex. """ # In our case we never pass more than one browser type. In general case more browsers are just bigger pool to - # select from, so narrowing it to the first one is still a valid action. - first_browser = ( - next(iter(browser)) if (isinstance(browser, Iterable) and not isinstance(browser, str)) else browser - ) - - if isinstance(first_browser, str): - single_name = first_browser - elif isinstance(first_browser, Browser): - single_name = first_browser.name - else: - single_name = None - - return single_name + # select from, so narrowing it to any of them is still a valid action. + + if isinstance(browser, Iterable): + return random.choice( + [ + single_browser if isinstance(single_browser, str) else single_browser.name + for single_browser in browser + ] + ) + return None class PatchedFingerprintGenerator(bf_FingerprintGenerator): @@ -245,9 +241,9 @@ class BrowserforgeHeaderGenerator: def __init__(self) -> None: self._generator = PatchedHeaderGenerator(locale=['en-US', 'en']) - def generate(self, browser_type: SupportedBrowserType = 'chromium') -> dict[str, str]: + def generate(self, browser_type: SupportedBrowserType = 'chrome') -> dict[str, str]: """Generate headers.""" - return self._generator.generate(browser=browser_type) + return self._generator.generate(browser=[browser_type]) def get_available_header_network() -> dict: diff --git a/src/crawlee/fingerprint_suite/_consts.py b/src/crawlee/fingerprint_suite/_consts.py index 6944cb67ed..d010e35419 100644 --- a/src/crawlee/fingerprint_suite/_consts.py +++ b/src/crawlee/fingerprint_suite/_consts.py @@ -5,8 +5,8 @@ COMMON_ACCEPT_LANGUAGE = 'en-US,en;q=0.9' BROWSER_TYPE_HEADER_KEYWORD = { - 'chromium': {'Chrome', 'CriOS'}, + 'chrome': {'Chrome', 'CriOS'}, 'firefox': {'Firefox', 'FxiOS'}, 'edge': {'Edg', 'Edge', 'EdgA', 'EdgiOS'}, - 'webkit': {'Safari'}, + 'safari': {'Safari'}, } diff --git a/src/crawlee/fingerprint_suite/_header_generator.py b/src/crawlee/fingerprint_suite/_header_generator.py index 414dc7b0f2..88431796cd 100644 --- a/src/crawlee/fingerprint_suite/_header_generator.py +++ b/src/crawlee/fingerprint_suite/_header_generator.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal from crawlee._types import HttpHeaders from crawlee._utils.docs import docs_group @@ -10,6 +10,18 @@ from crawlee.fingerprint_suite._types import SupportedBrowserType +def fingerprint_browser_type_from_playwright_browser_type( + playwright_browser_type: Literal['chromium', 'firefox', 'webkit'], +) -> SupportedBrowserType: + if playwright_browser_type == 'chromium': + return 'chrome' + if playwright_browser_type == 'firefox': + return 'firefox' + if playwright_browser_type == 'webkit': + return 'safari' + raise ValueError('Unsupported browser type') + + @docs_group('Classes') class HeaderGenerator: """Generates realistic looking or browser-like HTTP headers.""" @@ -21,7 +33,7 @@ def _select_specific_headers(self, all_headers: dict[str, str], header_names: se return HttpHeaders({key: value for key, value in all_headers.items() if key in header_names}) def get_specific_headers( - self, header_names: set[str] | None = None, browser_type: SupportedBrowserType = 'chromium' + self, header_names: set[str] | None = None, browser_type: SupportedBrowserType = 'chrome' ) -> HttpHeaders: """Return subset of headers based on the selected `header_names`. @@ -50,10 +62,10 @@ def get_random_user_agent_header(self) -> HttpHeaders: def get_user_agent_header( self, *, - browser_type: SupportedBrowserType = 'chromium', + browser_type: SupportedBrowserType = 'chrome', ) -> HttpHeaders: """Get the User-Agent header based on the browser type.""" - if browser_type not in {'chromium', 'firefox', 'webkit', 'edge'}: + if browser_type not in {'chrome', 'firefox', 'safari', 'edge'}: raise ValueError(f'Unsupported browser type: {browser_type}') all_headers = self._generator.generate(browser_type=browser_type) return self._select_specific_headers(all_headers, header_names={'User-Agent'}) @@ -61,10 +73,10 @@ def get_user_agent_header( def get_sec_ch_ua_headers( self, *, - browser_type: SupportedBrowserType = 'chromium', + browser_type: SupportedBrowserType = 'chrome', ) -> HttpHeaders: """Get the sec-ch-ua headers based on the browser type.""" - if browser_type not in {'chromium', 'firefox', 'webkit', 'edge'}: + if browser_type not in {'chrome', 'firefox', 'safari', 'edge'}: raise ValueError(f'Unsupported browser type: {browser_type}') all_headers = self._generator.generate(browser_type=browser_type) return self._select_specific_headers( diff --git a/src/crawlee/fingerprint_suite/_types.py b/src/crawlee/fingerprint_suite/_types.py index 674cfa4eec..a468e1fecc 100644 --- a/src/crawlee/fingerprint_suite/_types.py +++ b/src/crawlee/fingerprint_suite/_types.py @@ -9,7 +9,7 @@ SupportedOperatingSystems = Literal['windows', 'macos', 'linux', 'android', 'ios'] SupportedDevices = Literal['desktop', 'mobile'] SupportedHttpVersion = Literal['1', '2'] -SupportedBrowserType = Literal['chromium', 'firefox', 'webkit', 'edge'] +SupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge'] @docs_group('Data structures') diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 8a3bde6234..0605e7298d 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -151,7 +151,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: [ pytest.param(None, id='No fingerprint generator. Headers generated by header generator.'), pytest.param( - DefaultFingerprintGenerator(header_options=HeaderGeneratorOptions(browsers=['chromium'])), + DefaultFingerprintGenerator(header_options=HeaderGeneratorOptions(browsers=['chrome'])), id='Explicitly passed fingerprint generator.', ), pytest.param('default', id='Default fingerprint generator.'), @@ -182,8 +182,8 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: await crawler.run(['https://test.com']) user_agent = headers.get('user-agent') - assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'}) - assert any(keyword in user_agent for keyword in BROWSER_TYPE_HEADER_KEYWORD[browser_type]), user_agent + assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'}), user_agent + assert any(keyword in user_agent for keyword in BROWSER_TYPE_HEADER_KEYWORD['chrome']), user_agent assert headers.get('sec-ch-ua') in get_available_header_values(header_network, 'sec-ch-ua') assert headers.get('sec-ch-ua-mobile') in get_available_header_values(header_network, 'sec-ch-ua-mobile') diff --git a/tests/unit/fingerprint_suite/test_header_generator.py b/tests/unit/fingerprint_suite/test_header_generator.py index 73e98806ab..22bad4b08f 100644 --- a/tests/unit/fingerprint_suite/test_header_generator.py +++ b/tests/unit/fingerprint_suite/test_header_generator.py @@ -32,7 +32,7 @@ def test_get_random_user_agent_header() -> None: assert headers['User-Agent'] -@pytest.mark.parametrize('browser_type', ['chromium', 'firefox', 'edge', 'webkit']) +@pytest.mark.parametrize('browser_type', ['chrome', 'firefox', 'edge', 'safari']) def test_get_user_agent_header_stress_test(browser_type: SupportedBrowserType, header_network: dict) -> None: """Test that the User-Agent header is consistently generated correctly. @@ -55,9 +55,9 @@ def test_get_user_agent_header_invalid_browser_type() -> None: def test_get_sec_ch_ua_headers_chromium(header_network: dict) -> None: - """Test that Sec-Ch-Ua headers are generated correctly for Chromium.""" + """Test that Sec-Ch-Ua headers are generated correctly for Chrome.""" header_generator = HeaderGenerator() - headers = header_generator.get_sec_ch_ua_headers(browser_type='chromium') + headers = header_generator.get_sec_ch_ua_headers(browser_type='chrome') assert headers.get('sec-ch-ua') in get_available_header_values(header_network, 'sec-ch-ua') assert headers.get('sec-ch-ua-mobile') in get_available_header_values(header_network, 'sec-ch-ua-mobile')