Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor!: Split BrowserType literal into two different literals based on context #1070

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ async def main() -> None:
# Generator will generate real looking browser fingerprint based on the options.
# Unspecified fingerprint options will be automatically selected by the generator.
fingerprint_generator = DefaultFingerprintGenerator(
header_options=HeaderGeneratorOptions(browsers=['chromium']),
header_options=HeaderGeneratorOptions(browsers=['chrome']),
screen_options=ScreenOptions(min_width=400),
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@
)

fingerprint_generator = DefaultFingerprintGenerator(
header_options=HeaderGeneratorOptions(browsers=['chromium']),
header_options=HeaderGeneratorOptions(browsers=['chrome']),
screen_options=ScreenOptions(min_width=400),
)
3 changes: 2 additions & 1 deletion src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from crawlee.browsers._browser_controller import BrowserController
from crawlee.browsers._types import BrowserType
from crawlee.fingerprint_suite import HeaderGenerator
from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type

if TYPE_CHECKING:
from collections.abc import Mapping
Expand Down Expand Up @@ -225,7 +226,7 @@ async def _create_browser_context(
'sec-ch-ua-mobile',
'sec-ch-ua-platform',
},
browser_type=self.browser_type,
browser_type=fingerprint_browser_type_from_playwright_browser_type(self.browser_type),
)
)
else:
Expand Down
7 changes: 6 additions & 1 deletion src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
from crawlee.errors import SessionError
from crawlee.fingerprint_suite import DefaultFingerprintGenerator, FingerprintGenerator, HeaderGeneratorOptions
from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type
from crawlee.sessions._cookies import PlaywrightCookieParam
from crawlee.statistics import StatisticsState

Expand Down Expand Up @@ -139,7 +140,11 @@ def __init__(
# If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
else:
if fingerprint_generator == 'default':
generator_browser_type = None if browser_type is None else [browser_type]
if not browser_type:
generator_browser_type = None
else:
generator_browser_type = [fingerprint_browser_type_from_playwright_browser_type(browser_type)]

fingerprint_generator = DefaultFingerprintGenerator(
header_options=HeaderGeneratorOptions(browsers=generator_browser_type)
)
Expand Down
48 changes: 22 additions & 26 deletions src/crawlee/fingerprint_suite/_browserforge_adapter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import random
from collections.abc import Iterable
from copy import deepcopy
from functools import reduce
Expand All @@ -10,7 +11,6 @@
from browserforge.fingerprints import Fingerprint as bf_Fingerprint
from browserforge.fingerprints import FingerprintGenerator as bf_FingerprintGenerator
from browserforge.fingerprints import Screen
from browserforge.headers import Browser
from browserforge.headers.generator import DATA_DIR, ListOrString
from browserforge.headers.generator import HeaderGenerator as bf_HeaderGenerator
from typing_extensions import override
Expand All @@ -21,6 +21,8 @@
from ._fingerprint_generator import FingerprintGenerator

if TYPE_CHECKING:
from browserforge.headers import Browser

from ._types import HeaderGeneratorOptions, ScreenOptions, SupportedBrowserType


Expand Down Expand Up @@ -60,8 +62,8 @@ def generate(
This patched version of the method adds additional quality checks on the output of the original method. It tries
to generate headers several times until they match the requirements.

The `browser` parameter accepts `chromium` as a general category, which includes not only Google Chrome
but also other Chromium-based browsers. As a result, a Safari-like user agent may be generated for a `chromium`
The `browser` parameter accepts `chrome` as a general category, which includes not only Google Chrome
but also other Chromium-based browsers. As a result, a Safari-like user agent may be generated for a `chrome`
input, such as:
```
Mozilla/5.0 (iPhone; CPU iPhone OS 18_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko)
Expand All @@ -77,21 +79,18 @@ def generate(

single_browser = self._get_single_browser_type(browser)

if single_browser == 'chromium':
# `BrowserForge` header generator considers `chromium` in general sense and therefore will generate also
# other `Chromium` based browser headers. This adapter desires only specific subset of `chromium` headers
if single_browser == 'chrome':
# `BrowserForge` header generator considers `chrome` in general sense and therefore will generate also
# other `chrome` based browser headers. This adapter desires only specific subset of `chrome` headers
# that contain all 'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform' headers.
# Increase max attempts as from `BrowserForge` header generator perspective even `chromium`
# headers without `sec-...` headers are valid.
max_attempts += 50

# Browserforge uses term 'safari', we use term 'webkit'
bf_browser_type = 'safari' if single_browser == 'webkit' else single_browser

# Use browserforge to generate headers until it satisfies our additional requirements.
for _attempt in range(max_attempts):
generated_header: dict[str, str] = super().generate(
browser=bf_browser_type,
browser=single_browser,
os=os,
device=device,
locale=locale,
Expand All @@ -111,7 +110,7 @@ def generate(
keyword in generated_header['User-Agent']
for keyword in self._get_expected_browser_keywords(single_browser)
):
if single_browser == 'chromium' and not self._contains_all_sec_headers(generated_header):
if single_browser == 'chrome' and not self._contains_all_sec_headers(generated_header):
# Accept chromium header only with all sec headers.
continue

Expand All @@ -136,19 +135,16 @@ def _get_single_browser_type(self, browser: Iterable[str | Browser] | None) -> s
Handling the original multitype would be pointlessly complex.
"""
# In our case we never pass more than one browser type. In general case more browsers are just bigger pool to
# select from, so narrowing it to the first one is still a valid action.
first_browser = (
next(iter(browser)) if (isinstance(browser, Iterable) and not isinstance(browser, str)) else browser
)

if isinstance(first_browser, str):
single_name = first_browser
elif isinstance(first_browser, Browser):
single_name = first_browser.name
else:
single_name = None

return single_name
# select from, so narrowing it to any of them is still a valid action.

if isinstance(browser, Iterable):
return random.choice(
[
single_browser if isinstance(single_browser, str) else single_browser.name
for single_browser in browser
]
)
return None


class PatchedFingerprintGenerator(bf_FingerprintGenerator):
Expand Down Expand Up @@ -245,9 +241,9 @@ class BrowserforgeHeaderGenerator:
def __init__(self) -> None:
self._generator = PatchedHeaderGenerator(locale=['en-US', 'en'])

def generate(self, browser_type: SupportedBrowserType = 'chromium') -> dict[str, str]:
def generate(self, browser_type: SupportedBrowserType = 'chrome') -> dict[str, str]:
"""Generate headers."""
return self._generator.generate(browser=browser_type)
return self._generator.generate(browser=[browser_type])


def get_available_header_network() -> dict:
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/fingerprint_suite/_consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
COMMON_ACCEPT_LANGUAGE = 'en-US,en;q=0.9'

BROWSER_TYPE_HEADER_KEYWORD = {
'chromium': {'Chrome', 'CriOS'},
'chrome': {'Chrome', 'CriOS'},
'firefox': {'Firefox', 'FxiOS'},
'edge': {'Edg', 'Edge', 'EdgA', 'EdgiOS'},
'webkit': {'Safari'},
'safari': {'Safari'},
}
24 changes: 18 additions & 6 deletions src/crawlee/fingerprint_suite/_header_generator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Literal

from crawlee._types import HttpHeaders
from crawlee._utils.docs import docs_group
Expand All @@ -10,6 +10,18 @@
from crawlee.fingerprint_suite._types import SupportedBrowserType


def fingerprint_browser_type_from_playwright_browser_type(
playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
) -> SupportedBrowserType:
if playwright_browser_type == 'chromium':
return 'chrome'
if playwright_browser_type == 'firefox':
return 'firefox'
if playwright_browser_type == 'webkit':
return 'safari'
raise ValueError('Unsupported browser type')


@docs_group('Classes')
class HeaderGenerator:
"""Generates realistic looking or browser-like HTTP headers."""
Expand All @@ -21,7 +33,7 @@ def _select_specific_headers(self, all_headers: dict[str, str], header_names: se
return HttpHeaders({key: value for key, value in all_headers.items() if key in header_names})

def get_specific_headers(
self, header_names: set[str] | None = None, browser_type: SupportedBrowserType = 'chromium'
self, header_names: set[str] | None = None, browser_type: SupportedBrowserType = 'chrome'
) -> HttpHeaders:
"""Return subset of headers based on the selected `header_names`.

Expand Down Expand Up @@ -50,21 +62,21 @@ def get_random_user_agent_header(self) -> HttpHeaders:
def get_user_agent_header(
self,
*,
browser_type: SupportedBrowserType = 'chromium',
browser_type: SupportedBrowserType = 'chrome',
) -> HttpHeaders:
"""Get the User-Agent header based on the browser type."""
if browser_type not in {'chromium', 'firefox', 'webkit', 'edge'}:
if browser_type not in {'chrome', 'firefox', 'safari', 'edge'}:
raise ValueError(f'Unsupported browser type: {browser_type}')
all_headers = self._generator.generate(browser_type=browser_type)
return self._select_specific_headers(all_headers, header_names={'User-Agent'})

def get_sec_ch_ua_headers(
self,
*,
browser_type: SupportedBrowserType = 'chromium',
browser_type: SupportedBrowserType = 'chrome',
) -> HttpHeaders:
"""Get the sec-ch-ua headers based on the browser type."""
if browser_type not in {'chromium', 'firefox', 'webkit', 'edge'}:
if browser_type not in {'chrome', 'firefox', 'safari', 'edge'}:
raise ValueError(f'Unsupported browser type: {browser_type}')
all_headers = self._generator.generate(browser_type=browser_type)
return self._select_specific_headers(
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/fingerprint_suite/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
SupportedOperatingSystems = Literal['windows', 'macos', 'linux', 'android', 'ios']
SupportedDevices = Literal['desktop', 'mobile']
SupportedHttpVersion = Literal['1', '2']
SupportedBrowserType = Literal['chromium', 'firefox', 'webkit', 'edge']
SupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge']


@docs_group('Data structures')
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
[
pytest.param(None, id='No fingerprint generator. Headers generated by header generator.'),
pytest.param(
DefaultFingerprintGenerator(header_options=HeaderGeneratorOptions(browsers=['chromium'])),
DefaultFingerprintGenerator(header_options=HeaderGeneratorOptions(browsers=['chrome'])),
id='Explicitly passed fingerprint generator.',
),
pytest.param('default', id='Default fingerprint generator.'),
Expand Down Expand Up @@ -182,8 +182,8 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
await crawler.run(['https://test.com'])

user_agent = headers.get('user-agent')
assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'})
assert any(keyword in user_agent for keyword in BROWSER_TYPE_HEADER_KEYWORD[browser_type]), user_agent
assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'}), user_agent
assert any(keyword in user_agent for keyword in BROWSER_TYPE_HEADER_KEYWORD['chrome']), user_agent

assert headers.get('sec-ch-ua') in get_available_header_values(header_network, 'sec-ch-ua')
assert headers.get('sec-ch-ua-mobile') in get_available_header_values(header_network, 'sec-ch-ua-mobile')
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/fingerprint_suite/test_header_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def test_get_random_user_agent_header() -> None:
assert headers['User-Agent']


@pytest.mark.parametrize('browser_type', ['chromium', 'firefox', 'edge', 'webkit'])
@pytest.mark.parametrize('browser_type', ['chrome', 'firefox', 'edge', 'safari'])
def test_get_user_agent_header_stress_test(browser_type: SupportedBrowserType, header_network: dict) -> None:
"""Test that the User-Agent header is consistently generated correctly.

Expand All @@ -55,9 +55,9 @@ def test_get_user_agent_header_invalid_browser_type() -> None:


def test_get_sec_ch_ua_headers_chromium(header_network: dict) -> None:
"""Test that Sec-Ch-Ua headers are generated correctly for Chromium."""
"""Test that Sec-Ch-Ua headers are generated correctly for Chrome."""
header_generator = HeaderGenerator()
headers = header_generator.get_sec_ch_ua_headers(browser_type='chromium')
headers = header_generator.get_sec_ch_ua_headers(browser_type='chrome')

assert headers.get('sec-ch-ua') in get_available_header_values(header_network, 'sec-ch-ua')
assert headers.get('sec-ch-ua-mobile') in get_available_header_values(header_network, 'sec-ch-ua-mobile')
Expand Down
Loading