Skip to content

Commit ece9920

Browse files
committed
resolve
2 parents 5eebc2d + 309ed08 commit ece9920

29 files changed

+296
-118
lines changed

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,21 @@
22

33
All notable changes to this project will be documented in this file.
44

5+
## [0.6.5](https://github.com/apify/crawlee-python/releases/tag/v0.6.5) (2025-03-13)
6+
7+
### 🐛 Bug Fixes
8+
9+
- Update to `browserforge` workaround ([#1075](https://github.com/apify/crawlee-python/pull/1075)) ([2378cf8](https://github.com/apify/crawlee-python/commit/2378cf84ab1ed06473049a9ddfca2ba6f166306d)) by [@Pijukatel](https://github.com/Pijukatel)
10+
11+
12+
## [0.6.4](https://github.com/apify/crawlee-python/releases/tag/v0.6.4) (2025-03-12)
13+
14+
### 🐛 Bug Fixes
15+
16+
- Add a check thread before set `add_signal_handler` ([#1068](https://github.com/apify/crawlee-python/pull/1068)) ([6983bda](https://github.com/apify/crawlee-python/commit/6983bda2dbc202b3ecbf7db62b11deee007b4b5f)) by [@Mantisus](https://github.com/Mantisus)
17+
- Temporary workaround for `browserforge` import time code execution ([#1073](https://github.com/apify/crawlee-python/pull/1073)) ([17d914f](https://github.com/apify/crawlee-python/commit/17d914f78242078f88c07d686a567d1091255eb1)) by [@Pijukatel](https://github.com/Pijukatel)
18+
19+
520
## [0.6.3](https://github.com/apify/crawlee-python/releases/tag/v0.6.3) (2025-03-07)
621

722
### 🚀 Features

Makefile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ install-dev:
1111
uv sync --all-extras
1212
uv run pre-commit install
1313
uv run playwright install
14-
uv run python -m browserforge update
1514

1615
build:
1716
uv build --verbose

pyproject.toml

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "crawlee"
7-
version = "0.6.3"
7+
version = "0.6.5"
88
description = "Crawlee for Python"
99
authors = [{ name = "Apify Technologies s.r.o.", email = "[email protected]" }]
1010
license = { file = "LICENSE" }
@@ -33,19 +33,19 @@ keywords = [
3333
"scraping",
3434
]
3535
dependencies = [
36+
"apify_fingerprint_datapoints>=0.0.2",
3637
"browserforge>=1.2.3",
3738
"cachetools>=5.5.0",
3839
"colorama>=0.4.0",
39-
"docutils>=0.21.0",
4040
"eval-type-backport>=0.2.0",
4141
"httpx[brotli,http2,zstd]>=0.27.0",
4242
"more-itertools>=10.2.0",
4343
"psutil>=6.0.0",
4444
"pydantic-settings>=2.2.0,<2.7.0",
4545
"pydantic>=2.8.0,!=2.10.0,!=2.10.1,!=2.10.2",
4646
"pyee>=9.0.0",
47-
"rich>=13.9.0",
4847
"sortedcollections>=2.1.0",
48+
"sortedcontainers>=2.4.0",
4949
"tldextract>=5.1.0",
5050
"typing-extensions>=4.1.0",
5151
"yarl>=1.18.0",
@@ -65,7 +65,7 @@ all = [
6565
"playwright>=1.27.0",
6666
"scikit-learn==1.5.2; python_version == '3.9'",
6767
"scikit-learn>=1.6.0; python_version >= '3.10'",
68-
'typer>=0.12.0'
68+
'typer>=0.12.0',
6969
]
7070
adaptive-crawler = [
7171
"jaro-winkler>=2.0.3",
@@ -74,7 +74,7 @@ adaptive-crawler = [
7474
"scikit-learn>=1.6.0; python_version >= '3.10'",
7575
]
7676
beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"]
77-
cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "typer>=0.12.0"]
77+
cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"]
7878
curl-impersonate = ["curl-cffi>=0.9.0"]
7979
parsel = ["parsel>=1.10.0"]
8080
playwright = ["playwright>=1.27.0"]
@@ -93,7 +93,7 @@ crawlee = "crawlee._cli:cli"
9393
[dependency-groups]
9494
dev = [
9595
"build~=1.2.0",
96-
"filelock~=3.17.0",
96+
"filelock~=3.18.0",
9797
"ipdb~=0.13.0",
9898
"mypy~=1.15.0",
9999
"pre-commit~=4.1.0",
@@ -106,7 +106,7 @@ dev = [
106106
"pytest-xdist~=3.6.0",
107107
"pytest~=8.3.0",
108108
"respx~=0.22.0",
109-
"ruff~=0.9.0",
109+
"ruff~=0.11.0",
110110
"setuptools~=76.0.0", # setuptools are used by pytest, but not explicitly required
111111
"sortedcontainers-stubs~=2.4.0",
112112
"types-beautifulsoup4~=4.12.0.20240229",
@@ -226,12 +226,13 @@ warn_unused_ignores = true
226226
[[tool.mypy.overrides]]
227227
# Example codes are sometimes showing integration of crawlee with external tool, which is not dependency of crawlee.
228228
module = [
229-
"apify", # Example code shows integration of apify and crawlee.
230-
"camoufox", # Example code shows integration of camoufox and crawlee.
231-
"flask", # Example code shows deploy on Google Cloud.
232-
"functions_framework", # Example code shows deploy on Google Cloud.
233-
"jaro", # Untyped and stubs not available
234-
"sklearn.linear_model", # Untyped and stubs not available
229+
"apify", # Example code shows integration of apify and crawlee.
230+
"apify_fingerprint_datapoints", # Untyped and stubs not available
231+
"camoufox", # Example code shows integration of camoufox and crawlee.
232+
"flask", # Example code shows deploy on Google Cloud.
233+
"functions_framework", # Example code shows deploy on Google Cloud.
234+
"jaro", # Untyped and stubs not available
235+
"sklearn.linear_model", # Untyped and stubs not available
235236
]
236237
ignore_missing_imports = true
237238

src/crawlee/_autoscaling/snapshotter.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ def get_memory_sample(self, duration: timedelta | None = None) -> list[Snapshot]
186186
Returns:
187187
A sample of memory snapshots.
188188
"""
189-
snapshots = cast(list[Snapshot], self._memory_snapshots)
189+
snapshots = cast('list[Snapshot]', self._memory_snapshots)
190190
return self._get_sample(snapshots, duration)
191191

192192
@ensure_context
@@ -199,7 +199,7 @@ def get_event_loop_sample(self, duration: timedelta | None = None) -> list[Snaps
199199
Returns:
200200
A sample of event loop snapshots.
201201
"""
202-
snapshots = cast(list[Snapshot], self._event_loop_snapshots)
202+
snapshots = cast('list[Snapshot]', self._event_loop_snapshots)
203203
return self._get_sample(snapshots, duration)
204204

205205
@ensure_context
@@ -212,7 +212,7 @@ def get_cpu_sample(self, duration: timedelta | None = None) -> list[Snapshot]:
212212
Returns:
213213
A sample of CPU snapshots.
214214
"""
215-
snapshots = cast(list[Snapshot], self._cpu_snapshots)
215+
snapshots = cast('list[Snapshot]', self._cpu_snapshots)
216216
return self._get_sample(snapshots, duration)
217217

218218
@ensure_context
@@ -225,7 +225,7 @@ def get_client_sample(self, duration: timedelta | None = None) -> list[Snapshot]
225225
Returns:
226226
A sample of client snapshots.
227227
"""
228-
snapshots = cast(list[Snapshot], self._client_snapshots)
228+
snapshots = cast('list[Snapshot]', self._client_snapshots)
229229
return self._get_sample(snapshots, duration)
230230

231231
@staticmethod
@@ -255,7 +255,7 @@ def _snapshot_cpu(self, event_data: EventSystemInfoData) -> None:
255255
created_at=event_data.cpu_info.created_at,
256256
)
257257

258-
snapshots = cast(list[Snapshot], self._cpu_snapshots)
258+
snapshots = cast('list[Snapshot]', self._cpu_snapshots)
259259
self._prune_snapshots(snapshots, event_data.cpu_info.created_at)
260260
self._cpu_snapshots.add(snapshot)
261261

@@ -275,7 +275,7 @@ def _snapshot_memory(self, event_data: EventSystemInfoData) -> None:
275275
created_at=event_data.memory_info.created_at,
276276
)
277277

278-
snapshots = cast(list[Snapshot], self._memory_snapshots)
278+
snapshots = cast('list[Snapshot]', self._memory_snapshots)
279279
self._prune_snapshots(snapshots, snapshot.created_at)
280280
self._memory_snapshots.add(snapshot)
281281
self._evaluate_memory_load(event_data.memory_info.current_size, event_data.memory_info.created_at)
@@ -295,7 +295,7 @@ def _snapshot_event_loop(self) -> None:
295295
event_loop_delay = snapshot.created_at - previous_snapshot.created_at - self._EVENT_LOOP_SNAPSHOT_INTERVAL
296296
snapshot.delay = event_loop_delay
297297

298-
snapshots = cast(list[Snapshot], self._event_loop_snapshots)
298+
snapshots = cast('list[Snapshot]', self._event_loop_snapshots)
299299
self._prune_snapshots(snapshots, snapshot.created_at)
300300
self._event_loop_snapshots.add(snapshot)
301301

@@ -312,7 +312,7 @@ def _snapshot_client(self) -> None:
312312
error_count = rate_limit_errors.get(self._CLIENT_RATE_LIMIT_ERROR_RETRY_COUNT, 0)
313313
snapshot = ClientSnapshot(error_count=error_count, max_error_count=self._max_client_errors)
314314

315-
snapshots = cast(list[Snapshot], self._client_snapshots)
315+
snapshots = cast('list[Snapshot]', self._client_snapshots)
316316
self._prune_snapshots(snapshots, snapshot.created_at)
317317
self._client_snapshots.add(snapshot)
318318

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# ruff: noqa: N802
2+
3+
4+
def patch_browserforge() -> None:
5+
"""Patches `browserforge` to use data from `apify_fingerprint_datapoints`.
6+
7+
This avoids import time or runtime file downloads.
8+
"""
9+
# Temporary fix until https://github.com/daijro/browserforge/pull/29 is merged
10+
from pathlib import Path
11+
12+
import apify_fingerprint_datapoints
13+
from browserforge import download
14+
15+
download.DATA_DIRS: dict[str, Path] = { # type:ignore[misc]
16+
'headers': apify_fingerprint_datapoints.get_header_network().parent,
17+
'fingerprints': apify_fingerprint_datapoints.get_fingerprint_network().parent,
18+
}
19+
20+
def DownloadIfNotExists(**flags: bool) -> None:
21+
pass
22+
23+
download.DownloadIfNotExists = DownloadIfNotExists
24+
25+
import browserforge.bayesian_network
26+
27+
class BayesianNetwork(browserforge.bayesian_network.BayesianNetwork):
28+
def __init__(self, path: Path) -> None:
29+
"""Inverted mapping as browserforge expects somewhat renamed file names."""
30+
if path.name in download.DATA_FILES['headers']:
31+
path = download.DATA_DIRS['headers'] / download.DATA_FILES['headers'][path.name]
32+
else:
33+
path = download.DATA_DIRS['fingerprints'] / download.DATA_FILES['fingerprints'][path.name]
34+
super().__init__(path)
35+
36+
browserforge.bayesian_network.BayesianNetwork = BayesianNetwork # type:ignore[misc]
37+
import browserforge.headers.generator
38+
39+
browserforge.headers.generator.DATA_DIR = download.DATA_DIRS['headers']
40+
import browserforge.fingerprints.generator
41+
42+
browserforge.headers.generator.DATA_DIR = download.DATA_DIRS['fingerprints']

src/crawlee/_cli.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def _prompt_for_project_name(initial_project_name: str | None) -> str:
7878

7979
def _prompt_text(message: str, default: str) -> str:
8080
return cast(
81-
str,
81+
'str',
8282
ConsoleRender().render(
8383
inquirer.Text(
8484
name='text',
@@ -93,7 +93,7 @@ def _prompt_text(message: str, default: str) -> str:
9393
def _prompt_choice(message: str, choices: list[str]) -> str:
9494
"""Prompt the user to pick one from a list of choices."""
9595
return cast(
96-
str,
96+
'str',
9797
ConsoleRender().render(
9898
inquirer.List(
9999
name='choice',
@@ -106,7 +106,7 @@ def _prompt_choice(message: str, choices: list[str]) -> str:
106106

107107
def _prompt_bool(message: str, *, default: bool) -> bool:
108108
return cast(
109-
bool,
109+
'bool',
110110
ConsoleRender().render(
111111
inquirer.Confirm(
112112
name='confirm',

src/crawlee/_request.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,12 +306,12 @@ def get_query_param_from_url(self, param: str, *, default: str | None = None) ->
306306
@property
307307
def label(self) -> str | None:
308308
"""A string used to differentiate between arbitrary request types."""
309-
return cast(UserData, self.user_data).label
309+
return cast('UserData', self.user_data).label
310310

311311
@property
312312
def crawlee_data(self) -> CrawleeRequestData:
313313
"""Crawlee-specific configuration stored in the `user_data`."""
314-
user_data = cast(UserData, self.user_data)
314+
user_data = cast('UserData', self.user_data)
315315
if user_data.crawlee_data is None:
316316
user_data.crawlee_data = CrawleeRequestData()
317317

src/crawlee/_types.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ async def get_value(self, key: str, default_value: T | None = None) -> T | None:
244244

245245
async def get_value(self, key: str, default_value: T | None = None) -> T | None:
246246
if key in self.updates:
247-
return cast(T, self.updates[key].content)
247+
return cast('T', self.updates[key].content)
248248

249249
return await self._actual_key_value_store.get_value(key, default_value)
250250

src/crawlee/_utils/console.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
if TYPE_CHECKING:
6+
from collections.abc import Sequence
7+
8+
BORDER = {'TL': '┌', 'TR': '┐', 'BL': '└', 'BR': '┘', 'H': '─', 'V': '│', 'TM': '┬', 'BM': '┴'}
9+
10+
11+
def make_table(rows: Sequence[Sequence[str]], width: int = 100) -> str:
12+
"""Creates a text table using Unicode characters.
13+
14+
Args:
15+
rows: A list of tuples/lists to be displayed in the table.
16+
width: Maximum width of the table.
17+
"""
18+
if not rows:
19+
return ''
20+
21+
num_cols = max(len(row) for row in rows)
22+
23+
if num_cols == 0:
24+
return ''
25+
26+
# Normalize the row size by filling missing columns with empty values
27+
normalized_rows = [list(row) + [''] * (num_cols - len(row)) for row in rows]
28+
col_widths = [max(len(str(row[i])) for row in normalized_rows) for i in range(num_cols)]
29+
total_width = sum(col_widths) + (3 * num_cols) + 1
30+
31+
# If the table size is larger than `width`, set all columns to the same length
32+
col_widths = col_widths if total_width <= width else [max(3, (width - (3 * num_cols) - 1) // num_cols)] * num_cols
33+
34+
# Initialize borders
35+
top_parts, bottom_parts = [BORDER['TL']], [BORDER['BL']]
36+
37+
for i in range(num_cols):
38+
h_border = BORDER['H'] * (col_widths[i] + 2)
39+
top_parts.append(h_border)
40+
bottom_parts.append(h_border)
41+
42+
if i < num_cols - 1:
43+
top_parts.append(BORDER['TM'])
44+
bottom_parts.append(BORDER['BM'])
45+
else:
46+
top_parts.append(BORDER['TR'])
47+
bottom_parts.append(BORDER['BR'])
48+
49+
top_border, bottom_border = ''.join(top_parts), ''.join(bottom_parts)
50+
51+
result = [top_border]
52+
53+
for row in normalized_rows:
54+
cells = []
55+
56+
for i, cell in enumerate(row):
57+
# Trim the content if the length exceeds the widths of the column
58+
norm_cell = f'{cell[: col_widths[i] - 3]}...' if len(cell) > col_widths[i] else cell.ljust(col_widths[i])
59+
cells.append(norm_cell)
60+
61+
# row: │ cell1 │ cell2 │ ...
62+
row_str = BORDER['V'] + ''.join(f' {cell} {BORDER["V"]}' for cell in cells)
63+
result.append(row_str)
64+
65+
result.append(bottom_border)
66+
67+
return '\n'.join(result)

src/crawlee/_utils/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def _timedelta_to_ms(td: timedelta | None) -> float | None:
1414
return float('inf')
1515
if td is None:
1616
return td
17-
return int(round(td.total_seconds() * 1000))
17+
return round(td.total_seconds() * 1000)
1818

1919

2020
def _timedelta_to_secs(td: timedelta | None) -> float | None:

src/crawlee/browsers/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,15 @@
1+
# ruff: noqa: E402, TID252
2+
13
from crawlee._utils.try_import import install_import_hook as _install_import_hook
24
from crawlee._utils.try_import import try_import as _try_import
35

46
_install_import_hook(__name__)
57

8+
# Due to patch_browserforge
9+
from .._browserforge_workaround import patch_browserforge
10+
11+
patch_browserforge()
12+
613
# The following imports are wrapped in try_import to handle optional dependencies,
714
# ensuring the module can still function even if these dependencies are missing.
815
with _try_import(__name__, 'BrowserPool'):

src/crawlee/browsers/_playwright_browser_controller.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@
1111

1212
from crawlee._utils.docs import docs_group
1313
from crawlee.browsers._browser_controller import BrowserController
14-
from crawlee.browsers._types import BrowserType
1514
from crawlee.fingerprint_suite import HeaderGenerator
1615

1716
if TYPE_CHECKING:
1817
from collections.abc import Mapping
1918

2019
from crawlee.browsers._playwright_browser import PlaywrightPersistentBrowser
20+
from crawlee.browsers._types import BrowserType
2121
from crawlee.fingerprint_suite import FingerprintGenerator
2222
from crawlee.proxy_configuration import ProxyInfo
2323

@@ -107,7 +107,7 @@ def is_browser_connected(self) -> bool:
107107
@property
108108
@override
109109
def browser_type(self) -> BrowserType:
110-
return cast(BrowserType, self._browser.browser_type.name)
110+
return cast('BrowserType', self._browser.browser_type.name)
111111

112112
@override
113113
async def new_page(

0 commit comments

Comments
 (0)