diff --git a/.github/workflows/templates_e2e_tests.yaml b/.github/workflows/templates_e2e_tests.yaml new file mode 100644 index 0000000000..741fffe003 --- /dev/null +++ b/.github/workflows/templates_e2e_tests.yaml @@ -0,0 +1,53 @@ +name: Templates end-to-end tests + +on: + workflow_dispatch: + secrets: + APIFY_TEST_USER_API_TOKEN: + description: API token of the Python testing user on Apify + required: true + +jobs: + end_to_end_tests: + name: End-to-end tests + strategy: + fail-fast: false + + runs-on: "ubuntu-latest" + env: + python-version: "3.13" + node-version: "22" + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup node + uses: actions/setup-node@v4 + with: + node-version: ${{ env.node-version }} + + - name: Install dependencies + run: npm install -g apify-cli + + - name: Set up Python ${{ env.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.python-version }} + + # installed to be able to patch crawlee in the poetry.lock with custom wheel file for poetry based templates + - name: Install poetry + run: pipx install poetry + + - name: Set up uv package manager + uses: astral-sh/setup-uv@v5 + with: + python-version: ${{ env.python-version }} + + - name: Install Python dependencies + run: make install-dev + + - name: Run templates end-to-end tests + run: make e2e-templates-tests + env: + APIFY_TEST_USER_API_TOKEN: ${{ secrets.APIFY_TEST_USER_API_TOKEN }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e22fa1b791..d757c66d65 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -72,13 +72,19 @@ To run unit tests with HTML coverage report: make unit-tests-cov ``` - + +To run end-to-end tests: + +```sh +make e2e-templates-tests +``` ## Documentation diff --git a/Makefile b/Makefile index 8a2a54397f..b605db74b0 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ integration-tests format check-code build-api-reference run-docs # This is default for local testing, but GitHub workflows override it to a higher value in CI -INTEGRATION_TESTS_CONCURRENCY = 1 +E2E_TESTS_CONCURRENCY = 1 clean: rm -rf .mypy_cache .pytest_cache .ruff_cache build dist htmlcov .coverage @@ -32,8 +32,8 @@ unit-tests: unit-tests-cov: uv run pytest --numprocesses=auto --verbose --cov=src/crawlee --cov-report=html tests/unit -integration-tests: - uv run pytest --numprocesses=$(INTEGRATION_TESTS_CONCURRENCY) --verbose tests/integration +e2e-templates-tests: + uv run pytest --numprocesses=$(E2E_TESTS_CONCURRENCY) --verbose tests/e2e/project_template format: uv run ruff check --fix diff --git a/pyproject.toml b/pyproject.toml index 40c6a5763c..7c7190c285 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,6 +92,8 @@ crawlee = "crawlee._cli:cli" [dependency-groups] dev = [ + "apify_client", # For e2e tests. + "build~=1.2.2", # For e2e tests. "mypy~=1.15.0", "pre-commit~=4.2.0", "proxy-py~=2.4.0", @@ -119,7 +121,7 @@ packages = ["src/crawlee"] [tool.ruff] line-length = 120 include = ["src/**/*.py", "tests/**/*.py", "docs/**/*.py", "website/**/*.py"] -extend-exclude = ["project_template"] +extend-exclude = ["src/crawlee/project_template"] [tool.ruff.lint] select = ["ALL"] @@ -207,7 +209,7 @@ timeout = 1200 [tool.mypy] python_version = "3.9" plugins = ["pydantic.mypy"] -exclude = ["project_template"] +exclude = ["src/crawlee/project_template"] files = ["src", "tests", "docs", "website"] check_untyped_defs = true disallow_incomplete_defs = true @@ -229,6 +231,8 @@ module = [ "jaro", # Untyped and stubs not available "loguru", # Example code shows integration of loguru and crawlee for JSON logging. "sklearn.linear_model", # Untyped and stubs not available + "cookiecutter.*", # Untyped and stubs not available + "inquirer.*", # Untyped and stubs not available ] ignore_missing_imports = true diff --git a/src/crawlee/_cli.py b/src/crawlee/_cli.py index 47ee6fe5bf..dbd6b7944f 100644 --- a/src/crawlee/_cli.py +++ b/src/crawlee/_cli.py @@ -7,10 +7,10 @@ from typing import Annotated, Optional, cast try: - import inquirer # type: ignore[import-untyped] + import inquirer import typer - from cookiecutter.main import cookiecutter # type: ignore[import-untyped] - from inquirer.render.console import ConsoleRender # type: ignore[import-untyped] + from cookiecutter.main import cookiecutter + from inquirer.render.console import ConsoleRender from rich.progress import Progress, SpinnerColumn, TextColumn except ModuleNotFoundError as exc: raise ImportError( @@ -21,7 +21,8 @@ cli = typer.Typer(no_args_is_help=True) template_directory = importlib.resources.files('crawlee') / 'project_template' -cookiecutter_json = json.load((template_directory / 'cookiecutter.json').open()) +with open(str(template_directory / 'cookiecutter.json')) as f: + cookiecutter_json = json.load(f) crawler_choices = cookiecutter_json['crawler_type'] http_client_choices = cookiecutter_json['http_client'] diff --git a/src/crawlee/project_template/hooks/post_gen_project.py b/src/crawlee/project_template/hooks/post_gen_project.py index 877f234b6e..e076ff9308 100644 --- a/src/crawlee/project_template/hooks/post_gen_project.py +++ b/src/crawlee/project_template/hooks/post_gen_project.py @@ -2,7 +2,6 @@ import subprocess from pathlib import Path -Path('_pyproject.toml').rename('pyproject.toml') # % if cookiecutter.package_manager in ['poetry', 'uv'] Path('requirements.txt').unlink() diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile b/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile index 65383fa740..cde1497b4b 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile @@ -20,8 +20,7 @@ RUN pip install -U pip setuptools \ # Second, copy just poetry.lock and pyproject.toml into the Actor image, # since those should be the only files that affects the dependency install in the next step, # in order to speed up the build -COPY pyproject.toml ./ -COPY poetry.lock ./ +COPY pyproject.toml poetry.lock ./ # Install the dependencies RUN echo "Python version:" \ diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/_pyproject.toml b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml similarity index 100% rename from src/crawlee/project_template/{{cookiecutter.project_name}}/_pyproject.toml rename to src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py new file mode 100644 index 0000000000..cf4edb4d6f --- /dev/null +++ b/tests/e2e/conftest.py @@ -0,0 +1,44 @@ +import subprocess +from pathlib import Path + +import pytest +from filelock import FileLock + +_CRAWLEE_ROOT_PATH = Path(__file__).parent.parent.parent.resolve() + + +@pytest.fixture(scope='session') +def crawlee_wheel_path(tmp_path_factory: pytest.TempPathFactory, testrun_uid: str) -> Path: + """Build the package wheel if it hasn't been built yet, and return the path to the wheel.""" + # Make sure the wheel is not being built concurrently across all the pytest-xdist runners, + # through locking the building process with a temp file. + with FileLock(tmp_path_factory.getbasetemp().parent / 'crawlee_wheel_build.lock'): + # Make sure the wheel is built exactly once across all the pytest-xdist runners, + # through an indicator file saying that the wheel was already built. + was_wheel_built_this_test_run_file = tmp_path_factory.getbasetemp() / f'wheel_was_built_in_run_{testrun_uid}' + if not was_wheel_built_this_test_run_file.exists(): + subprocess.run( + args='python -m build', + cwd=_CRAWLEE_ROOT_PATH, + shell=True, + check=True, + capture_output=True, + ) + was_wheel_built_this_test_run_file.touch() + + # Read the current package version, necessary for getting the right wheel filename. + pyproject_toml_file = (_CRAWLEE_ROOT_PATH / 'pyproject.toml').read_text(encoding='utf-8') + for line in pyproject_toml_file.splitlines(): + if line.startswith('version = '): + delim = '"' if '"' in line else "'" + crawlee_version = line.split(delim)[1] + break + else: + raise RuntimeError('Unable to find version string.') + + wheel_path = _CRAWLEE_ROOT_PATH / 'dist' / f'crawlee-{crawlee_version}-py3-none-any.whl' + + # Just to be sure. + assert wheel_path.exists() + + return wheel_path diff --git a/tests/e2e/project_template/test_static_crawlers_templates.py b/tests/e2e/project_template/test_static_crawlers_templates.py new file mode 100644 index 0000000000..6bd27bf87e --- /dev/null +++ b/tests/e2e/project_template/test_static_crawlers_templates.py @@ -0,0 +1,85 @@ +import os +import re +import subprocess +from pathlib import Path + +import pytest +from apify_client import ApifyClientAsync +from cookiecutter.main import cookiecutter + +from crawlee._cli import default_start_url, template_directory +from crawlee._utils.crypto import crypto_random_object_id +from tests.e2e.project_template.utils import patch_crawlee_version_in_pyproject_toml_based_project + +# To run these tests locally, make sure you have apify-cli installed and available in the path. +# https://docs.apify.com/cli/docs/installation + + +@pytest.mark.parametrize('http_client', ['httpx', 'curl-impersonate']) +@pytest.mark.parametrize('crawler_type', ['parsel', 'beautifulsoup']) +@pytest.mark.parametrize('package_manager', ['uv', 'poetry']) +async def test_static_crawler_actor_at_apify( + tmp_path: Path, crawlee_wheel_path: Path, package_manager: str, crawler_type: str, http_client: str +) -> None: + # Generate new actor name + actor_name = f'crawlee-python-template-e2e-test-{crypto_random_object_id(8).lower()}' + + # Create project from template + cookiecutter( + template=str(template_directory), + no_input=True, + extra_context={ + 'project_name': actor_name, + 'package_manager': package_manager, + 'crawler_type': crawler_type, + 'http_client': http_client, + 'enable_apify_integration': True, + 'start_url': default_start_url, + }, + accept_hooks=False, # Do not install the newly created environment. + output_dir=tmp_path, + ) + + patch_crawlee_version_in_pyproject_toml_based_project( + project_path=tmp_path / actor_name, wheel_path=crawlee_wheel_path + ) + + # Build actor using sequence of cli commands as the user would + subprocess.run( # noqa: ASYNC221, S603 + ['apify', 'login', '-t', os.environ['APIFY_TEST_USER_API_TOKEN']], # noqa: S607 + capture_output=True, + check=True, + cwd=tmp_path / actor_name, + ) + subprocess.run(['apify', 'init', '-y', actor_name], capture_output=True, check=True, cwd=tmp_path / actor_name) # noqa: ASYNC221, S603, S607 + + build_process = subprocess.run(['apify', 'push'], capture_output=True, check=False, cwd=tmp_path / actor_name) # noqa: ASYNC221, S603, S607 + # Get actor ID from build log + actor_id_regexp = re.compile(r'https:\/\/console\.apify\.com\/actors\/(.*)#\/builds\/\d*\.\d*\.\d*') + # Why is it in stderr and not in stdout??? + actor_id = re.findall(actor_id_regexp, build_process.stderr.decode())[0] + + client = ApifyClientAsync(token=os.getenv('APIFY_TEST_USER_API_TOKEN')) + actor = client.actor(actor_id) + + # Run actor + try: + assert build_process.returncode == 0 + started_run_data = await actor.start() + actor_run = client.run(started_run_data['id']) + + finished_run_data = await actor_run.wait_for_finish() + actor_run_log = await actor_run.log().get() + finally: + # Delete the actor once it is no longer needed. + await actor.delete() + + # Asserts + additional_run_info = f'Full actor run log: {actor_run_log}' + assert actor_run_log + assert finished_run_data + assert finished_run_data['status'] == 'SUCCEEDED', additional_run_info + assert ( + 'Crawler.stop() was called with following reason: The crawler has reached its limit of 50 requests per crawl.' + ) in actor_run_log, additional_run_info + assert int(re.findall(r'requests_finished\s*│\s*(\d*)', actor_run_log)[-1]) >= 50, additional_run_info diff --git a/tests/e2e/project_template/utils.py b/tests/e2e/project_template/utils.py new file mode 100644 index 0000000000..5cb6b18291 --- /dev/null +++ b/tests/e2e/project_template/utils.py @@ -0,0 +1,52 @@ +import re +import shutil +import subprocess +from pathlib import Path + + +def patch_crawlee_version_in_pyproject_toml_based_project(project_path: Path, wheel_path: Path) -> None: + """Ensure that the test is using current version of the crawlee from the source and not from Pypi.""" + # Copy prepared .whl file + shutil.copy(wheel_path, project_path) + + # Get any extras + with open(project_path / 'pyproject.toml') as f: + pyproject = f.read() + crawlee_extras = re.findall(r'crawlee(\[.*\])', pyproject)[0] or '' + + # Inject crawlee wheel file to the docker image and update project to depend on it.""" + with open(project_path / 'Dockerfile') as f: + modified_lines = [] + for line in f: + modified_lines.append(line) + if line.startswith('COPY pyproject.toml'): + if 'uv.lock' in line: + package_manager = 'uv' + elif 'poetry.lock' in line: + package_manager = 'poetry' + else: + raise RuntimeError('This does not look like a uv or poetry based project.') + + # Create lock file that is expected by the docker to exist(Even though it wil be patched in the docker). + subprocess.run( + args=[package_manager, 'lock'], + cwd=str(project_path), + check=True, + capture_output=True, + ) + + # Add command to copy .whl to the docker image and update project with it. + # Patching in docker file due to the poetry not properly supporting relative paths for wheel packages + # and so the absolute path(in the container) is generated when running `add` command in the container. + modified_lines.extend( + [ + f'COPY {wheel_path.name} ./', + # If no crawlee version bump, poetry might be lazy and take existing crawlee version, + # make sure that one is patched as well. + f'RUN pip install ./{wheel_path.name}{crawlee_extras} --force-reinstall', + f'RUN {package_manager} add ./{wheel_path.name}{crawlee_extras}', + f'RUN {package_manager} lock', + ] + ) + with open(project_path / 'Dockerfile', 'w') as f: + f.write('\n'.join(modified_lines)) diff --git a/tests/integration/README.md b/tests/integration/README.md deleted file mode 100644 index 0ca287e976..0000000000 --- a/tests/integration/README.md +++ /dev/null @@ -1 +0,0 @@ -# Integration tests diff --git a/uv.lock b/uv.lock index 5a8c00b019..d7dcddf75f 100644 --- a/uv.lock +++ b/uv.lock @@ -40,6 +40,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/46/eb/e7f063ad1fec6b3178a3cd82d1a3c4de82cccf283fc42746168188e1cdd5/anyio-4.8.0-py3-none-any.whl", hash = "sha256:b5011f270ab5eb0abf13385f851315585cc37ef330dd88e27ec3d34d651fd47a", size = 96041 }, ] +[[package]] +name = "apify-client" +version = "1.9.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "apify-shared" }, + { name = "httpx" }, + { name = "more-itertools" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/64/95/5456d114f83749f74969577824abc74fc3ab5e1722acab37f77b0b1800c6/apify_client-1.9.2.tar.gz", hash = "sha256:af76b78c3153263040615daec0619765e067466bbb82e569afe799ad72c53050", size = 46276 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c5/3d/d71b7c79c8da2ca2ccd2aa43cea0cfb2d17475ea8fafb4e7d7a78b24c02c/apify_client-1.9.2-py3-none-any.whl", hash = "sha256:a441fb59b5ec1c42aead73284c90304029442ddc26e764c151b8dc7f15e38600", size = 72505 }, +] + [[package]] name = "apify-fingerprint-datapoints" version = "0.0.2" @@ -49,6 +63,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/29/0bc61fa8d357bd5f0f14c7250f19c984c3ed402575e8e2283ff58ba851a6/apify_fingerprint_datapoints-0.0.2-py3-none-any.whl", hash = "sha256:fc0a3707353d98064ac51541880290dc2dbbd2898affcf770d6762836812b9c1", size = 836233 }, ] +[[package]] +name = "apify-shared" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/67/f4/4ab1dbbc9fe4c3c2467275df66fda137b254c750d1de95cd8ba75a34830a/apify_shared-1.3.1.tar.gz", hash = "sha256:58f123f5627a0cb9e8e7ac0e6f3f27bed76d93af17c80ae2d19fad9c83783bc7", size = 13560 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/50/23/bfe508e33844f4ac9256d72164166281a76f7a1f21745f3e5a76411333aa/apify_shared-1.3.1-py3-none-any.whl", hash = "sha256:3d2630934683911e0fb1e741389abcb885aebfd282f17670e16a0e175af68d7d", size = 12379 }, +] + [[package]] name = "arrow" version = "1.3.0" @@ -266,6 +289,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8b/53/c60eb5bd26cf8689e361031bebc431437bc988555e80ba52d48c12c1d866/browserforge-1.2.3-py3-none-any.whl", hash = "sha256:a6c71ed4688b2f1b0bee757ca82ddad0007cbba68a71eca66ca607dde382f132", size = 39626 }, ] +[[package]] +name = "build" +version = "1.2.2.post1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "os_name == 'nt'" }, + { name = "importlib-metadata", marker = "python_full_version < '3.10.2'" }, + { name = "packaging" }, + { name = "pyproject-hooks" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7d/46/aeab111f8e06793e4f0e421fcad593d547fb8313b50990f31681ee2fb1ad/build-1.2.2.post1.tar.gz", hash = "sha256:b36993e92ca9375a219c99e606a122ff365a760a2d4bba0caa09bd5278b608b7", size = 46701 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/c2/80633736cd183ee4a62107413def345f7e6e3c01563dbca1417363cf957e/build-1.2.2.post1-py3-none-any.whl", hash = "sha256:1d61c0887fa860c01971625baae8bdd338e517b836a2f70dd1f7aa3a6b2fc5b5", size = 22950 }, +] + [[package]] name = "cachetools" version = "5.5.2" @@ -625,6 +664,8 @@ playwright = [ [package.dev-dependencies] dev = [ + { name = "apify-client" }, + { name = "build" }, { name = "mypy" }, { name = "pre-commit" }, { name = "proxy-py" }, @@ -693,6 +734,8 @@ provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impe [package.metadata.requires-dev] dev = [ + { name = "apify-client" }, + { name = "build", specifier = "~=1.2.2" }, { name = "mypy", specifier = "~=1.15.0" }, { name = "pre-commit", specifier = "~=4.2.0" }, { name = "proxy-py", specifier = "~=2.4.0" }, @@ -1104,6 +1147,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, ] +[[package]] +name = "importlib-metadata" +version = "8.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/33/08/c1395a292bb23fd03bdf572a1357c5a733d3eecbab877641ceacab23db6e/importlib_metadata-8.6.1.tar.gz", hash = "sha256:310b41d755445d74569f993ccfc22838295d9fe005425094fad953d7f15c8580", size = 55767 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/79/9d/0fb148dc4d6fa4a7dd1d8378168d9b4cd8d4560a6fbf6f0121c5fc34eb68/importlib_metadata-8.6.1-py3-none-any.whl", hash = "sha256:02a89390c1e15fdfdc0d7c6b25cb3e62650d0494005c97d6f148bf5b9787525e", size = 26971 }, +] + [[package]] name = "iniconfig" version = "2.0.0" @@ -2075,6 +2130,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 }, ] +[[package]] +name = "pyproject-hooks" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e7/82/28175b2414effca1cdac8dc99f76d660e7a4fb0ceefa4b4ab8f5f6742925/pyproject_hooks-1.2.0.tar.gz", hash = "sha256:1e859bd5c40fae9448642dd871adf459e5e2084186e8d2c2a79a824c970da1f8", size = 19228 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bd/24/12818598c362d7f300f18e74db45963dbcb85150324092410c8b49405e42/pyproject_hooks-1.2.0-py3-none-any.whl", hash = "sha256:9e5c6bfa8dcc30091c74b0cf803c81fdd29d94f01992a7707bc97babb1141913", size = 10216 }, +] + [[package]] name = "pytest" version = "8.3.5" @@ -3265,6 +3329,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f5/4b/a06e0ec3d155924f77835ed2d167ebd3b211a7b0853da1cf8d8414d784ef/yarl-1.18.3-py3-none-any.whl", hash = "sha256:b57f4f58099328dfb26c6a771d09fb20dbbae81d20cfb66141251ea063bd101b", size = 45109 }, ] +[[package]] +name = "zipp" +version = "3.21.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3f/50/bad581df71744867e9468ebd0bcd6505de3b275e06f202c2cb016e3ff56f/zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4", size = 24545 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/1a/7e4798e9339adc931158c9d69ecc34f5e6791489d469f5e50ec15e35f458/zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931", size = 9630 }, +] + [[package]] name = "zstandard" version = "0.23.0"