diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9aca857..986cb85 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -35,6 +35,16 @@ repos: rev: 0.7.1 hooks: - id: nbstripout +- repo: https://github.com/executablebooks/mdformat + rev: 0.7.17 + hooks: + - id: mdformat + additional_dependencies: [ + mdformat-myst, + mdformat-black, + mdformat-pyproject, + ] + files: (docs/.) # Conflicts with admonitions. # - repo: https://github.com/executablebooks/mdformat # rev: 0.7.17 diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..1478786 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,17 @@ +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.10" + +sphinx: + configuration: docs/source/conf.py + fail_on_warning: true + +python: + install: + - method: pip + path: . + extra_requirements: + - docs diff --git a/MANIFEST.in b/MANIFEST.in index 1eb48e3..b3ea241 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ +prune docs prune tests exclude *.md diff --git a/README.md b/README.md index ee44b20..61b96e6 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,11 @@ [![image](https://img.shields.io/github/actions/workflow/status/pytask-dev/pytask-parallel/main.yml?branch=main)](https://github.com/pytask-dev/pytask-parallel/actions?query=branch%3Amain) [![image](https://codecov.io/gh/pytask-dev/pytask-parallel/branch/main/graph/badge.svg)](https://codecov.io/gh/pytask-dev/pytask-parallel) [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/pytask-dev/pytask-parallel/main.svg)](https://results.pre-commit.ci/latest/github/pytask-dev/pytask-parallel/main) -[![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) ______________________________________________________________________ -Parallelize the execution of tasks with `pytask-parallel` which is a plugin for +Parallelize the execution of tasks with `pytask-parallel`, a plugin for [pytask](https://github.com/pytask-dev/pytask). ## Installation @@ -28,11 +28,14 @@ $ pip install pytask-parallel $ conda install -c conda-forge pytask-parallel ``` -By default, the plugin uses `concurrent.futures.ProcessPoolExecutor`. +By default, the plugin uses loky's reusable executor. -It is also possible to select the executor from loky or `ThreadPoolExecutor` from the -[concurrent.futures](https://docs.python.org/3/library/concurrent.futures.html) module -as backends to execute tasks asynchronously. +The following backends are available: + +- loky's [`get_reusable_executor`](https://loky.readthedocs.io/en/stable/API.html#loky.get_reusable_executor) +- `ProcessPoolExecutor` or `ThreadPoolExecutor` from + [concurrent.futures](https://docs.python.org/3/library/concurrent.futures.html) +- dask's [`ClientExecutor`](https://distributed.dask.org/en/stable/api.html#distributed.Client.get_executor) allows in combination with [coiled](https://docs.coiled.io/user_guide/index.html) to spawn clusters and workers on AWS, GCP, and other providers with minimal configuration. ## Usage @@ -65,71 +68,10 @@ You can also set the options in a `pyproject.toml`. [tool.pytask.ini_options] n_workers = 1 -parallel_backend = "processes" # or loky or threads -``` - -## Custom Executor - -> [!NOTE] -> -> The interface for custom executors is rudimentary right now and there is not a lot of -> support by public functions. Please, give some feedback if you are trying or managed -> to use a custom backend. -> -> Also, please contribute your custom executors if you consider them useful to others. - -pytask-parallel allows you to use your parallel backend as long as it follows the -interface defined by -[`concurrent.futures.Executor`](https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.Executor). - -In some cases, adding a new backend can be as easy as registering a builder function -that receives some arguments (currently only `n_workers`) and returns the instantiated -executor. - -```python -from concurrent.futures import Executor -from my_project.executor import CustomExecutor - -from pytask_parallel import ParallelBackend, registry - - -def build_custom_executor(n_workers: int) -> Executor: - return CustomExecutor(max_workers=n_workers) - - -registry.register_parallel_backend(ParallelBackend.CUSTOM, build_custom_executor) -``` - -Now, build the project requesting your custom backend. - -```console -pytask --parallel-backend custom +parallel_backend = "loky" # or processes or threads ``` -Realistically, it is not the only necessary adjustment for a nice user experience. There -are two other important things. pytask-parallel does not implement them by default since -it seems more tightly coupled to your backend. - -1. A wrapper for the executed function that captures warnings, catches exceptions and - saves products of the task (within the child process!). - - As an example, see - [`def _execute_task()`](https://github.com/pytask-dev/pytask-parallel/blob/c441dbb75fa6ab3ab17d8ad5061840c802dc1c41/src/pytask_parallel/processes.py#L91-L155) - that does all that for the processes and loky backend. - -1. To apply the wrapper, you need to write a custom hook implementation for - `def pytask_execute_task()`. See - [`def pytask_execute_task()`](https://github.com/pytask-dev/pytask-parallel/blob/c441dbb75fa6ab3ab17d8ad5061840c802dc1c41/src/pytask_parallel/processes.py#L41-L65) - for an example. Use the - [`hook_module`](https://pytask-dev.readthedocs.io/en/stable/how_to_guides/extending_pytask.html#using-hook-module-and-hook-module) - configuration value to register your implementation. - -Another example of an implementation can be found as a -[test](https://github.com/pytask-dev/pytask-parallel/blob/c441dbb75fa6ab3ab17d8ad5061840c802dc1c41/tests/test_backends.py#L35-L78). - -## Some implementation details - -### Parallelization and Debugging +## Parallelization and Debugging It is not possible to combine parallelization with debugging. That is why `--pdb` or `--trace` deactivate parallelization. @@ -137,26 +79,11 @@ It is not possible to combine parallelization with debugging. That is why `--pdb If you parallelize the execution of your tasks using two or more workers, do not use `breakpoint()` or `import pdb; pdb.set_trace()` since both will cause exceptions. -### Threads and warnings +## Documentation -Capturing warnings is not thread-safe. Therefore, warnings cannot be captured reliably -when tasks are parallelized with `--parallel-backend threads`. +You find the documentation at . ## Changes -Consult the [release notes](CHANGES.md) to find out about what is new. - -## Development - -- `pytask-parallel` does not call the `pytask_execute_task_protocol` hook - specification/entry-point because `pytask_execute_task_setup` and - `pytask_execute_task` need to be separated from `pytask_execute_task_teardown`. Thus, - plugins that change this hook specification may not interact well with the - parallelization. - -- Two PRs for CPython try to re-enable setting custom reducers which should have been - working but does not. Here are the references. - - - https://bugs.python.org/issue28053 - - https://github.com/python/cpython/pull/9959 - - https://github.com/python/cpython/pull/15058 +Consult the [release notes](https://pytask-parallel.readthedocs.io/en/stable/changes.html) to +find out about what is new. diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..8b6275a --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= -W --keep-going +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..9534b01 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/_static/images/pytask.ico b/docs/source/_static/images/pytask.ico new file mode 100644 index 0000000..caaf9f9 Binary files /dev/null and b/docs/source/_static/images/pytask.ico differ diff --git a/docs/source/_static/images/pytask.png b/docs/source/_static/images/pytask.png new file mode 100644 index 0000000..bbc40b8 Binary files /dev/null and b/docs/source/_static/images/pytask.png differ diff --git a/docs/source/_static/images/pytask.svg b/docs/source/_static/images/pytask.svg new file mode 100644 index 0000000..88ad046 --- /dev/null +++ b/docs/source/_static/images/pytask.svg @@ -0,0 +1,29 @@ + + + + + + + + + + + + diff --git a/docs/source/_static/images/pytask_w_text.png b/docs/source/_static/images/pytask_w_text.png new file mode 100644 index 0000000..2f7ab1b Binary files /dev/null and b/docs/source/_static/images/pytask_w_text.png differ diff --git a/docs/source/_static/images/pytask_w_text_dark.svg b/docs/source/_static/images/pytask_w_text_dark.svg new file mode 100644 index 0000000..ef06dc0 --- /dev/null +++ b/docs/source/_static/images/pytask_w_text_dark.svg @@ -0,0 +1,32 @@ + + + + + + + + + + + +pytask + diff --git a/docs/source/_static/images/pytask_w_text_light.svg b/docs/source/_static/images/pytask_w_text_light.svg new file mode 100644 index 0000000..eca9a69 --- /dev/null +++ b/docs/source/_static/images/pytask_w_text_light.svg @@ -0,0 +1,32 @@ + + + + + + + + + + + +pytask + diff --git a/CHANGES.md b/docs/source/changes.md similarity index 96% rename from CHANGES.md rename to docs/source/changes.md index f07f9b6..ec9cb68 100644 --- a/CHANGES.md +++ b/docs/source/changes.md @@ -13,6 +13,8 @@ releases are available on [PyPI](https://pypi.org/project/pytask-parallel) and - {pull}`89` restructures the package. - {pull}`92` redirects stdout and stderr from processes and loky and shows them in error reports. +- {pull}`93` adds documentation on readthedocs. +- {pull}`94` implements `ParallelBackend.NONE` as the default backend. ## 0.4.1 - 2024-01-12 diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..f537001 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,211 @@ +"""Configuration file for the Sphinx documentation builder. + +This file only contains a selection of the most common options. For a full list see the +documentation: https://www.sphinx-doc.org/en/master/usage/configuration.html + +""" + +from __future__ import annotations + +import inspect +import os +import sys +import warnings +from importlib.metadata import version +from pathlib import Path +from typing import TYPE_CHECKING + +import pytask_parallel + +if TYPE_CHECKING: + import sphinx + + +# -- Project information --------------------------------------------------------------- + +project = "pytask_parallel" +author = "Tobias Raabe" +copyright = f"2020, {author}" # noqa: A001 + +# The version, including alpha/beta/rc tags, but not commit hash and datestamps +release = version("pytask_parallel") +# The short X.Y version. +version = ".".join(release.split(".")[:2]) + +# -- General configuration ------------------------------------------------------------- + +master_doc = "index" + +# Add any Sphinx extension module names here, as strings. They can be extensions coming +# with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [ + "IPython.sphinxext.ipython_console_highlighting", + "IPython.sphinxext.ipython_directive", + "sphinx.ext.autodoc", + "sphinx.ext.doctest", + "sphinx.ext.extlinks", + "sphinx.ext.intersphinx", + "sphinx.ext.linkcode", + "sphinx.ext.napoleon", + "sphinxext.opengraph", + "sphinx_copybutton", + "sphinx_click", + "sphinx_toolbox.more_autodoc.autoprotocol", + "nbsphinx", + "myst_parser", + "sphinx_design", +] + +# List of patterns, relative to source directory, that match files and directories to +# ignore when looking for source files. This pattern also affects html_static_path and +# html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"] + + +pygments_style = "sphinx" +pygments_dark_style = "monokai" + +# -- Extensions configuration ---------------------------------------------------------- + +# Configuration for autodoc. +add_module_names = True + +# Remove prefixed $ for bash, >>> for Python prompts, and In [1]: for IPython prompts. +copybutton_prompt_text = r"\$ |>>> |In \[\d\]: " +copybutton_prompt_is_regexp = True + +_repo = "https://github.com/pytask-dev/pytask-parallel" +extlinks = { + "pypi": ("https://pypi.org/project/%s/", "%s"), + "issue": (f"{_repo}/issues/%s", "#%s"), + "pull": (f"{_repo}/pull/%s", "#%s"), + "user": ("https://github.com/%s", "@%s"), +} + +intersphinx_mapping = { + "click": ("https://click.palletsprojects.com/en/8.0.x/", None), + "coiled": ("https://docs.coiled.io/", None), + "dask": ("https://docs.dask.org/en/stable/", None), + "distributed": ("https://distributed.dask.org/en/stable/", None), + "python": ("https://docs.python.org/3.10", None), +} + +# MyST +myst_enable_extensions = ["deflist", "dollarmath"] +myst_footnote_transition = False + +# Open Graph +ogp_social_cards = {"image": "_static/images/pytask_w_text.png"} + + +# Linkcode, based on numpy doc/source/conf.py +def linkcode_resolve(domain: str, info: dict[str, str]) -> str: # noqa: C901, PLR0912 + """Determine the URL corresponding to Python object.""" + if domain != "py": + return None + + modname = info["module"] + fullname = info["fullname"] + + submod = sys.modules.get(modname) + if submod is None: + return None + + obj = submod + for part in fullname.split("."): + try: + with warnings.catch_warnings(): + # Accessing deprecated objects will generate noisy warnings + warnings.simplefilter("ignore", FutureWarning) + obj = getattr(obj, part) + except AttributeError: # noqa: PERF203 + return None + + try: + fn = inspect.getsourcefile(inspect.unwrap(obj)) + except TypeError: + try: # property + fn = inspect.getsourcefile(inspect.unwrap(obj.fget)) + except (AttributeError, TypeError): + fn = None + if not fn: + return None + + try: + source, lineno = inspect.getsourcelines(obj) + except TypeError: + try: # property + source, lineno = inspect.getsourcelines(obj.fget) + except (AttributeError, TypeError): + lineno = None + except OSError: + lineno = None + + linespec = f"#L{lineno}-L{lineno + len(source) - 1}" if lineno else "" + + fn = os.path.relpath(fn, start=Path(pytask_parallel.__file__).parent) + + if "+" in pytask_parallel.__version__: + return f"https://github.com/pytask-dev/pytask-parallel/blob/main/src/pytask_parallel/{fn}{linespec}" + return ( + f"https://github.com/pytask-dev/pytask-parallel/blob/" + f"v{pytask_parallel.__version__}/src/pytask_parallel/{fn}{linespec}" + ) + + +# -- Options for HTML output ----------------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for a list of +# built-in themes. +html_theme = "furo" + +# Add any paths that contain custom static files (such as style sheets) here, relative +# to this directory. They are copied after the built-in static files, so a file named +# "default.css" will overwrite the built-in "default.css". +html_css_files = ["css/termynal.css", "css/termynal_custom.css", "css/custom.css"] + +html_js_files = ["js/termynal.js", "js/custom.js"] + +# The name of an image file (within the static path) to use as favicon of the docs. +# This file should be a Windows icon file (.ico) being 16x16 or 32x32 pixels large. +html_favicon = "_static/images/pytask.ico" + +# Add any paths that contain custom static files (such as style sheets) here, relative +# to this directory. They are copied after the builtin static files, so a file named +# "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] + +# If false, no module index is generated. +html_domain_indices = True + +# If false, no index is generated. +html_use_index = True + +# If true, the index is split into individual pages for each letter. +html_split_index = False + +# If true, links to the reST sources are added to the pages. +html_show_sourcelink = False + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +html_show_copyright = True + +html_theme_options = { + "sidebar_hide_name": True, + "navigation_with_keys": True, + "light_logo": "images/pytask_w_text_light.svg", + "dark_logo": "images/pytask_w_text_dark.svg", +} + + +def setup(app: sphinx.application.Sphinx) -> None: + """Configure sphinx.""" + app.add_object_type( + "confval", + "confval", + objname="configuration value", + indextemplate="pair: %s; configuration value", + ) diff --git a/docs/source/custom_executors.md b/docs/source/custom_executors.md new file mode 100644 index 0000000..f44b377 --- /dev/null +++ b/docs/source/custom_executors.md @@ -0,0 +1,57 @@ +# Custom Executors + +```{caution} +The interface for custom executors is rudimentary right now. Please, give some feedback +if you managed to implement a custom executor or have suggestions for improvement. + +Please, also consider contributing your executor to pytask-parallel if you believe it +could be helpful to other people. Start by creating an issue or a draft PR. +``` + +pytask-parallel allows you to use your parallel backend as long as it follows the +interface defined by {class}`~concurrent.futures.Executor`. + +In some cases, adding a new backend can be as easy as registering a builder function +that receives some arguments (currently only `n_workers`) and returns the instantiated +executor. + +```python +from concurrent.futures import Executor +from my_project.executor import CustomExecutor + +from pytask_parallel import ParallelBackend, registry + + +def build_custom_executor(n_workers: int) -> Executor: + return CustomExecutor(max_workers=n_workers) + + +registry.register_parallel_backend(ParallelBackend.CUSTOM, build_custom_executor) +``` + +Now, build the project requesting your custom backend. + +```console +pytask --parallel-backend custom +``` + +Realistically, it is not the only necessary adjustment for a nice user experience. There +are two other important things. pytask-parallel does not implement them by default since +it seems more tightly coupled to your backend. + +1. A wrapper for the executed function that captures warnings, catches exceptions and + saves products of the task (within the child process!). + + As an example, see + [`def _execute_task()`](https://github.com/pytask-dev/pytask-parallel/blob/c441dbb75fa6ab3ab17d8ad5061840c802dc1c41/src/pytask_parallel/processes.py#L91-L155) + that does all that for the processes and loky backend. + +1. To apply the wrapper, you need to write a custom hook implementation for + `def pytask_execute_task()`. See + [`def pytask_execute_task()`](https://github.com/pytask-dev/pytask-parallel/blob/c441dbb75fa6ab3ab17d8ad5061840c802dc1c41/src/pytask_parallel/processes.py#L41-L65) + for an example. Use the + [`hook_module`](https://pytask-dev.readthedocs.io/en/stable/how_to_guides/extending_pytask.html#using-hook-module-and-hook-module) + configuration value to register your implementation. + +Another example of an implementation can be found as a +[test](https://github.com/pytask-dev/pytask-parallel/blob/c441dbb75fa6ab3ab17d8ad5061840c802dc1c41/tests/test_backends.py#L35-L78). diff --git a/docs/source/dask.md b/docs/source/dask.md new file mode 100644 index 0000000..e0b2ac1 --- /dev/null +++ b/docs/source/dask.md @@ -0,0 +1,128 @@ +# Dask + +```{caution} +Currently, the dask backend can only be used if your workflow code is organized in a +package due to how pytask imports your code and dask serializes task functions +([issue](https://github.com/dask/distributed/issues/8607)). +``` + +Dask is a flexible library for parallel and distributed computing. You probably know it +from its {class}`dask.dataframe` that allows lazy processing of big data. Here, we use +{mod}`distributed` that provides an interface similar to +{class}`~concurrent.futures.Executor` to parallelize our execution. + +There are a couple of ways in how we can use dask. + +## Local + +By default, using dask as the parallel backend will launch a +{class}`distributed.LocalCluster` with processes on your local machine. + +`````{tab-set} +````{tab-item} CLI +```console +pytask --parallel-backend dask -n 2 +``` +```` +````{tab-item} Configuration +```toml +[tool.pytask.ini_options] +parallel_backend = "dask" +n_workers = 2 +``` +```` +````` + +## Local or Remote - Connecting to a Scheduler + +It is also possible to connect to an existing scheduler and use it to execute tasks. The +scheduler can be launched on your local machine or in some remote environment. It also +has the benefit of being able to inspect the dask dashboard for more information on the +execution. + +Start by launching a scheduler in some terminal on some machine. + +```console +dask scheduler +``` + +After the launch, the IP of the scheduler will be displayed. Copy it. Then, open more +terminals to launch as many dask workers as you like with + +```console +dask worker +``` + +Finally, write a function to build the dask client and register it as the dask backend. +Place the code somewhere in your codebase, preferably, where you store the main +configuration of your project in `config.py` or another module that will be imported +during execution. + +```python +from pytask_parallel import ParallelBackend +from pytask_parallel import registry +from concurrent.futures import Executor +from dask.distributed import Client + + +def _build_dask_executor(n_workers: int) -> Executor: + return Client(address="").get_executor() + + +registry.register_parallel_backend(ParallelBackend.DASK, _build_dask_executor) +``` + +You can also register it as the custom executor using +{class}`pytask_parallel.ParallelBackend.CUSTOM` to switch back to the default dask +executor quickly. + +```{seealso} +You can find more information in the documentation for +[`dask.distributed`](https://distributed.dask.org/en/stable/). +``` + +## Remote - Using cloud providers with coiled + +[coiled](https://www.coiled.io/) is a product built on top of dask that eases the +deployment of your workflow to many cloud providers like AWS, GCP, and Azure. + +They offer a [free monthly tier](https://www.coiled.io/pricing) where you only +need to pay the costs for your cloud provider and you can get started without a credit +card. + +Furthermore, they offer the following benefits which are especially helpful to people +who are not familiar with cloud providers or remote computing. + +- A [four step short process](https://docs.coiled.io/user_guide/setup/index.html) to set + up your local environment and configure your cloud provider. +- coiled manages your resources by spawning workers if you need them and shutting them + down if they are idle. +- Synchronization of your local environment to remote workers. + +So, how can you run your pytask workflow on a cloud infrastructure with coiled? + +1. Follow their [guide on getting + started](https://docs.coiled.io/user_guide/setup/index.html) by creating a coiled + account and syncing it with your cloud provider. + +1. Register a function that builds an executor using {class}`coiled.Cluster`. + + ```python + import coiled + from pytask_parallel import ParallelBackend + from pytask_parallel import registry + from concurrent.futures import Executor + + + def _build_coiled_executor(n_workers: int) -> Executor: + return coiled.Cluster(n_workers=n_workers).get_client().get_executor() + + + registry.register_parallel_backend(ParallelBackend.CUSTOM, _build_coiled_executor) + ``` + +1. Execute your workflow with + + ```console + pytask --parallel-backend custom + ``` diff --git a/docs/source/developers_guide.md b/docs/source/developers_guide.md new file mode 100644 index 0000000..f2fa790 --- /dev/null +++ b/docs/source/developers_guide.md @@ -0,0 +1,14 @@ +# Developer's Guide + +`pytask-parallel` does not call the `pytask_execute_task_protocol` hook +specification/entry-point because `pytask_execute_task_setup` and +`pytask_execute_task` need to be separated from `pytask_execute_task_teardown`. Thus, +plugins that change this hook specification may not interact well with the +parallelization. + +Two PRs for CPython try to re-enable setting custom reducers which should have been +working but does not. Here are the references. + +- https://bugs.python.org/issue28053 +- https://github.com/python/cpython/pull/9959 +- https://github.com/python/cpython/pull/15058 diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 0000000..6209512 --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,31 @@ +# pytask-parallel + + + +[![PyPI](https://img.shields.io/pypi/v/pytask-parallel?color=blue)](https://pypi.org/project/pytask-parallel) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pytask-parallel)](https://pypi.org/project/pytask-parallel) +[![image](https://img.shields.io/conda/vn/conda-forge/pytask-parallel.svg)](https://anaconda.org/conda-forge/pytask-parallel) +[![image](https://img.shields.io/conda/pn/conda-forge/pytask-parallel.svg)](https://anaconda.org/conda-forge/pytask-parallel) +[![PyPI - License](https://img.shields.io/pypi/l/pytask-parallel)](https://pypi.org/project/pytask-parallel) +[![image](https://img.shields.io/github/actions/workflow/status/pytask-dev/pytask-parallel/main.yml?branch=main)](https://github.com/pytask-dev/pytask-parallel/actions?query=branch%3Amain) +[![image](https://codecov.io/gh/pytask-dev/pytask-parallel/branch/main/graph/badge.svg)](https://codecov.io/gh/pytask-dev/pytask-parallel) +[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/pytask-dev/pytask-parallel/main.svg)](https://results.pre-commit.ci/latest/github/pytask-dev/pytask-parallel/main) +[![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) + +pytask-parallel allows to execute workflows defined with +[pytask](https://pytask-dev.readthedocs.io/) in parallel using local or remote clusters. + +## Documentation + +```{toctree} +--- +maxdepth: 1 +--- +quickstart +dask +custom_executors +developers_guide +changes +On Github +``` diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md new file mode 100644 index 0000000..b780fcc --- /dev/null +++ b/docs/source/quickstart.md @@ -0,0 +1,179 @@ +# Quickstart + +## Installation + +pytask-parallel is available on [PyPI](https://pypi.org/project/pytask-parallel) and +[Anaconda.org](https://anaconda.org/conda-forge/pytask-parallel). Install it with + +```console +$ pip install pytask-parallel + +# or + +$ conda install -c conda-forge pytask-parallel +``` + +## Usage + +When the plugin is only installed and pytask executed, the tasks are not run in +parallel. + +For parallelization with the default backend [loky](https://loky.readthedocs.io/), you need to launch multiple workers. + +`````{tab-set} +````{tab-item} CLI +:sync: cli + +```console +pytask -n 2 +pytask --n-workers 2 + +# Starts os.cpu_count() - 1 workers. +pytask -n auto +``` +```` +````{tab-item} Configuration +:sync: configuration + +```toml +[tool.pytask.ini_options] +n_workers = 2 + +# Starts os.cpu_count() - 1 workers. +n_workers = "auto" +``` +```` +````` + +To use a different backend, pass the `--parallel-backend` option. The following command +will execute the workflow with one worker and the loky backend. + +`````{tab-set} +````{tab-item} CLI +:sync: cli + +```console +pytask --parallel-backend loky +``` +```` +````{tab-item} Configuration +:sync: configuration + +```toml +[tool.pytask.ini_options] +parallel_backend = "loky" +``` +```` +````` + +## Backends + +```{important} +It is not possible to combine parallelization with debugging. That is why `--pdb` or +`--trace` deactivate parallelization. + +If you parallelize the execution of your tasks using two or more workers, do not use +`breakpoint()` or `import pdb; pdb.set_trace()` since both will cause exceptions. +``` + +### loky + +There are multiple backends available. The default is the backend provided by loky which +aims to be a more robust implementation of {class}`~multiprocessing.pool.Pool` and in +{class}`~concurrent.futures.ProcessPoolExecutor`. + +```console +pytask --parallel-backend loky +``` + +As it spawns workers in new processes to run the tasks, it is especially suited for +CPU-bound tasks. ([Here](https://stackoverflow.com/a/868577/7523785) is an +explanation of what CPU- or IO-bound means.) + +### `concurrent.futures` + +You can use the values `threads` and `processes` to use the +{class}`~concurrent.futures.ThreadPoolExecutor` or the +{class}`~concurrent.futures.ProcessPoolExecutor` respectively. + +The {class}`~concurrent.futures.ThreadPoolExecutor` might be an interesting option for +you if you have many IO-bound tasks and you do not need to create many expensive +processes. + +`````{tab-set} +````{tab-item} CLI +:sync: cli + +```console +pytask --parallel-backend threads +``` +```` +````{tab-item} Configuration +:sync: configuration + +```toml +[tool.pytask.ini_options] +parallel_backend = "threads" +``` +```` +````` + +`````{tab-set} +````{tab-item} CLI +:sync: cli + +```console +pytask --parallel-backend processes +``` +```` +````{tab-item} Configuration +:sync: configuration + +```toml +[tool.pytask.ini_options] +parallel_backend = "processes" +``` +```` +````` + +```{important} +Capturing warnings is not thread-safe. Therefore, warnings cannot be captured reliably +when tasks are parallelized with `--parallel-backend threads`. +``` + +### dask + coiled + +dask and coiled together provide the option to execute your workflow on cloud providers +like AWS, GCP or Azure. Check out the [dedicated guide](dask.md) if you are interested +in that. + +Using the default mode, dask will spawn multiple local workers to process the tasks. + +`````{tab-set} +````{tab-item} CLI +:sync: cli + +```console +pytask --parallel-backend dask +``` +```` +````{tab-item} Configuration +:sync: configuration + +```toml +[tool.pytask.ini_options] +parallel_backend = "dask" +``` +```` +````` + +### Custom executors + +You can also use any custom executor that implements the +{class}`~concurrent.futures.Executor` interface. Read more about it in +[](custom_executors.md). + +```{important} +Please, consider contributing your executor to pytask-parallel if you believe it could +be helpful to other people. Start by creating an issue or a draft PR. +``` diff --git a/environment.yml b/environment.yml index dba1d27..a7ad075 100644 --- a/environment.yml +++ b/environment.yml @@ -28,5 +28,16 @@ dependencies: - nbmake - pytest-cov + # Documentation + - furo + - myst-parser + - nbsphinx + - sphinx + - sphinx-click + - sphinx-copybutton + - sphinx-design >=0.3.0 + - sphinx-toolbox + - sphinxext-opengraph + - pip: - -e . diff --git a/pyproject.toml b/pyproject.toml index 189d796..d28f4ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,19 @@ email = "raabe@posteo.de" [project.optional-dependencies] dask = ["dask[complete]", "distributed"] +docs = [ + "furo", + "ipython", + "matplotlib", + "myst-parser", + "nbsphinx", + "sphinx", + "sphinx-click", + "sphinx-copybutton", + "sphinx-design>=0.3", + "sphinx-toolbox", + "sphinxext-opengraph", +] test = [ "pytask-parallel[all]", "nbmake", @@ -106,6 +119,7 @@ select = ["ALL"] [tool.ruff.lint.per-file-ignores] "tests/*" = ["D", "ANN", "PLR2004", "S101"] +"docs/source/conf.py" = ["INP001"] [tool.ruff.lint.isort] force-single-line = true diff --git a/tox.ini b/tox.ini index a0c7cd6..cb55d00 100644 --- a/tox.ini +++ b/tox.ini @@ -11,3 +11,9 @@ deps = git+https://github.com/pytask-dev/pytask.git@main commands = pytest {posargs} + +[testenv:docs] +extras = docs, test +commands = + - sphinx-build -n -T -b html -d {envtmpdir}/doctrees docs/source docs/build/html + - sphinx-build -n -T -b doctest -d {envtmpdir}/doctrees docs/source docs/build/html