Improve documentation for data catalogs. (#606)

tobiasraabe · web-flow · commit 8038fcf43097 · 2024-05-12T14:04:42.000Z
diff --git a/.gitignore b/.gitignore
@@ -26,3 +26,4 @@ tests/test_jupyter/*.txt
 .pytest_cache
 .ruff_cache
 .venv
+docs/jupyter_execute
diff --git a/docs/source/changes.md b/docs/source/changes.md
@@ -47,6 +47,7 @@ releases are available on [PyPI](https://pypi.org/project/pytask) and
 - {pull}`603` fixes an example in the documentation about capturing warnings.
 - {pull}`604` fixes some examples with `PythonNode`s in the documentation.
 - {pull}`605` improves checks and CI.
+- {pull}`606` improves the documentation for data catalogs.
 - {pull}`609` allows a pending status for tasks. Useful for async backends implemented
   in pytask-parallel.
 - {pull}`611` removes the initial task execution status from
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -51,8 +51,7 @@
     "sphinx_copybutton",
     "sphinx_click",
     "sphinx_toolbox.more_autodoc.autoprotocol",
-    "nbsphinx",
-    "myst_parser",
+    "myst_nb",
     "sphinx_design",
 ]
 
diff --git a/docs/source/how_to_guides/bp_scaling_tasks.md b/docs/source/how_to_guides/bp_scaling_tasks.md
@@ -39,9 +39,6 @@ my_project
 │           ├────config.py
 │           └────task_estimate_models.py
 │
-│
-├───setup.py
-│
 ├───.pytask
 │   └────...
 │
diff --git a/docs/source/how_to_guides/the_data_catalog.md b/docs/source/how_to_guides/the_data_catalog.md
@@ -1,9 +1,8 @@
 # The `DataCatalog` - Revisited
 
-An introduction to the data catalog can be found in the
-[tutorial](../tutorials/using_a_data_catalog.md).
-
-This guide explains some details that were left out of the tutorial.
+This guide explains more details about the {class}`~pytask.DataCatalog` that were left
+out of the [tutorial](../tutorials/using_a_data_catalog.md). Please, read the tutorial
+for a basic understanding.
 
 ## Changing the default node
 
@@ -15,54 +14,64 @@ For example, use the {class}`~pytask.PythonNode` as the default.
 
 ```python
 from pytask import PythonNode
+from pytask import DataCatalog
 
 
 data_catalog = DataCatalog(default_node=PythonNode)
 ```
 
-Or, learn to write your own node by reading {doc}`writing_custom_nodes`.
+Or, learn to write your node by reading {doc}`writing_custom_nodes`.
 
-Here, is an example for a `PickleNode` that uses cloudpickle instead of the normal
-`pickle` module.
+Here, is an example for a {class}`~pytask.PickleNode` that uses cloudpickle instead of
+the normal {mod}`pickle` module.
 
 ```{literalinclude} ../../../docs_src/how_to_guides/the_data_catalog.py
 ```
 
 ## Changing the name and the default path
 
-By default, the data catalogs store their data in a directory `.pytask/data_catalogs`.
-If you use a `pyproject.toml` with a `[tool.pytask.ini_options]` section, then the
+By default, data catalogs store their data in a directory `.pytask/data_catalogs`. If
+you use a `pyproject.toml` with a `[tool.pytask.ini_options]` section, then the
 `.pytask` folder is in the same folder as the configuration file.
 
 The default name for a catalog is `"default"` and so you will find its data in
 `.pytask/data_catalogs/default`. If you assign a different name like
 `"data_management"`, you will find the data in `.pytask/data_catalogs/data_management`.
 
 ```python
+from pytask import DataCatalog
+
+
 data_catalog = DataCatalog(name="data_management")
 ```
 
+```{note}
+The name of a data catalog is restricted to letters, numbers, hyphens and underscores.
+```
+
 You can also change the path where the data catalogs will be stored by changing the
 `path` attribute. Here, we store the data catalog's data next to the module where the
 data catalog is defined in `.data`.
 
 ```python
 from pathlib import Path
+from pytask import DataCatalog
 
 
 data_catalog = DataCatalog(path=Path(__file__).parent / ".data")
 ```
 
 ## Multiple data catalogs
 
-You can use multiple data catalogs when you want to separate your datasets across
-multiple catalogs or when you want to use the same names multiple times (although it is
-not recommended!).
+You can use multiple data catalogs when you want to separate your datasets or to avoid
+name collisions of data catalog entries.
 
 Make sure you assign different names to the data catalogs so that their data is stored
 in different directories.
 
 ```python
+from pytask import DataCatalog
+
 # Stored in .pytask/data_catalog/a
 data_catalog_a = DataCatalog(name="a")
 
@@ -71,3 +80,53 @@ data_catalog_b = DataCatalog(name="b")
 ```
 
 Or, use different paths as explained above.
+
+## Nested data catalogs
+
+Name collisions can also occur when you are using multiple levels of repetitions, for
+example, when you are fitting multiple models to multiple data sets.
+
+You can structure your data catalogs like this.
+
+```python
+from pytask import DataCatalog
+
+
+MODEL_NAMES = ("ols", "logistic_regression")
+DATA_NAMES = ("data_1", "data_2")
+
+
+nested_data_catalogs = {
+    model_name: {
+        data_name: DataCatalog(name=f"{model_name}-{data_name}")
+        for data_name in DATA_NAMES
+    }
+    for model_name in MODEL_NAMES
+}
+```
+
+The task could look like this.
+
+```python
+from pathlib import Path
+from pytask import task
+from typing_extensions import Annotated
+
+from my_project.config import DATA_NAMES
+from my_project.config import MODEL_NAMES
+from my_project.config import nested_data_catalogs
+
+
+for model_name in MODEL_NAMES:
+    for data_name in DATA_NAMES:
+
+        @task
+        def fit_model(
+            path: Path = Path("...", data_name)
+        ) -> Annotated[
+            Any, nested_data_catalogs[model_name][data_name]["fitted_model"]
+        ]:
+            data = ...
+            fitted_model = ...
+            return fitted_model
+```
diff --git a/docs/source/reference_guides/api.md b/docs/source/reference_guides/api.md
@@ -228,7 +228,9 @@ Task are currently represented by the following classes:
 
 ```{eval-rst}
 .. autoclass:: pytask.Task
+   :members:
 .. autoclass:: pytask.TaskWithoutPath
+   :members:
 ```
 
 Currently, there are no different types of tasks since changing the `.function`
@@ -325,6 +327,9 @@ resolution and execution.
 
     An indicator to mark arguments of tasks as products.
 
+    >>> from pathlib import Path
+    >>> from pytask import Product
+    >>> from typing_extensions import Annotated
     >>> def task_example(path: Annotated[Path, Product]) -> None:
     ...     path.write_text("Hello, World!")
 
diff --git a/docs/source/tutorials/using_a_data_catalog.md b/docs/source/tutorials/using_a_data_catalog.md
@@ -10,14 +10,14 @@ Two things will quickly become a nuisance in bigger projects.
    they are just intermediate representations.
 
 As a solution, pytask offers a {class}`~pytask.DataCatalog` which is a purely optional
-feature. The tutorial focuses on the main features. To learn about all features, read
-the [how-to guide](../how_to_guides/the_data_catalog.md).
+feature. The tutorial focuses on the main features. To learn about all the features,
+read the [how-to guide](../how_to_guides/the_data_catalog.md).
 
 Let us focus on the previous example and see how the {class}`~pytask.DataCatalog` helps
 us.
 
-The project structure is the same as in the previous example with the exception of the
-`.pytask` folder and the missing `data.pkl` in `bld`.
+The project structure is the same as in the previous example except the `.pytask` folder
+and the missing `data.pkl` in `bld`.
 
 ```text
 my_project
@@ -44,15 +44,51 @@ At first, we define the data catalog in `config.py`.
 ```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_1.py
 ```
 
-## `task_data_preparation`
+## `task_create_random_data`
 
-Next, we will use the data catalog to save the product of the task in
-`task_data_preparation.py`.
+Next, we look at the module `task_data_preparation.py` and its task
+`task_create_random_data`. The task creates a dataframe with simulated data that should
+be stored on the disk.
 
-Instead of using a path, we set the location of the product in the data catalog with
-`data_catalog["data"]`. If the key does not exist, the data catalog will automatically
-create a {class}`~pytask.PickleNode` that allows you to save any Python object to a
-`pickle` file. The `pickle` file is stored within the `.pytask` folder.
+In the previous tutorial, we learned to use {class}`~pathlib.Path`s to define products
+of our tasks. Here we see again the signature of the task function.
+
+`````{tab-set}
+
+````{tab-item} Python 3.10+
+:sync: python310plus
+
+```{literalinclude} ../../../docs_src/tutorials/defining_dependencies_products_products_py310.py
+:lines: 10-12
+```
+````
+
+````{tab-item} Python 3.8+
+:sync: python38plus
+
+```{literalinclude} ../../../docs_src/tutorials/defining_dependencies_products_products_py38.py
+:lines: 10-12
+```
+````
+
+````{tab-item} produces
+:sync: produces
+
+```{literalinclude} ../../../docs_src/tutorials/defining_dependencies_products_products_produces.py
+:lines: 8
+```
+````
+`````
+
+When we want to use the data catalog, we replace `BLD / "data.pkl"` with an entry of the
+data catalog like `data_catalog["data"]`. If there is yet no entry with the name
+`"data"`, the data catalog will automatically create a {class}`~pytask.PickleNode`. The
+node allows you to save any Python object to a `pickle` file.
+
+You probably noticed that we did not need to define a path. That is because the data
+catalog takes care of that and stores the `pickle` file in the `.pytask` folder.
+
+Using `data_catalog["data"]` is thus equivalent to using `PickleNode(path=Path(...))`.
 
 The following tabs show you how to use the data catalog given the interface you prefer.
 
@@ -125,10 +161,6 @@ Following one of the interfaces gives you immediate access to the
 ````{tab-item} Python 3.10+
 :sync: python310plus
 
-Use `data_catalog["data"]` as an default argument to access the
-{class}`~pytask.PickleNode` within the task. When you are done transforming your
-{class}`~pandas.DataFrame`, save it with {meth}`~pytask.PickleNode.save`.
-
 ```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_3_py310.py
 :emphasize-lines: 12
 ```
@@ -138,10 +170,6 @@ Use `data_catalog["data"]` as an default argument to access the
 ````{tab-item} Python 3.8+
 :sync: python38plus
 
-Use `data_catalog["data"]` as an default argument to access the
-{class}`~pytask.PickleNode` within the task. When you are done transforming your
-{class}`~pandas.DataFrame`, save it with {meth}`~pytask.PickleNode.save`.
-
 ```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_3_py38.py
 :emphasize-lines: 12
 ```
@@ -160,7 +188,8 @@ In most projects, you have other data sets that you would like to access via the
 catalog. To add them, call the {meth}`~pytask.DataCatalog.add` method and supply a name
 and a path.
 
-Let's add `file.csv` to the data catalog.
+Let's add `file.csv` with the name `"csv"` to the data catalog and use it to create
+`data["transformed_csv"]`.
 
 ```text
 my_project
@@ -174,8 +203,6 @@ my_project
 │       ├────task_data_preparation.py
 │       └────task_plot_data.py
 │
-├───setup.py
-│
 ├───.pytask
 │   └────...
 │
@@ -184,13 +211,24 @@ my_project
     └────plot.png
 ```
 
-The path can be absolute or relative to the module of the data catalog.
+We can use a relative or an absolute path to define the location of the file. A relative
+path means the location is relative to the module of the data catalog.
 
 ```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_4.py
 ```
 
-You can now use the data catalog as in previous example and use the
-{class}`~~pathlib.Path` in the task.
+You can now use the data catalog as in the previous example and use the
+{class}`~pathlib.Path` in the task.
+
+```{note}
+Note that the value of `data_catalog["csv"]` inside the task becomes a
+{class}`~pathlib.Path`. It is because a {class}`~pathlib.Path` in
+{meth}`~pytask.DataCatalog.add` is not parsed to a {class}`~pytask.PickleNode` but a
+{class}`~pytask.PathNode`.
+
+Read {doc}`../how_to_guides/writing_custom_nodes` for more information about
+different node types which is not relevant now.
+```
 
 `````{tab-set}
 
@@ -224,9 +262,14 @@ You can now use the data catalog as in previous example and use the
 
 ## Developing with the `DataCatalog`
 
-You can also use the data catalog in a Jupyter notebook or in the terminal in the Python
-interpreter. Simply import the data catalog, select a node and call the
-{meth}`~pytask.PNode.load` method of a node to access its value.
+You can also use the data catalog in a Jupyter Notebook or the terminal in the Python
+interpreter. This can be super helpful when you develop tasks interactively in a Jupyter
+Notebook.
+
+Simply import the data catalog, select a node and call the {meth}`~pytask.PNode.load`
+method of a node to access its value.
+
+Here is an example with a terminal.
 
 ```pycon
 >>> from myproject.config import data_catalog
diff --git a/docs_src/tutorials/using_a_data_catalog_4.py b/docs_src/tutorials/using_a_data_catalog_4.py
@@ -10,4 +10,3 @@
 
 # Use either a relative or a absolute path.
 data_catalog.add("csv", Path("file.csv"))
-data_catalog.add("transformed_csv", BLD / "file.pkl")
diff --git a/pyproject.toml b/pyproject.toml
@@ -52,7 +52,7 @@ docs = [
     "ipython",
     "matplotlib",
     "myst-parser",
-    "nbsphinx",
+    "myst-nb",
     "sphinx",
     "sphinx-click",
     "sphinx-copybutton",
@@ -92,6 +92,10 @@ build-backend = "hatchling.build"
 managed = true
 dev-dependencies = ["tox-uv>=1.7.0"]
 
+[tool.rye.scripts]
+clean-docs = { cmd = "rm -rf docs/build" }
+build-docs = { cmd = "sphinx-build -b html docs/source docs/build" }
+
 [tool.hatch.build.hooks.vcs]
 version-file = "src/_pytask/_version.py"
 
diff --git a/src/_pytask/data_catalog.py b/src/_pytask/data_catalog.py
diff --git a/src/_pytask/nodes.py b/src/_pytask/nodes.py
diff --git a/src/_pytask/outcomes.py b/src/_pytask/outcomes.py
diff --git a/tox.ini b/tox.ini

Original file line number	Diff line number	Diff line change
`@@ -51,8 +51,7 @@`
`51`	`51`	`"sphinx_copybutton",`
`52`	`52`	`"sphinx_click",`
`53`	`53`	`"sphinx_toolbox.more_autodoc.autoprotocol",`
`54`		`- "nbsphinx",`
`55`		`- "myst_parser",`
	`54`	`+ "myst_nb",`
`56`	`55`	`"sphinx_design",`
`57`	`56`	`]`
`58`	`57`
Original file line number	Diff line number	Diff line change
`@@ -10,4 +10,3 @@`
`10`	`10`
`11`	`11`	`# Use either a relative or a absolute path.`
`12`	`12`	`data_catalog.add("csv", Path("file.csv"))`
`13`		`-data_catalog.add("transformed_csv", BLD / "file.pkl")`