From a7d0e7d6e37cd1728c8080e733e836a19cd741be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= Date: Fri, 28 Apr 2023 13:08:06 +0200 Subject: [PATCH] Deprecate `kedro.extras.datasets` and add top-level docs for `kedro_datasets` (#2546) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Juan Luis Cano Rodríguez --- .gitignore | 1 + .readthedocs.yml | 2 - RELEASE.md | 3 +- docs/build-docs.sh | 4 -- docs/kedro-datasets-docs.sh | 13 ----- docs/source/conf.py | 4 +- docs/source/data/data_catalog.md | 6 +-- docs/source/data/kedro_io.md | 6 +-- docs/source/extend_kedro/common_use_cases.md | 4 +- docs/source/extend_kedro/custom_datasets.md | 8 +-- docs/source/get_started/kedro_concepts.md | 2 +- docs/source/index.rst | 1 + .../integrations/pyspark_integration.md | 8 +-- docs/source/kedro.datasets.rst | 52 ------------------- docs/source/kedro.extras.rst | 20 +++++++ docs/source/kedro_datasets.rst | 52 +++++++++++++++++++ docs/source/tutorial/add_another_pipeline.md | 2 +- docs/source/tutorial/create_a_pipeline.md | 2 +- docs/source/tutorial/set_up_data.md | 2 +- .../visualise_charts_with_plotly.md | 2 +- kedro/extras/datasets/README.md | 4 ++ kedro/extras/datasets/__init__.py | 16 ++++++ kedro/io/core.py | 3 ++ pyproject.toml | 2 +- 24 files changed, 123 insertions(+), 96 deletions(-) delete mode 100755 docs/kedro-datasets-docs.sh delete mode 100644 docs/source/kedro.datasets.rst create mode 100644 docs/source/kedro.extras.rst create mode 100644 docs/source/kedro_datasets.rst diff --git a/.gitignore b/.gitignore index 8e06e36735..2fdac4dbdb 100644 --- a/.gitignore +++ b/.gitignore @@ -138,6 +138,7 @@ venv.bak/ # Additional files created by sphinx.ext.autosummary # Some of them are actually tracked to control the output /docs/source/kedro.* +/docs/source/kedro_datasets.* # mypy .mypy_cache/ diff --git a/.readthedocs.yml b/.readthedocs.yml index 771a7351b9..4b1c5f4824 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -5,7 +5,6 @@ # Required version: 2 -# .readthedocs.yml hook to copy kedro-datasets to kedro.datasets before building the docs build: os: ubuntu-22.04 tools: @@ -16,7 +15,6 @@ build: jobs: post_create_environment: - npm install -g @mermaid-js/mermaid-cli - - ./docs/kedro-datasets-docs.sh pre_build: - python -m sphinx -WETan -j auto -D language=en -b linkcheck -d _build/doctrees docs/source _build/linkcheck diff --git a/RELEASE.md b/RELEASE.md index 746cbb4fe0..747475d5cb 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -22,11 +22,12 @@ ### Documentation changes * Improvements to Sphinx toolchain including incrementing to use a newer version. * Improvements to documentation on visualising Kedro projects on Databricks, and additional documentation about the development workflow for Kedro projects on Databricks. -* Updated Technnical Steering Committee membership documentation. +* Updated Technical Steering Committee membership documentation. * Revised documentation section about linting and formatting and extended to give details of `flake8` configuration. * Updated table of contents for documentation to reduce scrolling. * Expanded FAQ documentation. * Added a 404 page to documentation. +* Added deprecation warnings about the removal of `kedro.extras.datasets`. ## Breaking changes to the API diff --git a/docs/build-docs.sh b/docs/build-docs.sh index 5575b6b577..d55076e118 100755 --- a/docs/build-docs.sh +++ b/docs/build-docs.sh @@ -7,10 +7,6 @@ set -o nounset action=$1 -# Reinstall kedro-datasets locally -rm -rf kedro/datasets -bash docs/kedro-datasets-docs.sh - if [ "$action" == "linkcheck" ]; then sphinx-build -WETan -j auto -D language=en -b linkcheck -d docs/build/doctrees docs/source docs/build/linkcheck elif [ "$action" == "docs" ]; then diff --git a/docs/kedro-datasets-docs.sh b/docs/kedro-datasets-docs.sh deleted file mode 100755 index 10054034ab..0000000000 --- a/docs/kedro-datasets-docs.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash -# Script to copy kedro-datasets to kedro.datasets before the documentation build for Kedro in ReadTheDocs. - -# Exit script if you try to use an uninitialized variable. -set -o nounset - -# Exit script if a statement returns a non-true return value. -set -o errexit - -pip install kedro-datasets -pip install --no-deps -t kedro/to_delete kedro-datasets -mv kedro/to_delete/kedro_datasets kedro/datasets -rm -r kedro/to_delete diff --git a/docs/source/conf.py b/docs/source/conf.py index 74b070cf70..db411cb073 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -131,7 +131,7 @@ "integer -- return number of occurrences of value", "integer -- return first index of value.", "kedro.extras.datasets.pandas.json_dataset.JSONDataSet", - "kedro.datasets.pandas.json_dataset.JSONDataSet", + "kedro_datasets.pandas.json_dataset.JSONDataSet", "pluggy._manager.PluginManager", "_DI", "_DO", @@ -309,7 +309,7 @@ "kedro.config", "kedro.extras.datasets", "kedro.extras.logging", - "kedro.datasets", + "kedro_datasets", ] diff --git a/docs/source/data/data_catalog.md b/docs/source/data/data_catalog.md index 7b9fcdafc7..3ea11b2a27 100644 --- a/docs/source/data/data_catalog.md +++ b/docs/source/data/data_catalog.md @@ -2,7 +2,7 @@ This section introduces `catalog.yml`, the project-shareable Data Catalog. The file is located in `conf/base` and is a registry of all data sources available for use by a project; it manages loading and saving of data. -All supported data connectors are available in [`kedro-datasets`](/kedro.datasets). +All supported data connectors are available in [`kedro-datasets`](/kedro_datasets). ## Use the Data Catalog within Kedro configuration @@ -261,7 +261,7 @@ scooters_query: index_col: [name] ``` -When you use [`pandas.SQLTableDataSet`](/kedro.datasets.pandas.SQLTableDataSet) or [`pandas.SQLQueryDataSet`](/kedro.datasets.pandas.SQLQueryDataSet), you must provide a database connection string. In the above example, we pass it using the `scooters_credentials` key from the credentials (see the details in the [Feeding in credentials](#feeding-in-credentials) section below). `scooters_credentials` must have a top-level key `con` containing a [SQLAlchemy compatible](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) connection string. As an alternative to credentials, you could explicitly put `con` into `load_args` and `save_args` (`pandas.SQLTableDataSet` only). +When you use [`pandas.SQLTableDataSet`](/kedro_datasets.pandas.SQLTableDataSet) or [`pandas.SQLQueryDataSet`](/kedro_datasets.pandas.SQLQueryDataSet), you must provide a database connection string. In the above example, we pass it using the `scooters_credentials` key from the credentials (see the details in the [Feeding in credentials](#feeding-in-credentials) section below). `scooters_credentials` must have a top-level key `con` containing a [SQLAlchemy compatible](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) connection string. As an alternative to credentials, you could explicitly put `con` into `load_args` and `save_args` (`pandas.SQLTableDataSet` only). ### Example 14: Loads data from an API endpoint, example US corn yield data from USDA @@ -535,7 +535,7 @@ The code API allows you to: ### Configure a Data Catalog -In a file like `catalog.py`, you can construct a `DataCatalog` object programmatically. In the following, we are using several pre-built data loaders documented in the [API reference documentation](/kedro.datasets). +In a file like `catalog.py`, you can construct a `DataCatalog` object programmatically. In the following, we are using several pre-built data loaders documented in the [API reference documentation](/kedro_datasets). ```python from kedro.io import DataCatalog diff --git a/docs/source/data/kedro_io.md b/docs/source/data/kedro_io.md index 4107491273..90afeaa132 100644 --- a/docs/source/data/kedro_io.md +++ b/docs/source/data/kedro_io.md @@ -41,8 +41,8 @@ For contributors, if you would like to submit a new dataset, you must extend the In order to enable versioning, you need to update the `catalog.yml` config file and set the `versioned` attribute to `true` for the given dataset. If this is a custom dataset, the implementation must also: 1. extend `kedro.io.core.AbstractVersionedDataSet` AND 2. add `version` namedtuple as an argument to its `__init__` method AND - 3. call `super().__init__()` with positional arguments `filepath`, `version`, and, optionally, with `glob` and `exists` functions if it uses a non-local filesystem (see [kedro_datasets.pandas.CSVDataSet](/kedro.datasets.pandas.CSVDataSet) as an example) AND - 4. modify its `_describe`, `_load` and `_save` methods respectively to support versioning (see [`kedro_datasets.pandas.CSVDataSet`](/kedro.datasets.pandas.CSVDataSet) for an example implementation) + 3. call `super().__init__()` with positional arguments `filepath`, `version`, and, optionally, with `glob` and `exists` functions if it uses a non-local filesystem (see [kedro_datasets.pandas.CSVDataSet](/kedro_datasets.pandas.CSVDataSet) as an example) AND + 4. modify its `_describe`, `_load` and `_save` methods respectively to support versioning (see [`kedro_datasets.pandas.CSVDataSet`](/kedro_datasets.pandas.CSVDataSet) for an example implementation) ```{note} If a new version of a dataset is created mid-run, for instance by an external system adding new files, it will not interfere in the current run, i.e. the load version stays the same throughout subsequent loads. @@ -239,7 +239,7 @@ Although HTTP(S) is a supported file system in the dataset implementations, it d ## Partitioned dataset -These days, distributed systems play an increasingly important role in ETL data pipelines. They significantly increase the processing throughput, enabling us to work with much larger volumes of input data. However, these benefits sometimes come at a cost. When dealing with the input data generated by such distributed systems, you might encounter a situation where your Kedro node needs to read the data from a directory full of uniform files of the same type (e.g. JSON, CSV, Parquet, etc.) rather than from a single file. Tools like `PySpark` and the corresponding [SparkDataSet](/kedro.datasets.spark.SparkDataSet) cater for such use cases, but the use of Spark is not always feasible. +These days, distributed systems play an increasingly important role in ETL data pipelines. They significantly increase the processing throughput, enabling us to work with much larger volumes of input data. However, these benefits sometimes come at a cost. When dealing with the input data generated by such distributed systems, you might encounter a situation where your Kedro node needs to read the data from a directory full of uniform files of the same type (e.g. JSON, CSV, Parquet, etc.) rather than from a single file. Tools like `PySpark` and the corresponding [SparkDataSet](/kedro_datasets.spark.SparkDataSet) cater for such use cases, but the use of Spark is not always feasible. This is why Kedro provides a built-in [PartitionedDataSet](/kedro.io.PartitionedDataSet), with the following features: diff --git a/docs/source/extend_kedro/common_use_cases.md b/docs/source/extend_kedro/common_use_cases.md index a9afc08e22..04b36d6ca5 100644 --- a/docs/source/extend_kedro/common_use_cases.md +++ b/docs/source/extend_kedro/common_use_cases.md @@ -4,7 +4,7 @@ Kedro has a few built-in mechanisms for you to extend its behaviour. This docume ## Use Case 1: How to add extra behaviour to Kedro's execution timeline -The execution timeline of a Kedro pipeline can be thought of as a sequence of actions performed by various Kedro library components, such as the [DataSets](/kedro.datasets), [DataCatalog](/kedro.io.DataCatalog), [Pipeline](/kedro.pipeline.Pipeline), [Node](/kedro.pipeline.node.Node) and [KedroContext](/kedro.framework.context.KedroContext). +The execution timeline of a Kedro pipeline can be thought of as a sequence of actions performed by various Kedro library components, such as the [DataSets](/kedro_datasets), [DataCatalog](/kedro.io.DataCatalog), [Pipeline](/kedro.pipeline.Pipeline), [Node](/kedro.pipeline.node.Node) and [KedroContext](/kedro.framework.context.KedroContext). At different points in the lifecycle of these components, you might want to add extra behaviour: for example, you could add extra computation for profiling purposes _before_ and _after_ a node runs, or _before_ and _after_ the I/O actions of a dataset, namely the `load` and `save` actions. @@ -12,7 +12,7 @@ This can now achieved by using [Hooks](../hooks/introduction.md), to define the ## Use Case 2: How to integrate Kedro with additional data sources -You can use [DataSets](/kedro.datasets) to interface with various different data sources. If the data source you plan to use is not supported out of the box by Kedro, you can [create a custom dataset](custom_datasets.md). +You can use [DataSets](/kedro_datasets) to interface with various different data sources. If the data source you plan to use is not supported out of the box by Kedro, you can [create a custom dataset](custom_datasets.md). ## Use Case 3: How to add or modify CLI commands diff --git a/docs/source/extend_kedro/custom_datasets.md b/docs/source/extend_kedro/custom_datasets.md index 7a064b1eb1..9e4b0713eb 100644 --- a/docs/source/extend_kedro/custom_datasets.md +++ b/docs/source/extend_kedro/custom_datasets.md @@ -1,6 +1,6 @@ # Custom datasets -[Kedro supports many datasets](/kedro.datasets) out of the box, but you may find that you need to create a custom dataset. For example, you may need to handle a proprietary data format or filesystem in your pipeline, or perhaps you have found a particular use case for a dataset that Kedro does not support. This tutorial explains how to create a custom dataset to read and save image data. +[Kedro supports many datasets](/kedro_datasets) out of the box, but you may find that you need to create a custom dataset. For example, you may need to handle a proprietary data format or filesystem in your pipeline, or perhaps you have found a particular use case for a dataset that Kedro does not support. This tutorial explains how to create a custom dataset to read and save image data. ## Scenario @@ -504,7 +504,7 @@ You may also want to consult the [in-depth documentation about the Versioning AP Kedro datasets should work with the [SequentialRunner](/kedro.runner.SequentialRunner) and the [ParallelRunner](/kedro.runner.ParallelRunner), so they must be fully serialisable by the [Python multiprocessing package](https://docs.python.org/3/library/multiprocessing.html). This means that your datasets should not make use of lambda functions, nested functions, closures etc. If you are using custom decorators, you need to ensure that they are using [`functools.wraps()`](https://docs.python.org/3/library/functools.html#functools.wraps). -There is one dataset that is an exception: [SparkDataSet](/kedro.datasets.spark.SparkDataSet). The explanation for this exception is that [Apache Spark](https://spark.apache.org/) uses its own parallelism and therefore doesn't work with Kedro [ParallelRunner](/kedro.runner.ParallelRunner). For parallelism within a Kedro project that leverages Spark please consider the alternative [ThreadRunner](/kedro.runner.ThreadRunner). +There is one dataset that is an exception: [SparkDataSet](/kedro_datasets.spark.SparkDataSet). The explanation for this exception is that [Apache Spark](https://spark.apache.org/) uses its own parallelism and therefore doesn't work with Kedro [ParallelRunner](/kedro.runner.ParallelRunner). For parallelism within a Kedro project that leverages Spark please consider the alternative [ThreadRunner](/kedro.runner.ThreadRunner). To verify whether your dataset is serialisable by `multiprocessing`, use the console or an iPython session to try dumping it using `multiprocessing.reduction.ForkingPickler`: @@ -562,7 +562,7 @@ class ImageDataSet(AbstractVersionedDataSet): ... ``` -We provide additional examples of [how to use parameters through the data catalog's YAML API](../data/data_catalog.md#use-the-data-catalog-with-the-yaml-api). For an example of how to use these parameters in your dataset's constructor, please see the [SparkDataSet](/kedro.datasets.spark.SparkDataSet)'s implementation. +We provide additional examples of [how to use parameters through the data catalog's YAML API](../data/data_catalog.md#use-the-data-catalog-with-the-yaml-api). For an example of how to use these parameters in your dataset's constructor, please see the [SparkDataSet](/kedro_datasets.spark.SparkDataSet)'s implementation. ## How to contribute a custom dataset implementation @@ -592,7 +592,7 @@ kedro-plugins/kedro-datasets/kedro_datasets/image ```{note} There are two special considerations when contributing a dataset: - 1. Add the dataset to `kedro.datasets.rst` so it shows up in the API documentation. + 1. Add the dataset to `kedro_datasets.rst` so it shows up in the API documentation. 2. Add the dataset to `static/jsonschema/kedro-catalog-X.json` for IDE validation. ``` diff --git a/docs/source/get_started/kedro_concepts.md b/docs/source/get_started/kedro_concepts.md index 90b5944255..4a6d771da0 100644 --- a/docs/source/get_started/kedro_concepts.md +++ b/docs/source/get_started/kedro_concepts.md @@ -55,7 +55,7 @@ greeting_pipeline = pipeline([return_greeting_node, join_statements_node]) The Kedro Data Catalog is the registry of all data sources that the project can use to manage loading and saving data. It maps the names of node inputs and outputs as keys in a `DataCatalog`, a Kedro class that can be specialised for different types of data storage. -[Kedro provides different built-in datasets](/kedro.datasets) for numerous file types and file systems, so you don’t have to write the logic for reading/writing data. +[Kedro provides different built-in datasets](/kedro_datasets) for numerous file types and file systems, so you don’t have to write the logic for reading/writing data. ## Kedro project directory structure diff --git a/docs/source/index.rst b/docs/source/index.rst index bbda085210..c0d42d31c0 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -148,6 +148,7 @@ API documentation :recursive: kedro + kedro_datasets Indices and tables ================== diff --git a/docs/source/integrations/pyspark_integration.md b/docs/source/integrations/pyspark_integration.md index f0382a6b01..c900243961 100644 --- a/docs/source/integrations/pyspark_integration.md +++ b/docs/source/integrations/pyspark_integration.md @@ -66,10 +66,10 @@ HOOKS = (SparkHooks(),) We recommend using Kedro's built-in Spark datasets to load raw data into Spark's [DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html), as well as to write them back to storage. Some of our built-in Spark datasets include: -* [spark.DeltaTableDataSet](/kedro.datasets.spark.DeltaTableDataSet) -* [spark.SparkDataSet](/kedro.datasets.spark.SparkDataSet) -* [spark.SparkJDBCDataSet](/kedro.datasets.spark.SparkJDBCDataSet) -* [spark.SparkHiveDataSet](/kedro.datasets.spark.SparkHiveDataSet) +* [spark.DeltaTableDataSet](/kedro_datasets.spark.DeltaTableDataSet) +* [spark.SparkDataSet](/kedro_datasets.spark.SparkDataSet) +* [spark.SparkJDBCDataSet](/kedro_datasets.spark.SparkJDBCDataSet) +* [spark.SparkHiveDataSet](/kedro_datasets.spark.SparkHiveDataSet) The example below illustrates how to use `spark.SparkDataSet` to read a CSV file located in S3 into a `DataFrame` in `conf/base/catalog.yml`: diff --git a/docs/source/kedro.datasets.rst b/docs/source/kedro.datasets.rst deleted file mode 100644 index ed43ec92cf..0000000000 --- a/docs/source/kedro.datasets.rst +++ /dev/null @@ -1,52 +0,0 @@ -kedro.datasets -===================== - -.. rubric:: Description - -.. automodule:: kedro.datasets - -.. rubric:: Classes - -.. autosummary:: - :toctree: - :template: autosummary/class.rst - - kedro.datasets.api.APIDataSet - kedro.datasets.biosequence.BioSequenceDataSet - kedro.datasets.dask.ParquetDataSet - kedro.datasets.email.EmailMessageDataSet - kedro.datasets.geopandas.GeoJSONDataSet - kedro.datasets.holoviews.HoloviewsWriter - kedro.datasets.json.JSONDataSet - kedro.datasets.matplotlib.MatplotlibWriter - kedro.datasets.networkx.GMLDataSet - kedro.datasets.networkx.GraphMLDataSet - kedro.datasets.networkx.JSONDataSet - kedro.datasets.pandas.CSVDataSet - kedro.datasets.pandas.ExcelDataSet - kedro.datasets.pandas.FeatherDataSet - kedro.datasets.pandas.GBQQueryDataSet - kedro.datasets.pandas.GBQTableDataSet - kedro.datasets.pandas.GenericDataSet - kedro.datasets.pandas.HDFDataSet - kedro.datasets.pandas.JSONDataSet - kedro.datasets.pandas.ParquetDataSet - kedro.datasets.pandas.SQLQueryDataSet - kedro.datasets.pandas.SQLTableDataSet - kedro.datasets.pandas.XMLDataSet - kedro.datasets.pickle.PickleDataSet - kedro.datasets.pillow.ImageDataSet - kedro.datasets.plotly.JSONDataSet - kedro.datasets.plotly.PlotlyDataSet - kedro.datasets.redis.PickleDataSet - kedro.datasets.spark.DeltaTableDataSet - kedro.datasets.spark.SparkDataSet - kedro.datasets.spark.SparkHiveDataSet - kedro.datasets.spark.SparkJDBCDataSet - kedro.datasets.svmlight.SVMLightDataSet - kedro.datasets.tensorflow.TensorFlowModelDataset - kedro.datasets.text.TextDataSet - kedro.datasets.tracking.JSONDataSet - kedro.datasets.tracking.MetricsDataSet - kedro.datasets.video.VideoDataSet - kedro.datasets.yaml.YAMLDataSet diff --git a/docs/source/kedro.extras.rst b/docs/source/kedro.extras.rst new file mode 100644 index 0000000000..0980b1a41b --- /dev/null +++ b/docs/source/kedro.extras.rst @@ -0,0 +1,20 @@ +kedro.extras +============ + +.. rubric:: Description + +.. automodule:: kedro.extras + +.. rubric:: Modules + +.. autosummary:: + :toctree: + :recursive: + + kedro.extras.extensions + kedro.extras.logging + +.. toctree:: + :hidden: + + kedro.extras.datasets diff --git a/docs/source/kedro_datasets.rst b/docs/source/kedro_datasets.rst new file mode 100644 index 0000000000..999e726bb3 --- /dev/null +++ b/docs/source/kedro_datasets.rst @@ -0,0 +1,52 @@ +kedro_datasets +============== + +.. rubric:: Description + +.. automodule:: kedro_datasets + +.. rubric:: Classes + +.. autosummary:: + :toctree: + :template: autosummary/class.rst + + kedro_datasets.api.APIDataSet + kedro_datasets.biosequence.BioSequenceDataSet + kedro_datasets.dask.ParquetDataSet + kedro_datasets.email.EmailMessageDataSet + kedro_datasets.geopandas.GeoJSONDataSet + kedro_datasets.holoviews.HoloviewsWriter + kedro_datasets.json.JSONDataSet + kedro_datasets.matplotlib.MatplotlibWriter + kedro_datasets.networkx.GMLDataSet + kedro_datasets.networkx.GraphMLDataSet + kedro_datasets.networkx.JSONDataSet + kedro_datasets.pandas.CSVDataSet + kedro_datasets.pandas.ExcelDataSet + kedro_datasets.pandas.FeatherDataSet + kedro_datasets.pandas.GBQQueryDataSet + kedro_datasets.pandas.GBQTableDataSet + kedro_datasets.pandas.GenericDataSet + kedro_datasets.pandas.HDFDataSet + kedro_datasets.pandas.JSONDataSet + kedro_datasets.pandas.ParquetDataSet + kedro_datasets.pandas.SQLQueryDataSet + kedro_datasets.pandas.SQLTableDataSet + kedro_datasets.pandas.XMLDataSet + kedro_datasets.pickle.PickleDataSet + kedro_datasets.pillow.ImageDataSet + kedro_datasets.plotly.JSONDataSet + kedro_datasets.plotly.PlotlyDataSet + kedro_datasets.redis.PickleDataSet + kedro_datasets.spark.DeltaTableDataSet + kedro_datasets.spark.SparkDataSet + kedro_datasets.spark.SparkHiveDataSet + kedro_datasets.spark.SparkJDBCDataSet + kedro_datasets.svmlight.SVMLightDataSet + kedro_datasets.tensorflow.TensorFlowModelDataset + kedro_datasets.text.TextDataSet + kedro_datasets.tracking.JSONDataSet + kedro_datasets.tracking.MetricsDataSet + kedro_datasets.video.VideoDataSet + kedro_datasets.yaml.YAMLDataSet diff --git a/docs/source/tutorial/add_another_pipeline.md b/docs/source/tutorial/add_another_pipeline.md index 34f29009b9..d85620dfa7 100644 --- a/docs/source/tutorial/add_another_pipeline.md +++ b/docs/source/tutorial/add_another_pipeline.md @@ -518,7 +518,7 @@ kedro run --runner=ThreadRunner kedro run --runner=module.path.to.my.runner ``` -`ParallelRunner` performs task parallelisation via multiprocessing, while `ThreadRunner` is intended for use with remote execution engines such as [Spark](../integrations/pyspark_integration.md) and [Dask](/kedro.datasets.dask.ParquetDataSet). +`ParallelRunner` performs task parallelisation via multiprocessing, while `ThreadRunner` is intended for use with remote execution engines such as [Spark](../integrations/pyspark_integration.md) and [Dask](/kedro_datasets.dask.ParquetDataSet). You can find out more about the runners Kedro provides, and how to create your own, in the [pipeline documentation about runners](../nodes_and_pipelines/run_a_pipeline.md). atasets to work with different data formats (including CSV, Excel, and Parquet) diff --git a/docs/source/tutorial/create_a_pipeline.md b/docs/source/tutorial/create_a_pipeline.md index 2e1a1d2edf..d0173a1cc9 100644 --- a/docs/source/tutorial/create_a_pipeline.md +++ b/docs/source/tutorial/create_a_pipeline.md @@ -177,7 +177,7 @@ You should see output similar to the following: ## Preprocessed data registration -Each of the nodes outputs a new dataset (`preprocessed_companies` and `preprocessed_shuttles`). Kedro saves these outputs in Parquet format [pandas.ParquetDataSet](/kedro.datasets.pandas.ParquetDataSet) because they are registered within the [Data Catalog](../resources/glossary.md#data-catalog) as you can see in `conf/base/catalog.yml`: +Each of the nodes outputs a new dataset (`preprocessed_companies` and `preprocessed_shuttles`). Kedro saves these outputs in Parquet format [pandas.ParquetDataSet](/kedro_datasets.pandas.ParquetDataSet) because they are registered within the [Data Catalog](../resources/glossary.md#data-catalog) as you can see in `conf/base/catalog.yml`:
Click to expand diff --git a/docs/source/tutorial/set_up_data.md b/docs/source/tutorial/set_up_data.md index a282abe4cd..364818b3a1 100644 --- a/docs/source/tutorial/set_up_data.md +++ b/docs/source/tutorial/set_up_data.md @@ -118,7 +118,7 @@ When you have finished, close `ipython` session with `exit()`. ### Custom data -[Kedro supports numerous datasets](/kedro.datasets) out of the box, but you can also add support for any proprietary data format or filesystem. +[Kedro supports numerous datasets](/kedro_datasets) out of the box, but you can also add support for any proprietary data format or filesystem. You can find further information about [how to add support for custom datasets](../extend_kedro/custom_datasets.md) in specific documentation covering advanced usage. diff --git a/docs/source/visualisation/visualise_charts_with_plotly.md b/docs/source/visualisation/visualise_charts_with_plotly.md index 2821900a49..5b14d2c635 100644 --- a/docs/source/visualisation/visualise_charts_with_plotly.md +++ b/docs/source/visualisation/visualise_charts_with_plotly.md @@ -173,7 +173,7 @@ Integrating Matplotlib into Kedro-Viz allows you to output charts as part of pip The MatplotlibWriter dataset converts Matplotlib objects to image files. This means that Matplotlib charts within Kedro-Viz are static and not interactive, unlike the Plotly charts seen above. ``` -You can view Matplotlib charts in Kedro-Viz when you use the [Kedro MatplotLibWriter dataset](/kedro.datasets.matplotlib.MatplotlibWriter). +You can view Matplotlib charts in Kedro-Viz when you use the [Kedro MatplotLibWriter dataset](/kedro_datasets.matplotlib.MatplotlibWriter). ### Update the dependencies diff --git a/kedro/extras/datasets/README.md b/kedro/extras/datasets/README.md index bb9ebfa528..3058ac4ab2 100644 --- a/kedro/extras/datasets/README.md +++ b/kedro/extras/datasets/README.md @@ -1,5 +1,9 @@ # Datasets +> **Warning** +> `kedro.extras.datasets` is deprecated and will be removed in Kedro 0.19, +> install `kedro-datasets` instead by running `pip install kedro-datasets`. + Welcome to `kedro.extras.datasets`, the home of Kedro's data connectors. Here you will find `AbstractDataSet` implementations created by QuantumBlack and external contributors. ## What `AbstractDataSet` implementations are supported? diff --git a/kedro/extras/datasets/__init__.py b/kedro/extras/datasets/__init__.py index bdaf8bf37e..5397e3da98 100644 --- a/kedro/extras/datasets/__init__.py +++ b/kedro/extras/datasets/__init__.py @@ -1,3 +1,19 @@ """``kedro.extras.datasets`` is where you can find all of Kedro's data connectors. These data connectors are implementations of the ``AbstractDataSet``. + +.. warning:: + + ``kedro.extras.datasets`` is deprecated and will be removed in Kedro 0.19. + Refer to :py:mod:`kedro_datasets` for the documentation, and + install ``kedro-datasets`` to avoid breakage by running ``pip install kedro-datasets``. + """ + +from warnings import warn as _warn + +_warn( + "`kedro.extras.datasets` is deprecated and will be removed in Kedro 0.19, " + "install `kedro-datasets` instead by running `pip install kedro-datasets`.", + DeprecationWarning, + stacklevel=2, +) diff --git a/kedro/io/core.py b/kedro/io/core.py index dc64e83e5a..d467e346ee 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -347,6 +347,9 @@ class Version(namedtuple("Version", ["load", "save"])): "intermediate data sets where possible to avoid this warning." ) +# `kedro_datasets` is probed before `kedro.extras.datasets`, +# hence the DeprecationWarning will not be shown +# if the dataset is available in the former _DEFAULT_PACKAGES = ["kedro.io.", "kedro_datasets.", "kedro.extras.datasets.", ""] diff --git a/pyproject.toml b/pyproject.toml index ba356a7e7d..8cdf924f8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ # PEP-518 https://peps.python.org/pep-0518/ [build-system] # Minimum requirements for the build system to execute. -requires = ["setuptools>=65.5.1", "wheel"] # PEP 518 specifications. +requires = ["setuptools>=65.5.1"] # PEP 518 specifications [project] name = "kedro"