From 20071e5a2696a2b87358b901ec5e176cafc53bb2 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 17 Jun 2024 12:34:52 +0100 Subject: [PATCH 1/4] docs(website): perform casts for PyTorch as needed --- docs/tutorial/scikit-learn.qmd | 6 +----- docs/tutorial/xgboost.qmd | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/docs/tutorial/scikit-learn.qmd b/docs/tutorial/scikit-learn.qmd index 042f0e1..53a5637 100644 --- a/docs/tutorial/scikit-learn.qmd +++ b/docs/tutorial/scikit-learn.qmd @@ -81,8 +81,7 @@ weather flight_data = ( flights.mutate( # Convert the arrival delay to a factor - # By default, PyTorch expects the target to have a Long datatype - arr_delay=ibis.ifelse(flights.arr_delay >= 30, 1, 0).cast("int64"), + arr_delay=ibis.ifelse(flights.arr_delay >= 30, 1, 0), # We will use the date (not date-time) in the recipe below date=flights.time_hour.date(), ) @@ -167,9 +166,6 @@ flights_rec = ml.Recipe( ml.DropZeroVariance(ml.everything()), ml.MutateAt("dep_time", ibis._.hour() * 60 + ibis._.minute()), ml.MutateAt(ml.timestamp(), ibis._.epoch_seconds()), - # By default, PyTorch requires that the type of `X` is `np.float32`. - # https://discuss.pytorch.org/t/mat1-and-mat2-must-have-the-same-dtype-but-got-double-and-float/197555/2 - ml.Cast(ml.numeric(), "float32"), ) ``` diff --git a/docs/tutorial/xgboost.qmd b/docs/tutorial/xgboost.qmd index 8f37f48..9ef7a27 100644 --- a/docs/tutorial/xgboost.qmd +++ b/docs/tutorial/xgboost.qmd @@ -81,8 +81,7 @@ weather flight_data = ( flights.mutate( # Convert the arrival delay to a factor - # By default, PyTorch expects the target to have a Long datatype - arr_delay=ibis.ifelse(flights.arr_delay >= 30, 1, 0).cast("int64"), + arr_delay=ibis.ifelse(flights.arr_delay >= 30, 1, 0), # We will use the date (not date-time) in the recipe below date=flights.time_hour.date(), ) @@ -167,9 +166,6 @@ flights_rec = ml.Recipe( ml.DropZeroVariance(ml.everything()), ml.MutateAt("dep_time", ibis._.hour() * 60 + ibis._.minute()), ml.MutateAt(ml.timestamp(), ibis._.epoch_seconds()), - # By default, PyTorch requires that the type of `X` is `np.float32`. - # https://discuss.pytorch.org/t/mat1-and-mat2-must-have-the-same-dtype-but-got-double-and-float/197555/2 - ml.Cast(ml.numeric(), "float32"), ) ``` From a4919fb973e5a725dc9678d78f67697d393183f8 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 17 Jun 2024 12:57:29 +0100 Subject: [PATCH 2/4] docs: acknowledge original source for the tutorial --- docs/tutorial/_acknowledgments.md | 3 +++ docs/tutorial/pytorch.qmd | 2 ++ docs/tutorial/scikit-learn.qmd | 2 ++ docs/tutorial/xgboost.qmd | 2 ++ examples/Preprocess your data with recipes.ipynb | 10 ++++++++++ 5 files changed, 19 insertions(+) create mode 100644 docs/tutorial/_acknowledgments.md diff --git a/docs/tutorial/_acknowledgments.md b/docs/tutorial/_acknowledgments.md new file mode 100644 index 0000000..af0fdb7 --- /dev/null +++ b/docs/tutorial/_acknowledgments.md @@ -0,0 +1,3 @@ +## Acknowledgments + +This tutorial is derived from the [tidymodels article of the same name](https://www.tidymodels.org/start/recipes/). The transformation logic is very similar, and much of the text is copied verbatim. diff --git a/docs/tutorial/pytorch.qmd b/docs/tutorial/pytorch.qmd index b2c8c97..b616210 100644 --- a/docs/tutorial/pytorch.qmd +++ b/docs/tutorial/pytorch.qmd @@ -240,3 +240,5 @@ X_test = test_data.drop("arr_delay") y_test = test_data.arr_delay pipe.score(X_test, y_test) ``` + +{{< include _acknowledgments.md >}} diff --git a/docs/tutorial/scikit-learn.qmd b/docs/tutorial/scikit-learn.qmd index 53a5637..5131659 100644 --- a/docs/tutorial/scikit-learn.qmd +++ b/docs/tutorial/scikit-learn.qmd @@ -207,3 +207,5 @@ X_test = test_data.drop("arr_delay") y_test = test_data.arr_delay pipe.score(X_test, y_test) ``` + +{{< include _acknowledgments.md >}} diff --git a/docs/tutorial/xgboost.qmd b/docs/tutorial/xgboost.qmd index 9ef7a27..70b231b 100644 --- a/docs/tutorial/xgboost.qmd +++ b/docs/tutorial/xgboost.qmd @@ -207,3 +207,5 @@ X_test = test_data.drop("arr_delay") y_test = test_data.arr_delay pipe.score(X_test, y_test) ``` + +{{< include _acknowledgments.md >}} diff --git a/examples/Preprocess your data with recipes.ipynb b/examples/Preprocess your data with recipes.ipynb index 14d1139..0263a68 100644 --- a/examples/Preprocess your data with recipes.ipynb +++ b/examples/Preprocess your data with recipes.ipynb @@ -1221,6 +1221,16 @@ "y_test = test_data.arr_delay\n", "pipe.score(X_test, y_test)" ] + }, + { + "cell_type": "markdown", + "id": "cc21b842-b85c-4ed9-af03-1feace909172", + "metadata": {}, + "source": [ + "## Acknowledgments\n", + "\n", + "This tutorial is derived from the [tidymodels article of the same name](https://www.tidymodels.org/start/recipes/). The transformation logic is very similar, and much of the text is copied verbatim." + ] } ], "metadata": { From 5337593cdbcdf3881b4f23464c47c35d817d8195 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 17 Jun 2024 13:01:07 +0100 Subject: [PATCH 3/4] docs: replace "data set" with "dataset" everywhere --- docs/tutorial/pytorch.qmd | 4 ++-- docs/tutorial/scikit-learn.qmd | 4 ++-- docs/tutorial/xgboost.qmd | 4 ++-- examples/Preprocess your data with recipes.ipynb | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/tutorial/pytorch.qmd b/docs/tutorial/pytorch.qmd index b616210..94a09dd 100644 --- a/docs/tutorial/pytorch.qmd +++ b/docs/tutorial/pytorch.qmd @@ -28,7 +28,7 @@ pip install 'ibis-framework[duckdb,examples]' ibis-ml skorch torch ## The New York City flight data -Let's use the [nycflights13 data](https://github.com/hadley/nycflights13) to predict whether a plane arrives more than 30 minutes late. This data set contains information on 325,819 flights departing near New York City in 2013. Let's start by loading the data and making a few changes to the variables: +Let's use the [nycflights13 data](https://github.com/hadley/nycflights13) to predict whether a plane arrives more than 30 minutes late. This dataset contains information on 325,819 flights departing near New York City in 2013. Let's start by loading the data and making a few changes to the variables: ```{python} #| output: false @@ -107,7 +107,7 @@ flight_data = ( flight_data ``` -We can see that about 16% of the flights in this data set arrived more than 30 minutes late. +We can see that about 16% of the flights in this dataset arrived more than 30 minutes late. ```{python} flight_data.arr_delay.value_counts().rename(n="arr_delay_count").mutate( diff --git a/docs/tutorial/scikit-learn.qmd b/docs/tutorial/scikit-learn.qmd index 5131659..cefdf96 100644 --- a/docs/tutorial/scikit-learn.qmd +++ b/docs/tutorial/scikit-learn.qmd @@ -28,7 +28,7 @@ pip install 'ibis-framework[duckdb,examples]' ibis-ml scikit-learn ## The New York City flight data -Let's use the [nycflights13 data](https://github.com/hadley/nycflights13) to predict whether a plane arrives more than 30 minutes late. This data set contains information on 325,819 flights departing near New York City in 2013. Let's start by loading the data and making a few changes to the variables: +Let's use the [nycflights13 data](https://github.com/hadley/nycflights13) to predict whether a plane arrives more than 30 minutes late. This dataset contains information on 325,819 flights departing near New York City in 2013. Let's start by loading the data and making a few changes to the variables: ```{python} #| output: false @@ -106,7 +106,7 @@ flight_data = ( flight_data ``` -We can see that about 16% of the flights in this data set arrived more than 30 minutes late. +We can see that about 16% of the flights in this dataset arrived more than 30 minutes late. ```{python} flight_data.arr_delay.value_counts().rename(n="arr_delay_count").mutate( diff --git a/docs/tutorial/xgboost.qmd b/docs/tutorial/xgboost.qmd index 70b231b..8f53aff 100644 --- a/docs/tutorial/xgboost.qmd +++ b/docs/tutorial/xgboost.qmd @@ -28,7 +28,7 @@ pip install 'ibis-framework[duckdb,examples]' ibis-ml 'xgboost[scikit-learn]' ## The New York City flight data -Let's use the [nycflights13 data](https://github.com/hadley/nycflights13) to predict whether a plane arrives more than 30 minutes late. This data set contains information on 325,819 flights departing near New York City in 2013. Let's start by loading the data and making a few changes to the variables: +Let's use the [nycflights13 data](https://github.com/hadley/nycflights13) to predict whether a plane arrives more than 30 minutes late. This dataset contains information on 325,819 flights departing near New York City in 2013. Let's start by loading the data and making a few changes to the variables: ```{python} #| output: false @@ -106,7 +106,7 @@ flight_data = ( flight_data ``` -We can see that about 16% of the flights in this data set arrived more than 30 minutes late. +We can see that about 16% of the flights in this dataset arrived more than 30 minutes late. ```{python} flight_data.arr_delay.value_counts().rename(n="arr_delay_count").mutate( diff --git a/examples/Preprocess your data with recipes.ipynb b/examples/Preprocess your data with recipes.ipynb index 0263a68..6c4a4b7 100644 --- a/examples/Preprocess your data with recipes.ipynb +++ b/examples/Preprocess your data with recipes.ipynb @@ -33,7 +33,7 @@ "source": [ "## The New York City flight data\n", "\n", - "Let's use the [nycflights13 data](https://github.com/hadley/nycflights13) to predict whether a plane arrives more than 30 minutes late. This data set contains information on 325,819 flights departing near New York City in 2013. Let's start by loading the data and making a few changes to the variables:" + "Let's use the [nycflights13 data](https://github.com/hadley/nycflights13) to predict whether a plane arrives more than 30 minutes late. This dataset contains information on 325,819 flights departing near New York City in 2013. Let's start by loading the data and making a few changes to the variables:" ] }, { @@ -317,7 +317,7 @@ "id": "722b2213-3b84-4f03-9006-59bf72591613", "metadata": {}, "source": [ - "We can see that about 16% of the flights in this data set arrived more than 30 minutes late." + "We can see that about 16% of the flights in this dataset arrived more than 30 minutes late." ] }, { From d9fca48ba3d96ee8e264366ccd2545524b7ffa12 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 17 Jun 2024 14:59:09 +0100 Subject: [PATCH 4/4] docs(website): move support matrix under reference --- docs/_quarto.yml | 6 ++++-- .../support-matrix/index.qmd} | 8 ++++---- docs/{ => reference/support-matrix}/step_config.yml | 0 docs/{ => reference/support-matrix}/support_matrix.py | 0 4 files changed, 8 insertions(+), 6 deletions(-) rename docs/{support_matrix.qmd => reference/support-matrix/index.qmd} (94%) rename docs/{ => reference/support-matrix}/step_config.yml (100%) rename docs/{ => reference/support-matrix}/support_matrix.py (100%) diff --git a/docs/_quarto.yml b/docs/_quarto.yml index f00b0e4..48b0785 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -48,8 +48,6 @@ website: - text: "Tutorial" href: tutorial/index.qmd - sidebar:reference - - text: "Support Matrix" - href: support_matrix.qmd tools: - icon: github menu: @@ -82,6 +80,10 @@ website: - reference/steps-temporal.qmd - reference/steps-other.qmd + - section: Support + contents: + - reference/support-matrix/index.qmd + format: html: theme: diff --git a/docs/support_matrix.qmd b/docs/reference/support-matrix/index.qmd similarity index 94% rename from docs/support_matrix.qmd rename to docs/reference/support-matrix/index.qmd index efc5438..77ba107 100644 --- a/docs/support_matrix.qmd +++ b/docs/reference/support-matrix/index.qmd @@ -20,7 +20,7 @@ varies: 1. ✅ Fully supported 2. 🚫 Not supported 3. 🟡 Partial support (hover over for more information) -4. 🔍 Support varies by operation or data type across different backends (check [operation support matrix](https://ibis-project.org/support_matrix) for details) +4. 🔍 Support varies by operation or data type across different backends (check the [Ibis operation support matrix](https://ibis-project.org/backends/support/matrix) for details) ::: ::: @@ -65,11 +65,10 @@ dict(value=len(ibis.util.backend_entry_points()) - 3, color="green", icon="datab ```{python} from itables import show from support_matrix import make_support_matrix -import re - matrix = make_support_matrix() + def custom_replace(value): if value is True: return "✅" @@ -80,10 +79,11 @@ def custom_replace(value): else: return f"🟡" + show( matrix.applymap(custom_replace), ordering=False, paging=False, buttons=["copy", "excel", "csv"], ) -``` \ No newline at end of file +``` diff --git a/docs/step_config.yml b/docs/reference/support-matrix/step_config.yml similarity index 100% rename from docs/step_config.yml rename to docs/reference/support-matrix/step_config.yml diff --git a/docs/support_matrix.py b/docs/reference/support-matrix/support_matrix.py similarity index 100% rename from docs/support_matrix.py rename to docs/reference/support-matrix/support_matrix.py