diff --git a/index.html b/index.html index 80fd74f..f09354e 100644 --- a/index.html +++ b/index.html @@ -263,7 +263,7 @@

Install IbisML

Create your first recipe

With recipes, you can define sequences of feature engineering steps to get your data ready for modeling. For example, create a recipe to replace missing values using the mean of each numeric column and then normalize numeric data to have a standard deviation of one and a mean of zero.

-
+
import ibis_ml as ml
 
 imputer = ml.ImputeMean(ml.numeric())
@@ -271,14 +271,14 @@ 

Create your first rec = ml.Recipe(imputer, scaler)

A recipe can be chained in a Pipeline like any other transformer.

-
+
from sklearn.pipeline import Pipeline
 from sklearn.svm import SVC
 
 pipe = Pipeline([("rec", rec), ("svc", SVC())])

The pipeline can be used as any other estimator and avoids leaking the test set into the train set.

-
+
from sklearn.datasets import make_classification
 from sklearn.model_selection import train_test_split
 
diff --git a/search.json b/search.json
index be3d46f..dd07f04 100644
--- a/search.json
+++ b/search.json
@@ -11,21 +11,21 @@
     "href": "tutorial/xgboost.html#introduction",
     "title": "Preprocess your data with recipes",
     "section": "Introduction",
-    "text": "Introduction\n…\n\nimport ibis\n\ncon = ibis.connect(\"duckdb://nycflights13.ddb\")\ncon.create_table(\n    \"flights\", ibis.examples.nycflights13_flights.fetch().to_pyarrow(), overwrite=True\n)\ncon.create_table(\n    \"weather\", ibis.examples.nycflights13_weather.fetch().to_pyarrow(), overwrite=True\n)\n\nYou can now see the example dataset copied over to the database:\n\ncon = ibis.connect(\"duckdb://nycflights13.ddb\")\ncon.list_tables()\n\n['flights', 'weather']\n\n\nWe’ll turn on interactive mode, which partially executes queries to give users a preview of the results.\n\nibis.options.interactive = True\n\n\nflights = con.table(\"flights\")\nflights = flights.mutate(\n    dep_time=(\n        flights.dep_time.lpad(4, \"0\").substr(0, 2)\n        + \":\"\n        + flights.dep_time.substr(-2, 2)\n        + \":00\"\n    ).try_cast(\"time\"),\n    arr_delay=flights.arr_delay.try_cast(int),\n    air_time=flights.air_time.try_cast(int),\n)\nflights\n\n┏━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ year  ┃ month ┃ day   ┃ dep_time ┃ sched_dep_time ┃ dep_delay ┃ arr_time ┃ sched_arr_time ┃ arr_delay ┃ carrier ┃ flight ┃ tailnum ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ hour  ┃ minute ┃ time_hour           ┃\n┡━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ int64 │ int64 │ int64 │ time     │ int64          │ string    │ string   │ int64          │ int64     │ string  │ int64  │ string  │ string │ string │ int64    │ int64    │ int64 │ int64  │ timestamp(6)        │\n├───────┼───────┼───────┼──────────┼────────────────┼───────────┼──────────┼────────────────┼───────────┼─────────┼────────┼─────────┼────────┼────────┼──────────┼──────────┼───────┼────────┼─────────────────────┤\n│  2013 │     1 │     1 │ 05:17:00 │            515 │ 2         │ 830      │            819 │        11 │ UA      │   1545 │ N14228  │ EWR    │ IAH    │      227 │     1400 │     5 │     15 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:33:00 │            529 │ 4         │ 850      │            830 │        20 │ UA      │   1714 │ N24211  │ LGA    │ IAH    │      227 │     1416 │     5 │     29 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:42:00 │            540 │ 2         │ 923      │            850 │        33 │ AA      │   1141 │ N619AA  │ JFK    │ MIA    │      160 │     1089 │     5 │     40 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:44:00 │            545 │ -1        │ 1004     │           1022 │       -18 │ B6      │    725 │ N804JB  │ JFK    │ BQN    │      183 │     1576 │     5 │     45 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:54:00 │            600 │ -6        │ 812      │            837 │       -25 │ DL      │    461 │ N668DN  │ LGA    │ ATL    │      116 │      762 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:54:00 │            558 │ -4        │ 740      │            728 │        12 │ UA      │   1696 │ N39463  │ EWR    │ ORD    │      150 │      719 │     5 │     58 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:55:00 │            600 │ -5        │ 913      │            854 │        19 │ B6      │    507 │ N516JB  │ EWR    │ FLL    │      158 │     1065 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:57:00 │            600 │ -3        │ 709      │            723 │       -14 │ EV      │   5708 │ N829AS  │ LGA    │ IAD    │       53 │      229 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:57:00 │            600 │ -3        │ 838      │            846 │        -8 │ B6      │     79 │ N593JB  │ JFK    │ MCO    │      140 │      944 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:58:00 │            600 │ -2        │ 753      │            745 │         8 │ AA      │    301 │ N3ALAA  │ LGA    │ ORD    │      138 │      733 │     6 │      0 │ 2013-01-01 11:00:00 │\n│     … │     … │     … │ …        │              … │ …         │ …        │              … │         … │ …       │      … │ …       │ …      │ …      │        … │        … │     … │      … │ …                   │\n└───────┴───────┴───────┴──────────┴────────────────┴───────────┴──────────┴────────────────┴───────────┴─────────┴────────┴─────────┴────────┴────────┴──────────┴──────────┴───────┴────────┴─────────────────────┘\n\n\n\n\nweather = con.table(\"weather\")\nweather\n\n┏━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ origin ┃ year  ┃ month ┃ day   ┃ hour  ┃ temp   ┃ dewp   ┃ humid  ┃ wind_dir ┃ wind_speed         ┃ wind_gust ┃ precip  ┃ pressure ┃ visib   ┃ time_hour           ┃\n┡━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ string │ int64 │ int64 │ int64 │ int64 │ string │ string │ string │ string   │ string             │ string    │ float64 │ string   │ float64 │ timestamp(6)        │\n├────────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼──────────┼────────────────────┼───────────┼─────────┼──────────┼─────────┼─────────────────────┤\n│ EWR    │  2013 │     1 │     1 │     1 │ 39.02  │ 26.06  │ 59.37  │ 270      │ 10.357019999999999 │ NA        │     0.0 │ 1012     │    10.0 │ 2013-01-01 06:00:00 │\n│ EWR    │  2013 │     1 │     1 │     2 │ 39.02  │ 26.96  │ 61.63  │ 250      │ 8.05546            │ NA        │     0.0 │ 1012.3   │    10.0 │ 2013-01-01 07:00:00 │\n│ EWR    │  2013 │     1 │     1 │     3 │ 39.02  │ 28.04  │ 64.43  │ 240      │ 11.5078            │ NA        │     0.0 │ 1012.5   │    10.0 │ 2013-01-01 08:00:00 │\n│ EWR    │  2013 │     1 │     1 │     4 │ 39.92  │ 28.04  │ 62.21  │ 250      │ 12.658579999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 09:00:00 │\n│ EWR    │  2013 │     1 │     1 │     5 │ 39.02  │ 28.04  │ 64.43  │ 260      │ 12.658579999999999 │ NA        │     0.0 │ 1011.9   │    10.0 │ 2013-01-01 10:00:00 │\n│ EWR    │  2013 │     1 │     1 │     6 │ 37.94  │ 28.04  │ 67.21  │ 240      │ 11.5078            │ NA        │     0.0 │ 1012.4   │    10.0 │ 2013-01-01 11:00:00 │\n│ EWR    │  2013 │     1 │     1 │     7 │ 39.02  │ 28.04  │ 64.43  │ 240      │ 14.960139999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 12:00:00 │\n│ EWR    │  2013 │     1 │     1 │     8 │ 39.92  │ 28.04  │ 62.21  │ 250      │ 10.357019999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 13:00:00 │\n│ EWR    │  2013 │     1 │     1 │     9 │ 39.92  │ 28.04  │ 62.21  │ 260      │ 14.960139999999999 │ NA        │     0.0 │ 1012.7   │    10.0 │ 2013-01-01 14:00:00 │\n│ EWR    │  2013 │     1 │     1 │    10 │ 41     │ 28.04  │ 59.65  │ 260      │ 13.809359999999998 │ NA        │     0.0 │ 1012.4   │    10.0 │ 2013-01-01 15:00:00 │\n│ …      │     … │     … │     … │     … │ …      │ …      │ …      │ …        │ …                  │ …         │       … │ …        │       … │ …                   │\n└────────┴───────┴───────┴───────┴───────┴────────┴────────┴────────┴──────────┴────────────────────┴───────────┴─────────┴──────────┴─────────┴─────────────────────┘"
+    "text": "Introduction\nIn this article, we’ll explore Recipes, which are designed to help you preprocess your data before training your model. Recipes are built as a series of preprocessing steps, such as:\n\nconverting qualitative predictors to indicator variables (also known as dummy variables),\ntransforming data to be on a different scale (e.g., taking the logarithm of a variable),\ntransforming whole groups of predictors together,\nextracting key features from raw variables (e.g., getting the day of the week out of a date variable),\n\nand so on. If you are familiar with scikit-learn’s dataset transformations, a lot of this might sound familiar and like what a transformer already does. Recipes can be used to do many of the same things, but they can scale your workloads on any Ibis-supported backend. This article shows how to use recipes for modeling.\nTo use code in this article, you will need to install the following packages: Ibis, IbisML, and XGBoost.\npip install 'ibis-framework[duckdb,examples]' ibis-ml 'xgboost[scikit-learn]'"
   },
   {
     "objectID": "tutorial/xgboost.html#the-new-york-city-flight-data",
     "href": "tutorial/xgboost.html#the-new-york-city-flight-data",
     "title": "Preprocess your data with recipes",
     "section": "The New York City flight data",
-    "text": "The New York City flight data\nLet’s use the nycflights13 data to predict whether a plane arrives more than 30 minutes late. This data set contains information on 325,819 flights departing near New York City in 2013. Let’s start by loading the data and making a few changes to the variables:\n\nflight_data = (\n    flights.mutate(\n        # Convert the arrival delay to a factor\n        # By default, PyTorch expects the target to have a Long datatype\n        arr_delay=ibis.ifelse(flights.arr_delay >= 30, 1, 0).cast(\"int64\"),\n        # We will use the date (not date-time) in the recipe below\n        date=flights.time_hour.date(),\n    )\n    # Include the weather data\n    .inner_join(weather, [\"origin\", \"time_hour\"])\n    # Only retain the specific columns we will use\n    .select(\n        \"dep_time\",\n        \"flight\",\n        \"origin\",\n        \"dest\",\n        \"air_time\",\n        \"distance\",\n        \"carrier\",\n        \"date\",\n        \"arr_delay\",\n        \"time_hour\",\n    )\n    # Exclude missing data\n    .dropna()\n)\nflight_data\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┤\n│ 05:57:00 │    461 │ LGA    │ ATL    │      100 │      762 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ 05:58:00 │   4424 │ EWR    │ RDU    │       63 │      416 │ EV      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ 05:58:00 │   6177 │ EWR    │ IAD    │       45 │      212 │ EV      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ 06:00:00 │    731 │ LGA    │ DTW    │       78 │      502 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ 06:01:00 │    684 │ EWR    │ LAX    │      316 │     2454 │ UA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ 06:01:00 │    301 │ LGA    │ ORD    │      164 │      733 │ AA      │ 2013-06-26 │         1 │ 2013-06-26 10:00:00 │\n│ 06:01:00 │   1837 │ LGA    │ MIA    │      148 │     1096 │ AA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ 06:01:00 │   1279 │ LGA    │ MEM    │      128 │      963 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ 06:02:00 │   1691 │ JFK    │ LAX    │      309 │     2475 │ UA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ 06:04:00 │   1447 │ JFK    │ CLT    │       75 │      541 │ US      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┘\n\n\n\nWe can see that about 16% of the flights in this data set arrived more than 30 minutes late.\n\nflight_data.arr_delay.value_counts().rename(n=\"arr_delay_count\").mutate(\n    prop=ibis._.n / ibis._.n.sum()\n)\n\n┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┓\n┃ arr_delay ┃ n      ┃ prop     ┃\n┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━┩\n│ int64     │ int64  │ float64  │\n├───────────┼────────┼──────────┤\n│         0 │ 273279 │ 0.838745 │\n│         1 │  52540 │ 0.161255 │\n└───────────┴────────┴──────────┘"
+    "text": "The New York City flight data\nLet’s use the nycflights13 data to predict whether a plane arrives more than 30 minutes late. This data set contains information on 325,819 flights departing near New York City in 2013. Let’s start by loading the data and making a few changes to the variables:\n\nimport ibis\n\ncon = ibis.connect(\"duckdb://nycflights13.ddb\")\ncon.create_table(\n    \"flights\", ibis.examples.nycflights13_flights.fetch().to_pyarrow(), overwrite=True\n)\ncon.create_table(\n    \"weather\", ibis.examples.nycflights13_weather.fetch().to_pyarrow(), overwrite=True\n)\n\nYou can now see the example dataset copied over to the database:\n\ncon = ibis.connect(\"duckdb://nycflights13.ddb\")\ncon.list_tables()\n\n['flights', 'weather']\n\n\nWe’ll turn on interactive mode, which partially executes queries to give users a preview of the results.\n\nibis.options.interactive = True\n\n\nflights = con.table(\"flights\")\nflights = flights.mutate(\n    dep_time=(\n        flights.dep_time.lpad(4, \"0\").substr(0, 2)\n        + \":\"\n        + flights.dep_time.substr(-2, 2)\n        + \":00\"\n    ).try_cast(\"time\"),\n    arr_delay=flights.arr_delay.try_cast(int),\n    air_time=flights.air_time.try_cast(int),\n)\nflights\n\n┏━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ year  ┃ month ┃ day   ┃ dep_time ┃ sched_dep_time ┃ dep_delay ┃ arr_time ┃ sched_arr_time ┃ arr_delay ┃ carrier ┃ flight ┃ tailnum ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ hour  ┃ minute ┃ time_hour           ┃\n┡━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ int64 │ int64 │ int64 │ time     │ int64          │ string    │ string   │ int64          │ int64     │ string  │ int64  │ string  │ string │ string │ int64    │ int64    │ int64 │ int64  │ timestamp(6)        │\n├───────┼───────┼───────┼──────────┼────────────────┼───────────┼──────────┼────────────────┼───────────┼─────────┼────────┼─────────┼────────┼────────┼──────────┼──────────┼───────┼────────┼─────────────────────┤\n│  2013 │     1 │     1 │ 05:17:00 │            515 │ 2         │ 830      │            819 │        11 │ UA      │   1545 │ N14228  │ EWR    │ IAH    │      227 │     1400 │     5 │     15 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:33:00 │            529 │ 4         │ 850      │            830 │        20 │ UA      │   1714 │ N24211  │ LGA    │ IAH    │      227 │     1416 │     5 │     29 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:42:00 │            540 │ 2         │ 923      │            850 │        33 │ AA      │   1141 │ N619AA  │ JFK    │ MIA    │      160 │     1089 │     5 │     40 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:44:00 │            545 │ -1        │ 1004     │           1022 │       -18 │ B6      │    725 │ N804JB  │ JFK    │ BQN    │      183 │     1576 │     5 │     45 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:54:00 │            600 │ -6        │ 812      │            837 │       -25 │ DL      │    461 │ N668DN  │ LGA    │ ATL    │      116 │      762 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:54:00 │            558 │ -4        │ 740      │            728 │        12 │ UA      │   1696 │ N39463  │ EWR    │ ORD    │      150 │      719 │     5 │     58 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:55:00 │            600 │ -5        │ 913      │            854 │        19 │ B6      │    507 │ N516JB  │ EWR    │ FLL    │      158 │     1065 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:57:00 │            600 │ -3        │ 709      │            723 │       -14 │ EV      │   5708 │ N829AS  │ LGA    │ IAD    │       53 │      229 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:57:00 │            600 │ -3        │ 838      │            846 │        -8 │ B6      │     79 │ N593JB  │ JFK    │ MCO    │      140 │      944 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:58:00 │            600 │ -2        │ 753      │            745 │         8 │ AA      │    301 │ N3ALAA  │ LGA    │ ORD    │      138 │      733 │     6 │      0 │ 2013-01-01 11:00:00 │\n│     … │     … │     … │ …        │              … │ …         │ …        │              … │         … │ …       │      … │ …       │ …      │ …      │        … │        … │     … │      … │ …                   │\n└───────┴───────┴───────┴──────────┴────────────────┴───────────┴──────────┴────────────────┴───────────┴─────────┴────────┴─────────┴────────┴────────┴──────────┴──────────┴───────┴────────┴─────────────────────┘\n\n\n\n\nweather = con.table(\"weather\")\nweather\n\n┏━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ origin ┃ year  ┃ month ┃ day   ┃ hour  ┃ temp   ┃ dewp   ┃ humid  ┃ wind_dir ┃ wind_speed         ┃ wind_gust ┃ precip  ┃ pressure ┃ visib   ┃ time_hour           ┃\n┡━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ string │ int64 │ int64 │ int64 │ int64 │ string │ string │ string │ string   │ string             │ string    │ float64 │ string   │ float64 │ timestamp(6)        │\n├────────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼──────────┼────────────────────┼───────────┼─────────┼──────────┼─────────┼─────────────────────┤\n│ EWR    │  2013 │     1 │     1 │     1 │ 39.02  │ 26.06  │ 59.37  │ 270      │ 10.357019999999999 │ NA        │     0.0 │ 1012     │    10.0 │ 2013-01-01 06:00:00 │\n│ EWR    │  2013 │     1 │     1 │     2 │ 39.02  │ 26.96  │ 61.63  │ 250      │ 8.05546            │ NA        │     0.0 │ 1012.3   │    10.0 │ 2013-01-01 07:00:00 │\n│ EWR    │  2013 │     1 │     1 │     3 │ 39.02  │ 28.04  │ 64.43  │ 240      │ 11.5078            │ NA        │     0.0 │ 1012.5   │    10.0 │ 2013-01-01 08:00:00 │\n│ EWR    │  2013 │     1 │     1 │     4 │ 39.92  │ 28.04  │ 62.21  │ 250      │ 12.658579999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 09:00:00 │\n│ EWR    │  2013 │     1 │     1 │     5 │ 39.02  │ 28.04  │ 64.43  │ 260      │ 12.658579999999999 │ NA        │     0.0 │ 1011.9   │    10.0 │ 2013-01-01 10:00:00 │\n│ EWR    │  2013 │     1 │     1 │     6 │ 37.94  │ 28.04  │ 67.21  │ 240      │ 11.5078            │ NA        │     0.0 │ 1012.4   │    10.0 │ 2013-01-01 11:00:00 │\n│ EWR    │  2013 │     1 │     1 │     7 │ 39.02  │ 28.04  │ 64.43  │ 240      │ 14.960139999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 12:00:00 │\n│ EWR    │  2013 │     1 │     1 │     8 │ 39.92  │ 28.04  │ 62.21  │ 250      │ 10.357019999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 13:00:00 │\n│ EWR    │  2013 │     1 │     1 │     9 │ 39.92  │ 28.04  │ 62.21  │ 260      │ 14.960139999999999 │ NA        │     0.0 │ 1012.7   │    10.0 │ 2013-01-01 14:00:00 │\n│ EWR    │  2013 │     1 │     1 │    10 │ 41     │ 28.04  │ 59.65  │ 260      │ 13.809359999999998 │ NA        │     0.0 │ 1012.4   │    10.0 │ 2013-01-01 15:00:00 │\n│ …      │     … │     … │     … │     … │ …      │ …      │ …      │ …        │ …                  │ …         │       … │ …        │       … │ …                   │\n└────────┴───────┴───────┴───────┴───────┴────────┴────────┴────────┴──────────┴────────────────────┴───────────┴─────────┴──────────┴─────────┴─────────────────────┘\n\n\n\n\nflight_data = (\n    flights.mutate(\n        # Convert the arrival delay to a factor\n        # By default, PyTorch expects the target to have a Long datatype\n        arr_delay=ibis.ifelse(flights.arr_delay >= 30, 1, 0).cast(\"int64\"),\n        # We will use the date (not date-time) in the recipe below\n        date=flights.time_hour.date(),\n    )\n    # Include the weather data\n    .inner_join(weather, [\"origin\", \"time_hour\"])\n    # Only retain the specific columns we will use\n    .select(\n        \"dep_time\",\n        \"flight\",\n        \"origin\",\n        \"dest\",\n        \"air_time\",\n        \"distance\",\n        \"carrier\",\n        \"date\",\n        \"arr_delay\",\n        \"time_hour\",\n    )\n    # Exclude missing data\n    .dropna()\n)\nflight_data\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┤\n│ 05:17:00 │   1545 │ EWR    │ IAH    │      227 │     1400 │ UA      │ 2013-01-01 │         0 │ 2013-01-01 10:00:00 │\n│ 05:54:00 │    461 │ LGA    │ ATL    │      116 │      762 │ DL      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │\n│ 05:54:00 │   1696 │ EWR    │ ORD    │      150 │      719 │ UA      │ 2013-01-01 │         0 │ 2013-01-01 10:00:00 │\n│ 05:55:00 │    507 │ EWR    │ FLL    │      158 │     1065 │ B6      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │\n│ 05:57:00 │   5708 │ LGA    │ IAD    │       53 │      229 │ EV      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │\n│ 05:57:00 │     79 │ JFK    │ MCO    │      140 │      944 │ B6      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │\n│ 05:58:00 │    301 │ LGA    │ ORD    │      138 │      733 │ AA      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │\n│ 05:58:00 │     49 │ JFK    │ PBI    │      149 │     1028 │ B6      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │\n│ 05:58:00 │     71 │ JFK    │ TPA    │      158 │     1005 │ B6      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │\n│ 05:58:00 │    194 │ JFK    │ LAX    │      345 │     2475 │ UA      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┘\n\n\n\nWe can see that about 16% of the flights in this data set arrived more than 30 minutes late.\n\nflight_data.arr_delay.value_counts().rename(n=\"arr_delay_count\").mutate(\n    prop=ibis._.n / ibis._.n.sum()\n)\n\n┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┓\n┃ arr_delay ┃ n      ┃ prop     ┃\n┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━┩\n│ int64     │ int64  │ float64  │\n├───────────┼────────┼──────────┤\n│         0 │ 273279 │ 0.838745 │\n│         1 │  52540 │ 0.161255 │\n└───────────┴────────┴──────────┘"
   },
   {
     "objectID": "tutorial/xgboost.html#data-splitting",
     "href": "tutorial/xgboost.html#data-splitting",
     "title": "Preprocess your data with recipes",
     "section": "Data splitting",
-    "text": "Data splitting\nTo get started, let’s split this single dataset into two: a training set and a testing set. We’ll keep most of the rows in the original dataset (subset chosen randomly) in the training set. The training data will be used to fit the model, and the testing set will be used to measure model performance.\nBecause the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.\n\nflight_data_with_unique_key = flight_data.mutate(\n    unique_key=ibis.literal(\",\").join(\n        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]\n    )\n)\nflight_data_with_unique_key\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃ unique_key         ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │ string             │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┤\n│ 05:57:00 │    461 │ LGA    │ ATL    │      100 │      762 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ DL,461,2013-06-26  │\n│ 05:58:00 │   4424 │ EWR    │ RDU    │       63 │      416 │ EV      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ EV,4424,2013-06-26 │\n│ 05:58:00 │   6177 │ EWR    │ IAD    │       45 │      212 │ EV      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ EV,6177,2013-06-26 │\n│ 06:00:00 │    731 │ LGA    │ DTW    │       78 │      502 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ DL,731,2013-06-26  │\n│ 06:01:00 │    684 │ EWR    │ LAX    │      316 │     2454 │ UA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ UA,684,2013-06-26  │\n│ 06:01:00 │    301 │ LGA    │ ORD    │      164 │      733 │ AA      │ 2013-06-26 │         1 │ 2013-06-26 10:00:00 │ AA,301,2013-06-26  │\n│ 06:01:00 │   1837 │ LGA    │ MIA    │      148 │     1096 │ AA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ AA,1837,2013-06-26 │\n│ 06:01:00 │   1279 │ LGA    │ MEM    │      128 │      963 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ DL,1279,2013-06-26 │\n│ 06:02:00 │   1691 │ JFK    │ LAX    │      309 │     2475 │ UA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ UA,1691,2013-06-26 │\n│ 06:04:00 │   1447 │ JFK    │ CLT    │       75 │      541 │ US      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ US,1447,2013-06-26 │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │ …                  │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┘\n\n\n\n\n# FIXME(deepyaman): Proposed key isn't unique for actual departure date.\nflight_data_with_unique_key.group_by(\"unique_key\").mutate(\n    cnt=flight_data_with_unique_key.count()\n)[ibis._.cnt > 1]\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃ unique_key         ┃ cnt   ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │ string             │ int64 │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┼───────┤\n│ 19:48:00 │   3450 │ JFK    │ JAX    │      116 │      828 │ 9E      │ 2013-05-02 │         0 │ 2013-05-02 00:00:00 │ 9E,3450,2013-05-02 │     2 │\n│ 19:55:00 │   3450 │ JFK    │ JAX    │      107 │      828 │ 9E      │ 2013-05-02 │         0 │ 2013-05-02 23:00:00 │ 9E,3450,2013-05-02 │     2 │\n│ 18:27:00 │   4033 │ LGA    │ TYS    │       98 │      647 │ 9E      │ 2013-12-28 │         0 │ 2013-12-28 22:00:00 │ 9E,4033,2013-12-28 │     2 │\n│ 19:01:00 │   4033 │ LGA    │ TYS    │      100 │      647 │ 9E      │ 2013-12-28 │         0 │ 2013-12-28 00:00:00 │ 9E,4033,2013-12-28 │     2 │\n│ 17:57:00 │   1211 │ LGA    │ RSW    │      148 │     1080 │ DL      │ 2013-12-03 │         0 │ 2013-12-03 22:00:00 │ DL,1211,2013-12-03 │     2 │\n│ 19:43:00 │   1211 │ LGA    │ RSW    │      152 │     1080 │ DL      │ 2013-12-03 │         0 │ 2013-12-03 00:00:00 │ DL,1211,2013-12-03 │     2 │\n│ 05:59:00 │   1318 │ EWR    │ DTW    │       99 │      488 │ DL      │ 2013-01-03 │         0 │ 2013-01-03 11:00:00 │ DL,1318,2013-01-03 │     2 │\n│ 20:28:00 │   1318 │ JFK    │ FLL    │      158 │     1069 │ DL      │ 2013-01-03 │         0 │ 2013-01-03 01:00:00 │ DL,1318,2013-01-03 │     2 │\n│ 19:19:00 │   2139 │ LGA    │ MIA    │      163 │     1096 │ DL      │ 2013-12-23 │         0 │ 2013-12-23 00:00:00 │ DL,2139,2013-12-23 │     2 │\n│ 18:55:00 │   2139 │ LGA    │ MIA    │      175 │     1096 │ DL      │ 2013-12-23 │         1 │ 2013-12-23 23:00:00 │ DL,2139,2013-12-23 │     2 │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │ …                  │     … │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┴───────┘\n\n\n\n\nimport random\n\n# Fix the random numbers by setting the seed\n# This enables the analysis to be reproducible when random numbers are used\nrandom.seed(222)\n\n# Put 3/4 of the data into the training set\nrandom_key = str(random.getrandbits(256))\ndata_split = flight_data_with_unique_key.mutate(\n    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3\n)\n\n# Create data frames for the two sets:\ntrain_data = data_split[data_split.train].drop(\"unique_key\", \"train\")\ntest_data = data_split[~data_split.train].drop(\"unique_key\", \"train\")"
+    "text": "Data splitting\nTo get started, let’s split this single dataset into two: a training set and a testing set. We’ll keep most of the rows in the original dataset (subset chosen randomly) in the training set. The training data will be used to fit the model, and the testing set will be used to measure model performance.\nBecause the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.\n\nflight_data_with_unique_key = flight_data.mutate(\n    unique_key=ibis.literal(\",\").join(\n        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]\n    )\n)\nflight_data_with_unique_key\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃ unique_key         ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │ string             │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┤\n│ 05:17:00 │   1545 │ EWR    │ IAH    │      227 │     1400 │ UA      │ 2013-01-01 │         0 │ 2013-01-01 10:00:00 │ UA,1545,2013-01-01 │\n│ 05:54:00 │    461 │ LGA    │ ATL    │      116 │      762 │ DL      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │ DL,461,2013-01-01  │\n│ 05:54:00 │   1696 │ EWR    │ ORD    │      150 │      719 │ UA      │ 2013-01-01 │         0 │ 2013-01-01 10:00:00 │ UA,1696,2013-01-01 │\n│ 05:55:00 │    507 │ EWR    │ FLL    │      158 │     1065 │ B6      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │ B6,507,2013-01-01  │\n│ 05:57:00 │   5708 │ LGA    │ IAD    │       53 │      229 │ EV      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │ EV,5708,2013-01-01 │\n│ 05:57:00 │     79 │ JFK    │ MCO    │      140 │      944 │ B6      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │ B6,79,2013-01-01   │\n│ 05:58:00 │    301 │ LGA    │ ORD    │      138 │      733 │ AA      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │ AA,301,2013-01-01  │\n│ 05:58:00 │     49 │ JFK    │ PBI    │      149 │     1028 │ B6      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │ B6,49,2013-01-01   │\n│ 05:58:00 │     71 │ JFK    │ TPA    │      158 │     1005 │ B6      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │ B6,71,2013-01-01   │\n│ 05:58:00 │    194 │ JFK    │ LAX    │      345 │     2475 │ UA      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │ UA,194,2013-01-01  │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │ …                  │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┘\n\n\n\n\n# FIXME(deepyaman): Proposed key isn't unique for actual departure date.\nflight_data_with_unique_key.group_by(\"unique_key\").mutate(\n    cnt=flight_data_with_unique_key.count()\n)[ibis._.cnt > 1]\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃ unique_key         ┃ cnt   ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │ string             │ int64 │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┼───────┤\n│ 19:59:00 │   1022 │ EWR    │ IAH    │      167 │     1400 │ UA      │ 2013-09-14 │         0 │ 2013-09-14 23:00:00 │ UA,1022,2013-09-14 │     2 │\n│ 20:00:00 │   1022 │ EWR    │ IAH    │      186 │     1400 │ UA      │ 2013-09-14 │         0 │ 2013-09-14 00:00:00 │ UA,1022,2013-09-14 │     2 │\n│ 19:12:00 │   1023 │ LGA    │ ORD    │      112 │      733 │ UA      │ 2013-05-29 │         0 │ 2013-05-29 23:00:00 │ UA,1023,2013-05-29 │     2 │\n│ 21:16:00 │   1023 │ EWR    │ IAH    │      175 │     1400 │ UA      │ 2013-05-29 │         0 │ 2013-05-29 01:00:00 │ UA,1023,2013-05-29 │     2 │\n│ 15:18:00 │   1052 │ EWR    │ IAH    │      174 │     1400 │ UA      │ 2013-08-27 │         0 │ 2013-08-27 19:00:00 │ UA,1052,2013-08-27 │     2 │\n│ 21:22:00 │   1052 │ EWR    │ IAH    │      173 │     1400 │ UA      │ 2013-08-27 │         0 │ 2013-08-27 01:00:00 │ UA,1052,2013-08-27 │     2 │\n│ 18:39:00 │   1053 │ EWR    │ CLE    │       72 │      404 │ UA      │ 2013-12-20 │         0 │ 2013-12-20 23:00:00 │ UA,1053,2013-12-20 │     2 │\n│ 19:27:00 │   1053 │ EWR    │ CLE    │       69 │      404 │ UA      │ 2013-12-20 │         0 │ 2013-12-20 00:00:00 │ UA,1053,2013-12-20 │     2 │\n│ 17:20:00 │   1071 │ EWR    │ PHX    │      281 │     2133 │ UA      │ 2013-02-26 │         0 │ 2013-02-26 22:00:00 │ UA,1071,2013-02-26 │     2 │\n│ 20:16:00 │   1071 │ EWR    │ BQN    │      196 │     1585 │ UA      │ 2013-02-26 │         0 │ 2013-02-26 01:00:00 │ UA,1071,2013-02-26 │     2 │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │ …                  │     … │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┴───────┘\n\n\n\n\nimport random\n\n# Fix the random numbers by setting the seed\n# This enables the analysis to be reproducible when random numbers are used\nrandom.seed(222)\n\n# Put 3/4 of the data into the training set\nrandom_key = str(random.getrandbits(256))\ndata_split = flight_data_with_unique_key.mutate(\n    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3\n)\n\n# Create data frames for the two sets:\ntrain_data = data_split[data_split.train].drop(\"unique_key\", \"train\")\ntest_data = data_split[~data_split.train].drop(\"unique_key\", \"train\")"
   },
   {
     "objectID": "tutorial/xgboost.html#create-features",
@@ -46,7 +46,7 @@
     "href": "tutorial/xgboost.html#use-a-trained-workflow-to-predict",
     "title": "Preprocess your data with recipes",
     "section": "Use a trained workflow to predict",
-    "text": "Use a trained workflow to predict\n…\n\nX_test = test_data.drop(\"arr_delay\")\ny_test = test_data.arr_delay\npipe.score(X_test, y_test)\n\n0.8352055332090651"
+    "text": "Use a trained workflow to predict\n…\n\nX_test = test_data.drop(\"arr_delay\")\ny_test = test_data.arr_delay\npipe.score(X_test, y_test)\n\n0.8332066123810458"
   },
   {
     "objectID": "tutorial/pytorch.html",
@@ -60,21 +60,21 @@
     "href": "tutorial/pytorch.html#introduction",
     "title": "Preprocess your data with recipes",
     "section": "Introduction",
-    "text": "Introduction\n…\n\nimport ibis\n\ncon = ibis.connect(\"duckdb://nycflights13.ddb\")\ncon.create_table(\n    \"flights\", ibis.examples.nycflights13_flights.fetch().to_pyarrow(), overwrite=True\n)\ncon.create_table(\n    \"weather\", ibis.examples.nycflights13_weather.fetch().to_pyarrow(), overwrite=True\n)\n\nYou can now see the example dataset copied over to the database:\n\ncon = ibis.connect(\"duckdb://nycflights13.ddb\")\ncon.list_tables()\n\n['flights', 'weather']\n\n\nWe’ll turn on interactive mode, which partially executes queries to give users a preview of the results.\n\nibis.options.interactive = True\n\n\nflights = con.table(\"flights\")\nflights = flights.mutate(\n    dep_time=(\n        flights.dep_time.lpad(4, \"0\").substr(0, 2)\n        + \":\"\n        + flights.dep_time.substr(-2, 2)\n        + \":00\"\n    ).try_cast(\"time\"),\n    arr_delay=flights.arr_delay.try_cast(int),\n    air_time=flights.air_time.try_cast(int),\n)\nflights\n\n┏━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ year  ┃ month ┃ day   ┃ dep_time ┃ sched_dep_time ┃ dep_delay ┃ arr_time ┃ sched_arr_time ┃ arr_delay ┃ carrier ┃ flight ┃ tailnum ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ hour  ┃ minute ┃ time_hour           ┃\n┡━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ int64 │ int64 │ int64 │ time     │ int64          │ string    │ string   │ int64          │ int64     │ string  │ int64  │ string  │ string │ string │ int64    │ int64    │ int64 │ int64  │ timestamp(6)        │\n├───────┼───────┼───────┼──────────┼────────────────┼───────────┼──────────┼────────────────┼───────────┼─────────┼────────┼─────────┼────────┼────────┼──────────┼──────────┼───────┼────────┼─────────────────────┤\n│  2013 │     1 │     1 │ 05:17:00 │            515 │ 2         │ 830      │            819 │        11 │ UA      │   1545 │ N14228  │ EWR    │ IAH    │      227 │     1400 │     5 │     15 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:33:00 │            529 │ 4         │ 850      │            830 │        20 │ UA      │   1714 │ N24211  │ LGA    │ IAH    │      227 │     1416 │     5 │     29 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:42:00 │            540 │ 2         │ 923      │            850 │        33 │ AA      │   1141 │ N619AA  │ JFK    │ MIA    │      160 │     1089 │     5 │     40 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:44:00 │            545 │ -1        │ 1004     │           1022 │       -18 │ B6      │    725 │ N804JB  │ JFK    │ BQN    │      183 │     1576 │     5 │     45 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:54:00 │            600 │ -6        │ 812      │            837 │       -25 │ DL      │    461 │ N668DN  │ LGA    │ ATL    │      116 │      762 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:54:00 │            558 │ -4        │ 740      │            728 │        12 │ UA      │   1696 │ N39463  │ EWR    │ ORD    │      150 │      719 │     5 │     58 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:55:00 │            600 │ -5        │ 913      │            854 │        19 │ B6      │    507 │ N516JB  │ EWR    │ FLL    │      158 │     1065 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:57:00 │            600 │ -3        │ 709      │            723 │       -14 │ EV      │   5708 │ N829AS  │ LGA    │ IAD    │       53 │      229 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:57:00 │            600 │ -3        │ 838      │            846 │        -8 │ B6      │     79 │ N593JB  │ JFK    │ MCO    │      140 │      944 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:58:00 │            600 │ -2        │ 753      │            745 │         8 │ AA      │    301 │ N3ALAA  │ LGA    │ ORD    │      138 │      733 │     6 │      0 │ 2013-01-01 11:00:00 │\n│     … │     … │     … │ …        │              … │ …         │ …        │              … │         … │ …       │      … │ …       │ …      │ …      │        … │        … │     … │      … │ …                   │\n└───────┴───────┴───────┴──────────┴────────────────┴───────────┴──────────┴────────────────┴───────────┴─────────┴────────┴─────────┴────────┴────────┴──────────┴──────────┴───────┴────────┴─────────────────────┘\n\n\n\n\nweather = con.table(\"weather\")\nweather\n\n┏━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ origin ┃ year  ┃ month ┃ day   ┃ hour  ┃ temp   ┃ dewp   ┃ humid  ┃ wind_dir ┃ wind_speed         ┃ wind_gust ┃ precip  ┃ pressure ┃ visib   ┃ time_hour           ┃\n┡━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ string │ int64 │ int64 │ int64 │ int64 │ string │ string │ string │ string   │ string             │ string    │ float64 │ string   │ float64 │ timestamp(6)        │\n├────────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼──────────┼────────────────────┼───────────┼─────────┼──────────┼─────────┼─────────────────────┤\n│ EWR    │  2013 │     1 │     1 │     1 │ 39.02  │ 26.06  │ 59.37  │ 270      │ 10.357019999999999 │ NA        │     0.0 │ 1012     │    10.0 │ 2013-01-01 06:00:00 │\n│ EWR    │  2013 │     1 │     1 │     2 │ 39.02  │ 26.96  │ 61.63  │ 250      │ 8.05546            │ NA        │     0.0 │ 1012.3   │    10.0 │ 2013-01-01 07:00:00 │\n│ EWR    │  2013 │     1 │     1 │     3 │ 39.02  │ 28.04  │ 64.43  │ 240      │ 11.5078            │ NA        │     0.0 │ 1012.5   │    10.0 │ 2013-01-01 08:00:00 │\n│ EWR    │  2013 │     1 │     1 │     4 │ 39.92  │ 28.04  │ 62.21  │ 250      │ 12.658579999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 09:00:00 │\n│ EWR    │  2013 │     1 │     1 │     5 │ 39.02  │ 28.04  │ 64.43  │ 260      │ 12.658579999999999 │ NA        │     0.0 │ 1011.9   │    10.0 │ 2013-01-01 10:00:00 │\n│ EWR    │  2013 │     1 │     1 │     6 │ 37.94  │ 28.04  │ 67.21  │ 240      │ 11.5078            │ NA        │     0.0 │ 1012.4   │    10.0 │ 2013-01-01 11:00:00 │\n│ EWR    │  2013 │     1 │     1 │     7 │ 39.02  │ 28.04  │ 64.43  │ 240      │ 14.960139999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 12:00:00 │\n│ EWR    │  2013 │     1 │     1 │     8 │ 39.92  │ 28.04  │ 62.21  │ 250      │ 10.357019999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 13:00:00 │\n│ EWR    │  2013 │     1 │     1 │     9 │ 39.92  │ 28.04  │ 62.21  │ 260      │ 14.960139999999999 │ NA        │     0.0 │ 1012.7   │    10.0 │ 2013-01-01 14:00:00 │\n│ EWR    │  2013 │     1 │     1 │    10 │ 41     │ 28.04  │ 59.65  │ 260      │ 13.809359999999998 │ NA        │     0.0 │ 1012.4   │    10.0 │ 2013-01-01 15:00:00 │\n│ …      │     … │     … │     … │     … │ …      │ …      │ …      │ …        │ …                  │ …         │       … │ …        │       … │ …                   │\n└────────┴───────┴───────┴───────┴───────┴────────┴────────┴────────┴──────────┴────────────────────┴───────────┴─────────┴──────────┴─────────┴─────────────────────┘"
+    "text": "Introduction\nIn this article, we’ll explore Recipes, which are designed to help you preprocess your data before training your model. Recipes are built as a series of preprocessing steps, such as:\n\nconverting qualitative predictors to indicator variables (also known as dummy variables),\ntransforming data to be on a different scale (e.g., taking the logarithm of a variable),\ntransforming whole groups of predictors together,\nextracting key features from raw variables (e.g., getting the day of the week out of a date variable),\n\nand so on. If you are familiar with scikit-learn’s dataset transformations, a lot of this might sound familiar and like what a transformer already does. Recipes can be used to do many of the same things, but they can scale your workloads on any Ibis-supported backend. This article shows how to use recipes for modeling.\nTo use code in this article, you will need to install the following packages: Ibis, IbisML, and skorch, a high-level library for PyTorch that provides full scikit-learn compatibility.\npip install 'ibis-framework[duckdb,examples]' ibis-ml skorch torch"
   },
   {
     "objectID": "tutorial/pytorch.html#the-new-york-city-flight-data",
     "href": "tutorial/pytorch.html#the-new-york-city-flight-data",
     "title": "Preprocess your data with recipes",
     "section": "The New York City flight data",
-    "text": "The New York City flight data\nLet’s use the nycflights13 data to predict whether a plane arrives more than 30 minutes late. This data set contains information on 325,819 flights departing near New York City in 2013. Let’s start by loading the data and making a few changes to the variables:\n\nflight_data = (\n    flights.mutate(\n        # Convert the arrival delay to a factor\n        # By default, PyTorch expects the target to have a Long datatype\n        arr_delay=ibis.ifelse(flights.arr_delay >= 30, 1, 0).cast(\"int64\"),\n        # We will use the date (not date-time) in the recipe below\n        date=flights.time_hour.date(),\n    )\n    # Include the weather data\n    .inner_join(weather, [\"origin\", \"time_hour\"])\n    # Only retain the specific columns we will use\n    .select(\n        \"dep_time\",\n        \"flight\",\n        \"origin\",\n        \"dest\",\n        \"air_time\",\n        \"distance\",\n        \"carrier\",\n        \"date\",\n        \"arr_delay\",\n        \"time_hour\",\n    )\n    # Exclude missing data\n    .dropna()\n)\nflight_data\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┤\n│ 10:45:00 │     67 │ EWR    │ ORD    │      120 │      719 │ UA      │ 2013-02-14 │         0 │ 2013-02-14 15:00:00 │\n│ 10:48:00 │    373 │ LGA    │ FLL    │      179 │     1076 │ B6      │ 2013-02-14 │         0 │ 2013-02-14 15:00:00 │\n│ 10:48:00 │    764 │ EWR    │ IAH    │      207 │     1400 │ UA      │ 2013-02-14 │         0 │ 2013-02-14 15:00:00 │\n│ 10:51:00 │   2044 │ LGA    │ MIA    │      171 │     1096 │ DL      │ 2013-02-14 │         0 │ 2013-02-14 16:00:00 │\n│ 10:51:00 │   2171 │ LGA    │ DCA    │       40 │      214 │ US      │ 2013-02-14 │         0 │ 2013-02-14 16:00:00 │\n│ 10:57:00 │   1275 │ JFK    │ SLC    │      286 │     1990 │ DL      │ 2013-02-14 │         0 │ 2013-02-14 16:00:00 │\n│ 10:57:00 │    366 │ LGA    │ STL    │      135 │      888 │ WN      │ 2013-02-14 │         0 │ 2013-02-14 16:00:00 │\n│ 10:57:00 │   1550 │ EWR    │ SFO    │      338 │     2565 │ UA      │ 2013-02-14 │         0 │ 2013-02-14 15:00:00 │\n│ 10:58:00 │   4694 │ EWR    │ MKE    │      113 │      725 │ EV      │ 2013-02-14 │         0 │ 2013-02-14 15:00:00 │\n│ 10:58:00 │   1647 │ LGA    │ ATL    │      117 │      762 │ DL      │ 2013-02-14 │         0 │ 2013-02-14 16:00:00 │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┘\n\n\n\nWe can see that about 16% of the flights in this data set arrived more than 30 minutes late.\n\nflight_data.arr_delay.value_counts().rename(n=\"arr_delay_count\").mutate(\n    prop=ibis._.n / ibis._.n.sum()\n)\n\n┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┓\n┃ arr_delay ┃ n      ┃ prop     ┃\n┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━┩\n│ int64     │ int64  │ float64  │\n├───────────┼────────┼──────────┤\n│         0 │ 273279 │ 0.838745 │\n│         1 │  52540 │ 0.161255 │\n└───────────┴────────┴──────────┘"
+    "text": "The New York City flight data\nLet’s use the nycflights13 data to predict whether a plane arrives more than 30 minutes late. This data set contains information on 325,819 flights departing near New York City in 2013. Let’s start by loading the data and making a few changes to the variables:\n\nimport ibis\n\ncon = ibis.connect(\"duckdb://nycflights13.ddb\")\ncon.create_table(\n    \"flights\", ibis.examples.nycflights13_flights.fetch().to_pyarrow(), overwrite=True\n)\ncon.create_table(\n    \"weather\", ibis.examples.nycflights13_weather.fetch().to_pyarrow(), overwrite=True\n)\n\nYou can now see the example dataset copied over to the database:\n\ncon = ibis.connect(\"duckdb://nycflights13.ddb\")\ncon.list_tables()\n\n['flights', 'weather']\n\n\nWe’ll turn on interactive mode, which partially executes queries to give users a preview of the results.\n\nibis.options.interactive = True\n\n\nflights = con.table(\"flights\")\nflights = flights.mutate(\n    dep_time=(\n        flights.dep_time.lpad(4, \"0\").substr(0, 2)\n        + \":\"\n        + flights.dep_time.substr(-2, 2)\n        + \":00\"\n    ).try_cast(\"time\"),\n    arr_delay=flights.arr_delay.try_cast(int),\n    air_time=flights.air_time.try_cast(int),\n)\nflights\n\n┏━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ year  ┃ month ┃ day   ┃ dep_time ┃ sched_dep_time ┃ dep_delay ┃ arr_time ┃ sched_arr_time ┃ arr_delay ┃ carrier ┃ flight ┃ tailnum ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ hour  ┃ minute ┃ time_hour           ┃\n┡━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ int64 │ int64 │ int64 │ time     │ int64          │ string    │ string   │ int64          │ int64     │ string  │ int64  │ string  │ string │ string │ int64    │ int64    │ int64 │ int64  │ timestamp(6)        │\n├───────┼───────┼───────┼──────────┼────────────────┼───────────┼──────────┼────────────────┼───────────┼─────────┼────────┼─────────┼────────┼────────┼──────────┼──────────┼───────┼────────┼─────────────────────┤\n│  2013 │     1 │     1 │ 05:17:00 │            515 │ 2         │ 830      │            819 │        11 │ UA      │   1545 │ N14228  │ EWR    │ IAH    │      227 │     1400 │     5 │     15 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:33:00 │            529 │ 4         │ 850      │            830 │        20 │ UA      │   1714 │ N24211  │ LGA    │ IAH    │      227 │     1416 │     5 │     29 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:42:00 │            540 │ 2         │ 923      │            850 │        33 │ AA      │   1141 │ N619AA  │ JFK    │ MIA    │      160 │     1089 │     5 │     40 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:44:00 │            545 │ -1        │ 1004     │           1022 │       -18 │ B6      │    725 │ N804JB  │ JFK    │ BQN    │      183 │     1576 │     5 │     45 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:54:00 │            600 │ -6        │ 812      │            837 │       -25 │ DL      │    461 │ N668DN  │ LGA    │ ATL    │      116 │      762 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:54:00 │            558 │ -4        │ 740      │            728 │        12 │ UA      │   1696 │ N39463  │ EWR    │ ORD    │      150 │      719 │     5 │     58 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:55:00 │            600 │ -5        │ 913      │            854 │        19 │ B6      │    507 │ N516JB  │ EWR    │ FLL    │      158 │     1065 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:57:00 │            600 │ -3        │ 709      │            723 │       -14 │ EV      │   5708 │ N829AS  │ LGA    │ IAD    │       53 │      229 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:57:00 │            600 │ -3        │ 838      │            846 │        -8 │ B6      │     79 │ N593JB  │ JFK    │ MCO    │      140 │      944 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:58:00 │            600 │ -2        │ 753      │            745 │         8 │ AA      │    301 │ N3ALAA  │ LGA    │ ORD    │      138 │      733 │     6 │      0 │ 2013-01-01 11:00:00 │\n│     … │     … │     … │ …        │              … │ …         │ …        │              … │         … │ …       │      … │ …       │ …      │ …      │        … │        … │     … │      … │ …                   │\n└───────┴───────┴───────┴──────────┴────────────────┴───────────┴──────────┴────────────────┴───────────┴─────────┴────────┴─────────┴────────┴────────┴──────────┴──────────┴───────┴────────┴─────────────────────┘\n\n\n\n\nweather = con.table(\"weather\")\nweather\n\n┏━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ origin ┃ year  ┃ month ┃ day   ┃ hour  ┃ temp   ┃ dewp   ┃ humid  ┃ wind_dir ┃ wind_speed         ┃ wind_gust ┃ precip  ┃ pressure ┃ visib   ┃ time_hour           ┃\n┡━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ string │ int64 │ int64 │ int64 │ int64 │ string │ string │ string │ string   │ string             │ string    │ float64 │ string   │ float64 │ timestamp(6)        │\n├────────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼──────────┼────────────────────┼───────────┼─────────┼──────────┼─────────┼─────────────────────┤\n│ EWR    │  2013 │     1 │     1 │     1 │ 39.02  │ 26.06  │ 59.37  │ 270      │ 10.357019999999999 │ NA        │     0.0 │ 1012     │    10.0 │ 2013-01-01 06:00:00 │\n│ EWR    │  2013 │     1 │     1 │     2 │ 39.02  │ 26.96  │ 61.63  │ 250      │ 8.05546            │ NA        │     0.0 │ 1012.3   │    10.0 │ 2013-01-01 07:00:00 │\n│ EWR    │  2013 │     1 │     1 │     3 │ 39.02  │ 28.04  │ 64.43  │ 240      │ 11.5078            │ NA        │     0.0 │ 1012.5   │    10.0 │ 2013-01-01 08:00:00 │\n│ EWR    │  2013 │     1 │     1 │     4 │ 39.92  │ 28.04  │ 62.21  │ 250      │ 12.658579999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 09:00:00 │\n│ EWR    │  2013 │     1 │     1 │     5 │ 39.02  │ 28.04  │ 64.43  │ 260      │ 12.658579999999999 │ NA        │     0.0 │ 1011.9   │    10.0 │ 2013-01-01 10:00:00 │\n│ EWR    │  2013 │     1 │     1 │     6 │ 37.94  │ 28.04  │ 67.21  │ 240      │ 11.5078            │ NA        │     0.0 │ 1012.4   │    10.0 │ 2013-01-01 11:00:00 │\n│ EWR    │  2013 │     1 │     1 │     7 │ 39.02  │ 28.04  │ 64.43  │ 240      │ 14.960139999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 12:00:00 │\n│ EWR    │  2013 │     1 │     1 │     8 │ 39.92  │ 28.04  │ 62.21  │ 250      │ 10.357019999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 13:00:00 │\n│ EWR    │  2013 │     1 │     1 │     9 │ 39.92  │ 28.04  │ 62.21  │ 260      │ 14.960139999999999 │ NA        │     0.0 │ 1012.7   │    10.0 │ 2013-01-01 14:00:00 │\n│ EWR    │  2013 │     1 │     1 │    10 │ 41     │ 28.04  │ 59.65  │ 260      │ 13.809359999999998 │ NA        │     0.0 │ 1012.4   │    10.0 │ 2013-01-01 15:00:00 │\n│ …      │     … │     … │     … │     … │ …      │ …      │ …      │ …        │ …                  │ …         │       … │ …        │       … │ …                   │\n└────────┴───────┴───────┴───────┴───────┴────────┴────────┴────────┴──────────┴────────────────────┴───────────┴─────────┴──────────┴─────────┴─────────────────────┘\n\n\n\n\nflight_data = (\n    flights.mutate(\n        # Convert the arrival delay to a factor\n        # By default, PyTorch expects the target to have a Long datatype\n        arr_delay=ibis.ifelse(flights.arr_delay >= 30, 1, 0).cast(\"int64\"),\n        # We will use the date (not date-time) in the recipe below\n        date=flights.time_hour.date(),\n    )\n    # Include the weather data\n    .inner_join(weather, [\"origin\", \"time_hour\"])\n    # Only retain the specific columns we will use\n    .select(\n        \"dep_time\",\n        \"flight\",\n        \"origin\",\n        \"dest\",\n        \"air_time\",\n        \"distance\",\n        \"carrier\",\n        \"date\",\n        \"arr_delay\",\n        \"time_hour\",\n    )\n    # Exclude missing data\n    .dropna()\n)\nflight_data\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┤\n│ 05:17:00 │   1545 │ EWR    │ IAH    │      227 │     1400 │ UA      │ 2013-01-01 │         0 │ 2013-01-01 10:00:00 │\n│ 05:54:00 │    461 │ LGA    │ ATL    │      116 │      762 │ DL      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │\n│ 05:54:00 │   1696 │ EWR    │ ORD    │      150 │      719 │ UA      │ 2013-01-01 │         0 │ 2013-01-01 10:00:00 │\n│ 05:55:00 │    507 │ EWR    │ FLL    │      158 │     1065 │ B6      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │\n│ 05:57:00 │   5708 │ LGA    │ IAD    │       53 │      229 │ EV      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │\n│ 05:57:00 │     79 │ JFK    │ MCO    │      140 │      944 │ B6      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │\n│ 05:58:00 │    301 │ LGA    │ ORD    │      138 │      733 │ AA      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │\n│ 05:58:00 │     49 │ JFK    │ PBI    │      149 │     1028 │ B6      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │\n│ 05:58:00 │     71 │ JFK    │ TPA    │      158 │     1005 │ B6      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │\n│ 05:58:00 │    194 │ JFK    │ LAX    │      345 │     2475 │ UA      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┘\n\n\n\nWe can see that about 16% of the flights in this data set arrived more than 30 minutes late.\n\nflight_data.arr_delay.value_counts().rename(n=\"arr_delay_count\").mutate(\n    prop=ibis._.n / ibis._.n.sum()\n)\n\n┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┓\n┃ arr_delay ┃ n      ┃ prop     ┃\n┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━┩\n│ int64     │ int64  │ float64  │\n├───────────┼────────┼──────────┤\n│         0 │ 273279 │ 0.838745 │\n│         1 │  52540 │ 0.161255 │\n└───────────┴────────┴──────────┘"
   },
   {
     "objectID": "tutorial/pytorch.html#data-splitting",
     "href": "tutorial/pytorch.html#data-splitting",
     "title": "Preprocess your data with recipes",
     "section": "Data splitting",
-    "text": "Data splitting\nTo get started, let’s split this single dataset into two: a training set and a testing set. We’ll keep most of the rows in the original dataset (subset chosen randomly) in the training set. The training data will be used to fit the model, and the testing set will be used to measure model performance.\nBecause the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.\n\nflight_data_with_unique_key = flight_data.mutate(\n    unique_key=ibis.literal(\",\").join(\n        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]\n    )\n)\nflight_data_with_unique_key\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃ unique_key         ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │ string             │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┤\n│ 05:57:00 │    461 │ LGA    │ ATL    │      100 │      762 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ DL,461,2013-06-26  │\n│ 05:58:00 │   4424 │ EWR    │ RDU    │       63 │      416 │ EV      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ EV,4424,2013-06-26 │\n│ 05:58:00 │   6177 │ EWR    │ IAD    │       45 │      212 │ EV      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ EV,6177,2013-06-26 │\n│ 06:00:00 │    731 │ LGA    │ DTW    │       78 │      502 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ DL,731,2013-06-26  │\n│ 06:01:00 │    684 │ EWR    │ LAX    │      316 │     2454 │ UA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ UA,684,2013-06-26  │\n│ 06:01:00 │    301 │ LGA    │ ORD    │      164 │      733 │ AA      │ 2013-06-26 │         1 │ 2013-06-26 10:00:00 │ AA,301,2013-06-26  │\n│ 06:01:00 │   1837 │ LGA    │ MIA    │      148 │     1096 │ AA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ AA,1837,2013-06-26 │\n│ 06:01:00 │   1279 │ LGA    │ MEM    │      128 │      963 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ DL,1279,2013-06-26 │\n│ 06:02:00 │   1691 │ JFK    │ LAX    │      309 │     2475 │ UA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ UA,1691,2013-06-26 │\n│ 06:04:00 │   1447 │ JFK    │ CLT    │       75 │      541 │ US      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ US,1447,2013-06-26 │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │ …                  │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┘\n\n\n\n\n# FIXME(deepyaman): Proposed key isn't unique for actual departure date.\nflight_data_with_unique_key.group_by(\"unique_key\").mutate(\n    cnt=flight_data_with_unique_key.count()\n)[ibis._.cnt > 1]\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃ unique_key         ┃ cnt   ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │ string             │ int64 │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┼───────┤\n│ 19:59:00 │   1022 │ EWR    │ IAH    │      167 │     1400 │ UA      │ 2013-09-14 │         0 │ 2013-09-14 23:00:00 │ UA,1022,2013-09-14 │     2 │\n│ 20:00:00 │   1022 │ EWR    │ IAH    │      186 │     1400 │ UA      │ 2013-09-14 │         0 │ 2013-09-14 00:00:00 │ UA,1022,2013-09-14 │     2 │\n│ 19:12:00 │   1023 │ LGA    │ ORD    │      112 │      733 │ UA      │ 2013-05-29 │         0 │ 2013-05-29 23:00:00 │ UA,1023,2013-05-29 │     2 │\n│ 21:16:00 │   1023 │ EWR    │ IAH    │      175 │     1400 │ UA      │ 2013-05-29 │         0 │ 2013-05-29 01:00:00 │ UA,1023,2013-05-29 │     2 │\n│ 21:22:00 │   1052 │ EWR    │ IAH    │      173 │     1400 │ UA      │ 2013-08-27 │         0 │ 2013-08-27 01:00:00 │ UA,1052,2013-08-27 │     2 │\n│ 15:18:00 │   1052 │ EWR    │ IAH    │      174 │     1400 │ UA      │ 2013-08-27 │         0 │ 2013-08-27 19:00:00 │ UA,1052,2013-08-27 │     2 │\n│ 19:27:00 │   1053 │ EWR    │ CLE    │       69 │      404 │ UA      │ 2013-12-20 │         0 │ 2013-12-20 00:00:00 │ UA,1053,2013-12-20 │     2 │\n│ 18:39:00 │   1053 │ EWR    │ CLE    │       72 │      404 │ UA      │ 2013-12-20 │         0 │ 2013-12-20 23:00:00 │ UA,1053,2013-12-20 │     2 │\n│ 20:16:00 │   1071 │ EWR    │ BQN    │      196 │     1585 │ UA      │ 2013-02-26 │         0 │ 2013-02-26 01:00:00 │ UA,1071,2013-02-26 │     2 │\n│ 17:20:00 │   1071 │ EWR    │ PHX    │      281 │     2133 │ UA      │ 2013-02-26 │         0 │ 2013-02-26 22:00:00 │ UA,1071,2013-02-26 │     2 │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │ …                  │     … │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┴───────┘\n\n\n\n\nimport random\n\n# Fix the random numbers by setting the seed\n# This enables the analysis to be reproducible when random numbers are used\nrandom.seed(222)\n\n# Put 3/4 of the data into the training set\nrandom_key = str(random.getrandbits(256))\ndata_split = flight_data_with_unique_key.mutate(\n    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3\n)\n\n# Create data frames for the two sets:\ntrain_data = data_split[data_split.train].drop(\"unique_key\", \"train\")\ntest_data = data_split[~data_split.train].drop(\"unique_key\", \"train\")"
+    "text": "Data splitting\nTo get started, let’s split this single dataset into two: a training set and a testing set. We’ll keep most of the rows in the original dataset (subset chosen randomly) in the training set. The training data will be used to fit the model, and the testing set will be used to measure model performance.\nBecause the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.\n\nflight_data_with_unique_key = flight_data.mutate(\n    unique_key=ibis.literal(\",\").join(\n        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]\n    )\n)\nflight_data_with_unique_key\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃ unique_key         ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │ string             │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┤\n│ 05:17:00 │   1545 │ EWR    │ IAH    │      227 │     1400 │ UA      │ 2013-01-01 │         0 │ 2013-01-01 10:00:00 │ UA,1545,2013-01-01 │\n│ 05:54:00 │    461 │ LGA    │ ATL    │      116 │      762 │ DL      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │ DL,461,2013-01-01  │\n│ 05:54:00 │   1696 │ EWR    │ ORD    │      150 │      719 │ UA      │ 2013-01-01 │         0 │ 2013-01-01 10:00:00 │ UA,1696,2013-01-01 │\n│ 05:55:00 │    507 │ EWR    │ FLL    │      158 │     1065 │ B6      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │ B6,507,2013-01-01  │\n│ 05:57:00 │   5708 │ LGA    │ IAD    │       53 │      229 │ EV      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │ EV,5708,2013-01-01 │\n│ 05:57:00 │     79 │ JFK    │ MCO    │      140 │      944 │ B6      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │ B6,79,2013-01-01   │\n│ 05:58:00 │    301 │ LGA    │ ORD    │      138 │      733 │ AA      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │ AA,301,2013-01-01  │\n│ 05:58:00 │     49 │ JFK    │ PBI    │      149 │     1028 │ B6      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │ B6,49,2013-01-01   │\n│ 05:58:00 │     71 │ JFK    │ TPA    │      158 │     1005 │ B6      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │ B6,71,2013-01-01   │\n│ 05:58:00 │    194 │ JFK    │ LAX    │      345 │     2475 │ UA      │ 2013-01-01 │         0 │ 2013-01-01 11:00:00 │ UA,194,2013-01-01  │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │ …                  │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┘\n\n\n\n\n# FIXME(deepyaman): Proposed key isn't unique for actual departure date.\nflight_data_with_unique_key.group_by(\"unique_key\").mutate(\n    cnt=flight_data_with_unique_key.count()\n)[ibis._.cnt > 1]\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃ unique_key         ┃ cnt   ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │ string             │ int64 │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┼───────┤\n│ 19:59:00 │   1022 │ EWR    │ IAH    │      167 │     1400 │ UA      │ 2013-09-14 │         0 │ 2013-09-14 23:00:00 │ UA,1022,2013-09-14 │     2 │\n│ 20:00:00 │   1022 │ EWR    │ IAH    │      186 │     1400 │ UA      │ 2013-09-14 │         0 │ 2013-09-14 00:00:00 │ UA,1022,2013-09-14 │     2 │\n│ 19:12:00 │   1023 │ LGA    │ ORD    │      112 │      733 │ UA      │ 2013-05-29 │         0 │ 2013-05-29 23:00:00 │ UA,1023,2013-05-29 │     2 │\n│ 21:16:00 │   1023 │ EWR    │ IAH    │      175 │     1400 │ UA      │ 2013-05-29 │         0 │ 2013-05-29 01:00:00 │ UA,1023,2013-05-29 │     2 │\n│ 21:22:00 │   1052 │ EWR    │ IAH    │      173 │     1400 │ UA      │ 2013-08-27 │         0 │ 2013-08-27 01:00:00 │ UA,1052,2013-08-27 │     2 │\n│ 15:18:00 │   1052 │ EWR    │ IAH    │      174 │     1400 │ UA      │ 2013-08-27 │         0 │ 2013-08-27 19:00:00 │ UA,1052,2013-08-27 │     2 │\n│ 19:27:00 │   1053 │ EWR    │ CLE    │       69 │      404 │ UA      │ 2013-12-20 │         0 │ 2013-12-20 00:00:00 │ UA,1053,2013-12-20 │     2 │\n│ 18:39:00 │   1053 │ EWR    │ CLE    │       72 │      404 │ UA      │ 2013-12-20 │         0 │ 2013-12-20 23:00:00 │ UA,1053,2013-12-20 │     2 │\n│ 20:16:00 │   1071 │ EWR    │ BQN    │      196 │     1585 │ UA      │ 2013-02-26 │         0 │ 2013-02-26 01:00:00 │ UA,1071,2013-02-26 │     2 │\n│ 17:20:00 │   1071 │ EWR    │ PHX    │      281 │     2133 │ UA      │ 2013-02-26 │         0 │ 2013-02-26 22:00:00 │ UA,1071,2013-02-26 │     2 │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │ …                  │     … │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┴───────┘\n\n\n\n\nimport random\n\n# Fix the random numbers by setting the seed\n# This enables the analysis to be reproducible when random numbers are used\nrandom.seed(222)\n\n# Put 3/4 of the data into the training set\nrandom_key = str(random.getrandbits(256))\ndata_split = flight_data_with_unique_key.mutate(\n    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3\n)\n\n# Create data frames for the two sets:\ntrain_data = data_split[data_split.train].drop(\"unique_key\", \"train\")\ntest_data = data_split[~data_split.train].drop(\"unique_key\", \"train\")"
   },
   {
     "objectID": "tutorial/pytorch.html#create-features",
@@ -88,7 +88,7 @@
     "href": "tutorial/pytorch.html#fit-a-model-with-a-recipe",
     "title": "Preprocess your data with recipes",
     "section": "Fit a model with a recipe",
-    "text": "Fit a model with a recipe\nLet’s model the flight data. We can use any scikit-learn-compatible estimator.\nWe will want to use our recipe across several steps as we train and test our model. We will:\n\nProcess the recipe using the training set: This involves any estimation or calculations based on the training set. For our recipe, the training set will be used to determine which predictors should be converted to dummy variables and which predictors will have zero-variance in the training set, and should be slated for removal.\nApply the recipe to the training set: We create the final predictor set on the training set.\nApply the recipe to the test set: We create the final predictor set on the test set. Nothing is recomputed and no information from the test set is used here; the dummy variable and zero-variance results from the training set are applied to the test set.\n\nTo simplify this process, we can use a scikit-learn Pipeline.\n\nfrom sklearn.pipeline import Pipeline\nfrom skorch import NeuralNetClassifier\nfrom torch import nn\n\n\nclass MyModule(nn.Module):\n    def __init__(self, num_units=10, nonlin=nn.ReLU()):\n        super().__init__()\n\n        self.dense0 = nn.Linear(10, num_units)\n        self.nonlin = nonlin\n        self.dropout = nn.Dropout(0.5)\n        self.dense1 = nn.Linear(num_units, num_units)\n        self.output = nn.Linear(num_units, 2)\n        self.softmax = nn.Softmax(dim=-1)\n\n    def forward(self, X, **kwargs):\n        X = self.nonlin(self.dense0(X))\n        X = self.dropout(X)\n        X = self.nonlin(self.dense1(X))\n        X = self.softmax(self.output(X))\n        return X\n\n\nnet = NeuralNetClassifier(\n    MyModule,\n    max_epochs=10,\n    lr=0.1,\n    # Shuffle training data on each epoch\n    iterator_train__shuffle=True,\n)\n\npipe = Pipeline([(\"flights_rec\", flights_rec), (\"net\", net)])\n\nNow, there is a single function that can be used to prepare the recipe and train the model from the resulting predictors:\n\nX_train = train_data.drop(\"arr_delay\")\ny_train = train_data.arr_delay\npipe.fit(X_train, y_train)\n\n  epoch    train_loss    valid_acc    valid_loss     dur\n-------  ------------  -----------  ------------  ------\n      1       10.3153       0.1612       13.3726  2.2884\n      2        8.7295       0.1612       13.3726  2.2764\n      3        8.1396       0.1612       13.3726  2.2633\n      4        6.7964       0.8388        2.5698  2.2661\n      5        6.0716       0.8388        2.5698  2.2583\n      6        6.0462       0.8388        2.5698  2.2638\n      7        6.0914       0.8388        2.5698  2.2605\n      8        6.1668       0.8388        2.5698  2.2585\n      9        5.9999       0.8388        2.5698  2.2621\n     10        5.9069       0.8388        2.5698  2.2600\n\n\nPipeline(steps=[('flights_rec',\n                 Recipe(ExpandDate(cols(('date',)),\n                                   components=['dow', 'month']),\n                        Drop(cols(('date',))),\n                        TargetEncode(nominal(), smooth=0.0),\n                        DropZeroVariance(everything(), tolerance=0.0001),\n                        MutateAt(cols(('dep_time',)),\n                                 ((_.hour() * 60) + _.minute())),\n                        MutateAt(timestamp(), _.epoch_seconds()),\n                        Cast(numeric(), 'float32'))),\n                ('net',\n                 <class 'skorch.classifier.NeuralNetClassifier'>[initialized](\n  module_=MyModule(\n    (dense0): Linear(in_features=10, out_features=10, bias=True)\n    (nonlin): ReLU()\n    (dropout): Dropout(p=0.5, inplace=False)\n    (dense1): Linear(in_features=10, out_features=10, bias=True)\n    (output): Linear(in_features=10, out_features=2, bias=True)\n    (softmax): Softmax(dim=-1)\n  ),\n))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.  Pipeline?Documentation for PipelineiFittedPipeline(steps=[('flights_rec',\n                 Recipe(ExpandDate(cols(('date',)),\n                                   components=['dow', 'month']),\n                        Drop(cols(('date',))),\n                        TargetEncode(nominal(), smooth=0.0),\n                        DropZeroVariance(everything(), tolerance=0.0001),\n                        MutateAt(cols(('dep_time',)),\n                                 ((_.hour() * 60) + _.minute())),\n                        MutateAt(timestamp(), _.epoch_seconds()),\n                        Cast(numeric(), 'float32'))),\n                ('net',\n                 <class 'skorch.classifier.NeuralNetClassifier'>[initialized](\n  module_=MyModule(\n    (dense0): Linear(in_features=10, out_features=10, bias=True)\n    (nonlin): ReLU()\n    (dropout): Dropout(p=0.5, inplace=False)\n    (dense1): Linear(in_features=10, out_features=10, bias=True)\n    (output): Linear(in_features=10, out_features=2, bias=True)\n    (softmax): Softmax(dim=-1)\n  ),\n))]) flights_rec: RecipeRecipe(ExpandDate(cols(('date',)), components=['dow', 'month']),\n       Drop(cols(('date',))),\n       TargetEncode(nominal(), smooth=0.0),\n       DropZeroVariance(everything(), tolerance=0.0001),\n       MutateAt(cols(('dep_time',)), ((_.hour() * 60) + _.minute())),\n       MutateAt(timestamp(), _.epoch_seconds()),\n       Cast(numeric(), 'float32')) ExpandDateExpandDate(cols(('date',)), components=['dow', 'month']) DropDrop(cols(('date',))) TargetEncodeTargetEncode(nominal(), smooth=0.0) DropZeroVarianceDropZeroVariance(everything(), tolerance=0.0001) MutateAtMutateAt(cols(('dep_time',)), ((_.hour() * 60) + _.minute())) MutateAtMutateAt(timestamp(), _.epoch_seconds()) CastCast(numeric(), 'float32') NeuralNetClassifier<class 'skorch.classifier.NeuralNetClassifier'>[initialized](\n  module_=MyModule(\n    (dense0): Linear(in_features=10, out_features=10, bias=True)\n    (nonlin): ReLU()\n    (dropout): Dropout(p=0.5, inplace=False)\n    (dense1): Linear(in_features=10, out_features=10, bias=True)\n    (output): Linear(in_features=10, out_features=2, bias=True)\n    (softmax): Softmax(dim=-1)\n  ),\n)"
+    "text": "Fit a model with a recipe\nLet’s model the flight data. We can use any scikit-learn-compatible estimator.\nWe will want to use our recipe across several steps as we train and test our model. We will:\n\nProcess the recipe using the training set: This involves any estimation or calculations based on the training set. For our recipe, the training set will be used to determine which predictors should be converted to dummy variables and which predictors will have zero-variance in the training set, and should be slated for removal.\nApply the recipe to the training set: We create the final predictor set on the training set.\nApply the recipe to the test set: We create the final predictor set on the test set. Nothing is recomputed and no information from the test set is used here; the dummy variable and zero-variance results from the training set are applied to the test set.\n\nTo simplify this process, we can use a scikit-learn Pipeline.\n\nfrom sklearn.pipeline import Pipeline\nfrom skorch import NeuralNetClassifier\nfrom torch import nn\n\n\nclass MyModule(nn.Module):\n    def __init__(self, num_units=10, nonlin=nn.ReLU()):\n        super().__init__()\n\n        self.dense0 = nn.Linear(10, num_units)\n        self.nonlin = nonlin\n        self.dropout = nn.Dropout(0.5)\n        self.dense1 = nn.Linear(num_units, num_units)\n        self.output = nn.Linear(num_units, 2)\n        self.softmax = nn.Softmax(dim=-1)\n\n    def forward(self, X, **kwargs):\n        X = self.nonlin(self.dense0(X))\n        X = self.dropout(X)\n        X = self.nonlin(self.dense1(X))\n        X = self.softmax(self.output(X))\n        return X\n\n\nnet = NeuralNetClassifier(\n    MyModule,\n    max_epochs=10,\n    lr=0.1,\n    # Shuffle training data on each epoch\n    iterator_train__shuffle=True,\n)\n\npipe = Pipeline([(\"flights_rec\", flights_rec), (\"net\", net)])\n\nNow, there is a single function that can be used to prepare the recipe and train the model from the resulting predictors:\n\nX_train = train_data.drop(\"arr_delay\")\ny_train = train_data.arr_delay\npipe.fit(X_train, y_train)\n\n  epoch    train_loss    valid_acc    valid_loss     dur\n-------  ------------  -----------  ------------  ------\n      1        2.7189       0.8388        2.5698  2.2748\n      2        2.5384       0.8388        2.5698  2.2485\n      3        2.5381       0.8388        2.5698  2.2467\n      4        2.5382       0.8388        2.5698  2.2512\n      5        2.5344       0.8388        2.5698  2.2496\n      6        2.5337       0.8388        2.5698  2.2620\n      7        2.5370       0.8388        2.5698  2.2255\n      8        2.5384       0.8388        2.5698  2.2156\n      9        2.5375       0.8388        2.5698  2.2303\n     10        2.5389       0.8388        2.5698  2.2218\n\n\nPipeline(steps=[('flights_rec',\n                 Recipe(ExpandDate(cols(('date',)),\n                                   components=['dow', 'month']),\n                        Drop(cols(('date',))),\n                        TargetEncode(nominal(), smooth=0.0),\n                        DropZeroVariance(everything(), tolerance=0.0001),\n                        MutateAt(cols(('dep_time',)),\n                                 ((_.hour() * 60) + _.minute())),\n                        MutateAt(timestamp(), _.epoch_seconds()),\n                        Cast(numeric(), 'float32'))),\n                ('net',\n                 <class 'skorch.classifier.NeuralNetClassifier'>[initialized](\n  module_=MyModule(\n    (dense0): Linear(in_features=10, out_features=10, bias=True)\n    (nonlin): ReLU()\n    (dropout): Dropout(p=0.5, inplace=False)\n    (dense1): Linear(in_features=10, out_features=10, bias=True)\n    (output): Linear(in_features=10, out_features=2, bias=True)\n    (softmax): Softmax(dim=-1)\n  ),\n))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.  Pipeline?Documentation for PipelineiFittedPipeline(steps=[('flights_rec',\n                 Recipe(ExpandDate(cols(('date',)),\n                                   components=['dow', 'month']),\n                        Drop(cols(('date',))),\n                        TargetEncode(nominal(), smooth=0.0),\n                        DropZeroVariance(everything(), tolerance=0.0001),\n                        MutateAt(cols(('dep_time',)),\n                                 ((_.hour() * 60) + _.minute())),\n                        MutateAt(timestamp(), _.epoch_seconds()),\n                        Cast(numeric(), 'float32'))),\n                ('net',\n                 <class 'skorch.classifier.NeuralNetClassifier'>[initialized](\n  module_=MyModule(\n    (dense0): Linear(in_features=10, out_features=10, bias=True)\n    (nonlin): ReLU()\n    (dropout): Dropout(p=0.5, inplace=False)\n    (dense1): Linear(in_features=10, out_features=10, bias=True)\n    (output): Linear(in_features=10, out_features=2, bias=True)\n    (softmax): Softmax(dim=-1)\n  ),\n))]) flights_rec: RecipeRecipe(ExpandDate(cols(('date',)), components=['dow', 'month']),\n       Drop(cols(('date',))),\n       TargetEncode(nominal(), smooth=0.0),\n       DropZeroVariance(everything(), tolerance=0.0001),\n       MutateAt(cols(('dep_time',)), ((_.hour() * 60) + _.minute())),\n       MutateAt(timestamp(), _.epoch_seconds()),\n       Cast(numeric(), 'float32')) ExpandDateExpandDate(cols(('date',)), components=['dow', 'month']) DropDrop(cols(('date',))) TargetEncodeTargetEncode(nominal(), smooth=0.0) DropZeroVarianceDropZeroVariance(everything(), tolerance=0.0001) MutateAtMutateAt(cols(('dep_time',)), ((_.hour() * 60) + _.minute())) MutateAtMutateAt(timestamp(), _.epoch_seconds()) CastCast(numeric(), 'float32') NeuralNetClassifier<class 'skorch.classifier.NeuralNetClassifier'>[initialized](\n  module_=MyModule(\n    (dense0): Linear(in_features=10, out_features=10, bias=True)\n    (nonlin): ReLU()\n    (dropout): Dropout(p=0.5, inplace=False)\n    (dense1): Linear(in_features=10, out_features=10, bias=True)\n    (output): Linear(in_features=10, out_features=2, bias=True)\n    (softmax): Softmax(dim=-1)\n  ),\n)"
   },
   {
     "objectID": "tutorial/pytorch.html#use-a-trained-workflow-to-predict",
@@ -786,21 +786,21 @@
     "href": "tutorial/scikit-learn.html#introduction",
     "title": "Preprocess your data with recipes",
     "section": "Introduction",
-    "text": "Introduction\n…\n\nimport ibis\n\ncon = ibis.connect(\"duckdb://nycflights13.ddb\")\ncon.create_table(\n    \"flights\", ibis.examples.nycflights13_flights.fetch().to_pyarrow(), overwrite=True\n)\ncon.create_table(\n    \"weather\", ibis.examples.nycflights13_weather.fetch().to_pyarrow(), overwrite=True\n)\n\nYou can now see the example dataset copied over to the database:\n\ncon = ibis.connect(\"duckdb://nycflights13.ddb\")\ncon.list_tables()\n\n['flights', 'weather']\n\n\nWe’ll turn on interactive mode, which partially executes queries to give users a preview of the results.\n\nibis.options.interactive = True\n\n\nflights = con.table(\"flights\")\nflights = flights.mutate(\n    dep_time=(\n        flights.dep_time.lpad(4, \"0\").substr(0, 2)\n        + \":\"\n        + flights.dep_time.substr(-2, 2)\n        + \":00\"\n    ).try_cast(\"time\"),\n    arr_delay=flights.arr_delay.try_cast(int),\n    air_time=flights.air_time.try_cast(int),\n)\nflights\n\n┏━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ year  ┃ month ┃ day   ┃ dep_time ┃ sched_dep_time ┃ dep_delay ┃ arr_time ┃ sched_arr_time ┃ arr_delay ┃ carrier ┃ flight ┃ tailnum ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ hour  ┃ minute ┃ time_hour           ┃\n┡━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ int64 │ int64 │ int64 │ time     │ int64          │ string    │ string   │ int64          │ int64     │ string  │ int64  │ string  │ string │ string │ int64    │ int64    │ int64 │ int64  │ timestamp(6)        │\n├───────┼───────┼───────┼──────────┼────────────────┼───────────┼──────────┼────────────────┼───────────┼─────────┼────────┼─────────┼────────┼────────┼──────────┼──────────┼───────┼────────┼─────────────────────┤\n│  2013 │     1 │     1 │ 05:17:00 │            515 │ 2         │ 830      │            819 │        11 │ UA      │   1545 │ N14228  │ EWR    │ IAH    │      227 │     1400 │     5 │     15 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:33:00 │            529 │ 4         │ 850      │            830 │        20 │ UA      │   1714 │ N24211  │ LGA    │ IAH    │      227 │     1416 │     5 │     29 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:42:00 │            540 │ 2         │ 923      │            850 │        33 │ AA      │   1141 │ N619AA  │ JFK    │ MIA    │      160 │     1089 │     5 │     40 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:44:00 │            545 │ -1        │ 1004     │           1022 │       -18 │ B6      │    725 │ N804JB  │ JFK    │ BQN    │      183 │     1576 │     5 │     45 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:54:00 │            600 │ -6        │ 812      │            837 │       -25 │ DL      │    461 │ N668DN  │ LGA    │ ATL    │      116 │      762 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:54:00 │            558 │ -4        │ 740      │            728 │        12 │ UA      │   1696 │ N39463  │ EWR    │ ORD    │      150 │      719 │     5 │     58 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:55:00 │            600 │ -5        │ 913      │            854 │        19 │ B6      │    507 │ N516JB  │ EWR    │ FLL    │      158 │     1065 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:57:00 │            600 │ -3        │ 709      │            723 │       -14 │ EV      │   5708 │ N829AS  │ LGA    │ IAD    │       53 │      229 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:57:00 │            600 │ -3        │ 838      │            846 │        -8 │ B6      │     79 │ N593JB  │ JFK    │ MCO    │      140 │      944 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:58:00 │            600 │ -2        │ 753      │            745 │         8 │ AA      │    301 │ N3ALAA  │ LGA    │ ORD    │      138 │      733 │     6 │      0 │ 2013-01-01 11:00:00 │\n│     … │     … │     … │ …        │              … │ …         │ …        │              … │         … │ …       │      … │ …       │ …      │ …      │        … │        … │     … │      … │ …                   │\n└───────┴───────┴───────┴──────────┴────────────────┴───────────┴──────────┴────────────────┴───────────┴─────────┴────────┴─────────┴────────┴────────┴──────────┴──────────┴───────┴────────┴─────────────────────┘\n\n\n\n\nweather = con.table(\"weather\")\nweather\n\n┏━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ origin ┃ year  ┃ month ┃ day   ┃ hour  ┃ temp   ┃ dewp   ┃ humid  ┃ wind_dir ┃ wind_speed         ┃ wind_gust ┃ precip  ┃ pressure ┃ visib   ┃ time_hour           ┃\n┡━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ string │ int64 │ int64 │ int64 │ int64 │ string │ string │ string │ string   │ string             │ string    │ float64 │ string   │ float64 │ timestamp(6)        │\n├────────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼──────────┼────────────────────┼───────────┼─────────┼──────────┼─────────┼─────────────────────┤\n│ EWR    │  2013 │     1 │     1 │     1 │ 39.02  │ 26.06  │ 59.37  │ 270      │ 10.357019999999999 │ NA        │     0.0 │ 1012     │    10.0 │ 2013-01-01 06:00:00 │\n│ EWR    │  2013 │     1 │     1 │     2 │ 39.02  │ 26.96  │ 61.63  │ 250      │ 8.05546            │ NA        │     0.0 │ 1012.3   │    10.0 │ 2013-01-01 07:00:00 │\n│ EWR    │  2013 │     1 │     1 │     3 │ 39.02  │ 28.04  │ 64.43  │ 240      │ 11.5078            │ NA        │     0.0 │ 1012.5   │    10.0 │ 2013-01-01 08:00:00 │\n│ EWR    │  2013 │     1 │     1 │     4 │ 39.92  │ 28.04  │ 62.21  │ 250      │ 12.658579999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 09:00:00 │\n│ EWR    │  2013 │     1 │     1 │     5 │ 39.02  │ 28.04  │ 64.43  │ 260      │ 12.658579999999999 │ NA        │     0.0 │ 1011.9   │    10.0 │ 2013-01-01 10:00:00 │\n│ EWR    │  2013 │     1 │     1 │     6 │ 37.94  │ 28.04  │ 67.21  │ 240      │ 11.5078            │ NA        │     0.0 │ 1012.4   │    10.0 │ 2013-01-01 11:00:00 │\n│ EWR    │  2013 │     1 │     1 │     7 │ 39.02  │ 28.04  │ 64.43  │ 240      │ 14.960139999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 12:00:00 │\n│ EWR    │  2013 │     1 │     1 │     8 │ 39.92  │ 28.04  │ 62.21  │ 250      │ 10.357019999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 13:00:00 │\n│ EWR    │  2013 │     1 │     1 │     9 │ 39.92  │ 28.04  │ 62.21  │ 260      │ 14.960139999999999 │ NA        │     0.0 │ 1012.7   │    10.0 │ 2013-01-01 14:00:00 │\n│ EWR    │  2013 │     1 │     1 │    10 │ 41     │ 28.04  │ 59.65  │ 260      │ 13.809359999999998 │ NA        │     0.0 │ 1012.4   │    10.0 │ 2013-01-01 15:00:00 │\n│ …      │     … │     … │     … │     … │ …      │ …      │ …      │ …        │ …                  │ …         │       … │ …        │       … │ …                   │\n└────────┴───────┴───────┴───────┴───────┴────────┴────────┴────────┴──────────┴────────────────────┴───────────┴─────────┴──────────┴─────────┴─────────────────────┘"
+    "text": "Introduction\nIn this article, we’ll explore Recipes, which are designed to help you preprocess your data before training your model. Recipes are built as a series of preprocessing steps, such as:\n\nconverting qualitative predictors to indicator variables (also known as dummy variables),\ntransforming data to be on a different scale (e.g., taking the logarithm of a variable),\ntransforming whole groups of predictors together,\nextracting key features from raw variables (e.g., getting the day of the week out of a date variable),\n\nand so on. If you are familiar with scikit-learn’s dataset transformations, a lot of this might sound familiar and like what a transformer already does. Recipes can be used to do many of the same things, but they can scale your workloads on any Ibis-supported backend. This article shows how to use recipes for modeling.\nTo use code in this article, you will need to install the following packages: Ibis, IbisML, and scikit-learn.\npip install 'ibis-framework[duckdb,examples]' ibis-ml scikit-learn"
   },
   {
     "objectID": "tutorial/scikit-learn.html#the-new-york-city-flight-data",
     "href": "tutorial/scikit-learn.html#the-new-york-city-flight-data",
     "title": "Preprocess your data with recipes",
     "section": "The New York City flight data",
-    "text": "The New York City flight data\nLet’s use the nycflights13 data to predict whether a plane arrives more than 30 minutes late. This data set contains information on 325,819 flights departing near New York City in 2013. Let’s start by loading the data and making a few changes to the variables:\n\nflight_data = (\n    flights.mutate(\n        # Convert the arrival delay to a factor\n        # By default, PyTorch expects the target to have a Long datatype\n        arr_delay=ibis.ifelse(flights.arr_delay >= 30, 1, 0).cast(\"int64\"),\n        # We will use the date (not date-time) in the recipe below\n        date=flights.time_hour.date(),\n    )\n    # Include the weather data\n    .inner_join(weather, [\"origin\", \"time_hour\"])\n    # Only retain the specific columns we will use\n    .select(\n        \"dep_time\",\n        \"flight\",\n        \"origin\",\n        \"dest\",\n        \"air_time\",\n        \"distance\",\n        \"carrier\",\n        \"date\",\n        \"arr_delay\",\n        \"time_hour\",\n    )\n    # Exclude missing data\n    .dropna()\n)\nflight_data\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┤\n│ 05:57:00 │    461 │ LGA    │ ATL    │      100 │      762 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ 05:58:00 │   4424 │ EWR    │ RDU    │       63 │      416 │ EV      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ 05:58:00 │   6177 │ EWR    │ IAD    │       45 │      212 │ EV      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ 06:00:00 │    731 │ LGA    │ DTW    │       78 │      502 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ 06:01:00 │    684 │ EWR    │ LAX    │      316 │     2454 │ UA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ 06:01:00 │    301 │ LGA    │ ORD    │      164 │      733 │ AA      │ 2013-06-26 │         1 │ 2013-06-26 10:00:00 │\n│ 06:01:00 │   1837 │ LGA    │ MIA    │      148 │     1096 │ AA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ 06:01:00 │   1279 │ LGA    │ MEM    │      128 │      963 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ 06:02:00 │   1691 │ JFK    │ LAX    │      309 │     2475 │ UA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ 06:04:00 │   1447 │ JFK    │ CLT    │       75 │      541 │ US      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┘\n\n\n\nWe can see that about 16% of the flights in this data set arrived more than 30 minutes late.\n\nflight_data.arr_delay.value_counts().rename(n=\"arr_delay_count\").mutate(\n    prop=ibis._.n / ibis._.n.sum()\n)\n\n┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┓\n┃ arr_delay ┃ n      ┃ prop     ┃\n┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━┩\n│ int64     │ int64  │ float64  │\n├───────────┼────────┼──────────┤\n│         0 │ 273279 │ 0.838745 │\n│         1 │  52540 │ 0.161255 │\n└───────────┴────────┴──────────┘"
+    "text": "The New York City flight data\nLet’s use the nycflights13 data to predict whether a plane arrives more than 30 minutes late. This data set contains information on 325,819 flights departing near New York City in 2013. Let’s start by loading the data and making a few changes to the variables:\n\nimport ibis\n\ncon = ibis.connect(\"duckdb://nycflights13.ddb\")\ncon.create_table(\n    \"flights\", ibis.examples.nycflights13_flights.fetch().to_pyarrow(), overwrite=True\n)\ncon.create_table(\n    \"weather\", ibis.examples.nycflights13_weather.fetch().to_pyarrow(), overwrite=True\n)\n\nYou can now see the example dataset copied over to the database:\n\ncon = ibis.connect(\"duckdb://nycflights13.ddb\")\ncon.list_tables()\n\n['flights', 'weather']\n\n\nWe’ll turn on interactive mode, which partially executes queries to give users a preview of the results.\n\nibis.options.interactive = True\n\n\nflights = con.table(\"flights\")\nflights = flights.mutate(\n    dep_time=(\n        flights.dep_time.lpad(4, \"0\").substr(0, 2)\n        + \":\"\n        + flights.dep_time.substr(-2, 2)\n        + \":00\"\n    ).try_cast(\"time\"),\n    arr_delay=flights.arr_delay.try_cast(int),\n    air_time=flights.air_time.try_cast(int),\n)\nflights\n\n┏━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ year  ┃ month ┃ day   ┃ dep_time ┃ sched_dep_time ┃ dep_delay ┃ arr_time ┃ sched_arr_time ┃ arr_delay ┃ carrier ┃ flight ┃ tailnum ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ hour  ┃ minute ┃ time_hour           ┃\n┡━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ int64 │ int64 │ int64 │ time     │ int64          │ string    │ string   │ int64          │ int64     │ string  │ int64  │ string  │ string │ string │ int64    │ int64    │ int64 │ int64  │ timestamp(6)        │\n├───────┼───────┼───────┼──────────┼────────────────┼───────────┼──────────┼────────────────┼───────────┼─────────┼────────┼─────────┼────────┼────────┼──────────┼──────────┼───────┼────────┼─────────────────────┤\n│  2013 │     1 │     1 │ 05:17:00 │            515 │ 2         │ 830      │            819 │        11 │ UA      │   1545 │ N14228  │ EWR    │ IAH    │      227 │     1400 │     5 │     15 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:33:00 │            529 │ 4         │ 850      │            830 │        20 │ UA      │   1714 │ N24211  │ LGA    │ IAH    │      227 │     1416 │     5 │     29 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:42:00 │            540 │ 2         │ 923      │            850 │        33 │ AA      │   1141 │ N619AA  │ JFK    │ MIA    │      160 │     1089 │     5 │     40 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:44:00 │            545 │ -1        │ 1004     │           1022 │       -18 │ B6      │    725 │ N804JB  │ JFK    │ BQN    │      183 │     1576 │     5 │     45 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:54:00 │            600 │ -6        │ 812      │            837 │       -25 │ DL      │    461 │ N668DN  │ LGA    │ ATL    │      116 │      762 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:54:00 │            558 │ -4        │ 740      │            728 │        12 │ UA      │   1696 │ N39463  │ EWR    │ ORD    │      150 │      719 │     5 │     58 │ 2013-01-01 10:00:00 │\n│  2013 │     1 │     1 │ 05:55:00 │            600 │ -5        │ 913      │            854 │        19 │ B6      │    507 │ N516JB  │ EWR    │ FLL    │      158 │     1065 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:57:00 │            600 │ -3        │ 709      │            723 │       -14 │ EV      │   5708 │ N829AS  │ LGA    │ IAD    │       53 │      229 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:57:00 │            600 │ -3        │ 838      │            846 │        -8 │ B6      │     79 │ N593JB  │ JFK    │ MCO    │      140 │      944 │     6 │      0 │ 2013-01-01 11:00:00 │\n│  2013 │     1 │     1 │ 05:58:00 │            600 │ -2        │ 753      │            745 │         8 │ AA      │    301 │ N3ALAA  │ LGA    │ ORD    │      138 │      733 │     6 │      0 │ 2013-01-01 11:00:00 │\n│     … │     … │     … │ …        │              … │ …         │ …        │              … │         … │ …       │      … │ …       │ …      │ …      │        … │        … │     … │      … │ …                   │\n└───────┴───────┴───────┴──────────┴────────────────┴───────────┴──────────┴────────────────┴───────────┴─────────┴────────┴─────────┴────────┴────────┴──────────┴──────────┴───────┴────────┴─────────────────────┘\n\n\n\n\nweather = con.table(\"weather\")\nweather\n\n┏━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ origin ┃ year  ┃ month ┃ day   ┃ hour  ┃ temp   ┃ dewp   ┃ humid  ┃ wind_dir ┃ wind_speed         ┃ wind_gust ┃ precip  ┃ pressure ┃ visib   ┃ time_hour           ┃\n┡━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ string │ int64 │ int64 │ int64 │ int64 │ string │ string │ string │ string   │ string             │ string    │ float64 │ string   │ float64 │ timestamp(6)        │\n├────────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼──────────┼────────────────────┼───────────┼─────────┼──────────┼─────────┼─────────────────────┤\n│ EWR    │  2013 │     1 │     1 │     1 │ 39.02  │ 26.06  │ 59.37  │ 270      │ 10.357019999999999 │ NA        │     0.0 │ 1012     │    10.0 │ 2013-01-01 06:00:00 │\n│ EWR    │  2013 │     1 │     1 │     2 │ 39.02  │ 26.96  │ 61.63  │ 250      │ 8.05546            │ NA        │     0.0 │ 1012.3   │    10.0 │ 2013-01-01 07:00:00 │\n│ EWR    │  2013 │     1 │     1 │     3 │ 39.02  │ 28.04  │ 64.43  │ 240      │ 11.5078            │ NA        │     0.0 │ 1012.5   │    10.0 │ 2013-01-01 08:00:00 │\n│ EWR    │  2013 │     1 │     1 │     4 │ 39.92  │ 28.04  │ 62.21  │ 250      │ 12.658579999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 09:00:00 │\n│ EWR    │  2013 │     1 │     1 │     5 │ 39.02  │ 28.04  │ 64.43  │ 260      │ 12.658579999999999 │ NA        │     0.0 │ 1011.9   │    10.0 │ 2013-01-01 10:00:00 │\n│ EWR    │  2013 │     1 │     1 │     6 │ 37.94  │ 28.04  │ 67.21  │ 240      │ 11.5078            │ NA        │     0.0 │ 1012.4   │    10.0 │ 2013-01-01 11:00:00 │\n│ EWR    │  2013 │     1 │     1 │     7 │ 39.02  │ 28.04  │ 64.43  │ 240      │ 14.960139999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 12:00:00 │\n│ EWR    │  2013 │     1 │     1 │     8 │ 39.92  │ 28.04  │ 62.21  │ 250      │ 10.357019999999999 │ NA        │     0.0 │ 1012.2   │    10.0 │ 2013-01-01 13:00:00 │\n│ EWR    │  2013 │     1 │     1 │     9 │ 39.92  │ 28.04  │ 62.21  │ 260      │ 14.960139999999999 │ NA        │     0.0 │ 1012.7   │    10.0 │ 2013-01-01 14:00:00 │\n│ EWR    │  2013 │     1 │     1 │    10 │ 41     │ 28.04  │ 59.65  │ 260      │ 13.809359999999998 │ NA        │     0.0 │ 1012.4   │    10.0 │ 2013-01-01 15:00:00 │\n│ …      │     … │     … │     … │     … │ …      │ …      │ …      │ …        │ …                  │ …         │       … │ …        │       … │ …                   │\n└────────┴───────┴───────┴───────┴───────┴────────┴────────┴────────┴──────────┴────────────────────┴───────────┴─────────┴──────────┴─────────┴─────────────────────┘\n\n\n\n\nflight_data = (\n    flights.mutate(\n        # Convert the arrival delay to a factor\n        # By default, PyTorch expects the target to have a Long datatype\n        arr_delay=ibis.ifelse(flights.arr_delay >= 30, 1, 0).cast(\"int64\"),\n        # We will use the date (not date-time) in the recipe below\n        date=flights.time_hour.date(),\n    )\n    # Include the weather data\n    .inner_join(weather, [\"origin\", \"time_hour\"])\n    # Only retain the specific columns we will use\n    .select(\n        \"dep_time\",\n        \"flight\",\n        \"origin\",\n        \"dest\",\n        \"air_time\",\n        \"distance\",\n        \"carrier\",\n        \"date\",\n        \"arr_delay\",\n        \"time_hour\",\n    )\n    # Exclude missing data\n    .dropna()\n)\nflight_data\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┤\n│ 10:45:00 │     67 │ EWR    │ ORD    │      120 │      719 │ UA      │ 2013-02-14 │         0 │ 2013-02-14 15:00:00 │\n│ 10:48:00 │    373 │ LGA    │ FLL    │      179 │     1076 │ B6      │ 2013-02-14 │         0 │ 2013-02-14 15:00:00 │\n│ 10:48:00 │    764 │ EWR    │ IAH    │      207 │     1400 │ UA      │ 2013-02-14 │         0 │ 2013-02-14 15:00:00 │\n│ 10:51:00 │   2044 │ LGA    │ MIA    │      171 │     1096 │ DL      │ 2013-02-14 │         0 │ 2013-02-14 16:00:00 │\n│ 10:51:00 │   2171 │ LGA    │ DCA    │       40 │      214 │ US      │ 2013-02-14 │         0 │ 2013-02-14 16:00:00 │\n│ 10:57:00 │   1275 │ JFK    │ SLC    │      286 │     1990 │ DL      │ 2013-02-14 │         0 │ 2013-02-14 16:00:00 │\n│ 10:57:00 │    366 │ LGA    │ STL    │      135 │      888 │ WN      │ 2013-02-14 │         0 │ 2013-02-14 16:00:00 │\n│ 10:57:00 │   1550 │ EWR    │ SFO    │      338 │     2565 │ UA      │ 2013-02-14 │         0 │ 2013-02-14 15:00:00 │\n│ 10:58:00 │   4694 │ EWR    │ MKE    │      113 │      725 │ EV      │ 2013-02-14 │         0 │ 2013-02-14 15:00:00 │\n│ 10:58:00 │   1647 │ LGA    │ ATL    │      117 │      762 │ DL      │ 2013-02-14 │         0 │ 2013-02-14 16:00:00 │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┘\n\n\n\nWe can see that about 16% of the flights in this data set arrived more than 30 minutes late.\n\nflight_data.arr_delay.value_counts().rename(n=\"arr_delay_count\").mutate(\n    prop=ibis._.n / ibis._.n.sum()\n)\n\n┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┓\n┃ arr_delay ┃ n      ┃ prop     ┃\n┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━┩\n│ int64     │ int64  │ float64  │\n├───────────┼────────┼──────────┤\n│         0 │ 273279 │ 0.838745 │\n│         1 │  52540 │ 0.161255 │\n└───────────┴────────┴──────────┘"
   },
   {
     "objectID": "tutorial/scikit-learn.html#data-splitting",
     "href": "tutorial/scikit-learn.html#data-splitting",
     "title": "Preprocess your data with recipes",
     "section": "Data splitting",
-    "text": "Data splitting\nTo get started, let’s split this single dataset into two: a training set and a testing set. We’ll keep most of the rows in the original dataset (subset chosen randomly) in the training set. The training data will be used to fit the model, and the testing set will be used to measure model performance.\nBecause the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.\n\nflight_data_with_unique_key = flight_data.mutate(\n    unique_key=ibis.literal(\",\").join(\n        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]\n    )\n)\nflight_data_with_unique_key\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃ unique_key         ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │ string             │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┤\n│ 05:57:00 │    461 │ LGA    │ ATL    │      100 │      762 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ DL,461,2013-06-26  │\n│ 05:58:00 │   4424 │ EWR    │ RDU    │       63 │      416 │ EV      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ EV,4424,2013-06-26 │\n│ 05:58:00 │   6177 │ EWR    │ IAD    │       45 │      212 │ EV      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ EV,6177,2013-06-26 │\n│ 06:00:00 │    731 │ LGA    │ DTW    │       78 │      502 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ DL,731,2013-06-26  │\n│ 06:01:00 │    684 │ EWR    │ LAX    │      316 │     2454 │ UA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ UA,684,2013-06-26  │\n│ 06:01:00 │    301 │ LGA    │ ORD    │      164 │      733 │ AA      │ 2013-06-26 │         1 │ 2013-06-26 10:00:00 │ AA,301,2013-06-26  │\n│ 06:01:00 │   1837 │ LGA    │ MIA    │      148 │     1096 │ AA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ AA,1837,2013-06-26 │\n│ 06:01:00 │   1279 │ LGA    │ MEM    │      128 │      963 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ DL,1279,2013-06-26 │\n│ 06:02:00 │   1691 │ JFK    │ LAX    │      309 │     2475 │ UA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ UA,1691,2013-06-26 │\n│ 06:04:00 │   1447 │ JFK    │ CLT    │       75 │      541 │ US      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ US,1447,2013-06-26 │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │ …                  │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┘\n\n\n\n\n# FIXME(deepyaman): Proposed key isn't unique for actual departure date.\nflight_data_with_unique_key.group_by(\"unique_key\").mutate(\n    cnt=flight_data_with_unique_key.count()\n)[ibis._.cnt > 1]\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃ unique_key         ┃ cnt   ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │ string             │ int64 │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┼───────┤\n│ 19:59:00 │   1022 │ EWR    │ IAH    │      167 │     1400 │ UA      │ 2013-09-14 │         0 │ 2013-09-14 23:00:00 │ UA,1022,2013-09-14 │     2 │\n│ 20:00:00 │   1022 │ EWR    │ IAH    │      186 │     1400 │ UA      │ 2013-09-14 │         0 │ 2013-09-14 00:00:00 │ UA,1022,2013-09-14 │     2 │\n│ 19:12:00 │   1023 │ LGA    │ ORD    │      112 │      733 │ UA      │ 2013-05-29 │         0 │ 2013-05-29 23:00:00 │ UA,1023,2013-05-29 │     2 │\n│ 21:16:00 │   1023 │ EWR    │ IAH    │      175 │     1400 │ UA      │ 2013-05-29 │         0 │ 2013-05-29 01:00:00 │ UA,1023,2013-05-29 │     2 │\n│ 21:22:00 │   1052 │ EWR    │ IAH    │      173 │     1400 │ UA      │ 2013-08-27 │         0 │ 2013-08-27 01:00:00 │ UA,1052,2013-08-27 │     2 │\n│ 15:18:00 │   1052 │ EWR    │ IAH    │      174 │     1400 │ UA      │ 2013-08-27 │         0 │ 2013-08-27 19:00:00 │ UA,1052,2013-08-27 │     2 │\n│ 19:27:00 │   1053 │ EWR    │ CLE    │       69 │      404 │ UA      │ 2013-12-20 │         0 │ 2013-12-20 00:00:00 │ UA,1053,2013-12-20 │     2 │\n│ 18:39:00 │   1053 │ EWR    │ CLE    │       72 │      404 │ UA      │ 2013-12-20 │         0 │ 2013-12-20 23:00:00 │ UA,1053,2013-12-20 │     2 │\n│ 20:16:00 │   1071 │ EWR    │ BQN    │      196 │     1585 │ UA      │ 2013-02-26 │         0 │ 2013-02-26 01:00:00 │ UA,1071,2013-02-26 │     2 │\n│ 17:20:00 │   1071 │ EWR    │ PHX    │      281 │     2133 │ UA      │ 2013-02-26 │         0 │ 2013-02-26 22:00:00 │ UA,1071,2013-02-26 │     2 │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │ …                  │     … │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┴───────┘\n\n\n\n\nimport random\n\n# Fix the random numbers by setting the seed\n# This enables the analysis to be reproducible when random numbers are used\nrandom.seed(222)\n\n# Put 3/4 of the data into the training set\nrandom_key = str(random.getrandbits(256))\ndata_split = flight_data_with_unique_key.mutate(\n    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3\n)\n\n# Create data frames for the two sets:\ntrain_data = data_split[data_split.train].drop(\"unique_key\", \"train\")\ntest_data = data_split[~data_split.train].drop(\"unique_key\", \"train\")"
+    "text": "Data splitting\nTo get started, let’s split this single dataset into two: a training set and a testing set. We’ll keep most of the rows in the original dataset (subset chosen randomly) in the training set. The training data will be used to fit the model, and the testing set will be used to measure model performance.\nBecause the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.\n\nflight_data_with_unique_key = flight_data.mutate(\n    unique_key=ibis.literal(\",\").join(\n        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]\n    )\n)\nflight_data_with_unique_key\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃ unique_key         ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │ string             │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┤\n│ 05:57:00 │    461 │ LGA    │ ATL    │      100 │      762 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ DL,461,2013-06-26  │\n│ 05:58:00 │   4424 │ EWR    │ RDU    │       63 │      416 │ EV      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ EV,4424,2013-06-26 │\n│ 05:58:00 │   6177 │ EWR    │ IAD    │       45 │      212 │ EV      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ EV,6177,2013-06-26 │\n│ 06:00:00 │    731 │ LGA    │ DTW    │       78 │      502 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ DL,731,2013-06-26  │\n│ 06:01:00 │    684 │ EWR    │ LAX    │      316 │     2454 │ UA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ UA,684,2013-06-26  │\n│ 06:01:00 │    301 │ LGA    │ ORD    │      164 │      733 │ AA      │ 2013-06-26 │         1 │ 2013-06-26 10:00:00 │ AA,301,2013-06-26  │\n│ 06:01:00 │   1837 │ LGA    │ MIA    │      148 │     1096 │ AA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ AA,1837,2013-06-26 │\n│ 06:01:00 │   1279 │ LGA    │ MEM    │      128 │      963 │ DL      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ DL,1279,2013-06-26 │\n│ 06:02:00 │   1691 │ JFK    │ LAX    │      309 │     2475 │ UA      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ UA,1691,2013-06-26 │\n│ 06:04:00 │   1447 │ JFK    │ CLT    │       75 │      541 │ US      │ 2013-06-26 │         0 │ 2013-06-26 10:00:00 │ US,1447,2013-06-26 │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │ …                  │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┘\n\n\n\n\n# FIXME(deepyaman): Proposed key isn't unique for actual departure date.\nflight_data_with_unique_key.group_by(\"unique_key\").mutate(\n    cnt=flight_data_with_unique_key.count()\n)[ibis._.cnt > 1]\n\n┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓\n┃ dep_time ┃ flight ┃ origin ┃ dest   ┃ air_time ┃ distance ┃ carrier ┃ date       ┃ arr_delay ┃ time_hour           ┃ unique_key         ┃ cnt   ┃\n┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩\n│ time     │ int64  │ string │ string │ int64    │ int64    │ string  │ date       │ int64     │ timestamp(6)        │ string             │ int64 │\n├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┼───────┤\n│ 20:00:00 │   1022 │ EWR    │ IAH    │      186 │     1400 │ UA      │ 2013-09-14 │         0 │ 2013-09-14 00:00:00 │ UA,1022,2013-09-14 │     2 │\n│ 19:59:00 │   1022 │ EWR    │ IAH    │      167 │     1400 │ UA      │ 2013-09-14 │         0 │ 2013-09-14 23:00:00 │ UA,1022,2013-09-14 │     2 │\n│ 19:12:00 │   1023 │ LGA    │ ORD    │      112 │      733 │ UA      │ 2013-05-29 │         0 │ 2013-05-29 23:00:00 │ UA,1023,2013-05-29 │     2 │\n│ 21:16:00 │   1023 │ EWR    │ IAH    │      175 │     1400 │ UA      │ 2013-05-29 │         0 │ 2013-05-29 01:00:00 │ UA,1023,2013-05-29 │     2 │\n│ 21:22:00 │   1052 │ EWR    │ IAH    │      173 │     1400 │ UA      │ 2013-08-27 │         0 │ 2013-08-27 01:00:00 │ UA,1052,2013-08-27 │     2 │\n│ 15:18:00 │   1052 │ EWR    │ IAH    │      174 │     1400 │ UA      │ 2013-08-27 │         0 │ 2013-08-27 19:00:00 │ UA,1052,2013-08-27 │     2 │\n│ 19:27:00 │   1053 │ EWR    │ CLE    │       69 │      404 │ UA      │ 2013-12-20 │         0 │ 2013-12-20 00:00:00 │ UA,1053,2013-12-20 │     2 │\n│ 18:39:00 │   1053 │ EWR    │ CLE    │       72 │      404 │ UA      │ 2013-12-20 │         0 │ 2013-12-20 23:00:00 │ UA,1053,2013-12-20 │     2 │\n│ 17:20:00 │   1071 │ EWR    │ PHX    │      281 │     2133 │ UA      │ 2013-02-26 │         0 │ 2013-02-26 22:00:00 │ UA,1071,2013-02-26 │     2 │\n│ 20:16:00 │   1071 │ EWR    │ BQN    │      196 │     1585 │ UA      │ 2013-02-26 │         0 │ 2013-02-26 01:00:00 │ UA,1071,2013-02-26 │     2 │\n│ …        │      … │ …      │ …      │        … │        … │ …       │ …          │         … │ …                   │ …                  │     … │\n└──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┴───────┘\n\n\n\n\nimport random\n\n# Fix the random numbers by setting the seed\n# This enables the analysis to be reproducible when random numbers are used\nrandom.seed(222)\n\n# Put 3/4 of the data into the training set\nrandom_key = str(random.getrandbits(256))\ndata_split = flight_data_with_unique_key.mutate(\n    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3\n)\n\n# Create data frames for the two sets:\ntrain_data = data_split[data_split.train].drop(\"unique_key\", \"train\")\ntest_data = data_split[~data_split.train].drop(\"unique_key\", \"train\")"
   },
   {
     "objectID": "tutorial/scikit-learn.html#create-features",
diff --git a/support_matrix.html b/support_matrix.html
index a10c75a..1181125 100644
--- a/support_matrix.html
+++ b/support_matrix.html
@@ -291,7 +291,7 @@
 
- +
@@ -320,17 +320,17 @@ - + - + - - - + + - + + @@ -338,23 +338,23 @@ - - + + - + - + + - @@ -362,8 +362,8 @@ - - + + @@ -461,9 +461,9 @@

Introduction

-

-
-
import ibis
-
-con = ibis.connect("duckdb://nycflights13.ddb")
-con.create_table(
-    "flights", ibis.examples.nycflights13_flights.fetch().to_pyarrow(), overwrite=True
-)
-con.create_table(
-    "weather", ibis.examples.nycflights13_weather.fetch().to_pyarrow(), overwrite=True
-)
+

In this article, we’ll explore Recipes, which are designed to help you preprocess your data before training your model. Recipes are built as a series of preprocessing steps, such as:

+
    +
  • converting qualitative predictors to indicator variables (also known as dummy variables),

  • +
  • transforming data to be on a different scale (e.g., taking the logarithm of a variable),

  • +
  • transforming whole groups of predictors together,

  • +
  • extracting key features from raw variables (e.g., getting the day of the week out of a date variable),

  • +
+

and so on. If you are familiar with scikit-learn’s dataset transformations, a lot of this might sound familiar and like what a transformer already does. Recipes can be used to do many of the same things, but they can scale your workloads on any Ibis-supported backend. This article shows how to use recipes for modeling.

+

To use code in this article, you will need to install the following packages: Ibis, IbisML, and skorch, a high-level library for PyTorch that provides full scikit-learn compatibility.

+
pip install 'ibis-framework[duckdb,examples]' ibis-ml skorch torch
+
+
+

The New York City flight data

+

Let’s use the nycflights13 data to predict whether a plane arrives more than 30 minutes late. This data set contains information on 325,819 flights departing near New York City in 2013. Let’s start by loading the data and making a few changes to the variables:

+
+
import ibis
+
+con = ibis.connect("duckdb://nycflights13.ddb")
+con.create_table(
+    "flights", ibis.examples.nycflights13_flights.fetch().to_pyarrow(), overwrite=True
+)
+con.create_table(
+    "weather", ibis.examples.nycflights13_weather.fetch().to_pyarrow(), overwrite=True
+)

You can now see the example dataset copied over to the database:

-
-
con = ibis.connect("duckdb://nycflights13.ddb")
-con.list_tables()
+
+
con = ibis.connect("duckdb://nycflights13.ddb")
+con.list_tables()
['flights', 'weather']

We’ll turn on interactive mode, which partially executes queries to give users a preview of the results.

-
-
ibis.options.interactive = True
+
+
ibis.options.interactive = True
-
-
flights = con.table("flights")
-flights = flights.mutate(
-    dep_time=(
-        flights.dep_time.lpad(4, "0").substr(0, 2)
-        + ":"
-        + flights.dep_time.substr(-2, 2)
-        + ":00"
-    ).try_cast("time"),
-    arr_delay=flights.arr_delay.try_cast(int),
-    air_time=flights.air_time.try_cast(int),
-)
-flights
+
+
flights = con.table("flights")
+flights = flights.mutate(
+    dep_time=(
+        flights.dep_time.lpad(4, "0").substr(0, 2)
+        + ":"
+        + flights.dep_time.substr(-2, 2)
+        + ":00"
+    ).try_cast("time"),
+    arr_delay=flights.arr_delay.try_cast(int),
+    air_time=flights.air_time.try_cast(int),
+)
+flights
┏━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
 ┃ year   month  day    dep_time  sched_dep_time  dep_delay  arr_time  sched_arr_time  arr_delay  carrier  flight  tailnum  origin  dest    air_time  distance  hour   minute  time_hour           ┃
@@ -366,9 +379,9 @@ 

Introduction

-
-
weather = con.table("weather")
-weather
+
+
weather = con.table("weather")
+weather
┏━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
 ┃ origin  year   month  day    hour   temp    dewp    humid   wind_dir  wind_speed          wind_gust  precip   pressure  visib    time_hour           ┃
@@ -390,64 +403,60 @@ 

Introduction

-
-
-

The New York City flight data

-

Let’s use the nycflights13 data to predict whether a plane arrives more than 30 minutes late. This data set contains information on 325,819 flights departing near New York City in 2013. Let’s start by loading the data and making a few changes to the variables:

-
-
flight_data = (
-    flights.mutate(
-        # Convert the arrival delay to a factor
-        # By default, PyTorch expects the target to have a Long datatype
-        arr_delay=ibis.ifelse(flights.arr_delay >= 30, 1, 0).cast("int64"),
-        # We will use the date (not date-time) in the recipe below
-        date=flights.time_hour.date(),
-    )
-    # Include the weather data
-    .inner_join(weather, ["origin", "time_hour"])
-    # Only retain the specific columns we will use
-    .select(
-        "dep_time",
-        "flight",
-        "origin",
-        "dest",
-        "air_time",
-        "distance",
-        "carrier",
-        "date",
-        "arr_delay",
-        "time_hour",
-    )
-    # Exclude missing data
-    .dropna()
-)
-flight_data
+
+
flight_data = (
+    flights.mutate(
+        # Convert the arrival delay to a factor
+        # By default, PyTorch expects the target to have a Long datatype
+        arr_delay=ibis.ifelse(flights.arr_delay >= 30, 1, 0).cast("int64"),
+        # We will use the date (not date-time) in the recipe below
+        date=flights.time_hour.date(),
+    )
+    # Include the weather data
+    .inner_join(weather, ["origin", "time_hour"])
+    # Only retain the specific columns we will use
+    .select(
+        "dep_time",
+        "flight",
+        "origin",
+        "dest",
+        "air_time",
+        "distance",
+        "carrier",
+        "date",
+        "arr_delay",
+        "time_hour",
+    )
+    # Exclude missing data
+    .dropna()
+)
+flight_data
┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
 ┃ dep_time  flight  origin  dest    air_time  distance  carrier  date        arr_delay  time_hour           ┃
 ┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
 │ timeint64stringstringint64int64stringdateint64timestamp(6)        │
 ├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┤
-│ 10:45:0067EWR   ORD   120719UA     2013-02-1402013-02-14 15:00:00 │
-│ 10:48:00373LGA   FLL   1791076B6     2013-02-1402013-02-14 15:00:00 │
-│ 10:48:00764EWR   IAH   2071400UA     2013-02-1402013-02-14 15:00:00 │
-│ 10:51:002044LGA   MIA   1711096DL     2013-02-1402013-02-14 16:00:00 │
-│ 10:51:002171LGA   DCA   40214US     2013-02-1402013-02-14 16:00:00 │
-│ 10:57:001275JFK   SLC   2861990DL     2013-02-1402013-02-14 16:00:00 │
-│ 10:57:00366LGA   STL   135888WN     2013-02-1402013-02-14 16:00:00 │
-│ 10:57:001550EWR   SFO   3382565UA     2013-02-1402013-02-14 15:00:00 │
-│ 10:58:004694EWR   MKE   113725EV     2013-02-1402013-02-14 15:00:00 │
-│ 10:58:001647LGA   ATL   117762DL     2013-02-1402013-02-14 16:00:00 │
+│ 05:17:001545EWR   IAH   2271400UA     2013-01-0102013-01-01 10:00:00 │
+│ 05:54:00461LGA   ATL   116762DL     2013-01-0102013-01-01 11:00:00 │
+│ 05:54:001696EWR   ORD   150719UA     2013-01-0102013-01-01 10:00:00 │
+│ 05:55:00507EWR   FLL   1581065B6     2013-01-0102013-01-01 11:00:00 │
+│ 05:57:005708LGA   IAD   53229EV     2013-01-0102013-01-01 11:00:00 │
+│ 05:57:0079JFK   MCO   140944B6     2013-01-0102013-01-01 11:00:00 │
+│ 05:58:00301LGA   ORD   138733AA     2013-01-0102013-01-01 11:00:00 │
+│ 05:58:0049JFK   PBI   1491028B6     2013-01-0102013-01-01 11:00:00 │
+│ 05:58:0071JFK   TPA   1581005B6     2013-01-0102013-01-01 11:00:00 │
+│ 05:58:00194JFK   LAX   3452475UA     2013-01-0102013-01-01 11:00:00 │
 │                    │
 └──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┘
 

We can see that about 16% of the flights in this data set arrived more than 30 minutes late.

-
-
flight_data.arr_delay.value_counts().rename(n="arr_delay_count").mutate(
-    prop=ibis._.n / ibis._.n.sum()
-)
+
+
flight_data.arr_delay.value_counts().rename(n="arr_delay_count").mutate(
+    prop=ibis._.n / ibis._.n.sum()
+)
┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┓
 ┃ arr_delay  n       prop     ┃
@@ -465,39 +474,39 @@ 

The New York

Data splitting

To get started, let’s split this single dataset into two: a training set and a testing set. We’ll keep most of the rows in the original dataset (subset chosen randomly) in the training set. The training data will be used to fit the model, and the testing set will be used to measure model performance.

Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.

-
-
flight_data_with_unique_key = flight_data.mutate(
-    unique_key=ibis.literal(",").join(
-        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]
-    )
-)
-flight_data_with_unique_key
+
+
flight_data_with_unique_key = flight_data.mutate(
+    unique_key=ibis.literal(",").join(
+        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]
+    )
+)
+flight_data_with_unique_key
┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓
 ┃ dep_time  flight  origin  dest    air_time  distance  carrier  date        arr_delay  time_hour            unique_key         ┃
 ┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩
 │ timeint64stringstringint64int64stringdateint64timestamp(6)string             │
 ├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┼────────────────────┤
-│ 05:57:00461LGA   ATL   100762DL     2013-06-2602013-06-26 10:00:00DL,461,2013-06-26  │
-│ 05:58:004424EWR   RDU   63416EV     2013-06-2602013-06-26 10:00:00EV,4424,2013-06-26 │
-│ 05:58:006177EWR   IAD   45212EV     2013-06-2602013-06-26 10:00:00EV,6177,2013-06-26 │
-│ 06:00:00731LGA   DTW   78502DL     2013-06-2602013-06-26 10:00:00DL,731,2013-06-26  │
-│ 06:01:00684EWR   LAX   3162454UA     2013-06-2602013-06-26 10:00:00UA,684,2013-06-26  │
-│ 06:01:00301LGA   ORD   164733AA     2013-06-2612013-06-26 10:00:00AA,301,2013-06-26  │
-│ 06:01:001837LGA   MIA   1481096AA     2013-06-2602013-06-26 10:00:00AA,1837,2013-06-26 │
-│ 06:01:001279LGA   MEM   128963DL     2013-06-2602013-06-26 10:00:00DL,1279,2013-06-26 │
-│ 06:02:001691JFK   LAX   3092475UA     2013-06-2602013-06-26 10:00:00UA,1691,2013-06-26 │
-│ 06:04:001447JFK   CLT   75541US     2013-06-2602013-06-26 10:00:00US,1447,2013-06-26 │
+│ 05:17:001545EWR   IAH   2271400UA     2013-01-0102013-01-01 10:00:00UA,1545,2013-01-01 │
+│ 05:54:00461LGA   ATL   116762DL     2013-01-0102013-01-01 11:00:00DL,461,2013-01-01  │
+│ 05:54:001696EWR   ORD   150719UA     2013-01-0102013-01-01 10:00:00UA,1696,2013-01-01 │
+│ 05:55:00507EWR   FLL   1581065B6     2013-01-0102013-01-01 11:00:00B6,507,2013-01-01  │
+│ 05:57:005708LGA   IAD   53229EV     2013-01-0102013-01-01 11:00:00EV,5708,2013-01-01 │
+│ 05:57:0079JFK   MCO   140944B6     2013-01-0102013-01-01 11:00:00B6,79,2013-01-01   │
+│ 05:58:00301LGA   ORD   138733AA     2013-01-0102013-01-01 11:00:00AA,301,2013-01-01  │
+│ 05:58:0049JFK   PBI   1491028B6     2013-01-0102013-01-01 11:00:00B6,49,2013-01-01   │
+│ 05:58:0071JFK   TPA   1581005B6     2013-01-0102013-01-01 11:00:00B6,71,2013-01-01   │
+│ 05:58:00194JFK   LAX   3452475UA     2013-01-0102013-01-01 11:00:00UA,194,2013-01-01  │
 │                   │
 └──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┴────────────────────┘
 
-
-
# FIXME(deepyaman): Proposed key isn't unique for actual departure date.
-flight_data_with_unique_key.group_by("unique_key").mutate(
-    cnt=flight_data_with_unique_key.count()
-)[ibis._.cnt > 1]
+
+
# FIXME(deepyaman): Proposed key isn't unique for actual departure date.
+flight_data_with_unique_key.group_by("unique_key").mutate(
+    cnt=flight_data_with_unique_key.count()
+)[ibis._.cnt > 1]
┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓
 ┃ dep_time  flight  origin  dest    air_time  distance  carrier  date        arr_delay  time_hour            unique_key          cnt   ┃
@@ -519,40 +528,40 @@ 

Data splitting

-
-
import random
-
-# Fix the random numbers by setting the seed
-# This enables the analysis to be reproducible when random numbers are used
-random.seed(222)
-
-# Put 3/4 of the data into the training set
-random_key = str(random.getrandbits(256))
-data_split = flight_data_with_unique_key.mutate(
-    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3
-)
-
-# Create data frames for the two sets:
-train_data = data_split[data_split.train].drop("unique_key", "train")
-test_data = data_split[~data_split.train].drop("unique_key", "train")
+
+
import random
+
+# Fix the random numbers by setting the seed
+# This enables the analysis to be reproducible when random numbers are used
+random.seed(222)
+
+# Put 3/4 of the data into the training set
+random_key = str(random.getrandbits(256))
+data_split = flight_data_with_unique_key.mutate(
+    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3
+)
+
+# Create data frames for the two sets:
+train_data = data_split[data_split.train].drop("unique_key", "train")
+test_data = data_split[~data_split.train].drop("unique_key", "train")

Create features

-
-
import ibis_ml as ml
-
-flights_rec = ml.Recipe(
-    ml.ExpandDate("date", components=["dow", "month"]),
-    ml.Drop("date"),
-    ml.TargetEncode(ml.nominal()),
-    ml.DropZeroVariance(ml.everything()),
-    ml.MutateAt("dep_time", ibis._.hour() * 60 + ibis._.minute()),
-    ml.MutateAt(ml.timestamp(), ibis._.epoch_seconds()),
-    # By default, PyTorch requires that the type of `X` is `np.float32`.
-    # https://discuss.pytorch.org/t/mat1-and-mat2-must-have-the-same-dtype-but-got-double-and-float/197555/2
-    ml.Cast(ml.numeric(), "float32"),
-)
+
+
import ibis_ml as ml
+
+flights_rec = ml.Recipe(
+    ml.ExpandDate("date", components=["dow", "month"]),
+    ml.Drop("date"),
+    ml.TargetEncode(ml.nominal()),
+    ml.DropZeroVariance(ml.everything()),
+    ml.MutateAt("dep_time", ibis._.hour() * 60 + ibis._.minute()),
+    ml.MutateAt(ml.timestamp(), ibis._.epoch_seconds()),
+    # By default, PyTorch requires that the type of `X` is `np.float32`.
+    # https://discuss.pytorch.org/t/mat1-and-mat2-must-have-the-same-dtype-but-got-double-and-float/197555/2
+    ml.Cast(ml.numeric(), "float32"),
+)
@@ -565,59 +574,59 @@

Fit a model with
  • Apply the recipe to the test set: We create the final predictor set on the test set. Nothing is recomputed and no information from the test set is used here; the dummy variable and zero-variance results from the training set are applied to the test set.

  • To simplify this process, we can use a scikit-learn Pipeline.

    -
    -
    from sklearn.pipeline import Pipeline
    -from skorch import NeuralNetClassifier
    -from torch import nn
    -
    -
    -class MyModule(nn.Module):
    -    def __init__(self, num_units=10, nonlin=nn.ReLU()):
    -        super().__init__()
    -
    -        self.dense0 = nn.Linear(10, num_units)
    -        self.nonlin = nonlin
    -        self.dropout = nn.Dropout(0.5)
    -        self.dense1 = nn.Linear(num_units, num_units)
    -        self.output = nn.Linear(num_units, 2)
    -        self.softmax = nn.Softmax(dim=-1)
    -
    -    def forward(self, X, **kwargs):
    -        X = self.nonlin(self.dense0(X))
    -        X = self.dropout(X)
    -        X = self.nonlin(self.dense1(X))
    -        X = self.softmax(self.output(X))
    -        return X
    -
    -
    -net = NeuralNetClassifier(
    -    MyModule,
    -    max_epochs=10,
    -    lr=0.1,
    -    # Shuffle training data on each epoch
    -    iterator_train__shuffle=True,
    -)
    -
    -pipe = Pipeline([("flights_rec", flights_rec), ("net", net)])
    +
    +
    from sklearn.pipeline import Pipeline
    +from skorch import NeuralNetClassifier
    +from torch import nn
    +
    +
    +class MyModule(nn.Module):
    +    def __init__(self, num_units=10, nonlin=nn.ReLU()):
    +        super().__init__()
    +
    +        self.dense0 = nn.Linear(10, num_units)
    +        self.nonlin = nonlin
    +        self.dropout = nn.Dropout(0.5)
    +        self.dense1 = nn.Linear(num_units, num_units)
    +        self.output = nn.Linear(num_units, 2)
    +        self.softmax = nn.Softmax(dim=-1)
    +
    +    def forward(self, X, **kwargs):
    +        X = self.nonlin(self.dense0(X))
    +        X = self.dropout(X)
    +        X = self.nonlin(self.dense1(X))
    +        X = self.softmax(self.output(X))
    +        return X
    +
    +
    +net = NeuralNetClassifier(
    +    MyModule,
    +    max_epochs=10,
    +    lr=0.1,
    +    # Shuffle training data on each epoch
    +    iterator_train__shuffle=True,
    +)
    +
    +pipe = Pipeline([("flights_rec", flights_rec), ("net", net)])

    Now, there is a single function that can be used to prepare the recipe and train the model from the resulting predictors:

    -
    -
    X_train = train_data.drop("arr_delay")
    -y_train = train_data.arr_delay
    -pipe.fit(X_train, y_train)
    +
    +
    X_train = train_data.drop("arr_delay")
    +y_train = train_data.arr_delay
    +pipe.fit(X_train, y_train)
      epoch    train_loss    valid_acc    valid_loss     dur
     -------  ------------  -----------  ------------  ------
    -      1       10.3153       0.1612       13.3726  2.2884
    -      2        8.7295       0.1612       13.3726  2.2764
    -      3        8.1396       0.1612       13.3726  2.2633
    -      4        6.7964       0.8388        2.5698  2.2661
    -      5        6.0716       0.8388        2.5698  2.2583
    -      6        6.0462       0.8388        2.5698  2.2638
    -      7        6.0914       0.8388        2.5698  2.2605
    -      8        6.1668       0.8388        2.5698  2.2585
    -      9        5.9999       0.8388        2.5698  2.2621
    -     10        5.9069       0.8388        2.5698  2.2600
    + 1 2.7189 0.8388 2.5698 2.2748 + 2 2.5384 0.8388 2.5698 2.2485 + 3 2.5381 0.8388 2.5698 2.2467 + 4 2.5382 0.8388 2.5698 2.2512 + 5 2.5344 0.8388 2.5698 2.2496 + 6 2.5337 0.8388 2.5698 2.2620 + 7 2.5370 0.8388 2.5698 2.2255 + 8 2.5384 0.8388 2.5698 2.2156 + 9 2.5375 0.8388 2.5698 2.2303 + 10 2.5389 0.8388 2.5698 2.2218

    Full coverage18 (86%)17 (81%) 19 (90%) 17 (81%) 18 (86%)10 (48%)9 (43%) 21 (100%)11 (52%)15 (71%)16 (76%)10 (48%)14 (67%) 15 (71%)18 (86%)14 (67%)17 (81%) 17 (81%) 17 (81%) 20 (95%)20 (95%) 20 (95%) 21 (100%)15 (71%)18 (86%)14 (67%)17 (81%)
    Partial coverage1 (5%)2 (10%) 1 (5%) 2 (10%) 1 (5%) 5 (24%) 0 (0%) 5 (24%)2 (10%)3 (14%)4 (19%) 3 (14%) 2 (10%)1 (5%) 3 (14%) 2 (10%) 0 (0%)0 (0%) 0 (0%) 0 (0%)3 (14%)1 (5%)4 (19%)2 (10%)
    Category