MAIN Improve DatetimeEncoder (#784)

Co-authored-by: Jérôme Dockès <[email protected]>
skrub-data · Nov 9, 2023 · 2bda119 · 2bda119
1 parent 77b1ccc
commit 2bda119
Show file tree

Hide file tree

Showing 9 changed files with 1,128 additions and 721 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -15,6 +15,10 @@ development and backward compatibility is not ensured.
 Major changes
 -------------
 
+* :func:`to_datetime` is now available to support pandas.to_datetime
+ over dataframes and 2d arrays.
+ :pr:`784` by :user:`Vincent Maladiere <Vincent-Maladiere>`
+
 * Some parameters of :class:`Joiner` have changed. The goal is to harmonize
  parameters across all estimator that perform join(-like) operations, as
  discussed in `#751 <https:/skrub-data/skrub/discussions/751>`_.
@@ -57,6 +61,11 @@ Major changes
 
 Minor changes
 -------------
+* :class:`DatetimeEncoder` doesn't remove constant features anymore.
+ It also supports an 'errors' argument to raise or coerce errors during
+ transform, and a 'add_total_seconds' argument to include the number of
+ seconds since Epoch.
+ :pr:`784` by :user:`Vincent Maladiere <Vincent-Maladiere>`
 
 * Scaling of ``matching_score`` in :func:`fuzzy_join` is now between 0 and 1; it used to be between 0.5 and 1. Moreover, the division by 0 error that occurred when all rows had a perfect match has been fixed. :pr:`802` by :user:`Jérôme Dockès <jeromedockes>`.
 

diff --git a/doc/api.rst b/doc/api.rst
@@ -79,7 +79,7 @@ This page lists all available functions and classes of `skrub`.
 
 .. raw:: html
 
- <h2>Other encoders</h2>
+ <h2>Dealing with dates</h2>
 
 .. autosummary::
  :toctree: generated/
@@ -89,6 +89,14 @@ This page lists all available functions and classes of `skrub`.
 
  DatetimeEncoder
 
+.. autosummary::
+ :toctree: generated/
+ :template: function.rst
+ :nosignatures:
+ :caption: Converting datetime columns in a table
+
+ to_datetime
+
 .. raw:: html
 
  <h2>Deduplication: merging variants of the same entry</h2>

diff --git a/doc/conf.py b/doc/conf.py
@@ -504,6 +504,7 @@ def notebook_modification_function(notebook_content, notebook_filename):
  "SimilarityEncoder": "skrub.SimilarityEncoder",
  "DatetimeEncoder": "skrub.DatetimeEncoder",
  "deduplicate": "skrub.deduplicate",
+ "to_datetime": "skrub.to_datetime",
  "TableVectorizer": "skrub.TableVectorizer",
  "DatasetInfoOnly": "skrub.datasets._fetching.DatasetInfoOnly",
  "DatasetAll": "skrub.datasets._fetching.DatasetAll",

diff --git a/examples/03_datetime_encoder.py b/examples/03_datetime_encoder.py
@@ -34,6 +34,9 @@
 
 .. |HGBR| replace::
  :class:`~sklearn.ensemble.HistGradientBoostingRegressor`
+
+.. |to_datetime| replace::
+ :func:`~skrub.to_datetime`
 """
 
 
@@ -46,19 +49,26 @@
 # on the location, date and time of measurement.
 
 from pprint import pprint
-
 import pandas as pd
 
 data = pd.read_csv(
  "https://raw.githubusercontent.com/pandas-dev/pandas"
  "/main/doc/data/air_quality_no2_long.csv"
-)
+).sort_values("date.utc")
 # Extract our input data (X) and the target column (y)
 y = data["value"]
 X = data[["city", "date.utc"]]
 
 X
 
+###############################################################################
+# We convert the dataframe date columns using |to_datetime|. Notice how
+# we don't need to specify the columns to convert.
+from skrub import to_datetime
+
+X = to_datetime(X)
+X.dtypes
+
 ###############################################################################
 # Encoding the features
 # .....................
@@ -73,27 +83,22 @@
 # lower units, as they are probably unimportant.
 
 from sklearn.preprocessing import OneHotEncoder
-
-from skrub import DatetimeEncoder
-
 from sklearn.compose import make_column_transformer
+from skrub import DatetimeEncoder
 
 encoder = make_column_transformer(
  (OneHotEncoder(handle_unknown="ignore"), ["city"]),
- (DatetimeEncoder(add_day_of_the_week=True, extract_until="minute"), ["date.utc"]),
+ (DatetimeEncoder(add_day_of_the_week=True, resolution="minute"), ["date.utc"]),
  remainder="drop",
 )
 
 X_enc = encoder.fit_transform(X)
 pprint(encoder.get_feature_names_out())
 
 ###############################################################################
-# We see that the encoder is working as expected: the "date.utc" column has
-# been replaced by features extracting the month, day, hour, and day of the
-# week information.
-#
-# Note the year and minute features are not present, this is because they
-# have been removed by the encoder as they are constant the whole period.
+# We see that the encoder is working as expected: the ``"date.utc"`` column has
+# been replaced by features extracting the month, day, hour, minute, day of the
+# week and total second since Epoch information.
 
 ###############################################################################
 # One-liner with the |TableVectorizer|
@@ -104,8 +109,7 @@
 
 from skrub import TableVectorizer
 
-table_vec = TableVectorizer()
-table_vec.fit_transform(X)
+table_vec = TableVectorizer().fit(X)
 pprint(table_vec.get_feature_names_out())
 
 ###############################################################################
@@ -116,8 +120,7 @@
 
 table_vec = TableVectorizer(
  datetime_transformer=DatetimeEncoder(add_day_of_the_week=True),
-)
-table_vec.fit_transform(X)
+).fit(X)
 pprint(table_vec.get_feature_names_out())
 
 ###############################################################################
@@ -144,14 +147,9 @@
 # ```py
 # from sklearn.experimental import enable_hist_gradient_boosting
 # ```
-
-import numpy as np
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.pipeline import make_pipeline
 
-table_vec = TableVectorizer(
- datetime_transformer=DatetimeEncoder(add_day_of_the_week=True),
-)
 pipeline = make_pipeline(table_vec, HistGradientBoostingRegressor())
 
 ###############################################################################
@@ -164,11 +162,6 @@
 #
 # Instead, we can use the |TimeSeriesSplit|,
 # which ensures that the test set is always in the future.
-
-sorted_indices = np.argsort(X["date.utc"])
-X = X.iloc[sorted_indices]
-y = y.iloc[sorted_indices]
-
 from sklearn.model_selection import TimeSeriesSplit, cross_val_score
 
 cross_val_score(
@@ -185,82 +178,71 @@
 #
 # The mean squared error is not obvious to interpret, so we compare
 # visually the prediction of our model with the actual values.
-
+import numpy as np
 import matplotlib.pyplot as plt
-from matplotlib.dates import AutoDateFormatter, AutoDateLocator
-
-X_train = X[X["date.utc"] < "2019-06-01"]
-X_test = X[X["date.utc"] >= "2019-06-01"]
 
-y_train = y[X["date.utc"] < "2019-06-01"]
-y_test = y[X["date.utc"] >= "2019-06-01"]
+mask_train = X["date.utc"] < "2019-06-01"
+X_train, X_test = X.loc[mask_train], X.loc[~mask_train]
+y_train, y_test = y.loc[mask_train], y.loc[~mask_train]
 
 pipeline.fit(X_train, y_train)
+y_pred = pipeline.predict(X_test)
 
 all_cities = X_test["city"].unique()
 
-fig, axs = plt.subplots(nrows=len(all_cities), ncols=1, figsize=(12, 9))
-fig.subplots_adjust(hspace=0.5)
+fig, axes = plt.subplots(nrows=len(all_cities), ncols=1, figsize=(12, 9))
+for ax, city in zip(axes, all_cities):
+ mask_prediction = X_test["city"] == city
+ date_prediction = X_test.loc[mask_prediction]["date.utc"]
+ y_prediction = y_pred[mask_prediction]
 
-for i, city in enumerate(all_cities):
- axs[i].plot(
-  X.loc[X.city == city, "date.utc"],
- y.loc[X.city == city],
-  label="Actual",
- )
- axs[i].plot(
-  X_test.loc[X_test.city == city, "date.utc"],
- pipeline.predict(X_test.loc[X_test.city == city]),
- label="Predicted",
+ mask_reference = X["city"] == city
+ date_reference = X.loc[mask_reference]["date.utc"]
+ y_reference = y[mask_reference]
+
+ ax.plot(date_reference, y_reference, label="Actual")
+ ax.plot(date_prediction, y_prediction, label="Predicted")
+
+ ax.set(
+ ylabel="NO2",
+ title=city,
  )
- axs[i].set_title(city)
- axs[i].set_ylabel("NO2")
- xtick_locator = AutoDateLocator(maxticks=8)
- xtick_formatter = AutoDateFormatter(xtick_locator)
- axs[i].xaxis.set_major_locator(xtick_locator)
- axs[i].xaxis.set_major_formatter(xtick_formatter)
- axs[i].legend()
+ ax.legend()
+
+fig.subplots_adjust(hspace=0.5)
 plt.show()
 
 ###############################################################################
 # Let's zoom on a few days:
 
-X_zoomed = X[(X["date.utc"] <= "2019-06-04") & (X["date.utc"] >= "2019-06-01")]
-y_zoomed = y[(X["date.utc"] <= "2019-06-04") & (X["date.utc"] >= "2019-06-01")]
-
-X_train_zoomed = X_zoomed[X_zoomed["date.utc"] < "2019-06-03"]
-X_test_zoomed = X_zoomed[X_zoomed["date.utc"] >= "2019-06-03"]
+mask_zoom_reference = (X["date.utc"] >= "2019-06-01") & (X["date.utc"] < "2019-06-04")
+mask_zoom_prediction = (X_test["date.utc"] >= "2019-06-01") & (
+ X_test["date.utc"] < "2019-06-04"
+)
 
-y_train_zoomed = y[X["date.utc"] < "2019-06-03"]
-y_test_zoomed = y[X["date.utc"] >= "2019-06-03"]
+all_cities = ["Paris", "London"]
+fig, axes = plt.subplots(nrows=len(all_cities), ncols=1, figsize=(12, 9))
+for ax, city in zip(axes, all_cities):
+ mask_prediction = (X_test["city"] == city) & mask_zoom_prediction
+ date_prediction = X_test.loc[mask_prediction]["date.utc"]
+ y_prediction = y_pred[mask_prediction]
 
-zoomed_cities = X_test_zoomed["city"].unique()
+ mask_reference = (X["city"] == city) & mask_zoom_reference
+ date_reference = X.loc[mask_reference]["date.utc"]
+ y_reference = y[mask_reference]
 
-fig, axs = plt.subplots(nrows=len(zoomed_cities), ncols=1, figsize=(12, 9))
-fig.subplots_adjust(hspace=0.5)
+ ax.plot(date_reference, y_reference, label="Actual")
+ ax.plot(date_prediction, y_prediction, label="Predicted")
 
-for i, city in enumerate(zoomed_cities):
- axs[i].plot(
- X_zoomed.loc[X_zoomed["city"] == city, "date.utc"],
- y_zoomed.loc[X_zoomed["city"] == city],
- label="Actual",
- )
- axs[i].plot(
- X_test_zoomed.loc[X_test_zoomed["city"] == city, "date.utc"],
- pipeline.predict(X_test_zoomed.loc[X_test_zoomed["city"] == city]),
- label="Predicted",
+ ax.set(
+ ylabel="NO2",
+ title=city,
  )
- axs[i].set_title(city)
- axs[i].set_ylabel("NO2")
-
- xtick_locator = AutoDateLocator(maxticks=8)
- xtick_formatter = AutoDateFormatter(xtick_locator)
- axs[i].xaxis.set_major_locator(xtick_locator)
- axs[i].xaxis.set_major_formatter(xtick_formatter)
+ ax.legend()
 
- axs[i].legend()
 plt.show()
 
+
 ###############################################################################
 # Features importance
 # -------------------
@@ -280,27 +262,28 @@
 
 # In this case, we don't use a pipeline, because we want to compute the
 # importance of the features created by the DatetimeEncoder
-X_ = table_vec.fit_transform(X)
-reg = HistGradientBoostingRegressor().fit(X_, y)
-result = permutation_importance(reg, X_, y, n_repeats=10, random_state=0)
-std = result.importances_std
-importances = result.importances_mean
-indices = np.argsort(importances)
-# Sort from least to most
-indices = list(reversed(indices))
-
-plt.figure(figsize=(12, 9))
-plt.title("Feature importances")
-n = len(indices)
-labels = np.array(table_vec.get_feature_names_out())[indices]
-plt.barh(range(n), importances[indices], color="b", yerr=std[indices])
-plt.yticks(range(n), labels, size=15)
-plt.tight_layout(pad=1)
-plt.show()
+X_transform = table_vec.fit_transform(X)
+feature_names = table_vec.get_feature_names_out()
+
+model = HistGradientBoostingRegressor().fit(X_transform, y)
+result = permutation_importance(model, X_transform, y, n_repeats=10, random_state=0)
+
+result = pd.DataFrame(
+ dict(
+ feature_names=feature_names,
+ std=result.importances_std,
+ importances=result.importances_mean,
+ )
+).sort_values("importances", ascending=False)
+
+result.plot.barh(
+ y="importances", x="feature_names", title="Feature Importances", figsize=(12, 9)
+)
+plt.tight_layout()
 
 ###############################################################################
-# We can see that the hour of the day is the most important feature,
-# which seems reasonable.
+# We can see that the total seconds since Epoch and the hour of the day
+# are the most important feature, which seems reasonable.
 #
 # Conclusion
 # ----------

diff --git a/skrub/__init__.py b/skrub/__init__.py
@@ -5,7 +5,7 @@
 
 from ._agg_joiner import AggJoiner, AggTarget
 from ._check_dependencies import check_dependencies
-from ._datetime_encoder import DatetimeEncoder
+from ._datetime_encoder import DatetimeEncoder, to_datetime
 from ._deduplicate import compute_ngram_distance, deduplicate
 from ._fuzzy_join import fuzzy_join
 from ._gap_encoder import GapEncoder
@@ -34,6 +34,7 @@
  "TargetEncoder",
  "deduplicate",
  "compute_ngram_distance",
+ "to_datetime",
  "AggJoiner",
  "AggTarget",
  "SelectCols",