From c9069bc809c5168deac07f9439acab56f631b428 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Wed, 4 Oct 2023 16:50:31 +0200
Subject: [PATCH 01/30] estimator refacto

---
 skrub/_datetime_encoder.py | 284 ++++++++++++++++++-------------------
 1 file changed, 135 insertions(+), 149 deletions(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index dccb39301..f8b4d23d0 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -1,14 +1,12 @@
-from typing import Literal
+from collections import defaultdict
 
 import numpy as np
 import pandas as pd
-from numpy.typing import ArrayLike, NDArray
 from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils import check_array
 from sklearn.utils.validation import check_is_fitted
 
-from skrub._utils import check_input
-
-WORD_TO_ALIAS: dict[str, str] = {
+WORD_TO_ALIAS = {
     "year": "Y",
     "month": "M",
     "day": "D",
@@ -18,22 +16,28 @@
     "microsecond": "us",
     "nanosecond": "N",
 }
-TIME_LEVELS: list[str] = list(WORD_TO_ALIAS.keys())
-AcceptedTimeValues = Literal[
-    "year",
-    "month",
-    "day",
-    "hour",
-    "minute",
-    "second",
-    "microsecond",
-    "nanosecond",
-]
-
-
-class DatetimeEncoder(BaseEstimator, TransformerMixin):
-    """Transform each datetime column into several numeric columns \
-    for temporal features (e.g. "year", "month", "day"...).
+TIME_LEVELS = list(WORD_TO_ALIAS)
+
+
+def is_datetime_parsable(X):
+    """
+    Parameters
+    ----------
+    X : numpy ndarray
+    """
+    np_dtypes_candidates = [np.object_, np.str_, np.datetime64]
+    if any(np.issubdtype(X.dtype, np_dtype) for np_dtype in np_dtypes_candidates):
+        try:
+            _ = pd.to_datetime(X)
+            return True
+        except (pd.errors.ParserError, ValueError):
+            pass
+    return False
+
+
+class DatetimeEncoder(TransformerMixin, BaseEstimator):
+    """Transforms each datetime column into several numeric columns \
+    for temporal features (e.g year, month, day...).
 
     Constant extracted features are dropped; for instance, if the year is
     always the same in a feature, the extracted "year" column won't be added.
@@ -98,72 +102,21 @@ class DatetimeEncoder(BaseEstimator, TransformerMixin):
            [2019.,   10.,   15.,   12.]])
     """
 
-    n_features_in_: int
-    n_features_out_: int
-    features_per_column_: dict[int, list[str]]
-    col_names_: list[str] | None
-
     def __init__(
         self,
         *,
-        extract_until: AcceptedTimeValues | None = "hour",
-        add_day_of_the_week: bool = False,
+        extract_until="hour",
+        add_day_of_the_week=False,
+        add_total_second=False,
+        errors="coerce",
     ):
         self.extract_until = extract_until
         self.add_day_of_the_week = add_day_of_the_week
+        self.add_total_second = add_total_second  # TODO doc
+        self.errors = errors  # TODO doc
 
-    def _more_tags(self):
-        """
-        Used internally by sklearn to ease the estimator checks.
-        """
-        return {
-            "X_types": ["2darray", "categorical"],
-            "allow_nan": True,
-            "_xfail_checks": {"check_dtype_object": "Specific datetime error."},
-        }
-
-    def _validate_keywords(self):
-        if self.extract_until not in TIME_LEVELS and self.extract_until is not None:
-            raise ValueError(
-                f'"extract_until" should be one of {TIME_LEVELS}, '
-                f"got {self.extract_until}. "
-            )
-
-    @staticmethod
-    def _extract_from_date(date_series: pd.Series, feature: str):
-        if feature == "year":
-            return pd.DatetimeIndex(date_series).year.to_numpy()
-        elif feature == "month":
-            return pd.DatetimeIndex(date_series).month.to_numpy()
-        elif feature == "day":
-            return pd.DatetimeIndex(date_series).day.to_numpy()
-        elif feature == "hour":
-            return pd.DatetimeIndex(date_series).hour.to_numpy()
-        elif feature == "minute":
-            return pd.DatetimeIndex(date_series).minute.to_numpy()
-        elif feature == "second":
-            return pd.DatetimeIndex(date_series).second.to_numpy()
-        elif feature == "microsecond":
-            return pd.DatetimeIndex(date_series).microsecond.to_numpy()
-        elif feature == "nanosecond":
-            return pd.DatetimeIndex(date_series).nanosecond.to_numpy()
-        elif feature == "dayofweek":
-            return pd.DatetimeIndex(date_series).dayofweek.to_numpy()
-        elif feature == "total_time":
-            tz = pd.DatetimeIndex(date_series).tz
-            # Compute the time in seconds from the epoch time UTC
-            if tz is None:
-                return (
-                    pd.to_datetime(date_series) - pd.Timestamp("1970-01-01")
-                ) // pd.Timedelta("1s")
-            else:
-                return (
-                    pd.DatetimeIndex(date_series).tz_convert("utc")
-                    - pd.Timestamp("1970-01-01", tz="utc")
-                ) // pd.Timedelta("1s")
-
-    def fit(self, X: ArrayLike, y=None) -> "DatetimeEncoder":
-        """Fit the instance to ``X``.
+    def fit(self, X, y=None):
+        """Fit the instance to X.
 
         In practice, just check keywords and input validity,
         and stores which extracted features are not constant.
@@ -180,52 +133,69 @@ def fit(self, X: ArrayLike, y=None) -> "DatetimeEncoder":
         DatetimeEncoder
             Fitted DatetimeEncoder instance (self).
         """
-        self._validate_keywords()
-        if isinstance(X, pd.DataFrame):
-            self.col_names_ = X.columns.to_list()
-        else:
-            self.col_names_ = None
-        X = check_input(X)
-        # Features to extract for each column, after removing constant features
-        self.features_per_column_ = {}
-        for i in range(X.shape[1]):
-            self.features_per_column_[i] = []
-        # Check which columns are constant
-        for i in range(X.shape[1]):
-            if self.extract_until is None:
-                if np.nanstd(self._extract_from_date(X[:, i], "total_time")) > 0:
-                    self.features_per_column_[i].append("total_time")
-            else:
-                for feature in TIME_LEVELS:
-                    if np.nanstd(self._extract_from_date(X[:, i], feature)) > 0:
-                        if TIME_LEVELS.index(feature) <= TIME_LEVELS.index(
-                            self.extract_until
-                        ):
-                            self.features_per_column_[i].append(feature)
-                        # we add a total_time feature, which contains the full
-                        # time to epoch, if there is at least one
-                        # feature that has not been extracted and is not constant
-                        if TIME_LEVELS.index(feature) > TIME_LEVELS.index(
-                            self.extract_until
-                        ):
-                            self.features_per_column_[i].append("total_time")
-                            break
-                # Add day of the week feature if needed
-                if (
-                    self.add_day_of_the_week
-                    and np.nanstd(self._extract_from_date(X[:, i], "dayofweek")) > 0
-                ):
-                    self.features_per_column_[i].append("dayofweek")
-
-        self.n_features_in_ = X.shape[1]
-        self.n_features_out_ = len(
-            np.concatenate(list(self.features_per_column_.values()))
-        )
+        if self.extract_until not in TIME_LEVELS and self.extract_until is not None:
+            raise ValueError(
+                f"'extract_until' options are {TIME_LEVELS}, "
+                f"got {self.extract_until!r}."
+            )
+
+        errors_options = ["coerce", "raise"]
+        if self.errors not in errors_options:
+            raise ValueError(
+                f"errors options are {errors_options!r}, got {self.errors!r}."
+            )
+
+        self._check_feature_names(X, reset=True)
+        self._check_n_features(X, reset=True)
+        X = check_array(X, ensure_2d=True, force_all_finite=False)
+
+        self._parse_datetime_cols(X)
 
         return self
 
-    def transform(self, X: ArrayLike, y=None) -> NDArray:
-        """Transform ``X`` by replacing each datetime column with \
+    def _parse_datetime_cols(self, X):
+        """
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+        """
+        # Features to extract for each column, after removing constant features
+        self.features_per_column_ = defaultdict(list)
+        self.format_per_column_ = dict()
+        self.n_features_out_ = 0
+
+        if self.extract_until is None:
+            levels = []
+            require_total_second = False
+        else:
+            idx_level = TIME_LEVELS.index(self.extract_until)
+            levels = TIME_LEVELS[:idx_level]
+            require_total_second = TIME_LEVELS == levels
+
+        self.add_total_second_ = self.add_total_second or require_total_second
+
+        columns = getattr(self, "feature_names_in_", list(range(X.shape[1])))
+        for col_idx, col in enumerate(columns):
+            X_col = X[:, col_idx]
+
+            if is_datetime_parsable(X_col):
+                # Pandas use the first non-null item of the array to infer the format.
+                mask_notnull = X_col == X_col
+                self.format_per_column_[col] = X_col[mask_notnull][0]
+
+                self.features_per_column_[col] += levels
+                self.n_features_out_ += len(levels)
+
+                if self.add_total_second_:
+                    self.features_per_column_[col].append("total_time")
+                    self.n_features_out_ += 1
+
+                if self.add_day_of_the_week:
+                    self.features_per_column_[col].append("day_of_week")
+                    self.n_features_out_ += 1
+
+    def transform(self, X, y=None):
+        """Transform `X` by replacing each datetime column with \
         corresponding numerical features.
 
         Parameters
@@ -240,28 +210,35 @@ def transform(self, X: ArrayLike, y=None) -> NDArray:
         ndarray, shape (``n_samples``, ``n_features_out_``)
             Transformed input.
         """
-        check_is_fitted(
-            self,
-            attributes=["n_features_in_", "n_features_out_", "features_per_column_"],
-        )
-        X = check_input(X)
-        if X.shape[1] != self.n_features_in_:
-            raise ValueError(
-                f"The number of features in the input data ({X.shape[1]}) "
-                "does not match the number of features "
-                f"seen during fit ({self.n_features_in_}). "
-            )
-        # Create a new array with the extracted features,
-        # choosing only features that weren't constant during fit
-        X_ = np.empty((X.shape[0], self.n_features_out_), dtype=np.float64)
-        idx = 0
-        for i in range(X.shape[1]):
-            for j, feature in enumerate(self.features_per_column_[i]):
-                X_[:, idx + j] = self._extract_from_date(X[:, i], feature)
-            idx += len(self.features_per_column_[i])
-        return X_
-
-    def get_feature_names_out(self, input_features=None) -> list[str]:
+        check_is_fitted(self)
+        self._check_n_features(X, reset=False)
+        self._check_feature_names(X, reset=False)
+        X = check_array(X, ensure_2d=True, force_all_finite=False)
+
+        columns = getattr(self, "feature_names_in_", list(range(X.shape[1])))
+        X_out = np.empty((X.shape[0], self.n_features_out_), dtype=np.float64)
+        offset_idx = 0
+        for col_idx, col in enumerate(columns):
+            if col in self.features_per_column_:
+                # X_j is a DatetimeIndex
+                X_j = pd.to_datetime(X[:, col_idx], errors=self.errors)
+
+                features = self.features_per_column_[col]
+                for feat_idx, feature in enumerate(features):
+                    if feature == "total_time":
+                        if X_j.tz is not None:
+                            X_j = X_j.tz_convert("utc")
+                        # Total seconds since epoch
+                        X_feature = (X_j.astype("int64") // 1e9).to_numpy()
+                    else:
+                        X_feature = getattr(X_j, feature).to_numpy()
+
+                    X_out[:, offset_idx + feat_idx] = X_feature
+                offset_idx += len(features)
+
+        return X_out
+
+    def get_feature_names_out(self, input_features=None):
         """Return clean feature names.
 
         Feature names are formatted like: "<column_name>_<new_feature>"
@@ -280,9 +257,18 @@ def get_feature_names_out(self, input_features=None) -> list[str]:
         list of str
             List of feature names.
         """
+        check_is_fitted(self, "features_per_column_")
         feature_names = []
-        for i in self.features_per_column_.keys():
-            prefix = str(i) if self.col_names_ is None else self.col_names_[i]
-            for feature in self.features_per_column_[i]:
-                feature_names.append(f"{prefix}_{feature}")
+        for column, features in self.features_per_column_.items():
+            feature_names += [f"{column}_{feat}" for feat in features]
         return feature_names
+
+    def _more_tags(self):
+        """
+        Used internally by sklearn to ease the estimator checks.
+        """
+        return {
+            "X_types": ["2darray", "categorical"],
+            "allow_nan": True,
+            "_xfail_checks": {"check_dtype_object": "Specific datetime error."},
+        }

From da7d67880e962a54c00349c18df6c03cb85bc6bb Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 5 Oct 2023 14:46:55 +0200
Subject: [PATCH 02/30] revamp all tests from datetime_encoder

---
 skrub/_datetime_encoder.py           |  59 ++-
 skrub/tests/test_datetime_encoder.py | 698 ++++++++++-----------------
 2 files changed, 301 insertions(+), 456 deletions(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index f8b4d23d0..7a1a59ccf 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -23,7 +23,7 @@ def is_datetime_parsable(X):
     """
     Parameters
     ----------
-    X : numpy ndarray
+    X : np.ndarray of shape (n_sample,)
     """
     np_dtypes_candidates = [np.object_, np.str_, np.datetime64]
     if any(np.issubdtype(X.dtype, np_dtype) for np_dtype in np_dtypes_candidates):
@@ -35,6 +35,18 @@ def is_datetime_parsable(X):
     return False
 
 
+def is_date_only(X):
+    """
+    Parameters
+    ----------
+    X : np.ndarray of shape (n_sample,)
+    """
+    if is_datetime_parsable(X):
+        X_t = pd.to_datetime(X)
+        return np.all(X_t == X_t.normalize())
+    return False
+
+
 class DatetimeEncoder(TransformerMixin, BaseEstimator):
     """Transforms each datetime column into several numeric columns \
     for temporal features (e.g year, month, day...).
@@ -107,7 +119,7 @@ def __init__(
         *,
         extract_until="hour",
         add_day_of_the_week=False,
-        add_total_second=False,
+        add_total_second=True,
         errors="coerce",
     ):
         self.extract_until = extract_until
@@ -147,7 +159,7 @@ def fit(self, X, y=None):
 
         self._check_feature_names(X, reset=True)
         self._check_n_features(X, reset=True)
-        X = check_array(X, ensure_2d=True, force_all_finite=False)
+        X = check_array(X, ensure_2d=True, force_all_finite=False, dtype=None)
 
         self._parse_datetime_cols(X)
 
@@ -166,13 +178,9 @@ def _parse_datetime_cols(self, X):
 
         if self.extract_until is None:
             levels = []
-            require_total_second = False
         else:
             idx_level = TIME_LEVELS.index(self.extract_until)
-            levels = TIME_LEVELS[:idx_level]
-            require_total_second = TIME_LEVELS == levels
-
-        self.add_total_second_ = self.add_total_second or require_total_second
+            levels = TIME_LEVELS[: idx_level + 1]
 
         columns = getattr(self, "feature_names_in_", list(range(X.shape[1])))
         for col_idx, col in enumerate(columns):
@@ -180,14 +188,21 @@ def _parse_datetime_cols(self, X):
 
             if is_datetime_parsable(X_col):
                 # Pandas use the first non-null item of the array to infer the format.
-                mask_notnull = X_col == X_col
+                X_dt = pd.to_datetime(X_col)
+                mask_notnull = X_dt == X_dt
                 self.format_per_column_[col] = X_col[mask_notnull][0]
 
+                if is_date_only(X_col):
+                    # Keep only date attributes
+                    levels = [
+                        level for level in levels if level in ["year", "month", "day"]
+                    ]
+
                 self.features_per_column_[col] += levels
                 self.n_features_out_ += len(levels)
 
-                if self.add_total_second_:
-                    self.features_per_column_[col].append("total_time")
+                if self.add_total_second:
+                    self.features_per_column_[col].append("total_second")
                     self.n_features_out_ += 1
 
                 if self.add_day_of_the_week:
@@ -213,27 +228,33 @@ def transform(self, X, y=None):
         check_is_fitted(self)
         self._check_n_features(X, reset=False)
         self._check_feature_names(X, reset=False)
-        X = check_array(X, ensure_2d=True, force_all_finite=False)
+        X = check_array(X, ensure_2d=True, force_all_finite=False, dtype=None)
 
         columns = getattr(self, "feature_names_in_", list(range(X.shape[1])))
+        # X_out must be of dtype float64 to handle np.nan
         X_out = np.empty((X.shape[0], self.n_features_out_), dtype=np.float64)
         offset_idx = 0
         for col_idx, col in enumerate(columns):
             if col in self.features_per_column_:
                 # X_j is a DatetimeIndex
-                X_j = pd.to_datetime(X[:, col_idx], errors=self.errors)
+                X_col = pd.to_datetime(X[:, col_idx], errors=self.errors)
 
                 features = self.features_per_column_[col]
                 for feat_idx, feature in enumerate(features):
-                    if feature == "total_time":
-                        if X_j.tz is not None:
-                            X_j = X_j.tz_convert("utc")
+                    if feature == "total_second":
+                        if X_col.tz is not None:
+                            X_col = X_col.tz_convert("utc")
                         # Total seconds since epoch
-                        X_feature = (X_j.astype("int64") // 1e9).to_numpy()
+                        mask_notnull = X_col == X_col
+                        X_feature = np.where(
+                            mask_notnull,
+                            X_col.astype("int64") // 1e9,
+                            np.nan,
+                        )
                     else:
-                        X_feature = getattr(X_j, feature).to_numpy()
-
+                        X_feature = getattr(X_col, feature).to_numpy()
                     X_out[:, offset_idx + feat_idx] = X_feature
+
                 offset_idx += len(features)
 
         return X_out
diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py
index fa7e93a93..5ecdf383b 100644
--- a/skrub/tests/test_datetime_encoder.py
+++ b/skrub/tests/test_datetime_encoder.py
@@ -1,494 +1,318 @@
+from copy import deepcopy
+from itertools import product
+
 import numpy as np
 import pandas as pd
 import pytest
-from sklearn.exceptions import NotFittedError
+from numpy.testing import assert_allclose, assert_array_equal
 
-from skrub._datetime_encoder import DatetimeEncoder
+from skrub._datetime_encoder import TIME_LEVELS, DatetimeEncoder
 
 
-def get_date_array() -> np.array:
-    return np.array(
+def get_date(as_array=False):
+    df = pd.DataFrame(
         [
-            pd.to_datetime(["2020-01-01", "2020-01-02", "2020-01-03"]),
-            pd.to_datetime(["2021-02-03", "2020-02-04", "2021-02-05"]),
-            pd.to_datetime(["2022-01-01", "2020-12-25", "2022-01-03"]),
-            pd.to_datetime(["2023-02-03", "2020-02-04", "2023-02-05"]),
-        ]
+            ["2020-01-01", "2020-01-02", "2020-01-03"],
+            ["2021-02-03", "2020-02-04", "2021-02-05"],
+            ["2022-01-01", "2020-12-25", "2022-01-03"],
+            ["2023-02-03", "2020-02-04", "2023-02-05"],
+        ],
     )
+    if as_array:
+        return df.to_numpy()
+    return df
 
 
-def get_constant_date_array() -> np.array:
-    return np.array(
+def get_constant_date(as_array=False):
+    df = pd.DataFrame(
         [
-            pd.to_datetime(["2020-01-01", "2020-02-04", "2021-02-05"]),
-            pd.to_datetime(["2020-01-01", "2020-02-04", "2021-02-05"]),
-            pd.to_datetime(["2020-01-01", "2020-02-04", "2021-02-05"]),
-            pd.to_datetime(["2020-01-01", "2020-02-04", "2021-02-05"]),
-        ]
+            ["2020-01-01", "2020-02-04", "2021-02-05"],
+            ["2020-01-01", "2020-02-04", "2021-02-05"],
+            ["2020-01-01", "2020-02-04", "2021-02-05"],
+            ["2020-01-01", "2020-02-04", "2021-02-05"],
+        ],
     )
+    if as_array:
+        return df.to_numpy()
+    return df
 
 
-def get_datetime_array() -> np.array:
-    return np.array(
+def get_datetime(as_array=False):
+    df = pd.DataFrame(
         [
-            pd.to_datetime(
-                [
-                    "2020-01-01 10:12:01",
-                    "2020-01-02 10:23:00",
-                    "2020-01-03 10:00:00",
-                ],
-            ),
-            pd.to_datetime(
-                [
-                    "2021-02-03 12:45:23",
-                    "2020-02-04 22:12:00",
-                    "2021-02-05 12:00:00",
-                ],
-            ),
-            pd.to_datetime(
-                [
-                    "2022-01-01 23:23:43",
-                    "2020-12-25 11:12:00",
-                    "2022-01-03 11:00:00",
-                ],
-            ),
-            pd.to_datetime(
-                [
-                    "2023-02-03 11:12:12",
-                    "2020-02-04 08:32:00",
-                    "2023-02-05 23:00:00",
-                ],
-            ),
-        ]
+            ["2020-01-01 10:12:01", "2020-01-02 10:23:00", "2020-01-03 10:00:00"],
+            ["2021-02-03 12:45:23", "2020-02-04 22:12:00", "2021-02-05 12:00:00"],
+            ["2022-01-01 23:23:43", "2020-12-25 11:12:00", "2022-01-03 11:00:00"],
+            ["2023-02-03 11:12:12", "2020-02-04 08:32:00", "2023-02-05 23:00:00"],
+        ],
     )
+    if as_array:
+        return df.to_numpy()
+    return df
 
 
-def get_datetime_array_nanoseconds() -> np.array:
-    return np.array(
+def get_nanoseconds(as_array=False):
+    df = pd.DataFrame(
         [
-            pd.to_datetime(
-                [
-                    # constant year and month
-                    # for the first feature
-                    "2020-08-24 15:55:30.123456789",
-                    "2020-08-24 15:55:30.123456789",
-                ],
-            ),
-            pd.to_datetime(
-                [
-                    "2020-08-20 14:56:31.987654321",
-                    "2021-07-20 14:56:31.987654321",
-                ],
-            ),
-            pd.to_datetime(
-                [
-                    "2020-08-20 14:57:32.123987654",
-                    "2023-09-20 14:57:32.123987654",
-                ],
-            ),
-            pd.to_datetime(
-                [
-                    "2020-08-20 14:58:33.987123456",
-                    "2023-09-20 14:58:33.987123456",
-                ],
-            ),
-        ]
+            ["2020-08-24 15:55:30.123456789", "2020-08-24 15:55:30.123456789"],
+            ["2020-08-20 14:56:31.987654321", "2021-07-20 14:56:31.987654321"],
+            ["2020-08-20 14:57:32.123987654", "2023-09-20 14:57:32.123987654"],
+            ["2020-08-20 14:58:33.987123456", "2023-09-20 14:58:33.987123456"],
+        ],
     )
+    if as_array:
+        return df.to_numpy()
+    return df
 
 
-def get_dirty_datetime_array() -> np.array:
-    return np.array(
+def get_nan_datetime(as_array=False):
+    df = pd.DataFrame(
         [
-            np.array(
-                pd.to_datetime(
-                    [
-                        "2020-01-01 10:12:01",
-                        "2020-01-02 10:23:00",
-                        "2020-01-03 10:00:00",
-                    ]
-                )
-            ),
-            np.array(
-                pd.to_datetime([np.nan, "2020-02-04 22:12:00", "2021-02-05 12:00:00"])
-            ),
-            np.array(
-                pd.to_datetime(["2022-01-01 23:23:43", "2020-12-25 11:12:00", pd.NaT])
-            ),
-            np.array(
-                pd.to_datetime(
-                    [
-                        "2023-02-03 11:12:12",
-                        "2020-02-04 08:32:00",
-                        "2023-02-05 23:00:00",
-                    ]
-                )
-            ),
-        ]
+            ["2020-01-01 10:12:01", None, "2020-01-03 10:00:00"],
+            [np.nan, "2020-02-04 22:12:00", "2021-02-05 12:00:00"],
+            ["2022-01-01 23:23:43", "2020-12-25 11:12:00", pd.NaT],
+            ["2023-02-03 11:12:12", "2020-02-04 08:32:00", "2023-02-05 23:00:00"],
+        ],
     )
+    if as_array:
+        return df.to_numpy()
+    return df
 
 
-def get_datetime_with_TZ_array() -> pd.DataFrame:
-    res = pd.DataFrame(
+def get_tz_datetime(as_array=False):
+    # The equivalent dtype is "datetime64[ns, Asia/Kolkata]"
+    df = pd.DataFrame(
         [
-            pd.to_datetime(["2020-01-01 10:12:01"]),
-            pd.to_datetime(["2021-02-03 12:45:23"]),
-            pd.to_datetime(["2022-01-01 23:23:43"]),
-            pd.to_datetime(["2023-02-03 11:12:12"]),
-        ]
-    )
-    for col in res.columns:
-        res[col] = pd.DatetimeIndex(res[col]).tz_localize("Asia/Kolkata")
-    return res
-
-
-def test_fit() -> None:
-    # Dates
-    X = get_date_array()
-    enc = DatetimeEncoder()
-    expected_features_per_column_ = {
-        0: ["year", "month", "day"],
-        1: ["month", "day"],
-        2: ["year", "month", "day"],
-    }
-    enc.fit(X)
-    assert enc.features_per_column_ == expected_features_per_column_
-
-    X = get_date_array()
-    enc = DatetimeEncoder(add_day_of_the_week=True)
-    expected_features_per_column_ = {
-        0: ["year", "month", "day", "dayofweek"],
-        1: ["month", "day", "dayofweek"],
-        2: ["year", "month", "day", "dayofweek"],
-    }
-    enc.fit(X)
-    assert enc.features_per_column_ == expected_features_per_column_
-
-    # Datetimes
-    X = get_datetime_array()
-    enc = DatetimeEncoder(add_day_of_the_week=True)
-    expected_features_per_column_ = {
-        0: ["year", "month", "day", "hour", "total_time", "dayofweek"],
-        1: ["month", "day", "hour", "total_time", "dayofweek"],
-        2: ["year", "month", "day", "hour", "dayofweek"],
-    }
-    enc.fit(X)
-    assert enc.features_per_column_ == expected_features_per_column_
-
-    # we check that the features are extracted until `extract_until`
-    # that constant feature are not extracted
-    # and that the total_time feature is extracted if needed
-    X = get_datetime_array()
-    enc = DatetimeEncoder(extract_until="minute")
-    expected_features_per_column_ = {
-        0: ["year", "month", "day", "hour", "minute", "total_time"],
-        1: ["month", "day", "hour", "minute"],
-        2: ["year", "month", "day", "hour"],
-    }
-    enc.fit(X)
-    assert enc.features_per_column_ == expected_features_per_column_
-
-    # extract_until="nanosecond"
-    X = get_datetime_array_nanoseconds()
-    enc = DatetimeEncoder(extract_until="nanosecond")
-    expected_features_per_column_ = {
-        # constant year and month
-        # for first feature
-        0: [
-            "day",
-            "hour",
-            "minute",
-            "second",
-            "microsecond",
-            "nanosecond",
-        ],
-        1: [
-            "year",
-            "month",
-            "day",
-            "hour",
-            "minute",
-            "second",
-            "microsecond",
-            "nanosecond",
+            ["2020-01-01 10:12:01+05:30"],
+            ["2021-02-03 12:45:23+05:30"],
+            ["2022-01-01 23:23:43+05:30"],
+            ["2023-02-03 11:12:12+05:30"],
         ],
-    }
-    enc.fit(X)
-    assert enc.features_per_column_ == expected_features_per_column_
-
-    # Dirty Datetimes
-    X = get_dirty_datetime_array()
-    enc = DatetimeEncoder()
-    expected_features_per_column_ = {
-        0: ["year", "month", "day", "hour", "total_time"],
-        1: ["month", "day", "hour", "total_time"],
-        2: ["year", "month", "day", "hour"],
-    }
-    enc.fit(X)
-    assert enc.features_per_column_ == expected_features_per_column_
+    )
+    if as_array:
+        return df.to_numpy()
+    return df
 
-    # Datetimes with TZ
-    X = get_datetime_with_TZ_array()
-    enc = DatetimeEncoder()
-    expected_features_per_column_ = {0: ["year", "month", "day", "hour", "total_time"]}
-    enc.fit(X)
-    assert enc.features_per_column_ == expected_features_per_column_
 
-    # Feature names
-    # Without column names
-    X = get_datetime_array()
-    enc = DatetimeEncoder(add_day_of_the_week=True)
-    expected_feature_names = [
-        "0_year",
-        "0_month",
-        "0_day",
-        "0_hour",
-        "0_total_time",
-        "0_dayofweek",
-        "1_month",
-        "1_day",
-        "1_hour",
-        "1_total_time",
-        "1_dayofweek",
-        "2_year",
-        "2_month",
-        "2_day",
-        "2_hour",
-        "2_dayofweek",
-    ]
+@pytest.mark.parametrize("as_array", [True, False])
+@pytest.mark.parametrize(
+    "get_data_func, features",
+    [
+        (get_date, TIME_LEVELS[: TIME_LEVELS.index("day") + 1]),
+        (get_datetime, TIME_LEVELS),
+        (get_tz_datetime, TIME_LEVELS),
+        (get_nanoseconds, TIME_LEVELS),
+    ],
+)
+@pytest.mark.parametrize(
+    "add_total_second, add_day_of_the_week",
+    list(product([True, False], [True, False])),
+)
+@pytest.mark.parametrize("extract_until", TIME_LEVELS)
+def test_fit(
+    as_array,
+    get_data_func,
+    features,
+    add_total_second,
+    add_day_of_the_week,
+    extract_until,
+):
+    X = get_data_func(as_array=as_array)
+    enc = DatetimeEncoder(
+        add_day_of_the_week=add_day_of_the_week,
+        add_total_second=add_total_second,
+        extract_until=extract_until,
+    )
     enc.fit(X)
-    assert enc.get_feature_names_out() == expected_feature_names
 
-    # With column names
-    X = get_datetime_array()
-    X = pd.DataFrame(X)
-    X.columns = ["col1", "col2", "col3"]
-    enc = DatetimeEncoder(add_day_of_the_week=True)
+    total_second = ["total_second"] if add_total_second else []
+    day_of_week = ["day_of_week"] if add_day_of_the_week else []
+
+    if extract_until in features:
+        features_ = features[: features.index(extract_until) + 1]
+    else:
+        features_ = deepcopy(features)
+
+    features_ += total_second + day_of_week
+    columns = range(X.shape[1])
+    expected_features_per_column = {col: features_ for col in columns}
+
+    expected_format_per_column = {col: np.asarray(X)[0, col] for col in columns}
+
+    expected_n_features_out = sum(
+        len(val) for val in expected_features_per_column.values()
+    )
+
     expected_feature_names = [
-        "col1_year",
-        "col1_month",
-        "col1_day",
-        "col1_hour",
-        "col1_total_time",
-        "col1_dayofweek",
-        "col2_month",
-        "col2_day",
-        "col2_hour",
-        "col2_total_time",
-        "col2_dayofweek",
-        "col3_year",
-        "col3_month",
-        "col3_day",
-        "col3_hour",
-        "col3_dayofweek",
+        f"{col}_{feature}" for col in columns for feature in features_
     ]
-    enc.fit(X)
+
+    assert enc.features_per_column_ == expected_features_per_column
+    assert enc.format_per_column_ == expected_format_per_column
+    assert enc.n_features_out_ == expected_n_features_out
     assert enc.get_feature_names_out() == expected_feature_names
 
 
-def test_transform() -> None:
-    # Dates
-    X = get_date_array()
-    enc = DatetimeEncoder(add_day_of_the_week=True)
-    expected_result = np.array(
-        [
-            [2020, 1, 1, 2, 1, 2, 3, 2020, 1, 3, 4],
-            [2021, 2, 3, 2, 2, 4, 1, 2021, 2, 5, 4],
-            [2022, 1, 1, 5, 12, 25, 4, 2022, 1, 3, 0],
-            [2023, 2, 3, 4, 2, 4, 1, 2023, 2, 5, 6],
-        ]
+def test_format_nan():
+    X = get_nan_datetime()
+    enc = DatetimeEncoder().fit(X)
+    expected_format_per_column = {
+        0: "2020-01-01 10:12:01",
+        1: "2020-02-04 22:12:00",
+        2: "2020-01-03 10:00:00",
+    }
+    assert enc.format_per_column_ == expected_format_per_column
+
+
+def test_format_nz():
+    X = get_tz_datetime()
+    enc = DatetimeEncoder().fit(X)
+    assert enc.format_per_column_ == {0: "2020-01-01 10:12:01+05:30"}
+
+
+def test_extract_until_none():
+    X = get_datetime()
+    enc = DatetimeEncoder(
+        extract_until=None,
+        add_total_second=False,
     )
     enc.fit(X)
-    assert np.allclose(enc.transform(X), expected_result, equal_nan=True)
 
-    enc = DatetimeEncoder(add_day_of_the_week=False)
-    expected_result = np.array(
-        [
-            [2020, 1, 1, 1, 2, 2020, 1, 3],
-            [2021, 2, 3, 2, 4, 2021, 2, 5],
-            [2022, 1, 1, 12, 25, 2022, 1, 3],
-            [2023, 2, 3, 2, 4, 2023, 2, 5],
-        ]
+    assert enc.features_per_column_ == {0: [], 1: [], 2: []}
+    assert enc.n_features_out_ == 0
+    assert enc.get_feature_names_out() == []
+
+
+def test_transform_date():
+    X = get_date()
+    enc = DatetimeEncoder(
+        add_total_second=False,
     )
-    enc.fit(X)
-    assert np.allclose(enc.transform(X), expected_result, equal_nan=True)
+    X_trans = enc.fit_transform(X)
 
-    enc = DatetimeEncoder(add_day_of_the_week=True)
     expected_result = np.array(
         [
-            [2020, 1, 1, 2, 1, 2, 3, 2020, 1, 3, 4],
-            [2021, 2, 3, 2, 2, 4, 1, 2021, 2, 5, 4],
-            [2022, 1, 1, 5, 12, 25, 4, 2022, 1, 3, 0],
-            [2023, 2, 3, 4, 2, 4, 1, 2023, 2, 5, 6],
+            [2020, 1, 1, 2020, 1, 2, 2020, 1, 3],
+            [2021, 2, 3, 2020, 2, 4, 2021, 2, 5],
+            [2022, 1, 1, 2020, 12, 25, 2022, 1, 3],
+            [2023, 2, 3, 2020, 2, 4, 2023, 2, 5],
         ]
     )
-    enc.fit(X)
-    assert np.allclose(enc.transform(X), expected_result, equal_nan=True)
+    X_trans = enc.transform(X)
+    assert_array_equal(X_trans, expected_result)
 
-    # Datetimes
-    X = get_datetime_array()[:, 0].reshape(-1, 1)
-    enc = DatetimeEncoder(add_day_of_the_week=True)
-    # Check that the "total_time" feature is working
-    expected_result = np.array(
-        [
-            [2020, 1, 1, 10, 0, 2],
-            [2021, 2, 3, 12, 0, 2],
-            [2022, 1, 1, 23, 0, 5],
-            [2023, 2, 3, 11, 0, 4],
-        ]
-    ).astype(np.float64)
-    # Time from epochs in seconds
-    expected_result[:, 4] = (X.astype("int64") // 1e9).astype(np.float64).reshape(-1)
 
-    enc.fit(X)
-    X_trans = enc.transform(X)
-    assert np.allclose(X_trans, expected_result, equal_nan=True)
-
-    # Check if we find back the date from the time to epoch
-    assert (
-        (
-            pd.to_datetime(X_trans[:, 4], unit="s") - pd.to_datetime(X.reshape(-1))
-        ).total_seconds()
-        == 0
-    ).all()
-
-    # Dirty datetimes
-    X = get_dirty_datetime_array()[:, 0].reshape(-1, 1)
-    enc = DatetimeEncoder(add_day_of_the_week=True)
-    expected_result = np.array(
-        [
-            [2020, 1, 1, 10, 0, 2],
-            [np.nan] * 6,
-            [2022, 1, 1, 23, 0, 5],
-            [2023, 2, 3, 11, 0, 4],
-        ]
+def test_transform_datetime():
+    X = get_datetime()
+    enc = DatetimeEncoder(
+        extract_until="second",
+        add_total_second=False,
     )
-    # Time from epochs in seconds
-    expected_result[:, 4] = (X.astype("int64") // 1e9).astype(np.float64).reshape(-1)
-    expected_result[1, 4] = np.nan
-    enc.fit(X)
-    X_trans = enc.transform(X)
-    assert np.allclose(X_trans, expected_result, equal_nan=True)
-
-    # Datetimes with TZ
-    # If the dates are timezone-aware, all the feature extractions should
-    # be done in the provided timezone.
-    # But the full time to epoch should correspond to the true number of
-    # seconds between epoch time and the time of the date.
-    X = get_datetime_with_TZ_array()
-    enc = DatetimeEncoder(add_day_of_the_week=True)
-    expected_result = np.array(
+    X_trans = enc.fit_transform(X)
+    X_trans_expected = np.array(
         [
-            [2020, 1, 1, 10, 0, 2],
-            [2021, 2, 3, 12, 0, 2],
-            [2022, 1, 1, 23, 0, 5],
-            [2023, 2, 3, 11, 0, 4],
+            [2020, 1, 1, 10, 12, 1, 2020, 1, 2, 10, 23, 0, 2020, 1, 3, 10, 0, 0],
+            [2021, 2, 3, 12, 45, 23, 2020, 2, 4, 22, 12, 0, 2021, 2, 5, 12, 0, 0],
+            [2022, 1, 1, 23, 23, 43, 2020, 12, 25, 11, 12, 0, 2022, 1, 3, 11, 0, 0],
+            [2023, 2, 3, 11, 12, 12, 2020, 2, 4, 8, 32, 0, 2023, 2, 5, 23, 0, 0],
         ]
-    ).astype(np.float64)
-    # Time from epochs in seconds
-    expected_result[:, 4] = (
-        (X.iloc[:, 0].view(dtype="int64") // 1e9)
-        .astype(np.float64)
-        .to_numpy()
-        .reshape(-1)
     )
-    enc.fit(X)
-    X_trans = enc.transform(X)
-    assert np.allclose(X_trans, expected_result, equal_nan=True)
-
-    # Check if we find back the date from the time to epoch
-    assert (
-        (
-            pd.to_datetime(X_trans[:, 4], unit="s")
-            .tz_localize("utc")
-            .tz_convert(X.iloc[:, 0][0].tz)
-            - pd.DatetimeIndex(X.iloc[:, 0])
-        ).total_seconds()
-        == 0
-    ).all()
+    assert_array_equal(X_trans, X_trans_expected)
 
-    # Check if it's working when the date is constant
-    X = get_constant_date_array()
-    enc = DatetimeEncoder(add_day_of_the_week=True)
-    assert enc.fit_transform(X).shape[1] == 0
 
-
-@pytest.mark.parametrize(
-    "extract_until",
-    ["year", "month", "day", "hour", "minute", "second", "microsecond", "nanosecond"],
-)
-def test_extract_until(extract_until) -> None:
-    time_levels = [
-        "year",
-        "month",
-        "day",
-        "hour",
-        "minute",
-        "second",
-        "microsecond",
-        "nanosecond",
-    ]
-    X = get_datetime_array()
-    enc = DatetimeEncoder(extract_until=extract_until)
-    expected_features_per_column_ = {
-        # all features after seconds are constant
-        # we want total_time if we have not extracted all non-constant features
-        0: time_levels[
-            : min(time_levels.index(extract_until), time_levels.index("second")) + 1
-        ]
-        + (
-            ["total_time"]
-            if extract_until in ["year", "month", "day", "hour", "minute"]
-            else []
-        ),
-        # constant after minute + year constant
-        1: time_levels[
-            1 : min(time_levels.index(extract_until), time_levels.index("minute")) + 1
-        ]
-        + (["total_time"] if extract_until in ["year", "month", "day", "hour"] else []),
-        # constant after hour
-        2: time_levels[
-            : min(time_levels.index(extract_until), time_levels.index("hour")) + 1
+def test_transform_tz():
+    X = get_tz_datetime()
+    enc = DatetimeEncoder(
+        add_total_second=True,
+    )
+    X_trans = enc.fit_transform(X)
+    X_trans_expected = np.array(
+        [
+            [2020, 1, 1, 10, 1.57785372e09],
+            [2021, 2, 3, 12, 1.61233652e09],
+            [2022, 1, 1, 23, 1.64105962e09],
+            [2023, 2, 3, 11, 1.67540293e09],
         ]
-        + (["total_time"] if extract_until in ["year", "month", "day"] else []),
-    }
-    enc.fit(X)
-    assert enc.features_per_column_ == expected_features_per_column_
-
-
-def test_extract_until_none() -> None:
-    X = get_dirty_datetime_array()
-    enc = DatetimeEncoder(extract_until=None)
-    expected_features_per_column_ = {
-        # all features after seconds are constant
-        # we want total_time if we have not extracted all non-constant features
-        0: ["total_time"],
-        1: ["total_time"],
-        2: ["total_time"],
-    }
-    enc.fit(X)
-    assert enc.features_per_column_ == expected_features_per_column_
-
-    # check get_names_out
-    expected_feature_names = [
-        "0_total_time",
-        "1_total_time",
-        "2_total_time",
-    ]
-    assert enc.get_feature_names_out() == expected_feature_names
-
-    # check with constant datetimes
-    X = get_constant_date_array()
-    enc = DatetimeEncoder(extract_until=None)
-    assert enc.fit_transform(X).shape[1] == 0
-
+    )
+    assert_allclose(X_trans, X_trans_expected)
 
-def test_check_fitted_datetime_encoder() -> None:
-    """Test that calling transform before fit raises an error"""
-    X = get_datetime_array()[:, 0].reshape(-1, 1)
-    enc = DatetimeEncoder(add_day_of_the_week=True)
-    with pytest.raises(NotFittedError):
-        enc.transform(X)
 
-    # Check that it works after fit
-    enc.fit(X)
-    enc.transform(X)
+def test_transform_nan():
+    X = get_nan_datetime()
+    enc = DatetimeEncoder(
+        add_total_second=True,
+    )
+    X_trans = enc.fit_transform(X)
+    X_trans_expected = np.array(
+        [
+            [
+                2020,
+                1,
+                1,
+                10,
+                1.57787352e09,
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+                2020,
+                1,
+                3,
+                10,
+                1.57804560e09,
+            ],
+            [
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+                2020,
+                2,
+                4,
+                22,
+                1.58085432e09,
+                2021,
+                2,
+                5,
+                12,
+                1.61252640e09,
+            ],
+            [
+                2022,
+                1,
+                1,
+                23,
+                1.64107942e09,
+                2020,
+                12,
+                25,
+                11,
+                1.60889472e09,
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+            ],
+            [
+                2023,
+                2,
+                3,
+                11,
+                1.67542273e09,
+                2020,
+                2,
+                4,
+                8,
+                1.58080512e09,
+                2023,
+                2,
+                5,
+                23,
+                1.67563800e09,
+            ],
+        ]
+    )
+    assert_allclose(X_trans, X_trans_expected)

From 0c08aadee8db9ed348a3c418acc4896c64710f3b Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 5 Oct 2023 15:23:46 +0200
Subject: [PATCH 03/30] update docstrings

---
 skrub/_datetime_encoder.py | 83 +++++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 27 deletions(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index 7a1a59ccf..44ac160dd 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -20,11 +20,19 @@
 
 
 def is_datetime_parsable(X):
-    """
+    """Check whether a 1d vector can be converted into a \
+    :class:`~pandas.core.indexes.datetimes.DatetimeIndex`.
+
     Parameters
     ----------
-    X : np.ndarray of shape (n_sample,)
+    X : array-like of shape ``(n_sample,)``
+
+    Returns
+    -------
+    is_dt_parsable : bool
     """
+    if len(X.shape) > 1:
+        raise ValueError(f"X must be 1d, got shape: {X.shape}.")
     np_dtypes_candidates = [np.object_, np.str_, np.datetime64]
     if any(np.issubdtype(X.dtype, np_dtype) for np_dtype in np_dtypes_candidates):
         try:
@@ -36,10 +44,18 @@ def is_datetime_parsable(X):
 
 
 def is_date_only(X):
-    """
+    """Check whether a 1d vector only contains dates.
+
+    Note that ``is_date_only`` being True implies ``is_datetime_parsable`` is True,
+    but not the contrary.
+
     Parameters
     ----------
-    X : np.ndarray of shape (n_sample,)
+    X : array-like of shape ``(n_sample,)``
+
+    Returns
+    -------
+    is_date : bool
     """
     if is_datetime_parsable(X):
         X_t = pd.to_datetime(X)
@@ -61,35 +77,44 @@ class DatetimeEncoder(TransformerMixin, BaseEstimator):
     extract_until : {"year", "month", "day", "hour", "minute", "second",
         "microsecond", "nanosecond", None}, default="hour"
         Extract up to this granularity.
-        If all non-constant features have not been extracted,
-        add the "total_time" feature, which contains the time to epoch (in seconds).
         For instance, if you specify "day", only "year", "month", "day" and
-        "total_time" features will be created.
-        If None, only the "total_time" feature will be created.
+        features will be created.
+        If ``None``, no feature will be created.
+
     add_day_of_the_week : bool, default=False
         Add day of the week feature (if day is extracted).
         This is a numerical feature from 0 (Monday) to 6 (Sunday).
 
+    add_total_second : bool, default=True
+        Add the total number of seconds since Epoch.
+
+    errors: {"coerce", "raise"}, default="coerce"
+        During transform:
+        - If ``"coerce"``, then invalid parsing will be set as ``NaT``.
+        - If ``"raise"``, then invalid parsing will raise an exception
+
     Attributes
     ----------
-    n_features_in_ : int
-        Number of features in the data seen during fit.
     n_features_out_ : int
         Number of features of the transformed data.
-    features_per_column_ : mapping of int to list of str
-        Dictionary mapping the index of the original columns
-        to the list of features extracted for each column.
-    col_names_ : None or list of str
-        List of the names of the features of the input data,
-        if input data was a pandas DataFrame, otherwise None.
+
+    features_per_column_ : dict[str, list[str]] or dict[int, list[str]]
+        Dictionary mapping the column names to the list of features extracted
+        for each column.
+
+    format_per_column_ : dict[str, str] or dict[int, str]
+        Dictionary mapping the column names to the first non-null example.
+        This is how Pandas infer the datetime format.
 
     See Also
     --------
     GapEncoder :
         Encode dirty categories (strings) by constructing
         latent topics with continuous encoding.
+
     MinHashEncoder :
         Encode string columns as a numeric array with the minhash method.
+
     SimilarityEncoder :
         Encode string columns as a numeric array with n-gram string similarity.
 
@@ -124,8 +149,8 @@ def __init__(
     ):
         self.extract_until = extract_until
         self.add_day_of_the_week = add_day_of_the_week
-        self.add_total_second = add_total_second  # TODO doc
-        self.errors = errors  # TODO doc
+        self.add_total_second = add_total_second
+        self.errors = errors
 
     def fit(self, X, y=None):
         """Fit the instance to X.
@@ -135,7 +160,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : array-like, shape (``n_samples``, ``n_features``)
+        X : array-like, shape ``(n_samples, n_features)``
             Data where each column is a datetime feature.
         y : None
             Unused, only here for compatibility.
@@ -161,12 +186,16 @@ def fit(self, X, y=None):
         self._check_n_features(X, reset=True)
         X = check_array(X, ensure_2d=True, force_all_finite=False, dtype=None)
 
-        self._parse_datetime_cols(X)
+        self._select_datetime_cols(X)
 
         return self
 
-    def _parse_datetime_cols(self, X):
-        """
+    def _select_datetime_cols(self, X):
+        """Select datetime-like columns and infer features to be parsed.
+
+        If the input only contains dates (and no datetimes), only the features
+        ["year", "month", "day"] will be filtered with extract_until.
+
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
@@ -215,14 +244,14 @@ def transform(self, X, y=None):
 
         Parameters
         ----------
-        X : array-like, shape (``n_samples``, ``n_features``)
+        X : array-like of shape ``(n_samples, n_features)``
             The data to transform, where each column is a datetime feature.
         y : None
             Unused, only here for compatibility.
 
         Returns
         -------
-        ndarray, shape (``n_samples``, ``n_features_out_``)
+        X_out : ndarray of shape ``(n_samples, n_features_out_)``
             Transformed input.
         """
         check_is_fitted(self)
@@ -260,13 +289,13 @@ def transform(self, X, y=None):
         return X_out
 
     def get_feature_names_out(self, input_features=None):
-        """Return clean feature names.
+        """Get output feature names for transformation.
 
         Feature names are formatted like: "<column_name>_<new_feature>"
         if the original data has column names, otherwise with format
         "<column_index>_<new_feature>" where `<new_feature>` is one of
         {"year", "month", "day", "hour", "minute", "second",
-        "microsecond", "nanosecond", "dayofweek"}.
+        "microsecond", "nanosecond", "day_of_week"}.
 
         Parameters
         ----------
@@ -275,7 +304,7 @@ def get_feature_names_out(self, input_features=None):
 
         Returns
         -------
-        list of str
+        feature_names : list of str
             List of feature names.
         """
         check_is_fitted(self, "features_per_column_")

From d57691c3552be0db31c59c9ff05b284c6fa34af1 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 5 Oct 2023 16:06:46 +0200
Subject: [PATCH 04/30] update example

---
 examples/03_datetime_encoder.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/examples/03_datetime_encoder.py b/examples/03_datetime_encoder.py
index bf328addf..27c90bc37 100644
--- a/examples/03_datetime_encoder.py
+++ b/examples/03_datetime_encoder.py
@@ -90,11 +90,8 @@
 
 ###############################################################################
 # We see that the encoder is working as expected: the "date.utc" column has
-# been replaced by features extracting the month, day, hour, and day of the
-# week information.
-#
-# Note the year and minute features are not present, this is because they
-# have been removed by the encoder as they are constant the whole period.
+# been replaced by features extracting the month, day, hour, minute, day of the
+# week and total second since Epoch information.
 
 ###############################################################################
 # One-liner with the |TableVectorizer|
@@ -148,14 +145,9 @@
 #    ```py
 #    from sklearn.experimental import enable_hist_gradient_boosting
 #    ```
-
-import numpy as np
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.pipeline import make_pipeline
 
-table_vec = TableVectorizer(
-    datetime_transformer=DatetimeEncoder(add_day_of_the_week=True),
-)
 pipeline = make_pipeline(table_vec, HistGradientBoostingRegressor())
 
 ###############################################################################
@@ -168,6 +160,7 @@
 #
 # Instead, we can use the |TimeSeriesSplit|,
 # which ensures that the test set is always in the future.
+import numpy as np
 
 X["date.utc"] = pd.to_datetime(X["date.utc"])
 sorted_indices = np.argsort(X["date.utc"])

From b39c2dac65cdfd46be04143f401e1ff7a35a6d7c Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 5 Oct 2023 21:48:12 +0200
Subject: [PATCH 05/30] split the transform method with _parse_datetime_cols

---
 skrub/_datetime_encoder.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index 44ac160dd..5ae4b8911 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -259,6 +259,19 @@ def transform(self, X, y=None):
         self._check_feature_names(X, reset=False)
         X = check_array(X, ensure_2d=True, force_all_finite=False, dtype=None)
 
+        return self._parse_datetime_cols(X)
+
+    def _parse_datetime_cols(self, X):
+        """Extract datetime features from the selected columns.
+
+        Parameters
+        ----------
+        X : ndarray of shape ``(n_samples, n_features)``
+
+        Returns
+        -------
+        X_out : ndarray of shape ``(n_samples, n_features_out_)``
+        """
         columns = getattr(self, "feature_names_in_", list(range(X.shape[1])))
         # X_out must be of dtype float64 to handle np.nan
         X_out = np.empty((X.shape[0], self.n_features_out_), dtype=np.float64)

From edf11dd7ab098356ff661d08f5e0be5ce6ef9393 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 5 Oct 2023 21:50:36 +0200
Subject: [PATCH 06/30] small typo in a comment

---
 skrub/_datetime_encoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index 5ae4b8911..25e2ef69c 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -278,7 +278,7 @@ def _parse_datetime_cols(self, X):
         offset_idx = 0
         for col_idx, col in enumerate(columns):
             if col in self.features_per_column_:
-                # X_j is a DatetimeIndex
+                # X_col is a DatetimeIndex
                 X_col = pd.to_datetime(X[:, col_idx], errors=self.errors)
 
                 features = self.features_per_column_[col]

From 367d207ae22a26cf3aca02a3a08e788fbdaec869 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 12 Oct 2023 17:19:13 +0200
Subject: [PATCH 07/30] add to_datetime and rework the backend

---
 CHANGES.rst                          |   9 +
 examples/03_datetime_encoder.py      |   2 +-
 skrub/__init__.py                    |   3 +-
 skrub/_datetime_encoder.py           | 629 ++++++++++++++++++++++-----
 skrub/tests/test_datetime_encoder.py |  16 +-
 5 files changed, 533 insertions(+), 126 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 548d304d7..ae2e64b3d 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -15,6 +15,10 @@ development and backward compatibility is not ensured.
 Major changes
 -------------
 
+* :func:`to_datetime` is now available to support pandas.to_datetime
+  over dataframes and 2d arrays.
+  :pr:`784` by :user:`Vincent Maladiere <Vincent-Maladiere>`
+
 * :func:`dataframe.pd_join`, :func:`dataframe.pd_aggregate`,
   :func:`dataframe.pl_join` and :func:`dataframe.pl_aggregate`
   are now available in the dataframe submodule.
@@ -40,6 +44,11 @@ Major changes
 
 Minor changes
 -------------
+* :class:`DatetimeEncoder` doesn't remove constant features anymore.
+  It also supports an 'errors' argument to raise or coerce errors during
+  transform, and a 'add_total_seconds' argument to include the number of
+  seconds since Epoch.
+  :pr:`784` by :user:`Vincent Maladiere <Vincent-Maladiere>`
 
 * :class:`TableVectorizer` is now able to apply parallelism at the column level rather than the transformer level. This is the default for univariate transformers, like :class:`MinHashEncoder`, and :class:`GapEncoder`.
   :pr:`592` by :user:`Leo Grinsztajn <LeoGrin>`
diff --git a/examples/03_datetime_encoder.py b/examples/03_datetime_encoder.py
index f8cc5756f..6d571ca65 100644
--- a/examples/03_datetime_encoder.py
+++ b/examples/03_datetime_encoder.py
@@ -80,7 +80,7 @@
 
 encoder = make_column_transformer(
     (OneHotEncoder(handle_unknown="ignore"), ["city"]),
-    (DatetimeEncoder(add_day_of_the_week=True, extract_until="minute"), ["date.utc"]),
+    (DatetimeEncoder(add_day_of_the_week=True, resolution="minute"), ["date.utc"]),
     remainder="drop",
 )
 
diff --git a/skrub/__init__.py b/skrub/__init__.py
index 2618dd421..bc73b0182 100644
--- a/skrub/__init__.py
+++ b/skrub/__init__.py
@@ -4,7 +4,7 @@
 from pathlib import Path as _Path
 
 from ._check_dependencies import check_dependencies
-from ._datetime_encoder import DatetimeEncoder
+from ._datetime_encoder import DatetimeEncoder, to_datetime
 from ._deduplicate import compute_ngram_distance, deduplicate
 from ._fuzzy_join import fuzzy_join
 from ._gap_encoder import GapEncoder
@@ -32,4 +32,5 @@
     "TargetEncoder",
     "deduplicate",
     "compute_ngram_distance",
+    "to_datetime",
 ]
diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index 25e2ef69c..b80cb5382 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -1,11 +1,16 @@
+import warnings
 from collections import defaultdict
+from typing import Iterable
 
 import numpy as np
 import pandas as pd
+from pandas._libs.tslibs.parsing import guess_datetime_format
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils import check_array
 from sklearn.utils.validation import check_is_fitted
 
+from .dataframe._namespace import get_df_namespace
+
 WORD_TO_ALIAS = {
     "year": "Y",
     "month": "M",
@@ -19,92 +24,480 @@
 TIME_LEVELS = list(WORD_TO_ALIAS)
 
 
-def is_datetime_parsable(X):
-    """Check whether a 1d vector can be converted into a \
-    :class:`~pandas.core.indexes.datetimes.DatetimeIndex`.
+def to_datetime(
+    X,
+    errors="coerce",
+    **kwargs,
+):
+    """
+    Convert argument to datetime.
+
+    Augment :func:`pandas.to_datetime` by supporting dataframes
+    and 2d arrays inputs. It converts compatible columns to datetime, and
+    pass incompatible columns unchanged.
+
+    With 2d arrays, numerical columns will also be passed unchanged.
+
+    int, float, str, datetime, list, tuple, 1d array, and Series are defered to
+    pandas.to_datetime directly.
+
+    Parameters
+    ----------
+    arg : int, float, str, datetime, list, tuple, nd array, Series, DataFrame/dict-like
+        The object to convert to a datetime.
+    errors : {'ignore', 'raise', 'coerce'}, default 'coerce'
+        - If :const:`'raise'`, then invalid parsing will raise an exception.
+        - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`.
+        - If :const:`'ignore'`, then invalid parsing will return the input.
+    dayfirst : bool, default False
+        Specify a date parse order if `arg` is str or is list-like.
+        If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"`
+        is parsed as :const:`2012-11-10`.
+
+        .. warning::
+
+            ``dayfirst=True`` is not strict, but will prefer to parse
+            with day first.
+
+    yearfirst : bool, default False
+        Specify a date parse order if `arg` is str or is list-like.
+
+        - If :const:`True` parses dates with the year first, e.g.
+          :const:`"10/11/12"` is parsed as :const:`2010-11-12`.
+        - If both `dayfirst` and `yearfirst` are :const:`True`, `yearfirst` is
+          preceded (same as :mod:`dateutil`).
+
+        .. warning::
+
+            ``yearfirst=True`` is not strict, but will prefer to parse
+            with year first.
+
+    utc : bool, default False
+        Control timezone-related parsing, localization and conversion.
+
+        - If :const:`True`, the function *always* returns a timezone-aware
+          UTC-localized :class:`Timestamp`, :class:`Series` or
+          :class:`DatetimeIndex`. To do this, timezone-naive inputs are
+          *localized* as UTC, while timezone-aware inputs are *converted* to UTC.
+
+        - If :const:`False` (default), inputs will not be coerced to UTC.
+          Timezone-naive inputs will remain naive, while timezone-aware ones
+          will keep their time offsets. Limitations exist for mixed
+          offsets (typically, daylight savings), see :ref:`Examples
+          <to_datetime_tz_examples>` section for details.
+
+        See also: pandas general documentation about `timezone conversion and
+        localization
+        <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
+        #time-zone-handling>`_.
+
+    format : str, default None
+        The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See
+        `strftime documentation
+        <https://docs.python.org/3/library/datetime.html
+        #strftime-and-strptime-behavior>`_ for more information on choices, though
+        note that :const:`"%f"` will parse all the way up to nanoseconds.
+        You can also pass:
+
+        - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_
+          time string (not necessarily in exactly the same format);
+        - "mixed", to infer the format for each element individually. This is risky,
+          and you should probably use it along with `dayfirst`.
+
+    exact : bool, default True
+        Control how `format` is used:
+
+        - If :const:`True`, require an exact `format` match.
+        - If :const:`False`, allow the `format` to match anywhere in the target
+          string.
+
+        Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``.
+    unit : str, default 'ns'
+        The unit of the arg (D,s,ms,us,ns) denote the unit, which is an
+        integer or float number. This will be based off the origin.
+        Example, with ``unit='ms'`` and ``origin='unix'``, this would calculate
+        the number of milliseconds to the unix epoch start.
+    origin : scalar, default 'unix'
+        Define the reference date. The numeric values would be parsed as number
+        of units (defined by `unit`) since this reference date.
+
+        - If :const:`'unix'` (or POSIX) time; origin is set to 1970-01-01.
+        - If :const:`'julian'`, unit must be :const:`'D'`, and origin is set to
+          beginning of Julian Calendar. Julian day number :const:`0` is assigned
+          to the day starting at noon on January 1, 4713 BC.
+        - If Timestamp convertible (Timestamp, dt.datetime, np.datetimt64 or date
+          string), origin is set to Timestamp identified by origin.
+        - If a float or integer, origin is the millisecond difference
+          relative to 1970-01-01.
+    cache : bool, default True
+        If :const:`True`, use a cache of unique, converted dates to apply the
+        datetime conversion. May produce significant speed-up when parsing
+        duplicate date strings, especially ones with timezone offsets. The cache
+        is only used when there are at least 50 values. The presence of
+        out-of-bounds values will render the cache unusable and may slow down
+        parsing.
+
+    Returns
+    -------
+    datetime
+        If parsing succeeded.
+        Return type depends on input (types in parenthesis correspond to
+        fallback in case of unsuccessful timezone or out-of-range timestamp
+        parsing):
+
+        - scalar: :class:`Timestamp` (or :class:`datetime.datetime`)
+        - array-like: :class:`DatetimeIndex` (or :class:`Series` with
+          :class:`object` dtype containing :class:`datetime.datetime`)
+        - Series: :class:`Series` of :class:`datetime64` dtype (or
+          :class:`Series` of :class:`object` dtype containing
+          :class:`datetime.datetime`)
+        - DataFrame: :class:`Series` of :class:`datetime64` dtype (or
+          :class:`Series` of :class:`object` dtype containing
+          :class:`datetime.datetime`)
+
+    Raises
+    ------
+    ParserError
+        When parsing a date from string fails.
+    ValueError
+        When another datetime conversion error happens. For example when one
+        of 'year', 'month', day' columns is missing in a :class:`DataFrame`, or
+        when a Timezone-aware :class:`datetime.datetime` is found in an array-like
+        of mixed time offsets, and ``utc=False``.
+
+    See Also
+    --------
+    :func:`pandas.to_datetime`
+    """
+    kwargs["errors"] = errors
+
+    # dataframe
+    if hasattr(X, "__dataframe__"):
+        return _to_datetime_dataframe(X, **kwargs)
+
+    # series, this attribute is available since Pandas 2.1.0
+    elif hasattr(X, "__column_consortium_standard__"):
+        return _to_datetime_series(X, **kwargs)
+
+    # 2d array
+    elif isinstance(X, Iterable) and np.asarray(X).ndim == 2:
+        X = _to_datetime_2d_array(np.asarray(X), **kwargs)
+        return np.vstack(X).T
+
+    # scalar or unknown type
+    return pd.to_datetime(X, **kwargs)
+
+
+def _to_datetime_dataframe(X, **kwargs):
+    """Dataframe specialization of ``_to_datetime_2d``.
+
+    Parameters
+    ----------
+    X : Pandas or Polars dataframe
+
+    Returns
+    -------
+    X : Pandas or Polars dataframe
+    """
+    _, px = get_df_namespace(X)
+    index = getattr(X, "index", None)
+    X_split = [X[col].to_numpy() for col in X.columns]
+    X_split = _to_datetime_2d(X_split, **kwargs)
+    X_split = {col: X_split[col_idx] for col_idx, col in enumerate(X.columns)}
+    X = pd.DataFrame(X_split, index=index)
+    # conversion is px is Polars, no-op if Pandas
+    return px.DataFrame(X)
+
+
+def _to_datetime_series(X, **kwargs):
+    """Series specialization of :func:`pandas.to_datetime`.
+
+    Parameters
+    ----------
+    X : Pandas or Polars series
+
+    Returns
+    -------
+    X : Pandas or Polars series
+    """
+    _, px = get_df_namespace(X.to_frame())
+    index = getattr(X, "index", None)
+    name = X.name
+    X = pd.to_datetime(X, **kwargs)
+    X = pd.Series(X, index=index, name=name)
+    # conversion is px is Polars, no-op if Pandas
+    return px.Series(X)
+
+
+def _to_datetime_2d_array(X, **kwargs):
+    """2d array specialization of ``_to_datetime_2d``.
+
+    Parameters
+    ----------
+    X : ndarray of shape ``(n_samples, n_features)``
+
+    Returns
+    -------
+    X_split : list of array, of shape ``n_features``
+    """
+    X_split = np.hsplit(X, X.shape[1])
+    X_split = [X_col.ravel() for X_col in X_split]
+    return _to_datetime_2d(X_split, **kwargs)
+
+
+def _to_datetime_2d(
+    X_split,
+    indices=None,
+    indice_to_format=None,
+    format=None,
+    **kwargs,
+):
+    """Convert datetime parsable columns from a 2d array or dataframe \
+        to datetime format.
+
+    The conversion is done inplace.
+
+    Parameters
+    ----------
+    X : list of 1d array of length n_features
+        The 2d input, chunked into a list of array. This format allows us
+        to treat each column individually and preserve their dtype, because
+        dataframe.to_numpy() casts all columns to object is any column dtype
+        is object.
+
+    indices : list of int, default=None
+        Indices of the parsable columns to convert.
+        If None, indices are computed using the current input X.
+
+    indice_to_format : mapping of int to str, default=None
+        Dictionary mapping column indices to their datetime format.
+        It defines the format parameter for each column when calling
+        pd.to_datetime.
+
+        If indices is None, indices_to_format is computed using the current input X.
+        If format is not None, all values of indices_to_format are format
+
+    format : str, default=None
+        Here for compatibility with ``pandas.to_datetime`` API.
+        When format is not None, it overwrites the values in indices_to_format.
+
+    Returns
+    -------
+    X_split : list of 1d array of length n_features
+    """
+    if indices is None:
+        indices, indice_to_format = _get_datetime_column_indices(X_split)
+
+    # format overwrite indices_to_format
+    if format is not None or indice_to_format is None:
+        indice_to_format = {col_idx: format for col_idx in indices}
+
+    for col_idx in indices:
+        X_split[col_idx] = pd.to_datetime(
+            X_split[col_idx], format=indice_to_format[col_idx], **kwargs
+        )
+
+    return X_split
+
+
+def _get_datetime_column_indices(X_split):
+    """Select the datetime parsable columns by their indices \
+    and return their datetime format.
 
     Parameters
     ----------
-    X : array-like of shape ``(n_sample,)``
+    X_split : list of 1d array of length n_features
+
+    Returns
+    -------
+    datetime_indices : list of int
+        List of parsable column, identified by their indices.
+
+    indice_to_format: mapping of int to str
+        Dictionary mapping parsable column indices to their datetime format.
+    """
+    indices = []
+    indice_to_format = {}
+
+    for col_idx, X_col in enumerate(X_split):
+        X_col = X_col[pd.notnull(X_col)]
+        if _is_column_datetime_parsable(X_col):
+            indices.append(col_idx)
+            indice_to_format[col_idx] = _guess_datetime_format(X_col)
+
+    return indices, indice_to_format
+
+
+def _is_column_datetime_parsable(X_col):
+    """Check whether a 1d array can be converted into a \
+    :class:`pandas.DatetimeIndex`.
+
+    Parameters
+    ----------
+    X_col : array-like of shape ``(n_samples,)``
 
     Returns
     -------
     is_dt_parsable : bool
     """
-    if len(X.shape) > 1:
-        raise ValueError(f"X must be 1d, got shape: {X.shape}.")
+    # Remove columns of int, float or bool casted as object.
+    try:
+        if np.array_equal(X_col, X_col.astype(np.float64)):
+            return False
+    except ValueError:
+        pass
+
     np_dtypes_candidates = [np.object_, np.str_, np.datetime64]
-    if any(np.issubdtype(X.dtype, np_dtype) for np_dtype in np_dtypes_candidates):
+    is_type_datetime_compatible = any(
+        np.issubdtype(X_col.dtype, np_dtype) for np_dtype in np_dtypes_candidates
+    )
+    if is_type_datetime_compatible:
         try:
-            _ = pd.to_datetime(X)
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", category=UserWarning)
+                # format=mixed parses entries individually,
+                # avoiding ValueError when both date and datetime formats
+                # are present.
+                # At this stage, the format itself doesn't matter.
+                _ = pd.to_datetime(X_col, format="mixed")
             return True
         except (pd.errors.ParserError, ValueError):
             pass
     return False
 
 
-def is_date_only(X):
-    """Check whether a 1d vector only contains dates.
+def _guess_datetime_format(X_col, require_dayfirst=True):
+    """
+    Parameters
+    ----------
+    X_col : ndarray of shape ``(n_samples,)``
+
+    require_dayfirst : bool, default True
+        Whether to return the dayfirst format when both dayfirst
+        and monthfirst are valid.
+
+    Returns
+    -------
+    format : str
+    """
+    X_col = X_col.astype(np.object_)
+    vfunc = np.vectorize(guess_datetime_format)
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=UserWarning)
+        month_first_formats = np.unique(vfunc(X_col, dayfirst=False))
+        day_first_formats = np.unique(vfunc(X_col, dayfirst=True))
+
+    if pd.isnull(month_first_formats).any() or pd.isnull(day_first_formats).any():
+        return None
+
+    elif (
+        len(month_first_formats) == 1
+        and len(day_first_formats) == 1
+        and month_first_formats[0] != day_first_formats[0]
+    ):
+        if require_dayfirst:
+            return str(day_first_formats[0])
+        else:
+            return str(month_first_formats[0])
+
+    elif len(month_first_formats) == 1:
+        return str(month_first_formats[0])
+
+    elif len(day_first_formats) == 1:
+        return str(day_first_formats[0])
+
+    # special heuristic: when both date and datetime formats are
+    # present, allow the format to be mixed.
+    elif (
+        len(month_first_formats) == 2
+        and len(day_first_formats) == 2
+        and len(month_first_formats[0]) != len(month_first_formats[1])
+    ):
+        return "mixed"
+
+    else:
+        return None
 
-    Note that ``is_date_only`` being True implies ``is_datetime_parsable`` is True,
-    but not the contrary.
+
+def _is_column_date_only(X_col):
+    """Check whether a :obj:`pandas.DatetimeIndex` only contains dates.
 
     Parameters
     ----------
-    X : array-like of shape ``(n_sample,)``
+    X_col : pandas.DatetimeIndex of shape ``(n_samples,)``
 
     Returns
     -------
     is_date : bool
     """
-    if is_datetime_parsable(X):
-        X_t = pd.to_datetime(X)
-        return np.all(X_t == X_t.normalize())
-    return False
+    return np.array_equal(X_col, X_col.normalize())
+
+
+def _datetime_to_total_seconds(X_col):
+    """
+    Parameters
+    ----------
+    X_col : DatetimeIndex of shape (n_samples,)
+
+    Returns
+    -------
+    X_col : ndarray of shape (n_samples)
+    """
+    if X_col.tz is not None:
+        X_col = X_col.tz_convert("utc")
+
+    # Total seconds since epoch
+    mask_notnull = X_col == X_col
+
+    return np.where(
+        mask_notnull,
+        X_col.astype("int64") / 1e9,
+        np.nan,
+    )
 
 
 class DatetimeEncoder(TransformerMixin, BaseEstimator):
     """Transforms each datetime column into several numeric columns \
     for temporal features (e.g year, month, day...).
 
-    Constant extracted features are dropped; for instance, if the year is
-    always the same in a feature, the extracted "year" column won't be added.
     If the dates are timezone aware, all the features extracted will correspond
     to the provided timezone.
 
     Parameters
     ----------
-    extract_until : {"year", "month", "day", "hour", "minute", "second",
+    resolution : {"year", "month", "day", "hour", "minute", "second",
         "microsecond", "nanosecond", None}, default="hour"
-        Extract up to this granularity.
-        For instance, if you specify "day", only "year", "month", "day" and
-        features will be created.
+        Extract up to this resolution.
+        E.g., ``resolution="day"`` generates the features "year", "month",
+        "day" only.
         If ``None``, no feature will be created.
 
     add_day_of_the_week : bool, default=False
-        Add day of the week feature (if day is extracted).
-        This is a numerical feature from 0 (Monday) to 6 (Sunday).
+        Add day of the week feature as a numerical feature
+        from 0 (Monday) to 6 (Sunday).
 
-    add_total_second : bool, default=True
+    add_total_seconds : bool, default=True
         Add the total number of seconds since Epoch.
 
     errors: {"coerce", "raise"}, default="coerce"
         During transform:
-        - If ``"coerce"``, then invalid parsing will be set as ``NaT``.
-        - If ``"raise"``, then invalid parsing will raise an exception
+        - If ``"coerce"``, then invalid parsing will be set as ``pd.NaT``.
+        - If ``"raise"``, then invalid parsing will raise an exception.
 
     Attributes
     ----------
-    n_features_out_ : int
-        Number of features of the transformed data.
+    column_indices_ : list of int
+        Indices of the datetime-parsable columns.
+
+    indice_to_format_ : dict[int, str]
+        Mapping from column indices to their datetime formats.
 
-    features_per_column_ : dict[str, list[str]] or dict[int, list[str]]
-        Dictionary mapping the column names to the list of features extracted
-        for each column.
+    indice_to_features_ : dict[int, list[str]]
+        Dictionary mapping the column names to the list of datetime
+        features extracted for each column.
 
-    format_per_column_ : dict[str, str] or dict[int, str]
-        Dictionary mapping the column names to the first non-null example.
-        This is how Pandas infer the datetime format.
+    n_features_out_ : int
+        Number of features of the transformed data.
 
     See Also
     --------
@@ -130,38 +523,40 @@ class DatetimeEncoder(TransformerMixin, BaseEstimator):
     DatetimeEncoder()
 
     The encoder will output a transformed array
-    with four columns ("year", "month", "day" and "hour"):
+    with five columns ("year", "month", "day", "hour" and "total_seconds"):
 
     >>> enc.transform(X)
-    array([[2022.,   10.,   15.,    0.],
-           [2021.,   12.,   25.,    0.],
-           [2020.,    5.,   18.,    0.],
-           [2019.,   10.,   15.,   12.]])
+    array([[2022.,   10.,   15.,    0.,    1.6657920e+09],
+           [2021.,   12.,   25.,    0.,    1.6403904e+09],
+           [2020.,    5.,   18.,    0.,    1.5897600e+09],
+           [2019.,   10.,   15.,   12.,    1.5711408e+09]])
     """
 
     def __init__(
         self,
         *,
-        extract_until="hour",
+        resolution="hour",
         add_day_of_the_week=False,
-        add_total_second=True,
+        add_total_seconds=True,
         errors="coerce",
     ):
-        self.extract_until = extract_until
+        self.resolution = resolution
         self.add_day_of_the_week = add_day_of_the_week
-        self.add_total_second = add_total_second
+        self.add_total_seconds = add_total_seconds
         self.errors = errors
 
     def fit(self, X, y=None):
         """Fit the instance to X.
 
-        In practice, just check keywords and input validity,
-        and stores which extracted features are not constant.
+        Select datetime-parsable columns and generate the list of
+        datetime feature to extract.
 
         Parameters
         ----------
         X : array-like, shape ``(n_samples, n_features)``
-            Data where each column is a datetime feature.
+            Input data. Columns that can't be converted into
+            `pandas.DatetimeIndex` and numerical values will
+            be dropped.
         y : None
             Unused, only here for compatibility.
 
@@ -170,10 +565,9 @@ def fit(self, X, y=None):
         DatetimeEncoder
             Fitted DatetimeEncoder instance (self).
         """
-        if self.extract_until not in TIME_LEVELS and self.extract_until is not None:
+        if self.resolution not in TIME_LEVELS and self.resolution is not None:
             raise ValueError(
-                f"'extract_until' options are {TIME_LEVELS}, "
-                f"got {self.extract_until!r}."
+                f"'resolution' options are {TIME_LEVELS}, got {self.resolution!r}."
             )
 
         errors_options = ["coerce", "raise"]
@@ -184,62 +578,61 @@ def fit(self, X, y=None):
 
         self._check_feature_names(X, reset=True)
         self._check_n_features(X, reset=True)
-        X = check_array(X, ensure_2d=True, force_all_finite=False, dtype=None)
+        X = check_array(
+            X, ensure_2d=True, force_all_finite=False, dtype=None, copy=False
+        )
 
         self._select_datetime_cols(X)
 
         return self
 
     def _select_datetime_cols(self, X):
-        """Select datetime-like columns and infer features to be parsed.
+        """Select datetime-parsable columns and generate the list of
+        datetime feature to extract.
 
         If the input only contains dates (and no datetimes), only the features
-        ["year", "month", "day"] will be filtered with extract_until.
+        ["year", "month", "day"] will be filtered with resolution.
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features)
+        X : array-like of shape ``(n_samples, n_features)``
         """
-        # Features to extract for each column, after removing constant features
-        self.features_per_column_ = defaultdict(list)
-        self.format_per_column_ = dict()
-        self.n_features_out_ = 0
-
-        if self.extract_until is None:
+        if self.resolution is None:
             levels = []
         else:
-            idx_level = TIME_LEVELS.index(self.extract_until)
+            idx_level = TIME_LEVELS.index(self.resolution)
             levels = TIME_LEVELS[: idx_level + 1]
 
-        columns = getattr(self, "feature_names_in_", list(range(X.shape[1])))
-        for col_idx, col in enumerate(columns):
-            X_col = X[:, col_idx]
+        X_split = np.hsplit(X, X.shape[1])
+        self.column_indices_, self.indice_to_format_ = _get_datetime_column_indices(
+            X_split
+        )
+        del X_split
 
-            if is_datetime_parsable(X_col):
-                # Pandas use the first non-null item of the array to infer the format.
-                X_dt = pd.to_datetime(X_col)
-                mask_notnull = X_dt == X_dt
-                self.format_per_column_[col] = X_col[mask_notnull][0]
+        self.indice_to_features_ = defaultdict(list)
+        self.n_features_out_ = 0
 
-                if is_date_only(X_col):
-                    # Keep only date attributes
-                    levels = [
-                        level for level in levels if level in ["year", "month", "day"]
-                    ]
+        for col_idx in self.column_indices_:
+            X_col = pd.DatetimeIndex(X[:, col_idx])
+            if _is_column_date_only(X_col):
+                # Keep only date attributes
+                levels = [
+                    level for level in levels if level in ["year", "month", "day"]
+                ]
 
-                self.features_per_column_[col] += levels
-                self.n_features_out_ += len(levels)
+            self.indice_to_features_[col_idx] += levels
+            self.n_features_out_ += len(levels)
 
-                if self.add_total_second:
-                    self.features_per_column_[col].append("total_second")
-                    self.n_features_out_ += 1
+            if self.add_total_seconds:
+                self.indice_to_features_[col_idx].append("total_seconds")
+                self.n_features_out_ += 1
 
-                if self.add_day_of_the_week:
-                    self.features_per_column_[col].append("day_of_week")
-                    self.n_features_out_ += 1
+            if self.add_day_of_the_week:
+                self.indice_to_features_[col_idx].append("day_of_week")
+                self.n_features_out_ += 1
 
     def transform(self, X, y=None):
-        """Transform `X` by replacing each datetime column with \
+        """Transform ``X`` by replacing each datetime column with \
         corresponding numerical features.
 
         Parameters
@@ -257,47 +650,49 @@ def transform(self, X, y=None):
         check_is_fitted(self)
         self._check_n_features(X, reset=False)
         self._check_feature_names(X, reset=False)
-        X = check_array(X, ensure_2d=True, force_all_finite=False, dtype=None)
-
-        return self._parse_datetime_cols(X)
 
-    def _parse_datetime_cols(self, X):
+        X = check_array(
+            X,
+            ensure_2d=True,
+            force_all_finite=False,
+            dtype=None,
+            copy=False,
+        )
+        X_split = _to_datetime_2d_array(
+            X,
+            indices=self.column_indices_,
+            indice_to_format=self.indice_to_format_,
+            errors=self.errors,
+        )
+
+        return self._extract_features(X_split)
+
+    def _extract_features(self, X_split):
         """Extract datetime features from the selected columns.
 
         Parameters
         ----------
-        X : ndarray of shape ``(n_samples, n_features)``
+        X_split : list of 1d array of length n_features
 
         Returns
         -------
         X_out : ndarray of shape ``(n_samples, n_features_out_)``
         """
-        columns = getattr(self, "feature_names_in_", list(range(X.shape[1])))
-        # X_out must be of dtype float64 to handle np.nan
-        X_out = np.empty((X.shape[0], self.n_features_out_), dtype=np.float64)
+        # X_out must be of dtype float64 otherwise np.nan will overflow
+        # to large negative numbers.
+        X_out = np.empty((X_split[0].shape[0], self.n_features_out_), dtype=np.float64)
         offset_idx = 0
-        for col_idx, col in enumerate(columns):
-            if col in self.features_per_column_:
-                # X_col is a DatetimeIndex
-                X_col = pd.to_datetime(X[:, col_idx], errors=self.errors)
-
-                features = self.features_per_column_[col]
-                for feat_idx, feature in enumerate(features):
-                    if feature == "total_second":
-                        if X_col.tz is not None:
-                            X_col = X_col.tz_convert("utc")
-                        # Total seconds since epoch
-                        mask_notnull = X_col == X_col
-                        X_feature = np.where(
-                            mask_notnull,
-                            X_col.astype("int64") // 1e9,
-                            np.nan,
-                        )
-                    else:
-                        X_feature = getattr(X_col, feature).to_numpy()
-                    X_out[:, offset_idx + feat_idx] = X_feature
-
-                offset_idx += len(features)
+        for col_idx in self.column_indices_:
+            X_col = X_split[col_idx]
+            features = self.indice_to_features_[col_idx]
+            for feat_idx, feature in enumerate(features):
+                if feature == "total_seconds":
+                    X_feature = _datetime_to_total_seconds(X_col)
+                else:
+                    X_feature = getattr(X_col, feature).to_numpy()
+                X_out[:, offset_idx + feat_idx] = X_feature
+
+            offset_idx += len(features)
 
         return X_out
 
@@ -320,9 +715,11 @@ def get_feature_names_out(self, input_features=None):
         feature_names : list of str
             List of feature names.
         """
-        check_is_fitted(self, "features_per_column_")
+        check_is_fitted(self, "indice_to_features_")
         feature_names = []
-        for column, features in self.features_per_column_.items():
+        columns = getattr(self, "feature_names_in_", list(range(self.n_features_in_)))
+        for col_idx, features in self.indice_to_features_.items():
+            column = columns[col_idx]
             feature_names += [f"{column}_{feat}" for feat in features]
         return feature_names
 
diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py
index 5ecdf383b..4a9539419 100644
--- a/skrub/tests/test_datetime_encoder.py
+++ b/skrub/tests/test_datetime_encoder.py
@@ -108,28 +108,28 @@ def get_tz_datetime(as_array=False):
     "add_total_second, add_day_of_the_week",
     list(product([True, False], [True, False])),
 )
-@pytest.mark.parametrize("extract_until", TIME_LEVELS)
+@pytest.mark.parametrize("resolution", TIME_LEVELS)
 def test_fit(
     as_array,
     get_data_func,
     features,
     add_total_second,
     add_day_of_the_week,
-    extract_until,
+    resolution,
 ):
     X = get_data_func(as_array=as_array)
     enc = DatetimeEncoder(
         add_day_of_the_week=add_day_of_the_week,
         add_total_second=add_total_second,
-        extract_until=extract_until,
+        resolution=resolution,
     )
     enc.fit(X)
 
     total_second = ["total_second"] if add_total_second else []
     day_of_week = ["day_of_week"] if add_day_of_the_week else []
 
-    if extract_until in features:
-        features_ = features[: features.index(extract_until) + 1]
+    if resolution in features:
+        features_ = features[: features.index(resolution) + 1]
     else:
         features_ = deepcopy(features)
 
@@ -170,10 +170,10 @@ def test_format_nz():
     assert enc.format_per_column_ == {0: "2020-01-01 10:12:01+05:30"}
 
 
-def test_extract_until_none():
+def test_resolution_none():
     X = get_datetime()
     enc = DatetimeEncoder(
-        extract_until=None,
+        resolution=None,
         add_total_second=False,
     )
     enc.fit(X)
@@ -205,7 +205,7 @@ def test_transform_date():
 def test_transform_datetime():
     X = get_datetime()
     enc = DatetimeEncoder(
-        extract_until="second",
+        resolution="second",
         add_total_second=False,
     )
     X_trans = enc.fit_transform(X)

From 65657c3736c7742c96916e7e679d2a100240fafd Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 12 Oct 2023 17:25:02 +0200
Subject: [PATCH 08/30] docstring typo

---
 skrub/_datetime_encoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index b80cb5382..6c41e91a1 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -259,7 +259,7 @@ def _to_datetime_2d(
 
     Parameters
     ----------
-    X : list of 1d array of length n_features
+    X_split : list of 1d array of length n_features
         The 2d input, chunked into a list of array. This format allows us
         to treat each column individually and preserve their dtype, because
         dataframe.to_numpy() casts all columns to object is any column dtype

From 53a04d2d617be1e276cd67083823cca1f6536290 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 12 Oct 2023 17:26:00 +0200
Subject: [PATCH 09/30] docstring typo 2

---
 skrub/_datetime_encoder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index 6c41e91a1..fddb2d766 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -262,8 +262,8 @@ def _to_datetime_2d(
     X_split : list of 1d array of length n_features
         The 2d input, chunked into a list of array. This format allows us
         to treat each column individually and preserve their dtype, because
-        dataframe.to_numpy() casts all columns to object is any column dtype
-        is object.
+        dataframe.to_numpy() casts all columns to object when at least one
+        column dtype is object.
 
     indices : list of int, default=None
         Indices of the parsable columns to convert.

From 998859cad1c42d85f233868b8f26796ae00aab96 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 12 Oct 2023 17:26:50 +0200
Subject: [PATCH 10/30] docstring typo 3

---
 skrub/_datetime_encoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index fddb2d766..681ae4f40 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -275,7 +275,7 @@ def _to_datetime_2d(
         pd.to_datetime.
 
         If indices is None, indices_to_format is computed using the current input X.
-        If format is not None, all values of indices_to_format are format
+        If format is not None, all values of indices_to_format are format.
 
     format : str, default=None
         Here for compatibility with ``pandas.to_datetime`` API.

From 6bec3e61597805dbe8e0447a478de7aa194eaaac Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 12 Oct 2023 17:29:37 +0200
Subject: [PATCH 11/30] add TODO

---
 skrub/_datetime_encoder.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index 681ae4f40..7efb2b3e3 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -323,6 +323,7 @@ def _get_datetime_column_indices(X_split):
         X_col = X_col[pd.notnull(X_col)]
         if _is_column_datetime_parsable(X_col):
             indices.append(col_idx)
+            # TODO: pass require_dayfirst to _guess_datetime_format
             indice_to_format[col_idx] = _guess_datetime_format(X_col)
 
     return indices, indice_to_format

From d4b9cbc53bd1d42153e8345f7a9be21b16745c39 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Fri, 13 Oct 2023 14:29:20 +0200
Subject: [PATCH 12/30] enhance tests

---
 skrub/_datetime_encoder.py           | 152 +++++-----------------
 skrub/tests/test_datetime_encoder.py | 187 +++++++++++++++++----------
 2 files changed, 147 insertions(+), 192 deletions(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index 7efb2b3e3..0c98e1196 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -36,138 +36,35 @@ def to_datetime(
     and 2d arrays inputs. It converts compatible columns to datetime, and
     pass incompatible columns unchanged.
 
-    With 2d arrays, numerical columns will also be passed unchanged.
-
     int, float, str, datetime, list, tuple, 1d array, and Series are defered to
-    pandas.to_datetime directly.
+    :func:`pandas.to_datetime` directly.
 
     Parameters
     ----------
-    arg : int, float, str, datetime, list, tuple, nd array, Series, DataFrame/dict-like
+    X : int, float, str, datetime, list, tuple, nd array, Series, DataFrame/dict-like
         The object to convert to a datetime.
+
     errors : {'ignore', 'raise', 'coerce'}, default 'coerce'
-        - If :const:`'raise'`, then invalid parsing will raise an exception.
-        - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`.
-        - If :const:`'ignore'`, then invalid parsing will return the input.
-    dayfirst : bool, default False
-        Specify a date parse order if `arg` is str or is list-like.
-        If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"`
-        is parsed as :const:`2012-11-10`.
-
-        .. warning::
-
-            ``dayfirst=True`` is not strict, but will prefer to parse
-            with day first.
-
-    yearfirst : bool, default False
-        Specify a date parse order if `arg` is str or is list-like.
-
-        - If :const:`True` parses dates with the year first, e.g.
-          :const:`"10/11/12"` is parsed as :const:`2010-11-12`.
-        - If both `dayfirst` and `yearfirst` are :const:`True`, `yearfirst` is
-          preceded (same as :mod:`dateutil`).
-
-        .. warning::
-
-            ``yearfirst=True`` is not strict, but will prefer to parse
-            with year first.
-
-    utc : bool, default False
-        Control timezone-related parsing, localization and conversion.
-
-        - If :const:`True`, the function *always* returns a timezone-aware
-          UTC-localized :class:`Timestamp`, :class:`Series` or
-          :class:`DatetimeIndex`. To do this, timezone-naive inputs are
-          *localized* as UTC, while timezone-aware inputs are *converted* to UTC.
-
-        - If :const:`False` (default), inputs will not be coerced to UTC.
-          Timezone-naive inputs will remain naive, while timezone-aware ones
-          will keep their time offsets. Limitations exist for mixed
-          offsets (typically, daylight savings), see :ref:`Examples
-          <to_datetime_tz_examples>` section for details.
-
-        See also: pandas general documentation about `timezone conversion and
-        localization
-        <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
-        #time-zone-handling>`_.
-
-    format : str, default None
-        The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See
-        `strftime documentation
-        <https://docs.python.org/3/library/datetime.html
-        #strftime-and-strptime-behavior>`_ for more information on choices, though
-        note that :const:`"%f"` will parse all the way up to nanoseconds.
-        You can also pass:
-
-        - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_
-          time string (not necessarily in exactly the same format);
-        - "mixed", to infer the format for each element individually. This is risky,
-          and you should probably use it along with `dayfirst`.
-
-    exact : bool, default True
-        Control how `format` is used:
-
-        - If :const:`True`, require an exact `format` match.
-        - If :const:`False`, allow the `format` to match anywhere in the target
-          string.
-
-        Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``.
-    unit : str, default 'ns'
-        The unit of the arg (D,s,ms,us,ns) denote the unit, which is an
-        integer or float number. This will be based off the origin.
-        Example, with ``unit='ms'`` and ``origin='unix'``, this would calculate
-        the number of milliseconds to the unix epoch start.
-    origin : scalar, default 'unix'
-        Define the reference date. The numeric values would be parsed as number
-        of units (defined by `unit`) since this reference date.
-
-        - If :const:`'unix'` (or POSIX) time; origin is set to 1970-01-01.
-        - If :const:`'julian'`, unit must be :const:`'D'`, and origin is set to
-          beginning of Julian Calendar. Julian day number :const:`0` is assigned
-          to the day starting at noon on January 1, 4713 BC.
-        - If Timestamp convertible (Timestamp, dt.datetime, np.datetimt64 or date
-          string), origin is set to Timestamp identified by origin.
-        - If a float or integer, origin is the millisecond difference
-          relative to 1970-01-01.
-    cache : bool, default True
-        If :const:`True`, use a cache of unique, converted dates to apply the
-        datetime conversion. May produce significant speed-up when parsing
-        duplicate date strings, especially ones with timezone offsets. The cache
-        is only used when there are at least 50 values. The presence of
-        out-of-bounds values will render the cache unusable and may slow down
-        parsing.
+        - If ``'raise'``, then invalid parsing will raise an exception.
+        - If ``'coerce'``, then invalid parsing will be set as ``NaT``.
+        Note that ``'ignore'`` is not used for dataframes, 2d arrays,
+        and series, and is used otherwise as in ``pd.to_datetime``.
+
+    **kwargs : key, value mappings
+        Other keyword arguments are passed down to
+        :func:`pandas.to_datetime`.
 
     Returns
     -------
     datetime
-        If parsing succeeded.
-        Return type depends on input (types in parenthesis correspond to
-        fallback in case of unsuccessful timezone or out-of-range timestamp
-        parsing):
-
-        - scalar: :class:`Timestamp` (or :class:`datetime.datetime`)
-        - array-like: :class:`DatetimeIndex` (or :class:`Series` with
-          :class:`object` dtype containing :class:`datetime.datetime`)
-        - Series: :class:`Series` of :class:`datetime64` dtype (or
-          :class:`Series` of :class:`object` dtype containing
-          :class:`datetime.datetime`)
-        - DataFrame: :class:`Series` of :class:`datetime64` dtype (or
-          :class:`Series` of :class:`object` dtype containing
-          :class:`datetime.datetime`)
-
-    Raises
-    ------
-    ParserError
-        When parsing a date from string fails.
-    ValueError
-        When another datetime conversion error happens. For example when one
-        of 'year', 'month', day' columns is missing in a :class:`DataFrame`, or
-        when a Timezone-aware :class:`datetime.datetime` is found in an array-like
-        of mixed time offsets, and ``utc=False``.
+        Return type depends on input.
+        - dataframes, series and 2d arrays return the same type
+        - otherwise return the same output as :func:`pandas.to_datetime`.
 
     See Also
     --------
     :func:`pandas.to_datetime`
+        Convert argument to datetime.
     """
     kwargs["errors"] = errors
 
@@ -223,8 +120,9 @@ def _to_datetime_series(X, **kwargs):
     _, px = get_df_namespace(X.to_frame())
     index = getattr(X, "index", None)
     name = X.name
-    X = pd.to_datetime(X, **kwargs)
-    X = pd.Series(X, index=index, name=name)
+    X_split = [X.to_numpy()]
+    X_split = _to_datetime_2d(X_split)
+    X = pd.Series(X_split[0], index=index, name=name)
     # conversion is px is Polars, no-op if Pandas
     return px.Series(X)
 
@@ -321,6 +219,11 @@ def _get_datetime_column_indices(X_split):
 
     for col_idx, X_col in enumerate(X_split):
         X_col = X_col[pd.notnull(X_col)]
+
+        # convert pd.TimeStamp to np.datetime64
+        if all(isinstance(val, pd.Timestamp) for val in X_col):
+            X_col = X_col.astype("datetime64")
+
         if _is_column_datetime_parsable(X_col):
             indices.append(col_idx)
             # TODO: pass require_dayfirst to _guess_datetime_format
@@ -367,7 +270,7 @@ def _is_column_datetime_parsable(X_col):
     return False
 
 
-def _guess_datetime_format(X_col, require_dayfirst=True):
+def _guess_datetime_format(X_col, require_dayfirst=False):
     """
     Parameters
     ----------
@@ -381,6 +284,11 @@ def _guess_datetime_format(X_col, require_dayfirst=True):
     -------
     format : str
     """
+    if np.issubdtype(X_col.dtype, np.datetime64):
+        # We don't need to specify a parsing format
+        # for columns that are already of type datetime64.
+        return None
+
     X_col = X_col.astype(np.object_)
     vfunc = np.vectorize(guess_datetime_format)
     with warnings.catch_warnings():
@@ -480,7 +388,7 @@ class DatetimeEncoder(TransformerMixin, BaseEstimator):
     add_total_seconds : bool, default=True
         Add the total number of seconds since Epoch.
 
-    errors: {"coerce", "raise"}, default="coerce"
+    errors : {'coerce', 'raise'}, default="coerce"
         During transform:
         - If ``"coerce"``, then invalid parsing will be set as ``pd.NaT``.
         - If ``"raise"``, then invalid parsing will raise an exception.
diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py
index 4a9539419..77d94ff0d 100644
--- a/skrub/tests/test_datetime_encoder.py
+++ b/skrub/tests/test_datetime_encoder.py
@@ -5,8 +5,9 @@
 import pandas as pd
 import pytest
 from numpy.testing import assert_allclose, assert_array_equal
+from pandas.testing import assert_frame_equal, assert_series_equal
 
-from skrub._datetime_encoder import TIME_LEVELS, DatetimeEncoder
+from skrub._datetime_encoder import TIME_LEVELS, DatetimeEncoder, to_datetime
 
 
 def get_date(as_array=False):
@@ -23,20 +24,6 @@ def get_date(as_array=False):
     return df
 
 
-def get_constant_date(as_array=False):
-    df = pd.DataFrame(
-        [
-            ["2020-01-01", "2020-02-04", "2021-02-05"],
-            ["2020-01-01", "2020-02-04", "2021-02-05"],
-            ["2020-01-01", "2020-02-04", "2021-02-05"],
-            ["2020-01-01", "2020-02-04", "2021-02-05"],
-        ],
-    )
-    if as_array:
-        return df.to_numpy()
-    return df
-
-
 def get_datetime(as_array=False):
     df = pd.DataFrame(
         [
@@ -71,7 +58,6 @@ def get_nan_datetime(as_array=False):
             ["2020-01-01 10:12:01", None, "2020-01-03 10:00:00"],
             [np.nan, "2020-02-04 22:12:00", "2021-02-05 12:00:00"],
             ["2022-01-01 23:23:43", "2020-12-25 11:12:00", pd.NaT],
-            ["2023-02-03 11:12:12", "2020-02-04 08:32:00", "2023-02-05 23:00:00"],
         ],
     )
     if as_array:
@@ -94,18 +80,47 @@ def get_tz_datetime(as_array=False):
     return df
 
 
+def get_mixed_type_dataframe():
+    return pd.DataFrame(
+        dict(
+            a=["2020-01-01", "2020-02-04", "2021-02-05"],
+            b=["yo", "ya", "yu"],
+            c=[1, 2, 3],
+            d=["1", "2", "3"],
+            e=["01/01/2023", "03/01/2023", "14/01/2023"],
+            f=[True, False, True],
+        )
+    )
+
+
+def get_mixed_datetime_format(as_array=False):
+    df = pd.DataFrame(
+        dict(
+            a=[
+                "2022-10-15",
+                "2021-12-25",
+                "2020-05-18",
+                "2019-10-15 12:00:00",
+            ]
+        )
+    )
+    if as_array:
+        return df.to_numpy()
+    return df
+
+
 @pytest.mark.parametrize("as_array", [True, False])
 @pytest.mark.parametrize(
-    "get_data_func, features",
+    "get_data_func, features, format",
     [
-        (get_date, TIME_LEVELS[: TIME_LEVELS.index("day") + 1]),
-        (get_datetime, TIME_LEVELS),
-        (get_tz_datetime, TIME_LEVELS),
-        (get_nanoseconds, TIME_LEVELS),
+        (get_date, TIME_LEVELS[: TIME_LEVELS.index("day") + 1], "%Y-%m-%d"),
+        (get_datetime, TIME_LEVELS, "%Y-%m-%d %H:%M:%S"),
+        (get_tz_datetime, TIME_LEVELS, "%Y-%m-%d %H:%M:%S%z"),
+        (get_nanoseconds, TIME_LEVELS, "%Y-%m-%d %H:%M:%S.%f"),
     ],
 )
 @pytest.mark.parametrize(
-    "add_total_second, add_day_of_the_week",
+    "add_total_seconds, add_day_of_the_week",
     list(product([True, False], [True, False])),
 )
 @pytest.mark.parametrize("resolution", TIME_LEVELS)
@@ -113,19 +128,20 @@ def test_fit(
     as_array,
     get_data_func,
     features,
-    add_total_second,
+    format,
+    add_total_seconds,
     add_day_of_the_week,
     resolution,
 ):
     X = get_data_func(as_array=as_array)
     enc = DatetimeEncoder(
         add_day_of_the_week=add_day_of_the_week,
-        add_total_second=add_total_second,
+        add_total_seconds=add_total_seconds,
         resolution=resolution,
     )
     enc.fit(X)
 
-    total_second = ["total_second"] if add_total_second else []
+    total_seconds = ["total_seconds"] if add_total_seconds else []
     day_of_week = ["day_of_week"] if add_day_of_the_week else []
 
     if resolution in features:
@@ -133,22 +149,18 @@ def test_fit(
     else:
         features_ = deepcopy(features)
 
-    features_ += total_second + day_of_week
+    features_ += total_seconds + day_of_week
     columns = range(X.shape[1])
-    expected_features_per_column = {col: features_ for col in columns}
-
-    expected_format_per_column = {col: np.asarray(X)[0, col] for col in columns}
-
-    expected_n_features_out = sum(
-        len(val) for val in expected_features_per_column.values()
-    )
 
+    expected_indice_to_features = {col: features_ for col in columns}
+    expected_indice_to_format = {col: format for col in columns}
+    expected_n_features_out = len(features_) * X.shape[1]
     expected_feature_names = [
         f"{col}_{feature}" for col in columns for feature in features_
     ]
 
-    assert enc.features_per_column_ == expected_features_per_column
-    assert enc.format_per_column_ == expected_format_per_column
+    assert enc.indice_to_features_ == expected_indice_to_features
+    assert enc.indice_to_format_ == expected_indice_to_format
     assert enc.n_features_out_ == expected_n_features_out
     assert enc.get_feature_names_out() == expected_feature_names
 
@@ -156,29 +168,29 @@ def test_fit(
 def test_format_nan():
     X = get_nan_datetime()
     enc = DatetimeEncoder().fit(X)
-    expected_format_per_column = {
-        0: "2020-01-01 10:12:01",
-        1: "2020-02-04 22:12:00",
-        2: "2020-01-03 10:00:00",
+    expected_indice_to_format = {
+        0: "%Y-%m-%d %H:%M:%S",
+        1: "%Y-%m-%d %H:%M:%S",
+        2: "%Y-%m-%d %H:%M:%S",
     }
-    assert enc.format_per_column_ == expected_format_per_column
+    assert enc.indice_to_format_ == expected_indice_to_format
 
 
 def test_format_nz():
     X = get_tz_datetime()
     enc = DatetimeEncoder().fit(X)
-    assert enc.format_per_column_ == {0: "2020-01-01 10:12:01+05:30"}
+    assert enc.indice_to_format_ == {0: "%Y-%m-%d %H:%M:%S%z"}
 
 
 def test_resolution_none():
     X = get_datetime()
     enc = DatetimeEncoder(
         resolution=None,
-        add_total_second=False,
+        add_total_seconds=False,
     )
     enc.fit(X)
 
-    assert enc.features_per_column_ == {0: [], 1: [], 2: []}
+    assert enc.indice_to_features_ == {0: [], 1: [], 2: []}
     assert enc.n_features_out_ == 0
     assert enc.get_feature_names_out() == []
 
@@ -186,7 +198,7 @@ def test_resolution_none():
 def test_transform_date():
     X = get_date()
     enc = DatetimeEncoder(
-        add_total_second=False,
+        add_total_seconds=False,
     )
     X_trans = enc.fit_transform(X)
 
@@ -206,10 +218,10 @@ def test_transform_datetime():
     X = get_datetime()
     enc = DatetimeEncoder(
         resolution="second",
-        add_total_second=False,
+        add_total_seconds=False,
     )
     X_trans = enc.fit_transform(X)
-    X_trans_expected = np.array(
+    expected_X_trans = np.array(
         [
             [2020, 1, 1, 10, 12, 1, 2020, 1, 2, 10, 23, 0, 2020, 1, 3, 10, 0, 0],
             [2021, 2, 3, 12, 45, 23, 2020, 2, 4, 22, 12, 0, 2021, 2, 5, 12, 0, 0],
@@ -217,16 +229,16 @@ def test_transform_datetime():
             [2023, 2, 3, 11, 12, 12, 2020, 2, 4, 8, 32, 0, 2023, 2, 5, 23, 0, 0],
         ]
     )
-    assert_array_equal(X_trans, X_trans_expected)
+    assert_array_equal(X_trans, expected_X_trans)
 
 
 def test_transform_tz():
     X = get_tz_datetime()
     enc = DatetimeEncoder(
-        add_total_second=True,
+        add_total_seconds=True,
     )
     X_trans = enc.fit_transform(X)
-    X_trans_expected = np.array(
+    expected_X_trans = np.array(
         [
             [2020, 1, 1, 10, 1.57785372e09],
             [2021, 2, 3, 12, 1.61233652e09],
@@ -234,16 +246,16 @@ def test_transform_tz():
             [2023, 2, 3, 11, 1.67540293e09],
         ]
     )
-    assert_allclose(X_trans, X_trans_expected)
+    assert_allclose(X_trans, expected_X_trans)
 
 
 def test_transform_nan():
     X = get_nan_datetime()
     enc = DatetimeEncoder(
-        add_total_second=True,
+        add_total_seconds=True,
     )
     X_trans = enc.fit_transform(X)
-    X_trans_expected = np.array(
+    expected_X_trans = np.array(
         [
             [
                 2020,
@@ -296,23 +308,58 @@ def test_transform_nan():
                 np.nan,
                 np.nan,
             ],
-            [
-                2023,
-                2,
-                3,
-                11,
-                1.67542273e09,
-                2020,
-                2,
-                4,
-                8,
-                1.58080512e09,
-                2023,
-                2,
-                5,
-                23,
-                1.67563800e09,
-            ],
         ]
     )
-    assert_allclose(X_trans, X_trans_expected)
+    assert_allclose(X_trans, expected_X_trans)
+
+
+def test_mixed_type_dataframe():
+    X = get_mixed_type_dataframe()
+    enc = DatetimeEncoder().fit(X)
+    assert enc.indice_to_format_ == {0: "%Y-%m-%d", 4: "%d/%m/%Y"}
+
+    X_dt = to_datetime(X)
+    expected_dtypes = [
+        np.dtype("<M8[ns]"),
+        np.dtype("object"),
+        np.dtype("int64"),
+        np.dtype("object"),
+        np.dtype("<M8[ns]"),
+        np.dtype("bool"),
+    ]
+    assert X_dt.dtypes.to_list() == expected_dtypes
+
+    X_dt = to_datetime(X.to_numpy())
+    assert X_dt.dtype == np.object_
+
+
+def test_mixed_datetime_format():
+    df = get_mixed_datetime_format()
+
+    df_dt = to_datetime(df)
+    expected_df_dt = pd.DataFrame(
+        dict(
+            a=[
+                pd.Timestamp("2022-10-15"),
+                pd.Timestamp("2021-12-25"),
+                pd.Timestamp("2020-05-18"),
+                pd.Timestamp("2019-10-15 12:00:00"),
+            ]
+        )
+    )
+    assert_frame_equal(df_dt, expected_df_dt)
+
+    ser_dt = to_datetime(df["a"])
+    expected_ser_dt = expected_df_dt["a"]
+    assert_series_equal(ser_dt, expected_ser_dt)
+
+
+def test_indempotency():
+    df = get_mixed_datetime_format()
+    df_dt = to_datetime(df)
+    df_dt_2 = to_datetime(df_dt)
+    assert_frame_equal(df_dt, df_dt_2)
+
+    X_trans = DatetimeEncoder().fit_transform(df)
+    X_trans_2 = DatetimeEncoder().fit_transform(df_dt)
+    assert_array_equal(X_trans, X_trans_2)

From 8710fcb60f42ab89d7f47ead67fc06b6df3502df Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Fri, 13 Oct 2023 15:26:00 +0200
Subject: [PATCH 13/30] apply Jerome's suggestions

---
 skrub/_datetime_encoder.py           | 45 ++++++++++++++--------------
 skrub/tests/test_datetime_encoder.py | 18 +++++------
 2 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index 0c98e1196..0164c04c5 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -138,15 +138,14 @@ def _to_datetime_2d_array(X, **kwargs):
     -------
     X_split : list of array, of shape ``n_features``
     """
-    X_split = np.hsplit(X, X.shape[1])
-    X_split = [X_col.ravel() for X_col in X_split]
+    X_split = list(X.T)
     return _to_datetime_2d(X_split, **kwargs)
 
 
 def _to_datetime_2d(
     X_split,
     indices=None,
-    indice_to_format=None,
+    index_to_format=None,
     format=None,
     **kwargs,
 ):
@@ -167,7 +166,7 @@ def _to_datetime_2d(
         Indices of the parsable columns to convert.
         If None, indices are computed using the current input X.
 
-    indice_to_format : mapping of int to str, default=None
+    index_to_format : mapping of int to str, default=None
         Dictionary mapping column indices to their datetime format.
         It defines the format parameter for each column when calling
         pd.to_datetime.
@@ -184,15 +183,15 @@ def _to_datetime_2d(
     X_split : list of 1d array of length n_features
     """
     if indices is None:
-        indices, indice_to_format = _get_datetime_column_indices(X_split)
+        indices, index_to_format = _get_datetime_column_indices(X_split)
 
     # format overwrite indices_to_format
-    if format is not None or indice_to_format is None:
-        indice_to_format = {col_idx: format for col_idx in indices}
+    if format is not None:
+        index_to_format = {col_idx: format for col_idx in indices}
 
     for col_idx in indices:
         X_split[col_idx] = pd.to_datetime(
-            X_split[col_idx], format=indice_to_format[col_idx], **kwargs
+            X_split[col_idx], format=index_to_format[col_idx], **kwargs
         )
 
     return X_split
@@ -211,11 +210,11 @@ def _get_datetime_column_indices(X_split):
     datetime_indices : list of int
         List of parsable column, identified by their indices.
 
-    indice_to_format: mapping of int to str
+    index_to_format: mapping of int to str
         Dictionary mapping parsable column indices to their datetime format.
     """
     indices = []
-    indice_to_format = {}
+    index_to_format = {}
 
     for col_idx, X_col in enumerate(X_split):
         X_col = X_col[pd.notnull(X_col)]
@@ -227,9 +226,9 @@ def _get_datetime_column_indices(X_split):
         if _is_column_datetime_parsable(X_col):
             indices.append(col_idx)
             # TODO: pass require_dayfirst to _guess_datetime_format
-            indice_to_format[col_idx] = _guess_datetime_format(X_col)
+            index_to_format[col_idx] = _guess_datetime_format(X_col)
 
-    return indices, indice_to_format
+    return indices, index_to_format
 
 
 def _is_column_datetime_parsable(X_col):
@@ -398,10 +397,10 @@ class DatetimeEncoder(TransformerMixin, BaseEstimator):
     column_indices_ : list of int
         Indices of the datetime-parsable columns.
 
-    indice_to_format_ : dict[int, str]
+    index_to_format_ : dict[int, str]
         Mapping from column indices to their datetime formats.
 
-    indice_to_features_ : dict[int, list[str]]
+    index_to_features_ : dict[int, list[str]]
         Dictionary mapping the column names to the list of datetime
         features extracted for each column.
 
@@ -513,12 +512,12 @@ def _select_datetime_cols(self, X):
             levels = TIME_LEVELS[: idx_level + 1]
 
         X_split = np.hsplit(X, X.shape[1])
-        self.column_indices_, self.indice_to_format_ = _get_datetime_column_indices(
+        self.column_indices_, self.index_to_format_ = _get_datetime_column_indices(
             X_split
         )
         del X_split
 
-        self.indice_to_features_ = defaultdict(list)
+        self.index_to_features_ = defaultdict(list)
         self.n_features_out_ = 0
 
         for col_idx in self.column_indices_:
@@ -529,15 +528,15 @@ def _select_datetime_cols(self, X):
                     level for level in levels if level in ["year", "month", "day"]
                 ]
 
-            self.indice_to_features_[col_idx] += levels
+            self.index_to_features_[col_idx] += levels
             self.n_features_out_ += len(levels)
 
             if self.add_total_seconds:
-                self.indice_to_features_[col_idx].append("total_seconds")
+                self.index_to_features_[col_idx].append("total_seconds")
                 self.n_features_out_ += 1
 
             if self.add_day_of_the_week:
-                self.indice_to_features_[col_idx].append("day_of_week")
+                self.index_to_features_[col_idx].append("day_of_week")
                 self.n_features_out_ += 1
 
     def transform(self, X, y=None):
@@ -570,7 +569,7 @@ def transform(self, X, y=None):
         X_split = _to_datetime_2d_array(
             X,
             indices=self.column_indices_,
-            indice_to_format=self.indice_to_format_,
+            index_to_format=self.index_to_format_,
             errors=self.errors,
         )
 
@@ -593,7 +592,7 @@ def _extract_features(self, X_split):
         offset_idx = 0
         for col_idx in self.column_indices_:
             X_col = X_split[col_idx]
-            features = self.indice_to_features_[col_idx]
+            features = self.index_to_features_[col_idx]
             for feat_idx, feature in enumerate(features):
                 if feature == "total_seconds":
                     X_feature = _datetime_to_total_seconds(X_col)
@@ -624,10 +623,10 @@ def get_feature_names_out(self, input_features=None):
         feature_names : list of str
             List of feature names.
         """
-        check_is_fitted(self, "indice_to_features_")
+        check_is_fitted(self, "index_to_features_")
         feature_names = []
         columns = getattr(self, "feature_names_in_", list(range(self.n_features_in_)))
-        for col_idx, features in self.indice_to_features_.items():
+        for col_idx, features in self.index_to_features_.items():
             column = columns[col_idx]
             feature_names += [f"{column}_{feat}" for feat in features]
         return feature_names
diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py
index 77d94ff0d..9f75818ed 100644
--- a/skrub/tests/test_datetime_encoder.py
+++ b/skrub/tests/test_datetime_encoder.py
@@ -152,15 +152,15 @@ def test_fit(
     features_ += total_seconds + day_of_week
     columns = range(X.shape[1])
 
-    expected_indice_to_features = {col: features_ for col in columns}
-    expected_indice_to_format = {col: format for col in columns}
+    expected_index_to_features = {col: features_ for col in columns}
+    expected_index_to_format = {col: format for col in columns}
     expected_n_features_out = len(features_) * X.shape[1]
     expected_feature_names = [
         f"{col}_{feature}" for col in columns for feature in features_
     ]
 
-    assert enc.indice_to_features_ == expected_indice_to_features
-    assert enc.indice_to_format_ == expected_indice_to_format
+    assert enc.index_to_features_ == expected_index_to_features
+    assert enc.index_to_format_ == expected_index_to_format
     assert enc.n_features_out_ == expected_n_features_out
     assert enc.get_feature_names_out() == expected_feature_names
 
@@ -168,18 +168,18 @@ def test_fit(
 def test_format_nan():
     X = get_nan_datetime()
     enc = DatetimeEncoder().fit(X)
-    expected_indice_to_format = {
+    expected_index_to_format = {
         0: "%Y-%m-%d %H:%M:%S",
         1: "%Y-%m-%d %H:%M:%S",
         2: "%Y-%m-%d %H:%M:%S",
     }
-    assert enc.indice_to_format_ == expected_indice_to_format
+    assert enc.index_to_format_ == expected_index_to_format
 
 
 def test_format_nz():
     X = get_tz_datetime()
     enc = DatetimeEncoder().fit(X)
-    assert enc.indice_to_format_ == {0: "%Y-%m-%d %H:%M:%S%z"}
+    assert enc.index_to_format_ == {0: "%Y-%m-%d %H:%M:%S%z"}
 
 
 def test_resolution_none():
@@ -190,7 +190,7 @@ def test_resolution_none():
     )
     enc.fit(X)
 
-    assert enc.indice_to_features_ == {0: [], 1: [], 2: []}
+    assert enc.index_to_features_ == {0: [], 1: [], 2: []}
     assert enc.n_features_out_ == 0
     assert enc.get_feature_names_out() == []
 
@@ -316,7 +316,7 @@ def test_transform_nan():
 def test_mixed_type_dataframe():
     X = get_mixed_type_dataframe()
     enc = DatetimeEncoder().fit(X)
-    assert enc.indice_to_format_ == {0: "%Y-%m-%d", 4: "%d/%m/%Y"}
+    assert enc.index_to_format_ == {0: "%Y-%m-%d", 4: "%d/%m/%Y"}
 
     X_dt = to_datetime(X)
     expected_dtypes = [

From dff7b22a0f4ad8287028771173d14d4036b1ffc8 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Fri, 13 Oct 2023 16:53:36 +0200
Subject: [PATCH 14/30] fix old pandas version errors

---
 skrub/_datetime_encoder.py           | 21 ++++++++++++++++-----
 skrub/tests/test_datetime_encoder.py | 22 +++++++++++++++++++---
 2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index 0164c04c5..a94c89e1a 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -7,6 +7,7 @@
 from pandas._libs.tslibs.parsing import guess_datetime_format
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils import check_array
+from sklearn.utils.fixes import parse_version
 from sklearn.utils.validation import check_is_fitted
 
 from .dataframe._namespace import get_df_namespace
@@ -24,6 +25,15 @@
 TIME_LEVELS = list(WORD_TO_ALIAS)
 
 
+def _is_pandas_format_mixed_available():
+    pandas_version = pd.__version__
+    min_pandas_version = "2.0.0"
+    return parse_version(min_pandas_version) < parse_version(pandas_version)
+
+
+MIXED_FORMAT = "mixed" if _is_pandas_format_mixed_available() else None
+
+
 def to_datetime(
     X,
     errors="coerce",
@@ -262,7 +272,7 @@ def _is_column_datetime_parsable(X_col):
                 # avoiding ValueError when both date and datetime formats
                 # are present.
                 # At this stage, the format itself doesn't matter.
-                _ = pd.to_datetime(X_col, format="mixed")
+                _ = pd.to_datetime(X_col, format=MIXED_FORMAT)
             return True
         except (pd.errors.ParserError, ValueError):
             pass
@@ -292,10 +302,11 @@ def _guess_datetime_format(X_col, require_dayfirst=False):
     vfunc = np.vectorize(guess_datetime_format)
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", category=UserWarning)
-        month_first_formats = np.unique(vfunc(X_col, dayfirst=False))
-        day_first_formats = np.unique(vfunc(X_col, dayfirst=True))
+        # pd.unique handles None
+        month_first_formats = pd.unique(vfunc(X_col, dayfirst=False))
+        day_first_formats = pd.unique(vfunc(X_col, dayfirst=True))
 
-    if pd.isnull(month_first_formats).any() or pd.isnull(day_first_formats).any():
+    if None in month_first_formats or None in day_first_formats:
         return None
 
     elif (
@@ -321,7 +332,7 @@ def _guess_datetime_format(X_col, require_dayfirst=False):
         and len(day_first_formats) == 2
         and len(month_first_formats[0]) != len(month_first_formats[1])
     ):
-        return "mixed"
+        return MIXED_FORMAT
 
     else:
         return None
diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py
index 9f75818ed..47910852f 100644
--- a/skrub/tests/test_datetime_encoder.py
+++ b/skrub/tests/test_datetime_encoder.py
@@ -7,7 +7,16 @@
 from numpy.testing import assert_allclose, assert_array_equal
 from pandas.testing import assert_frame_equal, assert_series_equal
 
-from skrub._datetime_encoder import TIME_LEVELS, DatetimeEncoder, to_datetime
+from skrub._datetime_encoder import (
+    TIME_LEVELS,
+    DatetimeEncoder,
+    _is_pandas_format_mixed_available,
+    to_datetime,
+)
+
+NANOSECONDS_FORMAT = (
+    "%Y-%m-%d %H:%M:%S.%f" if _is_pandas_format_mixed_available() else None
+)
 
 
 def get_date(as_array=False):
@@ -57,7 +66,7 @@ def get_nan_datetime(as_array=False):
         [
             ["2020-01-01 10:12:01", None, "2020-01-03 10:00:00"],
             [np.nan, "2020-02-04 22:12:00", "2021-02-05 12:00:00"],
-            ["2022-01-01 23:23:43", "2020-12-25 11:12:00", pd.NaT],
+            ["2022-01-01 23:23:43", "2020-12-25 11:12:00", pd.NA],
         ],
     )
     if as_array:
@@ -116,7 +125,7 @@ def get_mixed_datetime_format(as_array=False):
         (get_date, TIME_LEVELS[: TIME_LEVELS.index("day") + 1], "%Y-%m-%d"),
         (get_datetime, TIME_LEVELS, "%Y-%m-%d %H:%M:%S"),
         (get_tz_datetime, TIME_LEVELS, "%Y-%m-%d %H:%M:%S%z"),
-        (get_nanoseconds, TIME_LEVELS, "%Y-%m-%d %H:%M:%S.%f"),
+        (get_nanoseconds, TIME_LEVELS, NANOSECONDS_FORMAT),
     ],
 )
 @pytest.mark.parametrize(
@@ -354,6 +363,13 @@ def test_mixed_datetime_format():
     assert_series_equal(ser_dt, expected_ser_dt)
 
 
+@pytest.mark.skipif(
+    not _is_pandas_format_mixed_available(),
+    reason=(
+        "DeprecationWarning is already handled as a ValueError         in the latest"
+        " pandas version."
+    ),
+)
 def test_indempotency():
     df = get_mixed_datetime_format()
     df_dt = to_datetime(df)

From ff5b5751a0e43cd5671fcb5655a5a588861c8e02 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Mon, 16 Oct 2023 15:25:50 +0200
Subject: [PATCH 15/30] fix doctest

---
 skrub/_datetime_encoder.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index a94c89e1a..290b94e0c 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -432,23 +432,19 @@ class DatetimeEncoder(TransformerMixin, BaseEstimator):
 
     Examples
     --------
-    >>> enc = DatetimeEncoder()
-
-    Let's encode the following dates:
-
+    >>> enc = DatetimeEncoder(add_total_seconds=False)
     >>> X = [['2022-10-15'], ['2021-12-25'], ['2020-05-18'], ['2019-10-15 12:00:00']]
-
     >>> enc.fit(X)
-    DatetimeEncoder()
+    DatetimeEncoder(add_total_seconds=False)
 
     The encoder will output a transformed array
-    with five columns ("year", "month", "day", "hour" and "total_seconds"):
+    with four columns ("year", "month", "day", "hour"):
 
     >>> enc.transform(X)
-    array([[2022.,   10.,   15.,    0.,    1.6657920e+09],
-           [2021.,   12.,   25.,    0.,    1.6403904e+09],
-           [2020.,    5.,   18.,    0.,    1.5897600e+09],
-           [2019.,   10.,   15.,   12.,    1.5711408e+09]])
+    array([[2022.,   10.,   15.,    0.],
+           [2021.,   12.,   25.,    0.],
+           [2020.,    5.,   18.,    0.],
+           [2019.,   10.,   15.,   12.]])
     """
 
     def __init__(

From 7f463bc980c1faa470fefb2b7ef45b0e97dbbefa Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Mon, 16 Oct 2023 17:20:38 +0200
Subject: [PATCH 16/30] add scalar and 1d array support for to_datetime

---
 skrub/_datetime_encoder.py           | 42 +++++++++++++++++++++++++---
 skrub/tests/test_datetime_encoder.py | 18 ++++++++++--
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index 290b94e0c..d0198f566 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -37,10 +37,11 @@ def _is_pandas_format_mixed_available():
 def to_datetime(
     X,
     errors="coerce",
+    unit=None,
     **kwargs,
 ):
     """
-    Convert argument to datetime.
+    Convert argument to datetime. Return the input if not datetime-parsable.
 
     Augment :func:`pandas.to_datetime` by supporting dataframes
     and 2d arrays inputs. It converts compatible columns to datetime, and
@@ -60,6 +61,9 @@ def to_datetime(
         Note that ``'ignore'`` is not used for dataframes, 2d arrays,
         and series, and is used otherwise as in ``pd.to_datetime``.
 
+    unit : str, default None
+        Unused. Here for compatibility with :func:`pandas.to_datetime`.
+
     **kwargs : key, value mappings
         Other keyword arguments are passed down to
         :func:`pandas.to_datetime`.
@@ -75,6 +79,13 @@ def to_datetime(
     --------
     :func:`pandas.to_datetime`
         Convert argument to datetime.
+
+    Examples
+    --------
+    >>> X = pd.DataFrame(dict(a=[1, 2], b=["2021-01-01", "2021-02-02"]))
+    >>> X = to_datetime(X)
+    >>> X.dtypes.to_list()
+    [dtype('int64'), dtype('<M8[ns]')]
     """
     kwargs["errors"] = errors
 
@@ -91,8 +102,19 @@ def to_datetime(
         X = _to_datetime_2d_array(np.asarray(X), **kwargs)
         return np.vstack(X).T
 
+    # 1d array
+    elif isinstance(X, Iterable) and np.asarray(X).ndim == 1:
+        return _to_datetime_1d_array(np.asarray(X), **kwargs)
+
     # scalar or unknown type
-    return pd.to_datetime(X, **kwargs)
+    elif np.asarray(X).ndim == 0:
+        return _to_datetime_scalar(X, **kwargs)
+
+    else:
+        raise TypeError(
+            "X must be a string, datetime, list, tuple, 1-d array, Series, "
+            f"2-d array or dataframe. Got {X=!r}."
+        )
 
 
 def _to_datetime_dataframe(X, **kwargs):
@@ -131,7 +153,7 @@ def _to_datetime_series(X, **kwargs):
     index = getattr(X, "index", None)
     name = X.name
     X_split = [X.to_numpy()]
-    X_split = _to_datetime_2d(X_split)
+    X_split = _to_datetime_2d(X_split, **kwargs)
     X = pd.Series(X_split[0], index=index, name=name)
     # conversion is px is Polars, no-op if Pandas
     return px.Series(X)
@@ -152,6 +174,18 @@ def _to_datetime_2d_array(X, **kwargs):
     return _to_datetime_2d(X_split, **kwargs)
 
 
+def _to_datetime_1d_array(X, **kwargs):
+    X_split = [X]
+    X_split = _to_datetime_2d(X_split, **kwargs)
+    return np.asarray(X_split[0])
+
+
+def _to_datetime_scalar(X, **kwargs):
+    X_split = [np.atleast_1d(X)]
+    X_split = _to_datetime_2d(X_split, **kwargs)
+    return X_split[0][0]
+
+
 def _to_datetime_2d(
     X_split,
     indices=None,
@@ -285,7 +319,7 @@ def _guess_datetime_format(X_col, require_dayfirst=False):
     ----------
     X_col : ndarray of shape ``(n_samples,)``
 
-    require_dayfirst : bool, default True
+    require_dayfirst : bool, default False
         Whether to return the dayfirst format when both dayfirst
         and monthfirst are valid.
 
diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py
index 47910852f..d6bbd9a1b 100644
--- a/skrub/tests/test_datetime_encoder.py
+++ b/skrub/tests/test_datetime_encoder.py
@@ -366,8 +366,8 @@ def test_mixed_datetime_format():
 @pytest.mark.skipif(
     not _is_pandas_format_mixed_available(),
     reason=(
-        "DeprecationWarning is already handled as a ValueError         in the latest"
-        " pandas version."
+        "DeprecationWarning is already handled as a ValueError "
+        "in the latest pandas version."
     ),
 )
 def test_indempotency():
@@ -379,3 +379,17 @@ def test_indempotency():
     X_trans = DatetimeEncoder().fit_transform(df)
     X_trans_2 = DatetimeEncoder().fit_transform(df_dt)
     assert_array_equal(X_trans, X_trans_2)
+
+
+@pytest.mark.parametrize(
+    "X", [True, "a", ["a", "b"], ("a", "b"), 1, [1, 2], np.array([1, 2])]
+)
+def test_to_datetime_incorrect_skip(X):
+    assert_array_equal(to_datetime(X), X)
+
+
+def test_to_datetime_type_error():
+    # 3d tensor
+    X = [[["2021-01-01"]]]
+    with pytest.raises(TypeError):
+        to_datetime(X)

From 1bf6a9f52776e016ceb328ee935c8075b6c6b822 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Mon, 16 Oct 2023 18:04:33 +0200
Subject: [PATCH 17/30] fix test on py310-min

---
 skrub/tests/test_datetime_encoder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py
index d6bbd9a1b..89d6358be 100644
--- a/skrub/tests/test_datetime_encoder.py
+++ b/skrub/tests/test_datetime_encoder.py
@@ -5,7 +5,7 @@
 import pandas as pd
 import pytest
 from numpy.testing import assert_allclose, assert_array_equal
-from pandas.testing import assert_frame_equal, assert_series_equal
+from pandas.testing import assert_frame_equal
 
 from skrub._datetime_encoder import (
     TIME_LEVELS,
@@ -360,7 +360,7 @@ def test_mixed_datetime_format():
 
     ser_dt = to_datetime(df["a"])
     expected_ser_dt = expected_df_dt["a"]
-    assert_series_equal(ser_dt, expected_ser_dt)
+    assert_array_equal(ser_dt, expected_ser_dt)
 
 
 @pytest.mark.skipif(

From 4311a5eaef786fc9e5b409bde01b4981bf58956b Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Tue, 31 Oct 2023 15:47:29 +0100
Subject: [PATCH 18/30] update the example

---
 doc/api.rst                     |  10 +-
 doc/conf.py                     |   1 +
 examples/03_datetime_encoder.py | 168 +++++++++++++++-----------------
 3 files changed, 89 insertions(+), 90 deletions(-)

diff --git a/doc/api.rst b/doc/api.rst
index 6d7469b6d..fee46d1c1 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -79,7 +79,7 @@ This page lists all available functions and classes of `skrub`.
 
 .. raw:: html
 
-   <h2>Other encoders</h2>
+   <h2>Dealing with dates</h2>
 
 .. autosummary::
    :toctree: generated/
@@ -89,6 +89,14 @@ This page lists all available functions and classes of `skrub`.
 
    DatetimeEncoder
 
+.. autosummary::
+   :toctree: generated/
+   :template: function.rst
+   :nosignatures:
+   :caption: Converting datetime columns in a table
+
+   to_datetime
+
 .. raw:: html
 
    <h2>Deduplication: merging variants of the same entry</h2>
diff --git a/doc/conf.py b/doc/conf.py
index 710f4d69a..b1bccad12 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -504,6 +504,7 @@ def notebook_modification_function(notebook_content, notebook_filename):
     "SimilarityEncoder": "skrub.SimilarityEncoder",
     "DatetimeEncoder": "skrub.DatetimeEncoder",
     "deduplicate": "skrub.deduplicate",
+    "to_datetime": "skrub.to_datetime",
     "TableVectorizer": "skrub.TableVectorizer",
     "DatasetInfoOnly": "skrub.datasets._fetching.DatasetInfoOnly",
     "DatasetAll": "skrub.datasets._fetching.DatasetAll",
diff --git a/examples/03_datetime_encoder.py b/examples/03_datetime_encoder.py
index 12376b881..d89dff662 100644
--- a/examples/03_datetime_encoder.py
+++ b/examples/03_datetime_encoder.py
@@ -34,6 +34,9 @@
 
 .. |HGBR| replace::
     :class:`~sklearn.ensemble.HistGradientBoostingRegressor`
+
+.. |to_datetime| replace::
+    :func:`~skrub.to_datetime`
 """
 
 
@@ -46,19 +49,26 @@
 # on the location, date and time of measurement.
 
 from pprint import pprint
-
 import pandas as pd
 
 data = pd.read_csv(
     "https://raw.githubusercontent.com/pandas-dev/pandas"
     "/main/doc/data/air_quality_no2_long.csv"
-)
+).sort_values("date.utc")
 # Extract our input data (X) and the target column (y)
 y = data["value"]
 X = data[["city", "date.utc"]]
 
 X
 
+###############################################################################
+# We convert the dataframe date columns using |to_datetime|. Notice how
+# we don't need to specify the columns to convert.
+from skrub import to_datetime
+
+X = to_datetime(X)
+X.dtypes
+
 ###############################################################################
 # Encoding the features
 # .....................
@@ -73,10 +83,8 @@
 # lower units, as they are probably unimportant.
 
 from sklearn.preprocessing import OneHotEncoder
-
-from skrub import DatetimeEncoder
-
 from sklearn.compose import make_column_transformer
+from skrub import DatetimeEncoder
 
 encoder = make_column_transformer(
     (OneHotEncoder(handle_unknown="ignore"), ["city"]),
@@ -88,7 +96,7 @@
 pprint(encoder.get_feature_names_out())
 
 ###############################################################################
-# We see that the encoder is working as expected: the "date.utc" column has
+# We see that the encoder is working as expected: the ``"date.utc"`` column has
 # been replaced by features extracting the month, day, hour, minute, day of the
 # week and total second since Epoch information.
 
@@ -101,8 +109,7 @@
 
 from skrub import TableVectorizer
 
-table_vec = TableVectorizer()
-table_vec.fit_transform(X)
+table_vec = TableVectorizer().fit(X)
 pprint(table_vec.get_feature_names_out())
 
 ###############################################################################
@@ -113,8 +120,7 @@
 
 table_vec = TableVectorizer(
     datetime_transformer=DatetimeEncoder(add_day_of_the_week=True),
-)
-table_vec.fit_transform(X)
+).fit(X)
 pprint(table_vec.get_feature_names_out())
 
 ###############################################################################
@@ -156,12 +162,6 @@
 #
 # Instead, we can use the |TimeSeriesSplit|,
 # which ensures that the test set is always in the future.
-import numpy as np
-
-sorted_indices = np.argsort(X["date.utc"])
-X = X.iloc[sorted_indices]
-y = y.iloc[sorted_indices]
-
 from sklearn.model_selection import TimeSeriesSplit, cross_val_score
 
 cross_val_score(
@@ -178,82 +178,71 @@
 #
 # The mean squared error is not obvious to interpret, so we compare
 # visually the prediction of our model with the actual values.
-
+import numpy as np
 import matplotlib.pyplot as plt
-from matplotlib.dates import AutoDateFormatter, AutoDateLocator
 
-X_train = X[X["date.utc"] < "2019-06-01"]
-X_test = X[X["date.utc"] >= "2019-06-01"]
-
-y_train = y[X["date.utc"] < "2019-06-01"]
-y_test = y[X["date.utc"] >= "2019-06-01"]
+mask_train = X["date.utc"] < "2019-06-01"
+X_train, X_test = X.loc[mask_train], X.loc[~mask_train]
+y_train, y_test = y.loc[mask_train], y.loc[~mask_train]
 
 pipeline.fit(X_train, y_train)
+y_pred = pipeline.predict(X_test)
 
 all_cities = X_test["city"].unique()
 
-fig, axs = plt.subplots(nrows=len(all_cities), ncols=1, figsize=(12, 9))
-fig.subplots_adjust(hspace=0.5)
+fig, axes = plt.subplots(nrows=len(all_cities), ncols=1, figsize=(12, 9))
+for ax, city in zip(axes, all_cities):
+    mask_prediction = X_test["city"] == city
+    date_prediction = X_test.loc[mask_prediction]["date.utc"]
+    y_prediction = y_pred[mask_prediction]
 
-for i, city in enumerate(all_cities):
-    axs[i].plot(
-        X.loc[X.city == city, "date.utc"],
-        y.loc[X.city == city],
-        label="Actual",
-    )
-    axs[i].plot(
-        X_test.loc[X_test.city == city, "date.utc"],
-        pipeline.predict(X_test.loc[X_test.city == city]),
-        label="Predicted",
+    mask_reference = X["city"] == city
+    date_reference = X.loc[mask_reference]["date.utc"]
+    y_reference = y[mask_reference]
+
+    ax.plot(date_reference, y_reference, label="Actual")
+    ax.plot(date_prediction, y_prediction, label="Predicted")
+
+    ax.set(
+        ylabel="NO2",
+        title=city,
     )
-    axs[i].set_title(city)
-    axs[i].set_ylabel("NO2")
-    xtick_locator = AutoDateLocator(maxticks=8)
-    xtick_formatter = AutoDateFormatter(xtick_locator)
-    axs[i].xaxis.set_major_locator(xtick_locator)
-    axs[i].xaxis.set_major_formatter(xtick_formatter)
-    axs[i].legend()
+    ax.legend()
+
+fig.subplots_adjust(hspace=0.5)
 plt.show()
 
 ###############################################################################
 # Let's zoom on a few days:
 
-X_zoomed = X[(X["date.utc"] <= "2019-06-04") & (X["date.utc"] >= "2019-06-01")]
-y_zoomed = y[(X["date.utc"] <= "2019-06-04") & (X["date.utc"] >= "2019-06-01")]
-
-X_train_zoomed = X_zoomed[X_zoomed["date.utc"] < "2019-06-03"]
-X_test_zoomed = X_zoomed[X_zoomed["date.utc"] >= "2019-06-03"]
+mask_zoom_reference = (X["date.utc"] >= "2019-06-01") & (X["date.utc"] < "2019-06-04")
+mask_zoom_prediction = (X_test["date.utc"] >= "2019-06-01") & (
+    X_test["date.utc"] < "2019-06-04"
+)
 
-y_train_zoomed = y[X["date.utc"] < "2019-06-03"]
-y_test_zoomed = y[X["date.utc"] >= "2019-06-03"]
+all_cities = ["Paris", "London"]
+fig, axes = plt.subplots(nrows=len(all_cities), ncols=1, figsize=(12, 9))
+for ax, city in zip(axes, all_cities):
+    mask_prediction = (X_test["city"] == city) & mask_zoom_prediction
+    date_prediction = X_test.loc[mask_prediction]["date.utc"]
+    y_prediction = y_pred[mask_prediction]
 
-zoomed_cities = X_test_zoomed["city"].unique()
+    mask_reference = (X["city"] == city) & mask_zoom_reference
+    date_reference = X.loc[mask_reference]["date.utc"]
+    y_reference = y[mask_reference]
 
-fig, axs = plt.subplots(nrows=len(zoomed_cities), ncols=1, figsize=(12, 9))
-fig.subplots_adjust(hspace=0.5)
+    ax.plot(date_reference, y_reference, label="Actual")
+    ax.plot(date_prediction, y_prediction, label="Predicted")
 
-for i, city in enumerate(zoomed_cities):
-    axs[i].plot(
-        X_zoomed.loc[X_zoomed["city"] == city, "date.utc"],
-        y_zoomed.loc[X_zoomed["city"] == city],
-        label="Actual",
-    )
-    axs[i].plot(
-        X_test_zoomed.loc[X_test_zoomed["city"] == city, "date.utc"],
-        pipeline.predict(X_test_zoomed.loc[X_test_zoomed["city"] == city]),
-        label="Predicted",
+    ax.set(
+        ylabel="NO2",
+        title=city,
     )
-    axs[i].set_title(city)
-    axs[i].set_ylabel("NO2")
+    ax.legend()
 
-    xtick_locator = AutoDateLocator(maxticks=8)
-    xtick_formatter = AutoDateFormatter(xtick_locator)
-    axs[i].xaxis.set_major_locator(xtick_locator)
-    axs[i].xaxis.set_major_formatter(xtick_formatter)
-
-    axs[i].legend()
 plt.show()
 
+
 ###############################################################################
 # Features importance
 # -------------------
@@ -273,27 +262,28 @@
 
 # In this case, we don't use a pipeline, because we want to compute the
 # importance of the features created by the DatetimeEncoder
-X_ = table_vec.fit_transform(X)
-reg = HistGradientBoostingRegressor().fit(X_, y)
-result = permutation_importance(reg, X_, y, n_repeats=10, random_state=0)
-std = result.importances_std
-importances = result.importances_mean
-indices = np.argsort(importances)
-# Sort from least to most
-indices = list(reversed(indices))
-
-plt.figure(figsize=(12, 9))
-plt.title("Feature importances")
-n = len(indices)
-labels = np.array(table_vec.get_feature_names_out())[indices]
-plt.barh(range(n), importances[indices], color="b", yerr=std[indices])
-plt.yticks(range(n), labels, size=15)
-plt.tight_layout(pad=1)
-plt.show()
+X_transform = table_vec.fit_transform(X)
+feature_names = table_vec.get_feature_names_out()
+
+model = HistGradientBoostingRegressor().fit(X_transform, y)
+result = permutation_importance(model, X_transform, y, n_repeats=10, random_state=0)
+
+result = pd.DataFrame(
+    dict(
+        feature_names=feature_names,
+        std=result.importances_std,
+        importances=result.importances_mean,
+    )
+).sort_values("importances", ascending=False)
+
+result.plot.barh(
+    y="importances", x="feature_names", title="Feature Importances", figsize=(12, 9)
+)
+plt.tight_layout()
 
 ###############################################################################
-# We can see that the hour of the day is the most important feature,
-# which seems reasonable.
+# We can see that the total seconds since Epoch and the hour of the day
+# are the most important feature, which seems reasonable.
 #
 # Conclusion
 # ----------

From 77771f798d9ecf47020b94c71c8c6c93a713b7eb Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 2 Nov 2023 18:05:45 +0100
Subject: [PATCH 19/30] improve to_datetime docstring and parameters validation

---
 skrub/_datetime_encoder.py | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index d0198f566..2e3c25ddb 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -37,7 +37,6 @@ def _is_pandas_format_mixed_available():
 def to_datetime(
     X,
     errors="coerce",
-    unit=None,
     **kwargs,
 ):
     """
@@ -58,15 +57,13 @@ def to_datetime(
     errors : {'ignore', 'raise', 'coerce'}, default 'coerce'
         - If ``'raise'``, then invalid parsing will raise an exception.
         - If ``'coerce'``, then invalid parsing will be set as ``NaT``.
-        Note that ``'ignore'`` is not used for dataframes, 2d arrays,
-        and series, and is used otherwise as in ``pd.to_datetime``.
-
-    unit : str, default None
-        Unused. Here for compatibility with :func:`pandas.to_datetime`.
+        Note that ``'ignore'`` is not used and will raise an error.
 
     **kwargs : key, value mappings
-        Other keyword arguments are passed down to
-        :func:`pandas.to_datetime`.
+        Other keyword arguments are passed down to :func:`pandas.to_datetime`.
+        Raise an error if 'unit' is set to any value. This is because, in
+        `pandas.to_datetime`, unit is specific to timestamps, whereas in
+        `skru`.to_datetime` we don't attempt to parse numeric columns.
 
     Returns
     -------
@@ -87,8 +84,14 @@ def to_datetime(
     >>> X.dtypes.to_list()
     [dtype('int64'), dtype('<M8[ns]')]
     """
+    errors_options = ["coerce", "raise"]
+    if errors not in errors_options:
+        raise ValueError(f"errors options are {errors_options!r}, got {errors!r}.")
     kwargs["errors"] = errors
 
+    if kwargs.get("unit", None) is not None:
+        raise ValueError("unit ")
+
     # dataframe
     if hasattr(X, "__dataframe__"):
         return _to_datetime_dataframe(X, **kwargs)
@@ -128,14 +131,12 @@ def _to_datetime_dataframe(X, **kwargs):
     -------
     X : Pandas or Polars dataframe
     """
-    _, px = get_df_namespace(X)
+    skrub_px, _ = get_df_namespace(X)
     index = getattr(X, "index", None)
     X_split = [X[col].to_numpy() for col in X.columns]
     X_split = _to_datetime_2d(X_split, **kwargs)
     X_split = {col: X_split[col_idx] for col_idx, col in enumerate(X.columns)}
-    X = pd.DataFrame(X_split, index=index)
-    # conversion is px is Polars, no-op if Pandas
-    return px.DataFrame(X)
+    return skrub_px.make_dataframe(X_split, index=index)
 
 
 def _to_datetime_series(X, **kwargs):
@@ -149,14 +150,12 @@ def _to_datetime_series(X, **kwargs):
     -------
     X : Pandas or Polars series
     """
-    _, px = get_df_namespace(X.to_frame())
+    skrub_px, _ = get_df_namespace(X.to_frame())
     index = getattr(X, "index", None)
     name = X.name
     X_split = [X.to_numpy()]
     X_split = _to_datetime_2d(X_split, **kwargs)
-    X = pd.Series(X_split[0], index=index, name=name)
-    # conversion is px is Polars, no-op if Pandas
-    return px.Series(X)
+    return skrub_px.make_series(X_split[0], index=index, name=name)
 
 
 def _to_datetime_2d_array(X, **kwargs):

From 60cfad660fa10c6e1baf565f63494a0b61506e22 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 2 Nov 2023 18:07:11 +0100
Subject: [PATCH 20/30] fix _dataframe import path

---
 skrub/_datetime_encoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index 2e3c25ddb..e4715f0ff 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -10,7 +10,7 @@
 from sklearn.utils.fixes import parse_version
 from sklearn.utils.validation import check_is_fitted
 
-from .dataframe._namespace import get_df_namespace
+from ._dataframe._namespace import get_df_namespace
 
 WORD_TO_ALIAS = {
     "year": "Y",

From cd6672de399f84c33248e23daeb0c481848cdd5d Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Fri, 3 Nov 2023 19:30:49 +0100
Subject: [PATCH 21/30] improve doc and add some tests

---
 skrub/_datetime_encoder.py           |  91 ++++++++++++----------
 skrub/tests/test_datetime_encoder.py | 112 +++++++++++++++++++++------
 2 files changed, 141 insertions(+), 62 deletions(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index e4715f0ff..9b8364321 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -39,31 +39,44 @@ def to_datetime(
     errors="coerce",
     **kwargs,
 ):
-    """
-    Convert argument to datetime. Return the input if not datetime-parsable.
-
-    Augment :func:`pandas.to_datetime` by supporting dataframes
-    and 2d arrays inputs. It converts compatible columns to datetime, and
-    pass incompatible columns unchanged.
+    """Convert the columns of a dataframe or 2d array into a datetime representation.
 
-    int, float, str, datetime, list, tuple, 1d array, and Series are defered to
-    :func:`pandas.to_datetime` directly.
+    This function augments :func:`pandas.to_datetime` by supporting dataframes
+    and 2d array inputs. It only attempts to convert columns whose dtype are
+    object or string. Numeric columns are skip and preserved in the output.
 
     Parameters
     ----------
-    X : int, float, str, datetime, list, tuple, nd array, Series, DataFrame/dict-like
+    X : Pandas or Polars dataframe, 2d-array or any input accepted \
+        by ``pd.to_datetime``.
         The object to convert to a datetime.
 
-    errors : {'ignore', 'raise', 'coerce'}, default 'coerce'
-        - If ``'raise'``, then invalid parsing will raise an exception.
-        - If ``'coerce'``, then invalid parsing will be set as ``NaT``.
-        Note that ``'ignore'`` is not used and will raise an error.
+    errors : {'coerce', 'raise'}, default 'coerce'
+        When set to 'raise', errors will be raised only when the following conditions
+        are satisfied, for each column ``X_col``:
+        - After converting to numpy, the column dtype is np.object_ or np.str_
+        - Each entry of the column is datetime-parsable, i.e.
+          ``pd.to_datetime(X_col, format="mixed")`` doesn't raise an error.
+          This step is conservative, because e.g.
+          ``["2020-01-01", "hello", "2020-01-01"]``
+          is not considered datetime-parsable (so we won't attempt to convert it).
+        - The column as a whole is not datetime-parsable, due to a clash of datetime
+          format, e.g. '2020/01/01' and '2020-01-01'.
+
+        When set to ``'coerce'``, the entries of ``X_col`` that should have raised
+        an error are set to ``NaT`` instead.
+        You can choose which format to use with the keyword argument ``format``, as with
+        ``pd.to_datetime``, e.g. ``to_datetime(X_col, format='%Y/%m/%d')``.
+        Combined with ``error='coerce'``, this will convert all entries that don't
+        match this format to ``NaT``.
+
+        Note that the ``'ignore'`` option is not used and will raise an error.
 
     **kwargs : key, value mappings
         Other keyword arguments are passed down to :func:`pandas.to_datetime`.
         Raise an error if 'unit' is set to any value. This is because, in
         `pandas.to_datetime`, unit is specific to timestamps, whereas in
-        `skru`.to_datetime` we don't attempt to parse numeric columns.
+        `skrub`.to_datetime` we don't attempt to parse numeric columns.
 
     Returns
     -------
@@ -80,7 +93,14 @@ def to_datetime(
     Examples
     --------
     >>> X = pd.DataFrame(dict(a=[1, 2], b=["2021-01-01", "2021-02-02"]))
-    >>> X = to_datetime(X)
+    >>> X
+        a           b
+    0  1  31/01/2021
+    1  2  01/02/2022
+    >>> to_datetime(X)
+        a          b
+    0  1  2021-01-31
+    1  2  2022-02-01
     >>> X.dtypes.to_list()
     [dtype('int64'), dtype('<M8[ns]')]
     """
@@ -89,8 +109,11 @@ def to_datetime(
         raise ValueError(f"errors options are {errors_options!r}, got {errors!r}.")
     kwargs["errors"] = errors
 
-    if kwargs.get("unit", None) is not None:
-        raise ValueError("unit ")
+    if "unit" in kwargs:
+        raise ValueError(
+            "'unit' is not a parameter of skrub.to_datetime; it is only meaningful "
+            "when applying pandas.to_datetime to a numerical column"
+        )
 
     # dataframe
     if hasattr(X, "__dataframe__"):
@@ -115,8 +138,8 @@ def to_datetime(
 
     else:
         raise TypeError(
-            "X must be a string, datetime, list, tuple, 1-d array, Series, "
-            f"2-d array or dataframe. Got {X=!r}."
+            "X must be a Dataframe, series, 2d array or any "
+            f"valid input for ``pd.to_datetime``. Got {X=!r}."
         )
 
 
@@ -214,11 +237,12 @@ def _to_datetime_2d(
         It defines the format parameter for each column when calling
         pd.to_datetime.
 
-        If indices is None, indices_to_format is computed using the current input X.
-        If format is not None, all values of indices_to_format are format.
+        If indices is None, ``indices_to_format`` is computed using the
+        current input X.
+        If format is not None, all values of ``indices_to_format`` are set
+        to format.
 
     format : str, default=None
-        Here for compatibility with ``pandas.to_datetime`` API.
         When format is not None, it overwrites the values in indices_to_format.
 
     Returns
@@ -240,7 +264,7 @@ def _to_datetime_2d(
     return X_split
 
 
-def _get_datetime_column_indices(X_split):
+def _get_datetime_column_indices(X_split, dayfirst=True):
     """Select the datetime parsable columns by their indices \
     and return their datetime format.
 
@@ -312,7 +336,7 @@ def _is_column_datetime_parsable(X_col):
     return False
 
 
-def _guess_datetime_format(X_col, require_dayfirst=False):
+def _guess_datetime_format(X_col):
     """
     Parameters
     ----------
@@ -339,20 +363,7 @@ def _guess_datetime_format(X_col, require_dayfirst=False):
         month_first_formats = pd.unique(vfunc(X_col, dayfirst=False))
         day_first_formats = pd.unique(vfunc(X_col, dayfirst=True))
 
-    if None in month_first_formats or None in day_first_formats:
-        return None
-
-    elif (
-        len(month_first_formats) == 1
-        and len(day_first_formats) == 1
-        and month_first_formats[0] != day_first_formats[0]
-    ):
-        if require_dayfirst:
-            return str(day_first_formats[0])
-        else:
-            return str(month_first_formats[0])
-
-    elif len(month_first_formats) == 1:
+    if len(month_first_formats) == 1:
         return str(month_first_formats[0])
 
     elif len(day_first_formats) == 1:
@@ -503,7 +514,7 @@ def fit(self, X, y=None):
         ----------
         X : array-like, shape ``(n_samples, n_features)``
             Input data. Columns that can't be converted into
-            `pandas.DatetimeIndex` and numerical values will
+            ``pandas.DatetimeIndex`` and numerical values will
             be dropped.
         y : None
             Unused, only here for compatibility.
@@ -521,7 +532,7 @@ def fit(self, X, y=None):
         errors_options = ["coerce", "raise"]
         if self.errors not in errors_options:
             raise ValueError(
-                f"errors options are {errors_options!r}, got {self.errors!r}."
+                f"'errors' options are {errors_options!r}, got {self.errors!r}."
             )
 
         self._check_feature_names(X, reset=True)
diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py
index 89d6358be..52d2eabf9 100644
--- a/skrub/tests/test_datetime_encoder.py
+++ b/skrub/tests/test_datetime_encoder.py
@@ -342,27 +342,6 @@ def test_mixed_type_dataframe():
     assert X_dt.dtype == np.object_
 
 
-def test_mixed_datetime_format():
-    df = get_mixed_datetime_format()
-
-    df_dt = to_datetime(df)
-    expected_df_dt = pd.DataFrame(
-        dict(
-            a=[
-                pd.Timestamp("2022-10-15"),
-                pd.Timestamp("2021-12-25"),
-                pd.Timestamp("2020-05-18"),
-                pd.Timestamp("2019-10-15 12:00:00"),
-            ]
-        )
-    )
-    assert_frame_equal(df_dt, expected_df_dt)
-
-    ser_dt = to_datetime(df["a"])
-    expected_ser_dt = expected_df_dt["a"]
-    assert_array_equal(ser_dt, expected_ser_dt)
-
-
 @pytest.mark.skipif(
     not _is_pandas_format_mixed_available(),
     reason=(
@@ -381,8 +360,31 @@ def test_indempotency():
     assert_array_equal(X_trans, X_trans_2)
 
 
+def test_datetime_encoder_invalid_params():
+    X = get_datetime()
+
+    with pytest.raises(ValueError, match=r"(?=.*'resolution' options)"):
+        DatetimeEncoder(resolution="hello").fit(X)
+
+    DatetimeEncoder(resolution=None).fit(X)
+
+    with pytest.raises(ValueError, match=r"(?=.*'errors' options)"):
+        DatetimeEncoder(errors="ignore").fit(X)
+
+
 @pytest.mark.parametrize(
-    "X", [True, "a", ["a", "b"], ("a", "b"), 1, [1, 2], np.array([1, 2])]
+    "X",
+    [
+        True,
+        "a",
+        ["a", "b"],
+        ("a", "b"),
+        1,
+        [1, 2],
+        np.array([1, 2]),
+        pd.Timestamp(2020, 1, 1),
+        np.array(["2020-01-01", "hello", "2020-01-02"]),
+    ],
 )
 def test_to_datetime_incorrect_skip(X):
     assert_array_equal(to_datetime(X), X)
@@ -393,3 +395,69 @@ def test_to_datetime_type_error():
     X = [[["2021-01-01"]]]
     with pytest.raises(TypeError):
         to_datetime(X)
+
+
+def test_to_datetime_invalid_params():
+    with pytest.raises(ValueError, match=r"(?=.*errors options)"):
+        to_datetime(2020, errors="skip")
+
+    with pytest.raises(ValueError, match=r"(?=.*not a parameter of skrub)"):
+        to_datetime(2020, unit="second")
+
+
+def test_to_datetime_format_param():
+    X_col = ["2021-01-01", "2021/01/01"]
+
+    # without format (default)
+    out = to_datetime(X_col)
+    expected_out = np.array(["2021-01-01", "NaT"], dtype="datetime64[ns]")
+    assert_array_equal(out, expected_out)
+
+    # with format
+    out = to_datetime(X_col, format="%Y/%m/%d")
+    expected_out = np.array(["NaT", "2021-01-01"], dtype="datetime64[ns]")
+    assert_array_equal(out, expected_out)
+
+
+def test_mixed_datetime_format():
+    df = get_mixed_datetime_format()
+
+    df_dt = to_datetime(df)
+    expected_df_dt = pd.DataFrame(
+        dict(
+            a=[
+                pd.Timestamp("2022-10-15"),
+                pd.Timestamp("2021-12-25"),
+                pd.Timestamp("2020-05-18"),
+                pd.Timestamp("2019-10-15 12:00:00"),
+            ]
+        )
+    )
+    assert_frame_equal(df_dt, expected_df_dt)
+
+    series_dt = to_datetime(df["a"])
+    expected_series_dt = expected_df_dt["a"]
+    assert_array_equal(series_dt, expected_series_dt)
+
+
+def test_mix_of_unambiguous():
+    X_col = ["2021/10/15", "2021/13/01"]
+
+    # no format (default), no-op
+    out = to_datetime(X_col)
+    assert_array_equal(out, X_col)
+
+
+def test_only_ambiguous():
+    X_col = ["2021/10/10", "2020/01/02"]
+    out = to_datetime(X_col)
+    # monthfirst by default
+    expected_out = np.array(["2021-10-10", "2020-01-02"], dtype="datetime64[ns]")
+    assert_array_equal(out, expected_out)
+
+
+def test_monthfirst_only():
+    X_col = ["2021/02/02", "2021/01/15"]
+    out = to_datetime(X_col)
+    expected_out = np.array(["2021-02-02", "2021-01-15"], dtype="datetime64[ns]")
+    assert_array_equal(out, expected_out)

From 1f1e1287f815491bd19e2af883503300a5b5c47d Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Sat, 4 Nov 2023 11:48:17 +0100
Subject: [PATCH 22/30] fix docstring format

---
 skrub/_datetime_encoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index 9b8364321..42a2ba963 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -48,7 +48,7 @@ def to_datetime(
     Parameters
     ----------
     X : Pandas or Polars dataframe, 2d-array or any input accepted \
-        by ``pd.to_datetime``.
+    by ``pd.to_datetime``
         The object to convert to a datetime.
 
     errors : {'coerce', 'raise'}, default 'coerce'

From 581fd8815644a183497048c0f31bca115072b777 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Sat, 4 Nov 2023 12:10:57 +0100
Subject: [PATCH 23/30] make doctest happy

---
 skrub/_agg_joiner.py       | 19 ++++++++++---------
 skrub/_datetime_encoder.py | 14 ++++++--------
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/skrub/_agg_joiner.py b/skrub/_agg_joiner.py
index ed369964b..0e461c3bf 100644
--- a/skrub/_agg_joiner.py
+++ b/skrub/_agg_joiner.py
@@ -155,7 +155,7 @@ class AggJoiner(BaseEstimator, TransformerMixin):
        airportId airportName company_mode_1  total_passengers_mean_1
     0          1   Paris CDG             AF               103.33...
     1          2      NY JFK             DL                80.00...
-    """  # noqa: E501
+    """
 
     def __init__(
         self,
@@ -416,18 +416,19 @@ class AggTarget(BaseEstimator, TransformerMixin):
     ...     "company": ["DL", "AF", "AF", "DL", "DL", "TR"],
     ... })
     >>> y = np.array([1, 1, 0, 0, 1, 1])
-    >>> join_agg = AggTarget(
+    >>> agg_target = AggTarget(
     ...     main_key="company",
     ...     operation=["mean", "max"],
     ... )
-    >>> join_agg.fit_transform(X, y)
+    >>> agg_target.fit_transform(X, y)
        flightId  from_airport  ...  y_0_max_target y_0_mean_target
-    0         1             1  ...               1        0.66...
-    1         2             1  ...               1        0.50...
-    2         3             1  ...               1        0.50...
-    3         4             2  ...               1        0.66...
-    4         5             2  ...               1        0.66...
-    5         6             2  ...               1        1.00...
+    0         1             1  ...               1        0.666667
+    1         2             1  ...               1        0.500000
+    2         3             1  ...               1        0.500000
+    3         4             2  ...               1        0.666667
+    4         5             2  ...               1        0.666667
+    5         6             2  ...               1        1.000000
+    <BLANKLINE>
     [6 rows x 6 columns]
     """
 
diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index 42a2ba963..5a5134c06 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -94,15 +94,13 @@ def to_datetime(
     --------
     >>> X = pd.DataFrame(dict(a=[1, 2], b=["2021-01-01", "2021-02-02"]))
     >>> X
-        a           b
-    0  1  31/01/2021
-    1  2  01/02/2022
+       a          b
+    0  1 2021-01-01
+    1  2 2021-02-02
     >>> to_datetime(X)
-        a          b
-    0  1  2021-01-31
-    1  2  2022-02-01
-    >>> X.dtypes.to_list()
-    [dtype('int64'), dtype('<M8[ns]')]
+       a          b
+    0  1 2021-01-01
+    1  2 2021-02-02
     """
     errors_options = ["coerce", "raise"]
     if errors not in errors_options:

From 25d04572f48403c8c66f06a87a6073fb6cea1b26 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Sat, 4 Nov 2023 12:51:15 +0100
Subject: [PATCH 24/30] fix min pandas tests

---
 skrub/_datetime_encoder.py           |  4 ++--
 skrub/tests/test_datetime_encoder.py | 17 ++++++++++++-----
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index 5a5134c06..a8ef43646 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -361,10 +361,10 @@ def _guess_datetime_format(X_col):
         month_first_formats = pd.unique(vfunc(X_col, dayfirst=False))
         day_first_formats = pd.unique(vfunc(X_col, dayfirst=True))
 
-    if len(month_first_formats) == 1:
+    if len(month_first_formats) == 1 and month_first_formats[0] is not None:
         return str(month_first_formats[0])
 
-    elif len(day_first_formats) == 1:
+    elif len(day_first_formats) == 1 and day_first_formats[0] is not None:
         return str(day_first_formats[0])
 
     # special heuristic: when both date and datetime formats are
diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py
index 52d2eabf9..29fc03001 100644
--- a/skrub/tests/test_datetime_encoder.py
+++ b/skrub/tests/test_datetime_encoder.py
@@ -17,6 +17,7 @@
 NANOSECONDS_FORMAT = (
     "%Y-%m-%d %H:%M:%S.%f" if _is_pandas_format_mixed_available() else None
 )
+MSG_MIN_PANDAS_SKIP = "Pandas format=mixed is not available"
 
 
 def get_date(as_array=False):
@@ -344,10 +345,7 @@ def test_mixed_type_dataframe():
 
 @pytest.mark.skipif(
     not _is_pandas_format_mixed_available(),
-    reason=(
-        "DeprecationWarning is already handled as a ValueError "
-        "in the latest pandas version."
-    ),
+    reason=MSG_MIN_PANDAS_SKIP,
 )
 def test_indempotency():
     df = get_mixed_datetime_format()
@@ -382,7 +380,12 @@ def test_datetime_encoder_invalid_params():
         1,
         [1, 2],
         np.array([1, 2]),
-        pd.Timestamp(2020, 1, 1),
+        pytest.param(
+            pd.Timestamp(2020, 1, 1),
+            marks=pytest.mark.skipif(
+                not _is_pandas_format_mixed_available(), reason=MSG_MIN_PANDAS_SKIP
+            ),
+        ),
         np.array(["2020-01-01", "hello", "2020-01-02"]),
     ],
 )
@@ -405,6 +408,10 @@ def test_to_datetime_invalid_params():
         to_datetime(2020, unit="second")
 
 
+@pytest.mark.skipif(
+    not _is_pandas_format_mixed_available(),
+    reason=MSG_MIN_PANDAS_SKIP,
+)
 def test_to_datetime_format_param():
     X_col = ["2021-01-01", "2021/01/01"]
 

From bc72e81df96bbf04cb1b21232a067d8fb88ae641 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Sat, 4 Nov 2023 17:21:25 +0100
Subject: [PATCH 25/30] fix tests for min pandas

---
 skrub/_agg_joiner.py                 |  2 --
 skrub/_datetime_encoder.py           | 19 ++++++++++++++-----
 skrub/tests/test_datetime_encoder.py | 11 +----------
 skrub/tests/test_table_vectorizer.py |  9 ++++++++-
 4 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/skrub/_agg_joiner.py b/skrub/_agg_joiner.py
index 0e461c3bf..f84f68f98 100644
--- a/skrub/_agg_joiner.py
+++ b/skrub/_agg_joiner.py
@@ -428,8 +428,6 @@ class AggTarget(BaseEstimator, TransformerMixin):
     3         4             2  ...               1        0.666667
     4         5             2  ...               1        0.666667
     5         6             2  ...               1        1.000000
-    <BLANKLINE>
-    [6 rows x 6 columns]
     """
 
     def __init__(
diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index a8ef43646..f7055b3de 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -1,5 +1,6 @@
 import warnings
 from collections import defaultdict
+from contextlib import nullcontext
 from typing import Iterable
 
 import numpy as np
@@ -309,11 +310,19 @@ def _is_column_datetime_parsable(X_col):
     is_dt_parsable : bool
     """
     # Remove columns of int, float or bool casted as object.
-    try:
-        if np.array_equal(X_col, X_col.astype(np.float64)):
-            return False
-    except ValueError:
-        pass
+    # Pandas < 2.0.0 raise a deprecation warning instead of an error.
+    with (
+        warnings.catch_warnings()
+        if not _is_pandas_format_mixed_available()
+        else nullcontext()
+    ):
+        if not _is_pandas_format_mixed_available():
+            warnings.simplefilter("ignore", category=DeprecationWarning)
+        try:
+            if np.array_equal(X_col, X_col.astype(np.float64)):
+                return False
+        except ValueError:
+            pass
 
     np_dtypes_candidates = [np.object_, np.str_, np.datetime64]
     is_type_datetime_compatible = any(
diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py
index 29fc03001..a4a536353 100644
--- a/skrub/tests/test_datetime_encoder.py
+++ b/skrub/tests/test_datetime_encoder.py
@@ -343,10 +343,6 @@ def test_mixed_type_dataframe():
     assert X_dt.dtype == np.object_
 
 
-@pytest.mark.skipif(
-    not _is_pandas_format_mixed_available(),
-    reason=MSG_MIN_PANDAS_SKIP,
-)
 def test_indempotency():
     df = get_mixed_datetime_format()
     df_dt = to_datetime(df)
@@ -380,12 +376,7 @@ def test_datetime_encoder_invalid_params():
         1,
         [1, 2],
         np.array([1, 2]),
-        pytest.param(
-            pd.Timestamp(2020, 1, 1),
-            marks=pytest.mark.skipif(
-                not _is_pandas_format_mixed_available(), reason=MSG_MIN_PANDAS_SKIP
-            ),
-        ),
+        pd.Timestamp(2020, 1, 1),
         np.array(["2020-01-01", "hello", "2020-01-02"]),
     ],
 )
diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py
index ffe41155b..333b4d7f9 100644
--- a/skrub/tests/test_table_vectorizer.py
+++ b/skrub/tests/test_table_vectorizer.py
@@ -8,9 +8,12 @@
 from sklearn.utils.validation import check_is_fitted
 
 from skrub import GapEncoder, MinHashEncoder, SuperVectorizer, TableVectorizer
+from skrub._datetime_encoder import _is_pandas_format_mixed_available
 from skrub._table_vectorizer import _infer_date_format
 from skrub.tests.utils import transformers_list_equal
 
+MSG_PANDAS_DEPRECATED_WARNING = "Skip deprecation warning"
+
 
 def check_same_transformers(
     expected_transformers: dict, actual_transformers: list
@@ -788,7 +791,7 @@ def test_mixed_types() -> None:
             pd.DataFrame({"col1": [1.0, 2.0, np.nan]}),
         ),
         # All datetimes during fit, 1 category during transform
-        (
+        pytest.param(
             pd.DataFrame(
                 {
                     "col1": [
@@ -816,6 +819,10 @@ def test_mixed_types() -> None:
                     ]
                 }
             ),
+            marks=pytest.mark.skipif(
+                not _is_pandas_format_mixed_available(),
+                reason=MSG_PANDAS_DEPRECATED_WARNING,
+            ),
         ),
     ],
 )

From 1f52d1e8a49b32d336513d64c14f8c37c2a6c6ca Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Sat, 4 Nov 2023 17:25:47 +0100
Subject: [PATCH 26/30] make doctest happy

---
 skrub/_agg_joiner.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/skrub/_agg_joiner.py b/skrub/_agg_joiner.py
index f84f68f98..0e461c3bf 100644
--- a/skrub/_agg_joiner.py
+++ b/skrub/_agg_joiner.py
@@ -428,6 +428,8 @@ class AggTarget(BaseEstimator, TransformerMixin):
     3         4             2  ...               1        0.666667
     4         5             2  ...               1        0.666667
     5         6             2  ...               1        1.000000
+    <BLANKLINE>
+    [6 rows x 6 columns]
     """
 
     def __init__(

From 0875958d1d7dde882d199b7fb3abc4ccfe4c2ad0 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Mon, 6 Nov 2023 20:17:24 +0100
Subject: [PATCH 27/30] Update skrub/_datetime_encoder.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérôme Dockès <jerome@dockes.org>
---
 skrub/_datetime_encoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index f7055b3de..6abbf0fa4 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -440,7 +440,7 @@ class DatetimeEncoder(TransformerMixin, BaseEstimator):
         Extract up to this resolution.
         E.g., ``resolution="day"`` generates the features "year", "month",
         "day" only.
-        If ``None``, no feature will be created.
+        If ``None``, no such feature will be created (but day of the week and total seconds may still be extracted, see below).
 
     add_day_of_the_week : bool, default=False
         Add day of the week feature as a numerical feature

From 4137f8984b955322172fcdb5bc40ccaabe3f0292 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Wed, 8 Nov 2023 11:50:05 +0100
Subject: [PATCH 28/30] apply suggestions

---
 skrub/_datetime_encoder.py | 68 +++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 27 deletions(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index 6abbf0fa4..8626a56bc 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -1,6 +1,5 @@
 import warnings
 from collections import defaultdict
-from contextlib import nullcontext
 from typing import Iterable
 
 import numpy as np
@@ -46,6 +45,9 @@ def to_datetime(
     and 2d array inputs. It only attempts to convert columns whose dtype are
     object or string. Numeric columns are skip and preserved in the output.
 
+    Use the 'format' keyword to force a specific datetime format. See more details in
+    the parameters section.
+
     Parameters
     ----------
     X : Pandas or Polars dataframe, 2d-array or any input accepted \
@@ -60,7 +62,7 @@ def to_datetime(
           ``pd.to_datetime(X_col, format="mixed")`` doesn't raise an error.
           This step is conservative, because e.g.
           ``["2020-01-01", "hello", "2020-01-01"]``
-          is not considered datetime-parsable (so we won't attempt to convert it).
+          is not considered datetime-parsable, so we won't attempt to convert it).
         - The column as a whole is not datetime-parsable, due to a clash of datetime
           format, e.g. '2020/01/01' and '2020-01-01'.
 
@@ -75,9 +77,16 @@ def to_datetime(
 
     **kwargs : key, value mappings
         Other keyword arguments are passed down to :func:`pandas.to_datetime`.
-        Raise an error if 'unit' is set to any value. This is because, in
-        `pandas.to_datetime`, unit is specific to timestamps, whereas in
-        `skrub`.to_datetime` we don't attempt to parse numeric columns.
+
+        One notable argument is 'format'. Setting a format overwrites
+        the datetime format guessing behavior of this function for all columns.
+
+        Note that we don't encourage you to use dayfirst or monthfirst argument, since
+        their behavior is ambiguous and might not be applied at all.
+
+        Moreover, this function raises an error if 'unit' is set to any value.
+        This is because, in ``pandas.to_datetime``, 'unit' is specific to timestamps,
+        whereas in ``skrub.to_datetime`` we don't attempt to parse numeric columns.
 
     Returns
     -------
@@ -291,8 +300,15 @@ def _get_datetime_column_indices(X_split, dayfirst=True):
 
         if _is_column_datetime_parsable(X_col):
             indices.append(col_idx)
-            # TODO: pass require_dayfirst to _guess_datetime_format
-            index_to_format[col_idx] = _guess_datetime_format(X_col)
+
+            if np.issubdtype(X_col.dtype, np.datetime64):
+                # We don't need to specify a parsing format
+                # for columns that are already of type datetime64.
+                datetime_format = None
+            else:
+                datetime_format = _guess_datetime_format(X_col)
+
+            index_to_format[col_idx] = datetime_format
 
     return indices, index_to_format
 
@@ -311,13 +327,8 @@ def _is_column_datetime_parsable(X_col):
     """
     # Remove columns of int, float or bool casted as object.
     # Pandas < 2.0.0 raise a deprecation warning instead of an error.
-    with (
-        warnings.catch_warnings()
-        if not _is_pandas_format_mixed_available()
-        else nullcontext()
-    ):
-        if not _is_pandas_format_mixed_available():
-            warnings.simplefilter("ignore", category=DeprecationWarning)
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=DeprecationWarning)
         try:
             if np.array_equal(X_col, X_col.astype(np.float64)):
                 return False
@@ -344,24 +355,27 @@ def _is_column_datetime_parsable(X_col):
 
 
 def _guess_datetime_format(X_col):
-    """
+    """Infer the format of a 1d array.
+
+    This functions uses Pandas ``guess_datetime_format`` routine for both
+    dayfirst and monthfirst case, and select either format when using one
+    give a unify format on the array.
+
+    When both dayfirst and monthfirst format are possible, we select
+    monthfirst by default.
+
+    You can overwrite this behaviour by setting a format of the caller function.
+    Setting a format always take precedence over infering it using
+    ``_guess_datetime_format``.
+
     Parameters
     ----------
     X_col : ndarray of shape ``(n_samples,)``
 
-    require_dayfirst : bool, default False
-        Whether to return the dayfirst format when both dayfirst
-        and monthfirst are valid.
-
     Returns
     -------
-    format : str
+    datetime_format : str or None
     """
-    if np.issubdtype(X_col.dtype, np.datetime64):
-        # We don't need to specify a parsing format
-        # for columns that are already of type datetime64.
-        return None
-
     X_col = X_col.astype(np.object_)
     vfunc = np.vectorize(guess_datetime_format)
     with warnings.catch_warnings():
@@ -440,7 +454,8 @@ class DatetimeEncoder(TransformerMixin, BaseEstimator):
         Extract up to this resolution.
         E.g., ``resolution="day"`` generates the features "year", "month",
         "day" only.
-        If ``None``, no such feature will be created (but day of the week and total seconds may still be extracted, see below).
+        If ``None``, no such feature will be created (but day of the week and \
+            total seconds may still be extracted, see below).
 
     add_day_of_the_week : bool, default=False
         Add day of the week feature as a numerical feature
@@ -573,7 +588,6 @@ def _select_datetime_cols(self, X):
         self.column_indices_, self.index_to_format_ = _get_datetime_column_indices(
             X_split
         )
-        del X_split
 
         self.index_to_features_ = defaultdict(list)
         self.n_features_out_ = 0

From 0bf489686e47784bb2eda9108cce187631224421 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 9 Nov 2023 10:34:29 +0100
Subject: [PATCH 29/30] missing remarks

---
 skrub/_datetime_encoder.py           |  1 +
 skrub/tests/test_datetime_encoder.py | 10 ++++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
index 8626a56bc..21839c41d 100644
--- a/skrub/_datetime_encoder.py
+++ b/skrub/_datetime_encoder.py
@@ -371,6 +371,7 @@ def _guess_datetime_format(X_col):
     Parameters
     ----------
     X_col : ndarray of shape ``(n_samples,)``
+        X_col must only contains string objects without any missing value.
 
     Returns
     -------
diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py
index a4a536353..a6d144528 100644
--- a/skrub/tests/test_datetime_encoder.py
+++ b/skrub/tests/test_datetime_encoder.py
@@ -439,11 +439,13 @@ def test_mixed_datetime_format():
 
 
 def test_mix_of_unambiguous():
-    X_col = ["2021/10/15", "2021/13/01"]
-
-    # no format (default), no-op
+    X_col = ["2021/10/15", "01/14/2021"]
     out = to_datetime(X_col)
-    assert_array_equal(out, X_col)
+    expected_out = np.array(
+        [np.datetime64("2021-10-15"), np.datetime64("NaT")],
+        dtype="datetime64[ns]",
+    )
+    assert_array_equal(out, expected_out)
 
 
 def test_only_ambiguous():

From d5a4091f8a6737a26a9eca2dac82f135650a020c Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 9 Nov 2023 10:40:28 +0100
Subject: [PATCH 30/30] fix min pandas version test

---
 skrub/tests/test_datetime_encoder.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py
index a6d144528..3881eac67 100644
--- a/skrub/tests/test_datetime_encoder.py
+++ b/skrub/tests/test_datetime_encoder.py
@@ -438,6 +438,7 @@ def test_mixed_datetime_format():
     assert_array_equal(series_dt, expected_series_dt)
 
 
+@pytest.mark.skipif(not _is_pandas_format_mixed_available(), reason=MSG_MIN_PANDAS_SKIP)
 def test_mix_of_unambiguous():
     X_col = ["2021/10/15", "01/14/2021"]
     out = to_datetime(X_col)