From c9069bc809c5168deac07f9439acab56f631b428 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Wed, 4 Oct 2023 16:50:31 +0200 Subject: [PATCH 01/30] estimator refacto --- skrub/_datetime_encoder.py | 284 ++++++++++++++++++------------------- 1 file changed, 135 insertions(+), 149 deletions(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index dccb39301..f8b4d23d0 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -1,14 +1,12 @@ -from typing import Literal +from collections import defaultdict import numpy as np import pandas as pd -from numpy.typing import ArrayLike, NDArray from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted -from skrub._utils import check_input - -WORD_TO_ALIAS: dict[str, str] = { +WORD_TO_ALIAS = { "year": "Y", "month": "M", "day": "D", @@ -18,22 +16,28 @@ "microsecond": "us", "nanosecond": "N", } -TIME_LEVELS: list[str] = list(WORD_TO_ALIAS.keys()) -AcceptedTimeValues = Literal[ - "year", - "month", - "day", - "hour", - "minute", - "second", - "microsecond", - "nanosecond", -] - - -class DatetimeEncoder(BaseEstimator, TransformerMixin): - """Transform each datetime column into several numeric columns \ - for temporal features (e.g. "year", "month", "day"...). +TIME_LEVELS = list(WORD_TO_ALIAS) + + +def is_datetime_parsable(X): + """ + Parameters + ---------- + X : numpy ndarray + """ + np_dtypes_candidates = [np.object_, np.str_, np.datetime64] + if any(np.issubdtype(X.dtype, np_dtype) for np_dtype in np_dtypes_candidates): + try: + _ = pd.to_datetime(X) + return True + except (pd.errors.ParserError, ValueError): + pass + return False + + +class DatetimeEncoder(TransformerMixin, BaseEstimator): + """Transforms each datetime column into several numeric columns \ + for temporal features (e.g year, month, day...). Constant extracted features are dropped; for instance, if the year is always the same in a feature, the extracted "year" column won't be added. @@ -98,72 +102,21 @@ class DatetimeEncoder(BaseEstimator, TransformerMixin): [2019., 10., 15., 12.]]) """ - n_features_in_: int - n_features_out_: int - features_per_column_: dict[int, list[str]] - col_names_: list[str] | None - def __init__( self, *, - extract_until: AcceptedTimeValues | None = "hour", - add_day_of_the_week: bool = False, + extract_until="hour", + add_day_of_the_week=False, + add_total_second=False, + errors="coerce", ): self.extract_until = extract_until self.add_day_of_the_week = add_day_of_the_week + self.add_total_second = add_total_second # TODO doc + self.errors = errors # TODO doc - def _more_tags(self): - """ - Used internally by sklearn to ease the estimator checks. - """ - return { - "X_types": ["2darray", "categorical"], - "allow_nan": True, - "_xfail_checks": {"check_dtype_object": "Specific datetime error."}, - } - - def _validate_keywords(self): - if self.extract_until not in TIME_LEVELS and self.extract_until is not None: - raise ValueError( - f'"extract_until" should be one of {TIME_LEVELS}, ' - f"got {self.extract_until}. " - ) - - @staticmethod - def _extract_from_date(date_series: pd.Series, feature: str): - if feature == "year": - return pd.DatetimeIndex(date_series).year.to_numpy() - elif feature == "month": - return pd.DatetimeIndex(date_series).month.to_numpy() - elif feature == "day": - return pd.DatetimeIndex(date_series).day.to_numpy() - elif feature == "hour": - return pd.DatetimeIndex(date_series).hour.to_numpy() - elif feature == "minute": - return pd.DatetimeIndex(date_series).minute.to_numpy() - elif feature == "second": - return pd.DatetimeIndex(date_series).second.to_numpy() - elif feature == "microsecond": - return pd.DatetimeIndex(date_series).microsecond.to_numpy() - elif feature == "nanosecond": - return pd.DatetimeIndex(date_series).nanosecond.to_numpy() - elif feature == "dayofweek": - return pd.DatetimeIndex(date_series).dayofweek.to_numpy() - elif feature == "total_time": - tz = pd.DatetimeIndex(date_series).tz - # Compute the time in seconds from the epoch time UTC - if tz is None: - return ( - pd.to_datetime(date_series) - pd.Timestamp("1970-01-01") - ) // pd.Timedelta("1s") - else: - return ( - pd.DatetimeIndex(date_series).tz_convert("utc") - - pd.Timestamp("1970-01-01", tz="utc") - ) // pd.Timedelta("1s") - - def fit(self, X: ArrayLike, y=None) -> "DatetimeEncoder": - """Fit the instance to ``X``. + def fit(self, X, y=None): + """Fit the instance to X. In practice, just check keywords and input validity, and stores which extracted features are not constant. @@ -180,52 +133,69 @@ def fit(self, X: ArrayLike, y=None) -> "DatetimeEncoder": DatetimeEncoder Fitted DatetimeEncoder instance (self). """ - self._validate_keywords() - if isinstance(X, pd.DataFrame): - self.col_names_ = X.columns.to_list() - else: - self.col_names_ = None - X = check_input(X) - # Features to extract for each column, after removing constant features - self.features_per_column_ = {} - for i in range(X.shape[1]): - self.features_per_column_[i] = [] - # Check which columns are constant - for i in range(X.shape[1]): - if self.extract_until is None: - if np.nanstd(self._extract_from_date(X[:, i], "total_time")) > 0: - self.features_per_column_[i].append("total_time") - else: - for feature in TIME_LEVELS: - if np.nanstd(self._extract_from_date(X[:, i], feature)) > 0: - if TIME_LEVELS.index(feature) <= TIME_LEVELS.index( - self.extract_until - ): - self.features_per_column_[i].append(feature) - # we add a total_time feature, which contains the full - # time to epoch, if there is at least one - # feature that has not been extracted and is not constant - if TIME_LEVELS.index(feature) > TIME_LEVELS.index( - self.extract_until - ): - self.features_per_column_[i].append("total_time") - break - # Add day of the week feature if needed - if ( - self.add_day_of_the_week - and np.nanstd(self._extract_from_date(X[:, i], "dayofweek")) > 0 - ): - self.features_per_column_[i].append("dayofweek") - - self.n_features_in_ = X.shape[1] - self.n_features_out_ = len( - np.concatenate(list(self.features_per_column_.values())) - ) + if self.extract_until not in TIME_LEVELS and self.extract_until is not None: + raise ValueError( + f"'extract_until' options are {TIME_LEVELS}, " + f"got {self.extract_until!r}." + ) + + errors_options = ["coerce", "raise"] + if self.errors not in errors_options: + raise ValueError( + f"errors options are {errors_options!r}, got {self.errors!r}." + ) + + self._check_feature_names(X, reset=True) + self._check_n_features(X, reset=True) + X = check_array(X, ensure_2d=True, force_all_finite=False) + + self._parse_datetime_cols(X) return self - def transform(self, X: ArrayLike, y=None) -> NDArray: - """Transform ``X`` by replacing each datetime column with \ + def _parse_datetime_cols(self, X): + """ + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + """ + # Features to extract for each column, after removing constant features + self.features_per_column_ = defaultdict(list) + self.format_per_column_ = dict() + self.n_features_out_ = 0 + + if self.extract_until is None: + levels = [] + require_total_second = False + else: + idx_level = TIME_LEVELS.index(self.extract_until) + levels = TIME_LEVELS[:idx_level] + require_total_second = TIME_LEVELS == levels + + self.add_total_second_ = self.add_total_second or require_total_second + + columns = getattr(self, "feature_names_in_", list(range(X.shape[1]))) + for col_idx, col in enumerate(columns): + X_col = X[:, col_idx] + + if is_datetime_parsable(X_col): + # Pandas use the first non-null item of the array to infer the format. + mask_notnull = X_col == X_col + self.format_per_column_[col] = X_col[mask_notnull][0] + + self.features_per_column_[col] += levels + self.n_features_out_ += len(levels) + + if self.add_total_second_: + self.features_per_column_[col].append("total_time") + self.n_features_out_ += 1 + + if self.add_day_of_the_week: + self.features_per_column_[col].append("day_of_week") + self.n_features_out_ += 1 + + def transform(self, X, y=None): + """Transform `X` by replacing each datetime column with \ corresponding numerical features. Parameters @@ -240,28 +210,35 @@ def transform(self, X: ArrayLike, y=None) -> NDArray: ndarray, shape (``n_samples``, ``n_features_out_``) Transformed input. """ - check_is_fitted( - self, - attributes=["n_features_in_", "n_features_out_", "features_per_column_"], - ) - X = check_input(X) - if X.shape[1] != self.n_features_in_: - raise ValueError( - f"The number of features in the input data ({X.shape[1]}) " - "does not match the number of features " - f"seen during fit ({self.n_features_in_}). " - ) - # Create a new array with the extracted features, - # choosing only features that weren't constant during fit - X_ = np.empty((X.shape[0], self.n_features_out_), dtype=np.float64) - idx = 0 - for i in range(X.shape[1]): - for j, feature in enumerate(self.features_per_column_[i]): - X_[:, idx + j] = self._extract_from_date(X[:, i], feature) - idx += len(self.features_per_column_[i]) - return X_ - - def get_feature_names_out(self, input_features=None) -> list[str]: + check_is_fitted(self) + self._check_n_features(X, reset=False) + self._check_feature_names(X, reset=False) + X = check_array(X, ensure_2d=True, force_all_finite=False) + + columns = getattr(self, "feature_names_in_", list(range(X.shape[1]))) + X_out = np.empty((X.shape[0], self.n_features_out_), dtype=np.float64) + offset_idx = 0 + for col_idx, col in enumerate(columns): + if col in self.features_per_column_: + # X_j is a DatetimeIndex + X_j = pd.to_datetime(X[:, col_idx], errors=self.errors) + + features = self.features_per_column_[col] + for feat_idx, feature in enumerate(features): + if feature == "total_time": + if X_j.tz is not None: + X_j = X_j.tz_convert("utc") + # Total seconds since epoch + X_feature = (X_j.astype("int64") // 1e9).to_numpy() + else: + X_feature = getattr(X_j, feature).to_numpy() + + X_out[:, offset_idx + feat_idx] = X_feature + offset_idx += len(features) + + return X_out + + def get_feature_names_out(self, input_features=None): """Return clean feature names. Feature names are formatted like: "_" @@ -280,9 +257,18 @@ def get_feature_names_out(self, input_features=None) -> list[str]: list of str List of feature names. """ + check_is_fitted(self, "features_per_column_") feature_names = [] - for i in self.features_per_column_.keys(): - prefix = str(i) if self.col_names_ is None else self.col_names_[i] - for feature in self.features_per_column_[i]: - feature_names.append(f"{prefix}_{feature}") + for column, features in self.features_per_column_.items(): + feature_names += [f"{column}_{feat}" for feat in features] return feature_names + + def _more_tags(self): + """ + Used internally by sklearn to ease the estimator checks. + """ + return { + "X_types": ["2darray", "categorical"], + "allow_nan": True, + "_xfail_checks": {"check_dtype_object": "Specific datetime error."}, + } From da7d67880e962a54c00349c18df6c03cb85bc6bb Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 5 Oct 2023 14:46:55 +0200 Subject: [PATCH 02/30] revamp all tests from datetime_encoder --- skrub/_datetime_encoder.py | 59 ++- skrub/tests/test_datetime_encoder.py | 698 ++++++++++----------------- 2 files changed, 301 insertions(+), 456 deletions(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index f8b4d23d0..7a1a59ccf 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -23,7 +23,7 @@ def is_datetime_parsable(X): """ Parameters ---------- - X : numpy ndarray + X : np.ndarray of shape (n_sample,) """ np_dtypes_candidates = [np.object_, np.str_, np.datetime64] if any(np.issubdtype(X.dtype, np_dtype) for np_dtype in np_dtypes_candidates): @@ -35,6 +35,18 @@ def is_datetime_parsable(X): return False +def is_date_only(X): + """ + Parameters + ---------- + X : np.ndarray of shape (n_sample,) + """ + if is_datetime_parsable(X): + X_t = pd.to_datetime(X) + return np.all(X_t == X_t.normalize()) + return False + + class DatetimeEncoder(TransformerMixin, BaseEstimator): """Transforms each datetime column into several numeric columns \ for temporal features (e.g year, month, day...). @@ -107,7 +119,7 @@ def __init__( *, extract_until="hour", add_day_of_the_week=False, - add_total_second=False, + add_total_second=True, errors="coerce", ): self.extract_until = extract_until @@ -147,7 +159,7 @@ def fit(self, X, y=None): self._check_feature_names(X, reset=True) self._check_n_features(X, reset=True) - X = check_array(X, ensure_2d=True, force_all_finite=False) + X = check_array(X, ensure_2d=True, force_all_finite=False, dtype=None) self._parse_datetime_cols(X) @@ -166,13 +178,9 @@ def _parse_datetime_cols(self, X): if self.extract_until is None: levels = [] - require_total_second = False else: idx_level = TIME_LEVELS.index(self.extract_until) - levels = TIME_LEVELS[:idx_level] - require_total_second = TIME_LEVELS == levels - - self.add_total_second_ = self.add_total_second or require_total_second + levels = TIME_LEVELS[: idx_level + 1] columns = getattr(self, "feature_names_in_", list(range(X.shape[1]))) for col_idx, col in enumerate(columns): @@ -180,14 +188,21 @@ def _parse_datetime_cols(self, X): if is_datetime_parsable(X_col): # Pandas use the first non-null item of the array to infer the format. - mask_notnull = X_col == X_col + X_dt = pd.to_datetime(X_col) + mask_notnull = X_dt == X_dt self.format_per_column_[col] = X_col[mask_notnull][0] + if is_date_only(X_col): + # Keep only date attributes + levels = [ + level for level in levels if level in ["year", "month", "day"] + ] + self.features_per_column_[col] += levels self.n_features_out_ += len(levels) - if self.add_total_second_: - self.features_per_column_[col].append("total_time") + if self.add_total_second: + self.features_per_column_[col].append("total_second") self.n_features_out_ += 1 if self.add_day_of_the_week: @@ -213,27 +228,33 @@ def transform(self, X, y=None): check_is_fitted(self) self._check_n_features(X, reset=False) self._check_feature_names(X, reset=False) - X = check_array(X, ensure_2d=True, force_all_finite=False) + X = check_array(X, ensure_2d=True, force_all_finite=False, dtype=None) columns = getattr(self, "feature_names_in_", list(range(X.shape[1]))) + # X_out must be of dtype float64 to handle np.nan X_out = np.empty((X.shape[0], self.n_features_out_), dtype=np.float64) offset_idx = 0 for col_idx, col in enumerate(columns): if col in self.features_per_column_: # X_j is a DatetimeIndex - X_j = pd.to_datetime(X[:, col_idx], errors=self.errors) + X_col = pd.to_datetime(X[:, col_idx], errors=self.errors) features = self.features_per_column_[col] for feat_idx, feature in enumerate(features): - if feature == "total_time": - if X_j.tz is not None: - X_j = X_j.tz_convert("utc") + if feature == "total_second": + if X_col.tz is not None: + X_col = X_col.tz_convert("utc") # Total seconds since epoch - X_feature = (X_j.astype("int64") // 1e9).to_numpy() + mask_notnull = X_col == X_col + X_feature = np.where( + mask_notnull, + X_col.astype("int64") // 1e9, + np.nan, + ) else: - X_feature = getattr(X_j, feature).to_numpy() - + X_feature = getattr(X_col, feature).to_numpy() X_out[:, offset_idx + feat_idx] = X_feature + offset_idx += len(features) return X_out diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py index fa7e93a93..5ecdf383b 100644 --- a/skrub/tests/test_datetime_encoder.py +++ b/skrub/tests/test_datetime_encoder.py @@ -1,494 +1,318 @@ +from copy import deepcopy +from itertools import product + import numpy as np import pandas as pd import pytest -from sklearn.exceptions import NotFittedError +from numpy.testing import assert_allclose, assert_array_equal -from skrub._datetime_encoder import DatetimeEncoder +from skrub._datetime_encoder import TIME_LEVELS, DatetimeEncoder -def get_date_array() -> np.array: - return np.array( +def get_date(as_array=False): + df = pd.DataFrame( [ - pd.to_datetime(["2020-01-01", "2020-01-02", "2020-01-03"]), - pd.to_datetime(["2021-02-03", "2020-02-04", "2021-02-05"]), - pd.to_datetime(["2022-01-01", "2020-12-25", "2022-01-03"]), - pd.to_datetime(["2023-02-03", "2020-02-04", "2023-02-05"]), - ] + ["2020-01-01", "2020-01-02", "2020-01-03"], + ["2021-02-03", "2020-02-04", "2021-02-05"], + ["2022-01-01", "2020-12-25", "2022-01-03"], + ["2023-02-03", "2020-02-04", "2023-02-05"], + ], ) + if as_array: + return df.to_numpy() + return df -def get_constant_date_array() -> np.array: - return np.array( +def get_constant_date(as_array=False): + df = pd.DataFrame( [ - pd.to_datetime(["2020-01-01", "2020-02-04", "2021-02-05"]), - pd.to_datetime(["2020-01-01", "2020-02-04", "2021-02-05"]), - pd.to_datetime(["2020-01-01", "2020-02-04", "2021-02-05"]), - pd.to_datetime(["2020-01-01", "2020-02-04", "2021-02-05"]), - ] + ["2020-01-01", "2020-02-04", "2021-02-05"], + ["2020-01-01", "2020-02-04", "2021-02-05"], + ["2020-01-01", "2020-02-04", "2021-02-05"], + ["2020-01-01", "2020-02-04", "2021-02-05"], + ], ) + if as_array: + return df.to_numpy() + return df -def get_datetime_array() -> np.array: - return np.array( +def get_datetime(as_array=False): + df = pd.DataFrame( [ - pd.to_datetime( - [ - "2020-01-01 10:12:01", - "2020-01-02 10:23:00", - "2020-01-03 10:00:00", - ], - ), - pd.to_datetime( - [ - "2021-02-03 12:45:23", - "2020-02-04 22:12:00", - "2021-02-05 12:00:00", - ], - ), - pd.to_datetime( - [ - "2022-01-01 23:23:43", - "2020-12-25 11:12:00", - "2022-01-03 11:00:00", - ], - ), - pd.to_datetime( - [ - "2023-02-03 11:12:12", - "2020-02-04 08:32:00", - "2023-02-05 23:00:00", - ], - ), - ] + ["2020-01-01 10:12:01", "2020-01-02 10:23:00", "2020-01-03 10:00:00"], + ["2021-02-03 12:45:23", "2020-02-04 22:12:00", "2021-02-05 12:00:00"], + ["2022-01-01 23:23:43", "2020-12-25 11:12:00", "2022-01-03 11:00:00"], + ["2023-02-03 11:12:12", "2020-02-04 08:32:00", "2023-02-05 23:00:00"], + ], ) + if as_array: + return df.to_numpy() + return df -def get_datetime_array_nanoseconds() -> np.array: - return np.array( +def get_nanoseconds(as_array=False): + df = pd.DataFrame( [ - pd.to_datetime( - [ - # constant year and month - # for the first feature - "2020-08-24 15:55:30.123456789", - "2020-08-24 15:55:30.123456789", - ], - ), - pd.to_datetime( - [ - "2020-08-20 14:56:31.987654321", - "2021-07-20 14:56:31.987654321", - ], - ), - pd.to_datetime( - [ - "2020-08-20 14:57:32.123987654", - "2023-09-20 14:57:32.123987654", - ], - ), - pd.to_datetime( - [ - "2020-08-20 14:58:33.987123456", - "2023-09-20 14:58:33.987123456", - ], - ), - ] + ["2020-08-24 15:55:30.123456789", "2020-08-24 15:55:30.123456789"], + ["2020-08-20 14:56:31.987654321", "2021-07-20 14:56:31.987654321"], + ["2020-08-20 14:57:32.123987654", "2023-09-20 14:57:32.123987654"], + ["2020-08-20 14:58:33.987123456", "2023-09-20 14:58:33.987123456"], + ], ) + if as_array: + return df.to_numpy() + return df -def get_dirty_datetime_array() -> np.array: - return np.array( +def get_nan_datetime(as_array=False): + df = pd.DataFrame( [ - np.array( - pd.to_datetime( - [ - "2020-01-01 10:12:01", - "2020-01-02 10:23:00", - "2020-01-03 10:00:00", - ] - ) - ), - np.array( - pd.to_datetime([np.nan, "2020-02-04 22:12:00", "2021-02-05 12:00:00"]) - ), - np.array( - pd.to_datetime(["2022-01-01 23:23:43", "2020-12-25 11:12:00", pd.NaT]) - ), - np.array( - pd.to_datetime( - [ - "2023-02-03 11:12:12", - "2020-02-04 08:32:00", - "2023-02-05 23:00:00", - ] - ) - ), - ] + ["2020-01-01 10:12:01", None, "2020-01-03 10:00:00"], + [np.nan, "2020-02-04 22:12:00", "2021-02-05 12:00:00"], + ["2022-01-01 23:23:43", "2020-12-25 11:12:00", pd.NaT], + ["2023-02-03 11:12:12", "2020-02-04 08:32:00", "2023-02-05 23:00:00"], + ], ) + if as_array: + return df.to_numpy() + return df -def get_datetime_with_TZ_array() -> pd.DataFrame: - res = pd.DataFrame( +def get_tz_datetime(as_array=False): + # The equivalent dtype is "datetime64[ns, Asia/Kolkata]" + df = pd.DataFrame( [ - pd.to_datetime(["2020-01-01 10:12:01"]), - pd.to_datetime(["2021-02-03 12:45:23"]), - pd.to_datetime(["2022-01-01 23:23:43"]), - pd.to_datetime(["2023-02-03 11:12:12"]), - ] - ) - for col in res.columns: - res[col] = pd.DatetimeIndex(res[col]).tz_localize("Asia/Kolkata") - return res - - -def test_fit() -> None: - # Dates - X = get_date_array() - enc = DatetimeEncoder() - expected_features_per_column_ = { - 0: ["year", "month", "day"], - 1: ["month", "day"], - 2: ["year", "month", "day"], - } - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ - - X = get_date_array() - enc = DatetimeEncoder(add_day_of_the_week=True) - expected_features_per_column_ = { - 0: ["year", "month", "day", "dayofweek"], - 1: ["month", "day", "dayofweek"], - 2: ["year", "month", "day", "dayofweek"], - } - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ - - # Datetimes - X = get_datetime_array() - enc = DatetimeEncoder(add_day_of_the_week=True) - expected_features_per_column_ = { - 0: ["year", "month", "day", "hour", "total_time", "dayofweek"], - 1: ["month", "day", "hour", "total_time", "dayofweek"], - 2: ["year", "month", "day", "hour", "dayofweek"], - } - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ - - # we check that the features are extracted until `extract_until` - # that constant feature are not extracted - # and that the total_time feature is extracted if needed - X = get_datetime_array() - enc = DatetimeEncoder(extract_until="minute") - expected_features_per_column_ = { - 0: ["year", "month", "day", "hour", "minute", "total_time"], - 1: ["month", "day", "hour", "minute"], - 2: ["year", "month", "day", "hour"], - } - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ - - # extract_until="nanosecond" - X = get_datetime_array_nanoseconds() - enc = DatetimeEncoder(extract_until="nanosecond") - expected_features_per_column_ = { - # constant year and month - # for first feature - 0: [ - "day", - "hour", - "minute", - "second", - "microsecond", - "nanosecond", - ], - 1: [ - "year", - "month", - "day", - "hour", - "minute", - "second", - "microsecond", - "nanosecond", + ["2020-01-01 10:12:01+05:30"], + ["2021-02-03 12:45:23+05:30"], + ["2022-01-01 23:23:43+05:30"], + ["2023-02-03 11:12:12+05:30"], ], - } - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ - - # Dirty Datetimes - X = get_dirty_datetime_array() - enc = DatetimeEncoder() - expected_features_per_column_ = { - 0: ["year", "month", "day", "hour", "total_time"], - 1: ["month", "day", "hour", "total_time"], - 2: ["year", "month", "day", "hour"], - } - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ + ) + if as_array: + return df.to_numpy() + return df - # Datetimes with TZ - X = get_datetime_with_TZ_array() - enc = DatetimeEncoder() - expected_features_per_column_ = {0: ["year", "month", "day", "hour", "total_time"]} - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ - # Feature names - # Without column names - X = get_datetime_array() - enc = DatetimeEncoder(add_day_of_the_week=True) - expected_feature_names = [ - "0_year", - "0_month", - "0_day", - "0_hour", - "0_total_time", - "0_dayofweek", - "1_month", - "1_day", - "1_hour", - "1_total_time", - "1_dayofweek", - "2_year", - "2_month", - "2_day", - "2_hour", - "2_dayofweek", - ] +@pytest.mark.parametrize("as_array", [True, False]) +@pytest.mark.parametrize( + "get_data_func, features", + [ + (get_date, TIME_LEVELS[: TIME_LEVELS.index("day") + 1]), + (get_datetime, TIME_LEVELS), + (get_tz_datetime, TIME_LEVELS), + (get_nanoseconds, TIME_LEVELS), + ], +) +@pytest.mark.parametrize( + "add_total_second, add_day_of_the_week", + list(product([True, False], [True, False])), +) +@pytest.mark.parametrize("extract_until", TIME_LEVELS) +def test_fit( + as_array, + get_data_func, + features, + add_total_second, + add_day_of_the_week, + extract_until, +): + X = get_data_func(as_array=as_array) + enc = DatetimeEncoder( + add_day_of_the_week=add_day_of_the_week, + add_total_second=add_total_second, + extract_until=extract_until, + ) enc.fit(X) - assert enc.get_feature_names_out() == expected_feature_names - # With column names - X = get_datetime_array() - X = pd.DataFrame(X) - X.columns = ["col1", "col2", "col3"] - enc = DatetimeEncoder(add_day_of_the_week=True) + total_second = ["total_second"] if add_total_second else [] + day_of_week = ["day_of_week"] if add_day_of_the_week else [] + + if extract_until in features: + features_ = features[: features.index(extract_until) + 1] + else: + features_ = deepcopy(features) + + features_ += total_second + day_of_week + columns = range(X.shape[1]) + expected_features_per_column = {col: features_ for col in columns} + + expected_format_per_column = {col: np.asarray(X)[0, col] for col in columns} + + expected_n_features_out = sum( + len(val) for val in expected_features_per_column.values() + ) + expected_feature_names = [ - "col1_year", - "col1_month", - "col1_day", - "col1_hour", - "col1_total_time", - "col1_dayofweek", - "col2_month", - "col2_day", - "col2_hour", - "col2_total_time", - "col2_dayofweek", - "col3_year", - "col3_month", - "col3_day", - "col3_hour", - "col3_dayofweek", + f"{col}_{feature}" for col in columns for feature in features_ ] - enc.fit(X) + + assert enc.features_per_column_ == expected_features_per_column + assert enc.format_per_column_ == expected_format_per_column + assert enc.n_features_out_ == expected_n_features_out assert enc.get_feature_names_out() == expected_feature_names -def test_transform() -> None: - # Dates - X = get_date_array() - enc = DatetimeEncoder(add_day_of_the_week=True) - expected_result = np.array( - [ - [2020, 1, 1, 2, 1, 2, 3, 2020, 1, 3, 4], - [2021, 2, 3, 2, 2, 4, 1, 2021, 2, 5, 4], - [2022, 1, 1, 5, 12, 25, 4, 2022, 1, 3, 0], - [2023, 2, 3, 4, 2, 4, 1, 2023, 2, 5, 6], - ] +def test_format_nan(): + X = get_nan_datetime() + enc = DatetimeEncoder().fit(X) + expected_format_per_column = { + 0: "2020-01-01 10:12:01", + 1: "2020-02-04 22:12:00", + 2: "2020-01-03 10:00:00", + } + assert enc.format_per_column_ == expected_format_per_column + + +def test_format_nz(): + X = get_tz_datetime() + enc = DatetimeEncoder().fit(X) + assert enc.format_per_column_ == {0: "2020-01-01 10:12:01+05:30"} + + +def test_extract_until_none(): + X = get_datetime() + enc = DatetimeEncoder( + extract_until=None, + add_total_second=False, ) enc.fit(X) - assert np.allclose(enc.transform(X), expected_result, equal_nan=True) - enc = DatetimeEncoder(add_day_of_the_week=False) - expected_result = np.array( - [ - [2020, 1, 1, 1, 2, 2020, 1, 3], - [2021, 2, 3, 2, 4, 2021, 2, 5], - [2022, 1, 1, 12, 25, 2022, 1, 3], - [2023, 2, 3, 2, 4, 2023, 2, 5], - ] + assert enc.features_per_column_ == {0: [], 1: [], 2: []} + assert enc.n_features_out_ == 0 + assert enc.get_feature_names_out() == [] + + +def test_transform_date(): + X = get_date() + enc = DatetimeEncoder( + add_total_second=False, ) - enc.fit(X) - assert np.allclose(enc.transform(X), expected_result, equal_nan=True) + X_trans = enc.fit_transform(X) - enc = DatetimeEncoder(add_day_of_the_week=True) expected_result = np.array( [ - [2020, 1, 1, 2, 1, 2, 3, 2020, 1, 3, 4], - [2021, 2, 3, 2, 2, 4, 1, 2021, 2, 5, 4], - [2022, 1, 1, 5, 12, 25, 4, 2022, 1, 3, 0], - [2023, 2, 3, 4, 2, 4, 1, 2023, 2, 5, 6], + [2020, 1, 1, 2020, 1, 2, 2020, 1, 3], + [2021, 2, 3, 2020, 2, 4, 2021, 2, 5], + [2022, 1, 1, 2020, 12, 25, 2022, 1, 3], + [2023, 2, 3, 2020, 2, 4, 2023, 2, 5], ] ) - enc.fit(X) - assert np.allclose(enc.transform(X), expected_result, equal_nan=True) + X_trans = enc.transform(X) + assert_array_equal(X_trans, expected_result) - # Datetimes - X = get_datetime_array()[:, 0].reshape(-1, 1) - enc = DatetimeEncoder(add_day_of_the_week=True) - # Check that the "total_time" feature is working - expected_result = np.array( - [ - [2020, 1, 1, 10, 0, 2], - [2021, 2, 3, 12, 0, 2], - [2022, 1, 1, 23, 0, 5], - [2023, 2, 3, 11, 0, 4], - ] - ).astype(np.float64) - # Time from epochs in seconds - expected_result[:, 4] = (X.astype("int64") // 1e9).astype(np.float64).reshape(-1) - enc.fit(X) - X_trans = enc.transform(X) - assert np.allclose(X_trans, expected_result, equal_nan=True) - - # Check if we find back the date from the time to epoch - assert ( - ( - pd.to_datetime(X_trans[:, 4], unit="s") - pd.to_datetime(X.reshape(-1)) - ).total_seconds() - == 0 - ).all() - - # Dirty datetimes - X = get_dirty_datetime_array()[:, 0].reshape(-1, 1) - enc = DatetimeEncoder(add_day_of_the_week=True) - expected_result = np.array( - [ - [2020, 1, 1, 10, 0, 2], - [np.nan] * 6, - [2022, 1, 1, 23, 0, 5], - [2023, 2, 3, 11, 0, 4], - ] +def test_transform_datetime(): + X = get_datetime() + enc = DatetimeEncoder( + extract_until="second", + add_total_second=False, ) - # Time from epochs in seconds - expected_result[:, 4] = (X.astype("int64") // 1e9).astype(np.float64).reshape(-1) - expected_result[1, 4] = np.nan - enc.fit(X) - X_trans = enc.transform(X) - assert np.allclose(X_trans, expected_result, equal_nan=True) - - # Datetimes with TZ - # If the dates are timezone-aware, all the feature extractions should - # be done in the provided timezone. - # But the full time to epoch should correspond to the true number of - # seconds between epoch time and the time of the date. - X = get_datetime_with_TZ_array() - enc = DatetimeEncoder(add_day_of_the_week=True) - expected_result = np.array( + X_trans = enc.fit_transform(X) + X_trans_expected = np.array( [ - [2020, 1, 1, 10, 0, 2], - [2021, 2, 3, 12, 0, 2], - [2022, 1, 1, 23, 0, 5], - [2023, 2, 3, 11, 0, 4], + [2020, 1, 1, 10, 12, 1, 2020, 1, 2, 10, 23, 0, 2020, 1, 3, 10, 0, 0], + [2021, 2, 3, 12, 45, 23, 2020, 2, 4, 22, 12, 0, 2021, 2, 5, 12, 0, 0], + [2022, 1, 1, 23, 23, 43, 2020, 12, 25, 11, 12, 0, 2022, 1, 3, 11, 0, 0], + [2023, 2, 3, 11, 12, 12, 2020, 2, 4, 8, 32, 0, 2023, 2, 5, 23, 0, 0], ] - ).astype(np.float64) - # Time from epochs in seconds - expected_result[:, 4] = ( - (X.iloc[:, 0].view(dtype="int64") // 1e9) - .astype(np.float64) - .to_numpy() - .reshape(-1) ) - enc.fit(X) - X_trans = enc.transform(X) - assert np.allclose(X_trans, expected_result, equal_nan=True) - - # Check if we find back the date from the time to epoch - assert ( - ( - pd.to_datetime(X_trans[:, 4], unit="s") - .tz_localize("utc") - .tz_convert(X.iloc[:, 0][0].tz) - - pd.DatetimeIndex(X.iloc[:, 0]) - ).total_seconds() - == 0 - ).all() + assert_array_equal(X_trans, X_trans_expected) - # Check if it's working when the date is constant - X = get_constant_date_array() - enc = DatetimeEncoder(add_day_of_the_week=True) - assert enc.fit_transform(X).shape[1] == 0 - -@pytest.mark.parametrize( - "extract_until", - ["year", "month", "day", "hour", "minute", "second", "microsecond", "nanosecond"], -) -def test_extract_until(extract_until) -> None: - time_levels = [ - "year", - "month", - "day", - "hour", - "minute", - "second", - "microsecond", - "nanosecond", - ] - X = get_datetime_array() - enc = DatetimeEncoder(extract_until=extract_until) - expected_features_per_column_ = { - # all features after seconds are constant - # we want total_time if we have not extracted all non-constant features - 0: time_levels[ - : min(time_levels.index(extract_until), time_levels.index("second")) + 1 - ] - + ( - ["total_time"] - if extract_until in ["year", "month", "day", "hour", "minute"] - else [] - ), - # constant after minute + year constant - 1: time_levels[ - 1 : min(time_levels.index(extract_until), time_levels.index("minute")) + 1 - ] - + (["total_time"] if extract_until in ["year", "month", "day", "hour"] else []), - # constant after hour - 2: time_levels[ - : min(time_levels.index(extract_until), time_levels.index("hour")) + 1 +def test_transform_tz(): + X = get_tz_datetime() + enc = DatetimeEncoder( + add_total_second=True, + ) + X_trans = enc.fit_transform(X) + X_trans_expected = np.array( + [ + [2020, 1, 1, 10, 1.57785372e09], + [2021, 2, 3, 12, 1.61233652e09], + [2022, 1, 1, 23, 1.64105962e09], + [2023, 2, 3, 11, 1.67540293e09], ] - + (["total_time"] if extract_until in ["year", "month", "day"] else []), - } - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ - - -def test_extract_until_none() -> None: - X = get_dirty_datetime_array() - enc = DatetimeEncoder(extract_until=None) - expected_features_per_column_ = { - # all features after seconds are constant - # we want total_time if we have not extracted all non-constant features - 0: ["total_time"], - 1: ["total_time"], - 2: ["total_time"], - } - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ - - # check get_names_out - expected_feature_names = [ - "0_total_time", - "1_total_time", - "2_total_time", - ] - assert enc.get_feature_names_out() == expected_feature_names - - # check with constant datetimes - X = get_constant_date_array() - enc = DatetimeEncoder(extract_until=None) - assert enc.fit_transform(X).shape[1] == 0 - + ) + assert_allclose(X_trans, X_trans_expected) -def test_check_fitted_datetime_encoder() -> None: - """Test that calling transform before fit raises an error""" - X = get_datetime_array()[:, 0].reshape(-1, 1) - enc = DatetimeEncoder(add_day_of_the_week=True) - with pytest.raises(NotFittedError): - enc.transform(X) - # Check that it works after fit - enc.fit(X) - enc.transform(X) +def test_transform_nan(): + X = get_nan_datetime() + enc = DatetimeEncoder( + add_total_second=True, + ) + X_trans = enc.fit_transform(X) + X_trans_expected = np.array( + [ + [ + 2020, + 1, + 1, + 10, + 1.57787352e09, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + 2020, + 1, + 3, + 10, + 1.57804560e09, + ], + [ + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + 2020, + 2, + 4, + 22, + 1.58085432e09, + 2021, + 2, + 5, + 12, + 1.61252640e09, + ], + [ + 2022, + 1, + 1, + 23, + 1.64107942e09, + 2020, + 12, + 25, + 11, + 1.60889472e09, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ], + [ + 2023, + 2, + 3, + 11, + 1.67542273e09, + 2020, + 2, + 4, + 8, + 1.58080512e09, + 2023, + 2, + 5, + 23, + 1.67563800e09, + ], + ] + ) + assert_allclose(X_trans, X_trans_expected) From 0c08aadee8db9ed348a3c418acc4896c64710f3b Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 5 Oct 2023 15:23:46 +0200 Subject: [PATCH 03/30] update docstrings --- skrub/_datetime_encoder.py | 83 +++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 27 deletions(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 7a1a59ccf..44ac160dd 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -20,11 +20,19 @@ def is_datetime_parsable(X): - """ + """Check whether a 1d vector can be converted into a \ + :class:`~pandas.core.indexes.datetimes.DatetimeIndex`. + Parameters ---------- - X : np.ndarray of shape (n_sample,) + X : array-like of shape ``(n_sample,)`` + + Returns + ------- + is_dt_parsable : bool """ + if len(X.shape) > 1: + raise ValueError(f"X must be 1d, got shape: {X.shape}.") np_dtypes_candidates = [np.object_, np.str_, np.datetime64] if any(np.issubdtype(X.dtype, np_dtype) for np_dtype in np_dtypes_candidates): try: @@ -36,10 +44,18 @@ def is_datetime_parsable(X): def is_date_only(X): - """ + """Check whether a 1d vector only contains dates. + + Note that ``is_date_only`` being True implies ``is_datetime_parsable`` is True, + but not the contrary. + Parameters ---------- - X : np.ndarray of shape (n_sample,) + X : array-like of shape ``(n_sample,)`` + + Returns + ------- + is_date : bool """ if is_datetime_parsable(X): X_t = pd.to_datetime(X) @@ -61,35 +77,44 @@ class DatetimeEncoder(TransformerMixin, BaseEstimator): extract_until : {"year", "month", "day", "hour", "minute", "second", "microsecond", "nanosecond", None}, default="hour" Extract up to this granularity. - If all non-constant features have not been extracted, - add the "total_time" feature, which contains the time to epoch (in seconds). For instance, if you specify "day", only "year", "month", "day" and - "total_time" features will be created. - If None, only the "total_time" feature will be created. + features will be created. + If ``None``, no feature will be created. + add_day_of_the_week : bool, default=False Add day of the week feature (if day is extracted). This is a numerical feature from 0 (Monday) to 6 (Sunday). + add_total_second : bool, default=True + Add the total number of seconds since Epoch. + + errors: {"coerce", "raise"}, default="coerce" + During transform: + - If ``"coerce"``, then invalid parsing will be set as ``NaT``. + - If ``"raise"``, then invalid parsing will raise an exception + Attributes ---------- - n_features_in_ : int - Number of features in the data seen during fit. n_features_out_ : int Number of features of the transformed data. - features_per_column_ : mapping of int to list of str - Dictionary mapping the index of the original columns - to the list of features extracted for each column. - col_names_ : None or list of str - List of the names of the features of the input data, - if input data was a pandas DataFrame, otherwise None. + + features_per_column_ : dict[str, list[str]] or dict[int, list[str]] + Dictionary mapping the column names to the list of features extracted + for each column. + + format_per_column_ : dict[str, str] or dict[int, str] + Dictionary mapping the column names to the first non-null example. + This is how Pandas infer the datetime format. See Also -------- GapEncoder : Encode dirty categories (strings) by constructing latent topics with continuous encoding. + MinHashEncoder : Encode string columns as a numeric array with the minhash method. + SimilarityEncoder : Encode string columns as a numeric array with n-gram string similarity. @@ -124,8 +149,8 @@ def __init__( ): self.extract_until = extract_until self.add_day_of_the_week = add_day_of_the_week - self.add_total_second = add_total_second # TODO doc - self.errors = errors # TODO doc + self.add_total_second = add_total_second + self.errors = errors def fit(self, X, y=None): """Fit the instance to X. @@ -135,7 +160,7 @@ def fit(self, X, y=None): Parameters ---------- - X : array-like, shape (``n_samples``, ``n_features``) + X : array-like, shape ``(n_samples, n_features)`` Data where each column is a datetime feature. y : None Unused, only here for compatibility. @@ -161,12 +186,16 @@ def fit(self, X, y=None): self._check_n_features(X, reset=True) X = check_array(X, ensure_2d=True, force_all_finite=False, dtype=None) - self._parse_datetime_cols(X) + self._select_datetime_cols(X) return self - def _parse_datetime_cols(self, X): - """ + def _select_datetime_cols(self, X): + """Select datetime-like columns and infer features to be parsed. + + If the input only contains dates (and no datetimes), only the features + ["year", "month", "day"] will be filtered with extract_until. + Parameters ---------- X : array-like of shape (n_samples, n_features) @@ -215,14 +244,14 @@ def transform(self, X, y=None): Parameters ---------- - X : array-like, shape (``n_samples``, ``n_features``) + X : array-like of shape ``(n_samples, n_features)`` The data to transform, where each column is a datetime feature. y : None Unused, only here for compatibility. Returns ------- - ndarray, shape (``n_samples``, ``n_features_out_``) + X_out : ndarray of shape ``(n_samples, n_features_out_)`` Transformed input. """ check_is_fitted(self) @@ -260,13 +289,13 @@ def transform(self, X, y=None): return X_out def get_feature_names_out(self, input_features=None): - """Return clean feature names. + """Get output feature names for transformation. Feature names are formatted like: "_" if the original data has column names, otherwise with format "_" where `` is one of {"year", "month", "day", "hour", "minute", "second", - "microsecond", "nanosecond", "dayofweek"}. + "microsecond", "nanosecond", "day_of_week"}. Parameters ---------- @@ -275,7 +304,7 @@ def get_feature_names_out(self, input_features=None): Returns ------- - list of str + feature_names : list of str List of feature names. """ check_is_fitted(self, "features_per_column_") From d57691c3552be0db31c59c9ff05b284c6fa34af1 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 5 Oct 2023 16:06:46 +0200 Subject: [PATCH 04/30] update example --- examples/03_datetime_encoder.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/examples/03_datetime_encoder.py b/examples/03_datetime_encoder.py index bf328addf..27c90bc37 100644 --- a/examples/03_datetime_encoder.py +++ b/examples/03_datetime_encoder.py @@ -90,11 +90,8 @@ ############################################################################### # We see that the encoder is working as expected: the "date.utc" column has -# been replaced by features extracting the month, day, hour, and day of the -# week information. -# -# Note the year and minute features are not present, this is because they -# have been removed by the encoder as they are constant the whole period. +# been replaced by features extracting the month, day, hour, minute, day of the +# week and total second since Epoch information. ############################################################################### # One-liner with the |TableVectorizer| @@ -148,14 +145,9 @@ # ```py # from sklearn.experimental import enable_hist_gradient_boosting # ``` - -import numpy as np from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.pipeline import make_pipeline -table_vec = TableVectorizer( - datetime_transformer=DatetimeEncoder(add_day_of_the_week=True), -) pipeline = make_pipeline(table_vec, HistGradientBoostingRegressor()) ############################################################################### @@ -168,6 +160,7 @@ # # Instead, we can use the |TimeSeriesSplit|, # which ensures that the test set is always in the future. +import numpy as np X["date.utc"] = pd.to_datetime(X["date.utc"]) sorted_indices = np.argsort(X["date.utc"]) From b39c2dac65cdfd46be04143f401e1ff7a35a6d7c Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 5 Oct 2023 21:48:12 +0200 Subject: [PATCH 05/30] split the transform method with _parse_datetime_cols --- skrub/_datetime_encoder.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 44ac160dd..5ae4b8911 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -259,6 +259,19 @@ def transform(self, X, y=None): self._check_feature_names(X, reset=False) X = check_array(X, ensure_2d=True, force_all_finite=False, dtype=None) + return self._parse_datetime_cols(X) + + def _parse_datetime_cols(self, X): + """Extract datetime features from the selected columns. + + Parameters + ---------- + X : ndarray of shape ``(n_samples, n_features)`` + + Returns + ------- + X_out : ndarray of shape ``(n_samples, n_features_out_)`` + """ columns = getattr(self, "feature_names_in_", list(range(X.shape[1]))) # X_out must be of dtype float64 to handle np.nan X_out = np.empty((X.shape[0], self.n_features_out_), dtype=np.float64) From edf11dd7ab098356ff661d08f5e0be5ce6ef9393 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 5 Oct 2023 21:50:36 +0200 Subject: [PATCH 06/30] small typo in a comment --- skrub/_datetime_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 5ae4b8911..25e2ef69c 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -278,7 +278,7 @@ def _parse_datetime_cols(self, X): offset_idx = 0 for col_idx, col in enumerate(columns): if col in self.features_per_column_: - # X_j is a DatetimeIndex + # X_col is a DatetimeIndex X_col = pd.to_datetime(X[:, col_idx], errors=self.errors) features = self.features_per_column_[col] From 367d207ae22a26cf3aca02a3a08e788fbdaec869 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 12 Oct 2023 17:19:13 +0200 Subject: [PATCH 07/30] add to_datetime and rework the backend --- CHANGES.rst | 9 + examples/03_datetime_encoder.py | 2 +- skrub/__init__.py | 3 +- skrub/_datetime_encoder.py | 629 ++++++++++++++++++++++----- skrub/tests/test_datetime_encoder.py | 16 +- 5 files changed, 533 insertions(+), 126 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 548d304d7..ae2e64b3d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -15,6 +15,10 @@ development and backward compatibility is not ensured. Major changes ------------- +* :func:`to_datetime` is now available to support pandas.to_datetime + over dataframes and 2d arrays. + :pr:`784` by :user:`Vincent Maladiere ` + * :func:`dataframe.pd_join`, :func:`dataframe.pd_aggregate`, :func:`dataframe.pl_join` and :func:`dataframe.pl_aggregate` are now available in the dataframe submodule. @@ -40,6 +44,11 @@ Major changes Minor changes ------------- +* :class:`DatetimeEncoder` doesn't remove constant features anymore. + It also supports an 'errors' argument to raise or coerce errors during + transform, and a 'add_total_seconds' argument to include the number of + seconds since Epoch. + :pr:`784` by :user:`Vincent Maladiere ` * :class:`TableVectorizer` is now able to apply parallelism at the column level rather than the transformer level. This is the default for univariate transformers, like :class:`MinHashEncoder`, and :class:`GapEncoder`. :pr:`592` by :user:`Leo Grinsztajn ` diff --git a/examples/03_datetime_encoder.py b/examples/03_datetime_encoder.py index f8cc5756f..6d571ca65 100644 --- a/examples/03_datetime_encoder.py +++ b/examples/03_datetime_encoder.py @@ -80,7 +80,7 @@ encoder = make_column_transformer( (OneHotEncoder(handle_unknown="ignore"), ["city"]), - (DatetimeEncoder(add_day_of_the_week=True, extract_until="minute"), ["date.utc"]), + (DatetimeEncoder(add_day_of_the_week=True, resolution="minute"), ["date.utc"]), remainder="drop", ) diff --git a/skrub/__init__.py b/skrub/__init__.py index 2618dd421..bc73b0182 100644 --- a/skrub/__init__.py +++ b/skrub/__init__.py @@ -4,7 +4,7 @@ from pathlib import Path as _Path from ._check_dependencies import check_dependencies -from ._datetime_encoder import DatetimeEncoder +from ._datetime_encoder import DatetimeEncoder, to_datetime from ._deduplicate import compute_ngram_distance, deduplicate from ._fuzzy_join import fuzzy_join from ._gap_encoder import GapEncoder @@ -32,4 +32,5 @@ "TargetEncoder", "deduplicate", "compute_ngram_distance", + "to_datetime", ] diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 25e2ef69c..b80cb5382 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -1,11 +1,16 @@ +import warnings from collections import defaultdict +from typing import Iterable import numpy as np import pandas as pd +from pandas._libs.tslibs.parsing import guess_datetime_format from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted +from .dataframe._namespace import get_df_namespace + WORD_TO_ALIAS = { "year": "Y", "month": "M", @@ -19,92 +24,480 @@ TIME_LEVELS = list(WORD_TO_ALIAS) -def is_datetime_parsable(X): - """Check whether a 1d vector can be converted into a \ - :class:`~pandas.core.indexes.datetimes.DatetimeIndex`. +def to_datetime( + X, + errors="coerce", + **kwargs, +): + """ + Convert argument to datetime. + + Augment :func:`pandas.to_datetime` by supporting dataframes + and 2d arrays inputs. It converts compatible columns to datetime, and + pass incompatible columns unchanged. + + With 2d arrays, numerical columns will also be passed unchanged. + + int, float, str, datetime, list, tuple, 1d array, and Series are defered to + pandas.to_datetime directly. + + Parameters + ---------- + arg : int, float, str, datetime, list, tuple, nd array, Series, DataFrame/dict-like + The object to convert to a datetime. + errors : {'ignore', 'raise', 'coerce'}, default 'coerce' + - If :const:`'raise'`, then invalid parsing will raise an exception. + - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`. + - If :const:`'ignore'`, then invalid parsing will return the input. + dayfirst : bool, default False + Specify a date parse order if `arg` is str or is list-like. + If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"` + is parsed as :const:`2012-11-10`. + + .. warning:: + + ``dayfirst=True`` is not strict, but will prefer to parse + with day first. + + yearfirst : bool, default False + Specify a date parse order if `arg` is str or is list-like. + + - If :const:`True` parses dates with the year first, e.g. + :const:`"10/11/12"` is parsed as :const:`2010-11-12`. + - If both `dayfirst` and `yearfirst` are :const:`True`, `yearfirst` is + preceded (same as :mod:`dateutil`). + + .. warning:: + + ``yearfirst=True`` is not strict, but will prefer to parse + with year first. + + utc : bool, default False + Control timezone-related parsing, localization and conversion. + + - If :const:`True`, the function *always* returns a timezone-aware + UTC-localized :class:`Timestamp`, :class:`Series` or + :class:`DatetimeIndex`. To do this, timezone-naive inputs are + *localized* as UTC, while timezone-aware inputs are *converted* to UTC. + + - If :const:`False` (default), inputs will not be coerced to UTC. + Timezone-naive inputs will remain naive, while timezone-aware ones + will keep their time offsets. Limitations exist for mixed + offsets (typically, daylight savings), see :ref:`Examples + ` section for details. + + See also: pandas general documentation about `timezone conversion and + localization + `_. + + format : str, default None + The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See + `strftime documentation + `_ for more information on choices, though + note that :const:`"%f"` will parse all the way up to nanoseconds. + You can also pass: + + - "ISO8601", to parse any `ISO8601 `_ + time string (not necessarily in exactly the same format); + - "mixed", to infer the format for each element individually. This is risky, + and you should probably use it along with `dayfirst`. + + exact : bool, default True + Control how `format` is used: + + - If :const:`True`, require an exact `format` match. + - If :const:`False`, allow the `format` to match anywhere in the target + string. + + Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``. + unit : str, default 'ns' + The unit of the arg (D,s,ms,us,ns) denote the unit, which is an + integer or float number. This will be based off the origin. + Example, with ``unit='ms'`` and ``origin='unix'``, this would calculate + the number of milliseconds to the unix epoch start. + origin : scalar, default 'unix' + Define the reference date. The numeric values would be parsed as number + of units (defined by `unit`) since this reference date. + + - If :const:`'unix'` (or POSIX) time; origin is set to 1970-01-01. + - If :const:`'julian'`, unit must be :const:`'D'`, and origin is set to + beginning of Julian Calendar. Julian day number :const:`0` is assigned + to the day starting at noon on January 1, 4713 BC. + - If Timestamp convertible (Timestamp, dt.datetime, np.datetimt64 or date + string), origin is set to Timestamp identified by origin. + - If a float or integer, origin is the millisecond difference + relative to 1970-01-01. + cache : bool, default True + If :const:`True`, use a cache of unique, converted dates to apply the + datetime conversion. May produce significant speed-up when parsing + duplicate date strings, especially ones with timezone offsets. The cache + is only used when there are at least 50 values. The presence of + out-of-bounds values will render the cache unusable and may slow down + parsing. + + Returns + ------- + datetime + If parsing succeeded. + Return type depends on input (types in parenthesis correspond to + fallback in case of unsuccessful timezone or out-of-range timestamp + parsing): + + - scalar: :class:`Timestamp` (or :class:`datetime.datetime`) + - array-like: :class:`DatetimeIndex` (or :class:`Series` with + :class:`object` dtype containing :class:`datetime.datetime`) + - Series: :class:`Series` of :class:`datetime64` dtype (or + :class:`Series` of :class:`object` dtype containing + :class:`datetime.datetime`) + - DataFrame: :class:`Series` of :class:`datetime64` dtype (or + :class:`Series` of :class:`object` dtype containing + :class:`datetime.datetime`) + + Raises + ------ + ParserError + When parsing a date from string fails. + ValueError + When another datetime conversion error happens. For example when one + of 'year', 'month', day' columns is missing in a :class:`DataFrame`, or + when a Timezone-aware :class:`datetime.datetime` is found in an array-like + of mixed time offsets, and ``utc=False``. + + See Also + -------- + :func:`pandas.to_datetime` + """ + kwargs["errors"] = errors + + # dataframe + if hasattr(X, "__dataframe__"): + return _to_datetime_dataframe(X, **kwargs) + + # series, this attribute is available since Pandas 2.1.0 + elif hasattr(X, "__column_consortium_standard__"): + return _to_datetime_series(X, **kwargs) + + # 2d array + elif isinstance(X, Iterable) and np.asarray(X).ndim == 2: + X = _to_datetime_2d_array(np.asarray(X), **kwargs) + return np.vstack(X).T + + # scalar or unknown type + return pd.to_datetime(X, **kwargs) + + +def _to_datetime_dataframe(X, **kwargs): + """Dataframe specialization of ``_to_datetime_2d``. + + Parameters + ---------- + X : Pandas or Polars dataframe + + Returns + ------- + X : Pandas or Polars dataframe + """ + _, px = get_df_namespace(X) + index = getattr(X, "index", None) + X_split = [X[col].to_numpy() for col in X.columns] + X_split = _to_datetime_2d(X_split, **kwargs) + X_split = {col: X_split[col_idx] for col_idx, col in enumerate(X.columns)} + X = pd.DataFrame(X_split, index=index) + # conversion is px is Polars, no-op if Pandas + return px.DataFrame(X) + + +def _to_datetime_series(X, **kwargs): + """Series specialization of :func:`pandas.to_datetime`. + + Parameters + ---------- + X : Pandas or Polars series + + Returns + ------- + X : Pandas or Polars series + """ + _, px = get_df_namespace(X.to_frame()) + index = getattr(X, "index", None) + name = X.name + X = pd.to_datetime(X, **kwargs) + X = pd.Series(X, index=index, name=name) + # conversion is px is Polars, no-op if Pandas + return px.Series(X) + + +def _to_datetime_2d_array(X, **kwargs): + """2d array specialization of ``_to_datetime_2d``. + + Parameters + ---------- + X : ndarray of shape ``(n_samples, n_features)`` + + Returns + ------- + X_split : list of array, of shape ``n_features`` + """ + X_split = np.hsplit(X, X.shape[1]) + X_split = [X_col.ravel() for X_col in X_split] + return _to_datetime_2d(X_split, **kwargs) + + +def _to_datetime_2d( + X_split, + indices=None, + indice_to_format=None, + format=None, + **kwargs, +): + """Convert datetime parsable columns from a 2d array or dataframe \ + to datetime format. + + The conversion is done inplace. + + Parameters + ---------- + X : list of 1d array of length n_features + The 2d input, chunked into a list of array. This format allows us + to treat each column individually and preserve their dtype, because + dataframe.to_numpy() casts all columns to object is any column dtype + is object. + + indices : list of int, default=None + Indices of the parsable columns to convert. + If None, indices are computed using the current input X. + + indice_to_format : mapping of int to str, default=None + Dictionary mapping column indices to their datetime format. + It defines the format parameter for each column when calling + pd.to_datetime. + + If indices is None, indices_to_format is computed using the current input X. + If format is not None, all values of indices_to_format are format + + format : str, default=None + Here for compatibility with ``pandas.to_datetime`` API. + When format is not None, it overwrites the values in indices_to_format. + + Returns + ------- + X_split : list of 1d array of length n_features + """ + if indices is None: + indices, indice_to_format = _get_datetime_column_indices(X_split) + + # format overwrite indices_to_format + if format is not None or indice_to_format is None: + indice_to_format = {col_idx: format for col_idx in indices} + + for col_idx in indices: + X_split[col_idx] = pd.to_datetime( + X_split[col_idx], format=indice_to_format[col_idx], **kwargs + ) + + return X_split + + +def _get_datetime_column_indices(X_split): + """Select the datetime parsable columns by their indices \ + and return their datetime format. Parameters ---------- - X : array-like of shape ``(n_sample,)`` + X_split : list of 1d array of length n_features + + Returns + ------- + datetime_indices : list of int + List of parsable column, identified by their indices. + + indice_to_format: mapping of int to str + Dictionary mapping parsable column indices to their datetime format. + """ + indices = [] + indice_to_format = {} + + for col_idx, X_col in enumerate(X_split): + X_col = X_col[pd.notnull(X_col)] + if _is_column_datetime_parsable(X_col): + indices.append(col_idx) + indice_to_format[col_idx] = _guess_datetime_format(X_col) + + return indices, indice_to_format + + +def _is_column_datetime_parsable(X_col): + """Check whether a 1d array can be converted into a \ + :class:`pandas.DatetimeIndex`. + + Parameters + ---------- + X_col : array-like of shape ``(n_samples,)`` Returns ------- is_dt_parsable : bool """ - if len(X.shape) > 1: - raise ValueError(f"X must be 1d, got shape: {X.shape}.") + # Remove columns of int, float or bool casted as object. + try: + if np.array_equal(X_col, X_col.astype(np.float64)): + return False + except ValueError: + pass + np_dtypes_candidates = [np.object_, np.str_, np.datetime64] - if any(np.issubdtype(X.dtype, np_dtype) for np_dtype in np_dtypes_candidates): + is_type_datetime_compatible = any( + np.issubdtype(X_col.dtype, np_dtype) for np_dtype in np_dtypes_candidates + ) + if is_type_datetime_compatible: try: - _ = pd.to_datetime(X) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=UserWarning) + # format=mixed parses entries individually, + # avoiding ValueError when both date and datetime formats + # are present. + # At this stage, the format itself doesn't matter. + _ = pd.to_datetime(X_col, format="mixed") return True except (pd.errors.ParserError, ValueError): pass return False -def is_date_only(X): - """Check whether a 1d vector only contains dates. +def _guess_datetime_format(X_col, require_dayfirst=True): + """ + Parameters + ---------- + X_col : ndarray of shape ``(n_samples,)`` + + require_dayfirst : bool, default True + Whether to return the dayfirst format when both dayfirst + and monthfirst are valid. + + Returns + ------- + format : str + """ + X_col = X_col.astype(np.object_) + vfunc = np.vectorize(guess_datetime_format) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=UserWarning) + month_first_formats = np.unique(vfunc(X_col, dayfirst=False)) + day_first_formats = np.unique(vfunc(X_col, dayfirst=True)) + + if pd.isnull(month_first_formats).any() or pd.isnull(day_first_formats).any(): + return None + + elif ( + len(month_first_formats) == 1 + and len(day_first_formats) == 1 + and month_first_formats[0] != day_first_formats[0] + ): + if require_dayfirst: + return str(day_first_formats[0]) + else: + return str(month_first_formats[0]) + + elif len(month_first_formats) == 1: + return str(month_first_formats[0]) + + elif len(day_first_formats) == 1: + return str(day_first_formats[0]) + + # special heuristic: when both date and datetime formats are + # present, allow the format to be mixed. + elif ( + len(month_first_formats) == 2 + and len(day_first_formats) == 2 + and len(month_first_formats[0]) != len(month_first_formats[1]) + ): + return "mixed" + + else: + return None - Note that ``is_date_only`` being True implies ``is_datetime_parsable`` is True, - but not the contrary. + +def _is_column_date_only(X_col): + """Check whether a :obj:`pandas.DatetimeIndex` only contains dates. Parameters ---------- - X : array-like of shape ``(n_sample,)`` + X_col : pandas.DatetimeIndex of shape ``(n_samples,)`` Returns ------- is_date : bool """ - if is_datetime_parsable(X): - X_t = pd.to_datetime(X) - return np.all(X_t == X_t.normalize()) - return False + return np.array_equal(X_col, X_col.normalize()) + + +def _datetime_to_total_seconds(X_col): + """ + Parameters + ---------- + X_col : DatetimeIndex of shape (n_samples,) + + Returns + ------- + X_col : ndarray of shape (n_samples) + """ + if X_col.tz is not None: + X_col = X_col.tz_convert("utc") + + # Total seconds since epoch + mask_notnull = X_col == X_col + + return np.where( + mask_notnull, + X_col.astype("int64") / 1e9, + np.nan, + ) class DatetimeEncoder(TransformerMixin, BaseEstimator): """Transforms each datetime column into several numeric columns \ for temporal features (e.g year, month, day...). - Constant extracted features are dropped; for instance, if the year is - always the same in a feature, the extracted "year" column won't be added. If the dates are timezone aware, all the features extracted will correspond to the provided timezone. Parameters ---------- - extract_until : {"year", "month", "day", "hour", "minute", "second", + resolution : {"year", "month", "day", "hour", "minute", "second", "microsecond", "nanosecond", None}, default="hour" - Extract up to this granularity. - For instance, if you specify "day", only "year", "month", "day" and - features will be created. + Extract up to this resolution. + E.g., ``resolution="day"`` generates the features "year", "month", + "day" only. If ``None``, no feature will be created. add_day_of_the_week : bool, default=False - Add day of the week feature (if day is extracted). - This is a numerical feature from 0 (Monday) to 6 (Sunday). + Add day of the week feature as a numerical feature + from 0 (Monday) to 6 (Sunday). - add_total_second : bool, default=True + add_total_seconds : bool, default=True Add the total number of seconds since Epoch. errors: {"coerce", "raise"}, default="coerce" During transform: - - If ``"coerce"``, then invalid parsing will be set as ``NaT``. - - If ``"raise"``, then invalid parsing will raise an exception + - If ``"coerce"``, then invalid parsing will be set as ``pd.NaT``. + - If ``"raise"``, then invalid parsing will raise an exception. Attributes ---------- - n_features_out_ : int - Number of features of the transformed data. + column_indices_ : list of int + Indices of the datetime-parsable columns. + + indice_to_format_ : dict[int, str] + Mapping from column indices to their datetime formats. - features_per_column_ : dict[str, list[str]] or dict[int, list[str]] - Dictionary mapping the column names to the list of features extracted - for each column. + indice_to_features_ : dict[int, list[str]] + Dictionary mapping the column names to the list of datetime + features extracted for each column. - format_per_column_ : dict[str, str] or dict[int, str] - Dictionary mapping the column names to the first non-null example. - This is how Pandas infer the datetime format. + n_features_out_ : int + Number of features of the transformed data. See Also -------- @@ -130,38 +523,40 @@ class DatetimeEncoder(TransformerMixin, BaseEstimator): DatetimeEncoder() The encoder will output a transformed array - with four columns ("year", "month", "day" and "hour"): + with five columns ("year", "month", "day", "hour" and "total_seconds"): >>> enc.transform(X) - array([[2022., 10., 15., 0.], - [2021., 12., 25., 0.], - [2020., 5., 18., 0.], - [2019., 10., 15., 12.]]) + array([[2022., 10., 15., 0., 1.6657920e+09], + [2021., 12., 25., 0., 1.6403904e+09], + [2020., 5., 18., 0., 1.5897600e+09], + [2019., 10., 15., 12., 1.5711408e+09]]) """ def __init__( self, *, - extract_until="hour", + resolution="hour", add_day_of_the_week=False, - add_total_second=True, + add_total_seconds=True, errors="coerce", ): - self.extract_until = extract_until + self.resolution = resolution self.add_day_of_the_week = add_day_of_the_week - self.add_total_second = add_total_second + self.add_total_seconds = add_total_seconds self.errors = errors def fit(self, X, y=None): """Fit the instance to X. - In practice, just check keywords and input validity, - and stores which extracted features are not constant. + Select datetime-parsable columns and generate the list of + datetime feature to extract. Parameters ---------- X : array-like, shape ``(n_samples, n_features)`` - Data where each column is a datetime feature. + Input data. Columns that can't be converted into + `pandas.DatetimeIndex` and numerical values will + be dropped. y : None Unused, only here for compatibility. @@ -170,10 +565,9 @@ def fit(self, X, y=None): DatetimeEncoder Fitted DatetimeEncoder instance (self). """ - if self.extract_until not in TIME_LEVELS and self.extract_until is not None: + if self.resolution not in TIME_LEVELS and self.resolution is not None: raise ValueError( - f"'extract_until' options are {TIME_LEVELS}, " - f"got {self.extract_until!r}." + f"'resolution' options are {TIME_LEVELS}, got {self.resolution!r}." ) errors_options = ["coerce", "raise"] @@ -184,62 +578,61 @@ def fit(self, X, y=None): self._check_feature_names(X, reset=True) self._check_n_features(X, reset=True) - X = check_array(X, ensure_2d=True, force_all_finite=False, dtype=None) + X = check_array( + X, ensure_2d=True, force_all_finite=False, dtype=None, copy=False + ) self._select_datetime_cols(X) return self def _select_datetime_cols(self, X): - """Select datetime-like columns and infer features to be parsed. + """Select datetime-parsable columns and generate the list of + datetime feature to extract. If the input only contains dates (and no datetimes), only the features - ["year", "month", "day"] will be filtered with extract_until. + ["year", "month", "day"] will be filtered with resolution. Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : array-like of shape ``(n_samples, n_features)`` """ - # Features to extract for each column, after removing constant features - self.features_per_column_ = defaultdict(list) - self.format_per_column_ = dict() - self.n_features_out_ = 0 - - if self.extract_until is None: + if self.resolution is None: levels = [] else: - idx_level = TIME_LEVELS.index(self.extract_until) + idx_level = TIME_LEVELS.index(self.resolution) levels = TIME_LEVELS[: idx_level + 1] - columns = getattr(self, "feature_names_in_", list(range(X.shape[1]))) - for col_idx, col in enumerate(columns): - X_col = X[:, col_idx] + X_split = np.hsplit(X, X.shape[1]) + self.column_indices_, self.indice_to_format_ = _get_datetime_column_indices( + X_split + ) + del X_split - if is_datetime_parsable(X_col): - # Pandas use the first non-null item of the array to infer the format. - X_dt = pd.to_datetime(X_col) - mask_notnull = X_dt == X_dt - self.format_per_column_[col] = X_col[mask_notnull][0] + self.indice_to_features_ = defaultdict(list) + self.n_features_out_ = 0 - if is_date_only(X_col): - # Keep only date attributes - levels = [ - level for level in levels if level in ["year", "month", "day"] - ] + for col_idx in self.column_indices_: + X_col = pd.DatetimeIndex(X[:, col_idx]) + if _is_column_date_only(X_col): + # Keep only date attributes + levels = [ + level for level in levels if level in ["year", "month", "day"] + ] - self.features_per_column_[col] += levels - self.n_features_out_ += len(levels) + self.indice_to_features_[col_idx] += levels + self.n_features_out_ += len(levels) - if self.add_total_second: - self.features_per_column_[col].append("total_second") - self.n_features_out_ += 1 + if self.add_total_seconds: + self.indice_to_features_[col_idx].append("total_seconds") + self.n_features_out_ += 1 - if self.add_day_of_the_week: - self.features_per_column_[col].append("day_of_week") - self.n_features_out_ += 1 + if self.add_day_of_the_week: + self.indice_to_features_[col_idx].append("day_of_week") + self.n_features_out_ += 1 def transform(self, X, y=None): - """Transform `X` by replacing each datetime column with \ + """Transform ``X`` by replacing each datetime column with \ corresponding numerical features. Parameters @@ -257,47 +650,49 @@ def transform(self, X, y=None): check_is_fitted(self) self._check_n_features(X, reset=False) self._check_feature_names(X, reset=False) - X = check_array(X, ensure_2d=True, force_all_finite=False, dtype=None) - - return self._parse_datetime_cols(X) - def _parse_datetime_cols(self, X): + X = check_array( + X, + ensure_2d=True, + force_all_finite=False, + dtype=None, + copy=False, + ) + X_split = _to_datetime_2d_array( + X, + indices=self.column_indices_, + indice_to_format=self.indice_to_format_, + errors=self.errors, + ) + + return self._extract_features(X_split) + + def _extract_features(self, X_split): """Extract datetime features from the selected columns. Parameters ---------- - X : ndarray of shape ``(n_samples, n_features)`` + X_split : list of 1d array of length n_features Returns ------- X_out : ndarray of shape ``(n_samples, n_features_out_)`` """ - columns = getattr(self, "feature_names_in_", list(range(X.shape[1]))) - # X_out must be of dtype float64 to handle np.nan - X_out = np.empty((X.shape[0], self.n_features_out_), dtype=np.float64) + # X_out must be of dtype float64 otherwise np.nan will overflow + # to large negative numbers. + X_out = np.empty((X_split[0].shape[0], self.n_features_out_), dtype=np.float64) offset_idx = 0 - for col_idx, col in enumerate(columns): - if col in self.features_per_column_: - # X_col is a DatetimeIndex - X_col = pd.to_datetime(X[:, col_idx], errors=self.errors) - - features = self.features_per_column_[col] - for feat_idx, feature in enumerate(features): - if feature == "total_second": - if X_col.tz is not None: - X_col = X_col.tz_convert("utc") - # Total seconds since epoch - mask_notnull = X_col == X_col - X_feature = np.where( - mask_notnull, - X_col.astype("int64") // 1e9, - np.nan, - ) - else: - X_feature = getattr(X_col, feature).to_numpy() - X_out[:, offset_idx + feat_idx] = X_feature - - offset_idx += len(features) + for col_idx in self.column_indices_: + X_col = X_split[col_idx] + features = self.indice_to_features_[col_idx] + for feat_idx, feature in enumerate(features): + if feature == "total_seconds": + X_feature = _datetime_to_total_seconds(X_col) + else: + X_feature = getattr(X_col, feature).to_numpy() + X_out[:, offset_idx + feat_idx] = X_feature + + offset_idx += len(features) return X_out @@ -320,9 +715,11 @@ def get_feature_names_out(self, input_features=None): feature_names : list of str List of feature names. """ - check_is_fitted(self, "features_per_column_") + check_is_fitted(self, "indice_to_features_") feature_names = [] - for column, features in self.features_per_column_.items(): + columns = getattr(self, "feature_names_in_", list(range(self.n_features_in_))) + for col_idx, features in self.indice_to_features_.items(): + column = columns[col_idx] feature_names += [f"{column}_{feat}" for feat in features] return feature_names diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py index 5ecdf383b..4a9539419 100644 --- a/skrub/tests/test_datetime_encoder.py +++ b/skrub/tests/test_datetime_encoder.py @@ -108,28 +108,28 @@ def get_tz_datetime(as_array=False): "add_total_second, add_day_of_the_week", list(product([True, False], [True, False])), ) -@pytest.mark.parametrize("extract_until", TIME_LEVELS) +@pytest.mark.parametrize("resolution", TIME_LEVELS) def test_fit( as_array, get_data_func, features, add_total_second, add_day_of_the_week, - extract_until, + resolution, ): X = get_data_func(as_array=as_array) enc = DatetimeEncoder( add_day_of_the_week=add_day_of_the_week, add_total_second=add_total_second, - extract_until=extract_until, + resolution=resolution, ) enc.fit(X) total_second = ["total_second"] if add_total_second else [] day_of_week = ["day_of_week"] if add_day_of_the_week else [] - if extract_until in features: - features_ = features[: features.index(extract_until) + 1] + if resolution in features: + features_ = features[: features.index(resolution) + 1] else: features_ = deepcopy(features) @@ -170,10 +170,10 @@ def test_format_nz(): assert enc.format_per_column_ == {0: "2020-01-01 10:12:01+05:30"} -def test_extract_until_none(): +def test_resolution_none(): X = get_datetime() enc = DatetimeEncoder( - extract_until=None, + resolution=None, add_total_second=False, ) enc.fit(X) @@ -205,7 +205,7 @@ def test_transform_date(): def test_transform_datetime(): X = get_datetime() enc = DatetimeEncoder( - extract_until="second", + resolution="second", add_total_second=False, ) X_trans = enc.fit_transform(X) From 65657c3736c7742c96916e7e679d2a100240fafd Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 12 Oct 2023 17:25:02 +0200 Subject: [PATCH 08/30] docstring typo --- skrub/_datetime_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index b80cb5382..6c41e91a1 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -259,7 +259,7 @@ def _to_datetime_2d( Parameters ---------- - X : list of 1d array of length n_features + X_split : list of 1d array of length n_features The 2d input, chunked into a list of array. This format allows us to treat each column individually and preserve their dtype, because dataframe.to_numpy() casts all columns to object is any column dtype From 53a04d2d617be1e276cd67083823cca1f6536290 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 12 Oct 2023 17:26:00 +0200 Subject: [PATCH 09/30] docstring typo 2 --- skrub/_datetime_encoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 6c41e91a1..fddb2d766 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -262,8 +262,8 @@ def _to_datetime_2d( X_split : list of 1d array of length n_features The 2d input, chunked into a list of array. This format allows us to treat each column individually and preserve their dtype, because - dataframe.to_numpy() casts all columns to object is any column dtype - is object. + dataframe.to_numpy() casts all columns to object when at least one + column dtype is object. indices : list of int, default=None Indices of the parsable columns to convert. From 998859cad1c42d85f233868b8f26796ae00aab96 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 12 Oct 2023 17:26:50 +0200 Subject: [PATCH 10/30] docstring typo 3 --- skrub/_datetime_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index fddb2d766..681ae4f40 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -275,7 +275,7 @@ def _to_datetime_2d( pd.to_datetime. If indices is None, indices_to_format is computed using the current input X. - If format is not None, all values of indices_to_format are format + If format is not None, all values of indices_to_format are format. format : str, default=None Here for compatibility with ``pandas.to_datetime`` API. From 6bec3e61597805dbe8e0447a478de7aa194eaaac Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 12 Oct 2023 17:29:37 +0200 Subject: [PATCH 11/30] add TODO --- skrub/_datetime_encoder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 681ae4f40..7efb2b3e3 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -323,6 +323,7 @@ def _get_datetime_column_indices(X_split): X_col = X_col[pd.notnull(X_col)] if _is_column_datetime_parsable(X_col): indices.append(col_idx) + # TODO: pass require_dayfirst to _guess_datetime_format indice_to_format[col_idx] = _guess_datetime_format(X_col) return indices, indice_to_format From d4b9cbc53bd1d42153e8345f7a9be21b16745c39 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Fri, 13 Oct 2023 14:29:20 +0200 Subject: [PATCH 12/30] enhance tests --- skrub/_datetime_encoder.py | 152 +++++----------------- skrub/tests/test_datetime_encoder.py | 187 +++++++++++++++++---------- 2 files changed, 147 insertions(+), 192 deletions(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 7efb2b3e3..0c98e1196 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -36,138 +36,35 @@ def to_datetime( and 2d arrays inputs. It converts compatible columns to datetime, and pass incompatible columns unchanged. - With 2d arrays, numerical columns will also be passed unchanged. - int, float, str, datetime, list, tuple, 1d array, and Series are defered to - pandas.to_datetime directly. + :func:`pandas.to_datetime` directly. Parameters ---------- - arg : int, float, str, datetime, list, tuple, nd array, Series, DataFrame/dict-like + X : int, float, str, datetime, list, tuple, nd array, Series, DataFrame/dict-like The object to convert to a datetime. + errors : {'ignore', 'raise', 'coerce'}, default 'coerce' - - If :const:`'raise'`, then invalid parsing will raise an exception. - - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`. - - If :const:`'ignore'`, then invalid parsing will return the input. - dayfirst : bool, default False - Specify a date parse order if `arg` is str or is list-like. - If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"` - is parsed as :const:`2012-11-10`. - - .. warning:: - - ``dayfirst=True`` is not strict, but will prefer to parse - with day first. - - yearfirst : bool, default False - Specify a date parse order if `arg` is str or is list-like. - - - If :const:`True` parses dates with the year first, e.g. - :const:`"10/11/12"` is parsed as :const:`2010-11-12`. - - If both `dayfirst` and `yearfirst` are :const:`True`, `yearfirst` is - preceded (same as :mod:`dateutil`). - - .. warning:: - - ``yearfirst=True`` is not strict, but will prefer to parse - with year first. - - utc : bool, default False - Control timezone-related parsing, localization and conversion. - - - If :const:`True`, the function *always* returns a timezone-aware - UTC-localized :class:`Timestamp`, :class:`Series` or - :class:`DatetimeIndex`. To do this, timezone-naive inputs are - *localized* as UTC, while timezone-aware inputs are *converted* to UTC. - - - If :const:`False` (default), inputs will not be coerced to UTC. - Timezone-naive inputs will remain naive, while timezone-aware ones - will keep their time offsets. Limitations exist for mixed - offsets (typically, daylight savings), see :ref:`Examples - ` section for details. - - See also: pandas general documentation about `timezone conversion and - localization - `_. - - format : str, default None - The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See - `strftime documentation - `_ for more information on choices, though - note that :const:`"%f"` will parse all the way up to nanoseconds. - You can also pass: - - - "ISO8601", to parse any `ISO8601 `_ - time string (not necessarily in exactly the same format); - - "mixed", to infer the format for each element individually. This is risky, - and you should probably use it along with `dayfirst`. - - exact : bool, default True - Control how `format` is used: - - - If :const:`True`, require an exact `format` match. - - If :const:`False`, allow the `format` to match anywhere in the target - string. - - Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``. - unit : str, default 'ns' - The unit of the arg (D,s,ms,us,ns) denote the unit, which is an - integer or float number. This will be based off the origin. - Example, with ``unit='ms'`` and ``origin='unix'``, this would calculate - the number of milliseconds to the unix epoch start. - origin : scalar, default 'unix' - Define the reference date. The numeric values would be parsed as number - of units (defined by `unit`) since this reference date. - - - If :const:`'unix'` (or POSIX) time; origin is set to 1970-01-01. - - If :const:`'julian'`, unit must be :const:`'D'`, and origin is set to - beginning of Julian Calendar. Julian day number :const:`0` is assigned - to the day starting at noon on January 1, 4713 BC. - - If Timestamp convertible (Timestamp, dt.datetime, np.datetimt64 or date - string), origin is set to Timestamp identified by origin. - - If a float or integer, origin is the millisecond difference - relative to 1970-01-01. - cache : bool, default True - If :const:`True`, use a cache of unique, converted dates to apply the - datetime conversion. May produce significant speed-up when parsing - duplicate date strings, especially ones with timezone offsets. The cache - is only used when there are at least 50 values. The presence of - out-of-bounds values will render the cache unusable and may slow down - parsing. + - If ``'raise'``, then invalid parsing will raise an exception. + - If ``'coerce'``, then invalid parsing will be set as ``NaT``. + Note that ``'ignore'`` is not used for dataframes, 2d arrays, + and series, and is used otherwise as in ``pd.to_datetime``. + + **kwargs : key, value mappings + Other keyword arguments are passed down to + :func:`pandas.to_datetime`. Returns ------- datetime - If parsing succeeded. - Return type depends on input (types in parenthesis correspond to - fallback in case of unsuccessful timezone or out-of-range timestamp - parsing): - - - scalar: :class:`Timestamp` (or :class:`datetime.datetime`) - - array-like: :class:`DatetimeIndex` (or :class:`Series` with - :class:`object` dtype containing :class:`datetime.datetime`) - - Series: :class:`Series` of :class:`datetime64` dtype (or - :class:`Series` of :class:`object` dtype containing - :class:`datetime.datetime`) - - DataFrame: :class:`Series` of :class:`datetime64` dtype (or - :class:`Series` of :class:`object` dtype containing - :class:`datetime.datetime`) - - Raises - ------ - ParserError - When parsing a date from string fails. - ValueError - When another datetime conversion error happens. For example when one - of 'year', 'month', day' columns is missing in a :class:`DataFrame`, or - when a Timezone-aware :class:`datetime.datetime` is found in an array-like - of mixed time offsets, and ``utc=False``. + Return type depends on input. + - dataframes, series and 2d arrays return the same type + - otherwise return the same output as :func:`pandas.to_datetime`. See Also -------- :func:`pandas.to_datetime` + Convert argument to datetime. """ kwargs["errors"] = errors @@ -223,8 +120,9 @@ def _to_datetime_series(X, **kwargs): _, px = get_df_namespace(X.to_frame()) index = getattr(X, "index", None) name = X.name - X = pd.to_datetime(X, **kwargs) - X = pd.Series(X, index=index, name=name) + X_split = [X.to_numpy()] + X_split = _to_datetime_2d(X_split) + X = pd.Series(X_split[0], index=index, name=name) # conversion is px is Polars, no-op if Pandas return px.Series(X) @@ -321,6 +219,11 @@ def _get_datetime_column_indices(X_split): for col_idx, X_col in enumerate(X_split): X_col = X_col[pd.notnull(X_col)] + + # convert pd.TimeStamp to np.datetime64 + if all(isinstance(val, pd.Timestamp) for val in X_col): + X_col = X_col.astype("datetime64") + if _is_column_datetime_parsable(X_col): indices.append(col_idx) # TODO: pass require_dayfirst to _guess_datetime_format @@ -367,7 +270,7 @@ def _is_column_datetime_parsable(X_col): return False -def _guess_datetime_format(X_col, require_dayfirst=True): +def _guess_datetime_format(X_col, require_dayfirst=False): """ Parameters ---------- @@ -381,6 +284,11 @@ def _guess_datetime_format(X_col, require_dayfirst=True): ------- format : str """ + if np.issubdtype(X_col.dtype, np.datetime64): + # We don't need to specify a parsing format + # for columns that are already of type datetime64. + return None + X_col = X_col.astype(np.object_) vfunc = np.vectorize(guess_datetime_format) with warnings.catch_warnings(): @@ -480,7 +388,7 @@ class DatetimeEncoder(TransformerMixin, BaseEstimator): add_total_seconds : bool, default=True Add the total number of seconds since Epoch. - errors: {"coerce", "raise"}, default="coerce" + errors : {'coerce', 'raise'}, default="coerce" During transform: - If ``"coerce"``, then invalid parsing will be set as ``pd.NaT``. - If ``"raise"``, then invalid parsing will raise an exception. diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py index 4a9539419..77d94ff0d 100644 --- a/skrub/tests/test_datetime_encoder.py +++ b/skrub/tests/test_datetime_encoder.py @@ -5,8 +5,9 @@ import pandas as pd import pytest from numpy.testing import assert_allclose, assert_array_equal +from pandas.testing import assert_frame_equal, assert_series_equal -from skrub._datetime_encoder import TIME_LEVELS, DatetimeEncoder +from skrub._datetime_encoder import TIME_LEVELS, DatetimeEncoder, to_datetime def get_date(as_array=False): @@ -23,20 +24,6 @@ def get_date(as_array=False): return df -def get_constant_date(as_array=False): - df = pd.DataFrame( - [ - ["2020-01-01", "2020-02-04", "2021-02-05"], - ["2020-01-01", "2020-02-04", "2021-02-05"], - ["2020-01-01", "2020-02-04", "2021-02-05"], - ["2020-01-01", "2020-02-04", "2021-02-05"], - ], - ) - if as_array: - return df.to_numpy() - return df - - def get_datetime(as_array=False): df = pd.DataFrame( [ @@ -71,7 +58,6 @@ def get_nan_datetime(as_array=False): ["2020-01-01 10:12:01", None, "2020-01-03 10:00:00"], [np.nan, "2020-02-04 22:12:00", "2021-02-05 12:00:00"], ["2022-01-01 23:23:43", "2020-12-25 11:12:00", pd.NaT], - ["2023-02-03 11:12:12", "2020-02-04 08:32:00", "2023-02-05 23:00:00"], ], ) if as_array: @@ -94,18 +80,47 @@ def get_tz_datetime(as_array=False): return df +def get_mixed_type_dataframe(): + return pd.DataFrame( + dict( + a=["2020-01-01", "2020-02-04", "2021-02-05"], + b=["yo", "ya", "yu"], + c=[1, 2, 3], + d=["1", "2", "3"], + e=["01/01/2023", "03/01/2023", "14/01/2023"], + f=[True, False, True], + ) + ) + + +def get_mixed_datetime_format(as_array=False): + df = pd.DataFrame( + dict( + a=[ + "2022-10-15", + "2021-12-25", + "2020-05-18", + "2019-10-15 12:00:00", + ] + ) + ) + if as_array: + return df.to_numpy() + return df + + @pytest.mark.parametrize("as_array", [True, False]) @pytest.mark.parametrize( - "get_data_func, features", + "get_data_func, features, format", [ - (get_date, TIME_LEVELS[: TIME_LEVELS.index("day") + 1]), - (get_datetime, TIME_LEVELS), - (get_tz_datetime, TIME_LEVELS), - (get_nanoseconds, TIME_LEVELS), + (get_date, TIME_LEVELS[: TIME_LEVELS.index("day") + 1], "%Y-%m-%d"), + (get_datetime, TIME_LEVELS, "%Y-%m-%d %H:%M:%S"), + (get_tz_datetime, TIME_LEVELS, "%Y-%m-%d %H:%M:%S%z"), + (get_nanoseconds, TIME_LEVELS, "%Y-%m-%d %H:%M:%S.%f"), ], ) @pytest.mark.parametrize( - "add_total_second, add_day_of_the_week", + "add_total_seconds, add_day_of_the_week", list(product([True, False], [True, False])), ) @pytest.mark.parametrize("resolution", TIME_LEVELS) @@ -113,19 +128,20 @@ def test_fit( as_array, get_data_func, features, - add_total_second, + format, + add_total_seconds, add_day_of_the_week, resolution, ): X = get_data_func(as_array=as_array) enc = DatetimeEncoder( add_day_of_the_week=add_day_of_the_week, - add_total_second=add_total_second, + add_total_seconds=add_total_seconds, resolution=resolution, ) enc.fit(X) - total_second = ["total_second"] if add_total_second else [] + total_seconds = ["total_seconds"] if add_total_seconds else [] day_of_week = ["day_of_week"] if add_day_of_the_week else [] if resolution in features: @@ -133,22 +149,18 @@ def test_fit( else: features_ = deepcopy(features) - features_ += total_second + day_of_week + features_ += total_seconds + day_of_week columns = range(X.shape[1]) - expected_features_per_column = {col: features_ for col in columns} - - expected_format_per_column = {col: np.asarray(X)[0, col] for col in columns} - - expected_n_features_out = sum( - len(val) for val in expected_features_per_column.values() - ) + expected_indice_to_features = {col: features_ for col in columns} + expected_indice_to_format = {col: format for col in columns} + expected_n_features_out = len(features_) * X.shape[1] expected_feature_names = [ f"{col}_{feature}" for col in columns for feature in features_ ] - assert enc.features_per_column_ == expected_features_per_column - assert enc.format_per_column_ == expected_format_per_column + assert enc.indice_to_features_ == expected_indice_to_features + assert enc.indice_to_format_ == expected_indice_to_format assert enc.n_features_out_ == expected_n_features_out assert enc.get_feature_names_out() == expected_feature_names @@ -156,29 +168,29 @@ def test_fit( def test_format_nan(): X = get_nan_datetime() enc = DatetimeEncoder().fit(X) - expected_format_per_column = { - 0: "2020-01-01 10:12:01", - 1: "2020-02-04 22:12:00", - 2: "2020-01-03 10:00:00", + expected_indice_to_format = { + 0: "%Y-%m-%d %H:%M:%S", + 1: "%Y-%m-%d %H:%M:%S", + 2: "%Y-%m-%d %H:%M:%S", } - assert enc.format_per_column_ == expected_format_per_column + assert enc.indice_to_format_ == expected_indice_to_format def test_format_nz(): X = get_tz_datetime() enc = DatetimeEncoder().fit(X) - assert enc.format_per_column_ == {0: "2020-01-01 10:12:01+05:30"} + assert enc.indice_to_format_ == {0: "%Y-%m-%d %H:%M:%S%z"} def test_resolution_none(): X = get_datetime() enc = DatetimeEncoder( resolution=None, - add_total_second=False, + add_total_seconds=False, ) enc.fit(X) - assert enc.features_per_column_ == {0: [], 1: [], 2: []} + assert enc.indice_to_features_ == {0: [], 1: [], 2: []} assert enc.n_features_out_ == 0 assert enc.get_feature_names_out() == [] @@ -186,7 +198,7 @@ def test_resolution_none(): def test_transform_date(): X = get_date() enc = DatetimeEncoder( - add_total_second=False, + add_total_seconds=False, ) X_trans = enc.fit_transform(X) @@ -206,10 +218,10 @@ def test_transform_datetime(): X = get_datetime() enc = DatetimeEncoder( resolution="second", - add_total_second=False, + add_total_seconds=False, ) X_trans = enc.fit_transform(X) - X_trans_expected = np.array( + expected_X_trans = np.array( [ [2020, 1, 1, 10, 12, 1, 2020, 1, 2, 10, 23, 0, 2020, 1, 3, 10, 0, 0], [2021, 2, 3, 12, 45, 23, 2020, 2, 4, 22, 12, 0, 2021, 2, 5, 12, 0, 0], @@ -217,16 +229,16 @@ def test_transform_datetime(): [2023, 2, 3, 11, 12, 12, 2020, 2, 4, 8, 32, 0, 2023, 2, 5, 23, 0, 0], ] ) - assert_array_equal(X_trans, X_trans_expected) + assert_array_equal(X_trans, expected_X_trans) def test_transform_tz(): X = get_tz_datetime() enc = DatetimeEncoder( - add_total_second=True, + add_total_seconds=True, ) X_trans = enc.fit_transform(X) - X_trans_expected = np.array( + expected_X_trans = np.array( [ [2020, 1, 1, 10, 1.57785372e09], [2021, 2, 3, 12, 1.61233652e09], @@ -234,16 +246,16 @@ def test_transform_tz(): [2023, 2, 3, 11, 1.67540293e09], ] ) - assert_allclose(X_trans, X_trans_expected) + assert_allclose(X_trans, expected_X_trans) def test_transform_nan(): X = get_nan_datetime() enc = DatetimeEncoder( - add_total_second=True, + add_total_seconds=True, ) X_trans = enc.fit_transform(X) - X_trans_expected = np.array( + expected_X_trans = np.array( [ [ 2020, @@ -296,23 +308,58 @@ def test_transform_nan(): np.nan, np.nan, ], - [ - 2023, - 2, - 3, - 11, - 1.67542273e09, - 2020, - 2, - 4, - 8, - 1.58080512e09, - 2023, - 2, - 5, - 23, - 1.67563800e09, - ], ] ) - assert_allclose(X_trans, X_trans_expected) + assert_allclose(X_trans, expected_X_trans) + + +def test_mixed_type_dataframe(): + X = get_mixed_type_dataframe() + enc = DatetimeEncoder().fit(X) + assert enc.indice_to_format_ == {0: "%Y-%m-%d", 4: "%d/%m/%Y"} + + X_dt = to_datetime(X) + expected_dtypes = [ + np.dtype(" Date: Fri, 13 Oct 2023 15:26:00 +0200 Subject: [PATCH 13/30] apply Jerome's suggestions --- skrub/_datetime_encoder.py | 45 ++++++++++++++-------------- skrub/tests/test_datetime_encoder.py | 18 +++++------ 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 0c98e1196..0164c04c5 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -138,15 +138,14 @@ def _to_datetime_2d_array(X, **kwargs): ------- X_split : list of array, of shape ``n_features`` """ - X_split = np.hsplit(X, X.shape[1]) - X_split = [X_col.ravel() for X_col in X_split] + X_split = list(X.T) return _to_datetime_2d(X_split, **kwargs) def _to_datetime_2d( X_split, indices=None, - indice_to_format=None, + index_to_format=None, format=None, **kwargs, ): @@ -167,7 +166,7 @@ def _to_datetime_2d( Indices of the parsable columns to convert. If None, indices are computed using the current input X. - indice_to_format : mapping of int to str, default=None + index_to_format : mapping of int to str, default=None Dictionary mapping column indices to their datetime format. It defines the format parameter for each column when calling pd.to_datetime. @@ -184,15 +183,15 @@ def _to_datetime_2d( X_split : list of 1d array of length n_features """ if indices is None: - indices, indice_to_format = _get_datetime_column_indices(X_split) + indices, index_to_format = _get_datetime_column_indices(X_split) # format overwrite indices_to_format - if format is not None or indice_to_format is None: - indice_to_format = {col_idx: format for col_idx in indices} + if format is not None: + index_to_format = {col_idx: format for col_idx in indices} for col_idx in indices: X_split[col_idx] = pd.to_datetime( - X_split[col_idx], format=indice_to_format[col_idx], **kwargs + X_split[col_idx], format=index_to_format[col_idx], **kwargs ) return X_split @@ -211,11 +210,11 @@ def _get_datetime_column_indices(X_split): datetime_indices : list of int List of parsable column, identified by their indices. - indice_to_format: mapping of int to str + index_to_format: mapping of int to str Dictionary mapping parsable column indices to their datetime format. """ indices = [] - indice_to_format = {} + index_to_format = {} for col_idx, X_col in enumerate(X_split): X_col = X_col[pd.notnull(X_col)] @@ -227,9 +226,9 @@ def _get_datetime_column_indices(X_split): if _is_column_datetime_parsable(X_col): indices.append(col_idx) # TODO: pass require_dayfirst to _guess_datetime_format - indice_to_format[col_idx] = _guess_datetime_format(X_col) + index_to_format[col_idx] = _guess_datetime_format(X_col) - return indices, indice_to_format + return indices, index_to_format def _is_column_datetime_parsable(X_col): @@ -398,10 +397,10 @@ class DatetimeEncoder(TransformerMixin, BaseEstimator): column_indices_ : list of int Indices of the datetime-parsable columns. - indice_to_format_ : dict[int, str] + index_to_format_ : dict[int, str] Mapping from column indices to their datetime formats. - indice_to_features_ : dict[int, list[str]] + index_to_features_ : dict[int, list[str]] Dictionary mapping the column names to the list of datetime features extracted for each column. @@ -513,12 +512,12 @@ def _select_datetime_cols(self, X): levels = TIME_LEVELS[: idx_level + 1] X_split = np.hsplit(X, X.shape[1]) - self.column_indices_, self.indice_to_format_ = _get_datetime_column_indices( + self.column_indices_, self.index_to_format_ = _get_datetime_column_indices( X_split ) del X_split - self.indice_to_features_ = defaultdict(list) + self.index_to_features_ = defaultdict(list) self.n_features_out_ = 0 for col_idx in self.column_indices_: @@ -529,15 +528,15 @@ def _select_datetime_cols(self, X): level for level in levels if level in ["year", "month", "day"] ] - self.indice_to_features_[col_idx] += levels + self.index_to_features_[col_idx] += levels self.n_features_out_ += len(levels) if self.add_total_seconds: - self.indice_to_features_[col_idx].append("total_seconds") + self.index_to_features_[col_idx].append("total_seconds") self.n_features_out_ += 1 if self.add_day_of_the_week: - self.indice_to_features_[col_idx].append("day_of_week") + self.index_to_features_[col_idx].append("day_of_week") self.n_features_out_ += 1 def transform(self, X, y=None): @@ -570,7 +569,7 @@ def transform(self, X, y=None): X_split = _to_datetime_2d_array( X, indices=self.column_indices_, - indice_to_format=self.indice_to_format_, + index_to_format=self.index_to_format_, errors=self.errors, ) @@ -593,7 +592,7 @@ def _extract_features(self, X_split): offset_idx = 0 for col_idx in self.column_indices_: X_col = X_split[col_idx] - features = self.indice_to_features_[col_idx] + features = self.index_to_features_[col_idx] for feat_idx, feature in enumerate(features): if feature == "total_seconds": X_feature = _datetime_to_total_seconds(X_col) @@ -624,10 +623,10 @@ def get_feature_names_out(self, input_features=None): feature_names : list of str List of feature names. """ - check_is_fitted(self, "indice_to_features_") + check_is_fitted(self, "index_to_features_") feature_names = [] columns = getattr(self, "feature_names_in_", list(range(self.n_features_in_))) - for col_idx, features in self.indice_to_features_.items(): + for col_idx, features in self.index_to_features_.items(): column = columns[col_idx] feature_names += [f"{column}_{feat}" for feat in features] return feature_names diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py index 77d94ff0d..9f75818ed 100644 --- a/skrub/tests/test_datetime_encoder.py +++ b/skrub/tests/test_datetime_encoder.py @@ -152,15 +152,15 @@ def test_fit( features_ += total_seconds + day_of_week columns = range(X.shape[1]) - expected_indice_to_features = {col: features_ for col in columns} - expected_indice_to_format = {col: format for col in columns} + expected_index_to_features = {col: features_ for col in columns} + expected_index_to_format = {col: format for col in columns} expected_n_features_out = len(features_) * X.shape[1] expected_feature_names = [ f"{col}_{feature}" for col in columns for feature in features_ ] - assert enc.indice_to_features_ == expected_indice_to_features - assert enc.indice_to_format_ == expected_indice_to_format + assert enc.index_to_features_ == expected_index_to_features + assert enc.index_to_format_ == expected_index_to_format assert enc.n_features_out_ == expected_n_features_out assert enc.get_feature_names_out() == expected_feature_names @@ -168,18 +168,18 @@ def test_fit( def test_format_nan(): X = get_nan_datetime() enc = DatetimeEncoder().fit(X) - expected_indice_to_format = { + expected_index_to_format = { 0: "%Y-%m-%d %H:%M:%S", 1: "%Y-%m-%d %H:%M:%S", 2: "%Y-%m-%d %H:%M:%S", } - assert enc.indice_to_format_ == expected_indice_to_format + assert enc.index_to_format_ == expected_index_to_format def test_format_nz(): X = get_tz_datetime() enc = DatetimeEncoder().fit(X) - assert enc.indice_to_format_ == {0: "%Y-%m-%d %H:%M:%S%z"} + assert enc.index_to_format_ == {0: "%Y-%m-%d %H:%M:%S%z"} def test_resolution_none(): @@ -190,7 +190,7 @@ def test_resolution_none(): ) enc.fit(X) - assert enc.indice_to_features_ == {0: [], 1: [], 2: []} + assert enc.index_to_features_ == {0: [], 1: [], 2: []} assert enc.n_features_out_ == 0 assert enc.get_feature_names_out() == [] @@ -316,7 +316,7 @@ def test_transform_nan(): def test_mixed_type_dataframe(): X = get_mixed_type_dataframe() enc = DatetimeEncoder().fit(X) - assert enc.indice_to_format_ == {0: "%Y-%m-%d", 4: "%d/%m/%Y"} + assert enc.index_to_format_ == {0: "%Y-%m-%d", 4: "%d/%m/%Y"} X_dt = to_datetime(X) expected_dtypes = [ From dff7b22a0f4ad8287028771173d14d4036b1ffc8 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Fri, 13 Oct 2023 16:53:36 +0200 Subject: [PATCH 14/30] fix old pandas version errors --- skrub/_datetime_encoder.py | 21 ++++++++++++++++----- skrub/tests/test_datetime_encoder.py | 22 +++++++++++++++++++--- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 0164c04c5..a94c89e1a 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -7,6 +7,7 @@ from pandas._libs.tslibs.parsing import guess_datetime_format from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils import check_array +from sklearn.utils.fixes import parse_version from sklearn.utils.validation import check_is_fitted from .dataframe._namespace import get_df_namespace @@ -24,6 +25,15 @@ TIME_LEVELS = list(WORD_TO_ALIAS) +def _is_pandas_format_mixed_available(): + pandas_version = pd.__version__ + min_pandas_version = "2.0.0" + return parse_version(min_pandas_version) < parse_version(pandas_version) + + +MIXED_FORMAT = "mixed" if _is_pandas_format_mixed_available() else None + + def to_datetime( X, errors="coerce", @@ -262,7 +272,7 @@ def _is_column_datetime_parsable(X_col): # avoiding ValueError when both date and datetime formats # are present. # At this stage, the format itself doesn't matter. - _ = pd.to_datetime(X_col, format="mixed") + _ = pd.to_datetime(X_col, format=MIXED_FORMAT) return True except (pd.errors.ParserError, ValueError): pass @@ -292,10 +302,11 @@ def _guess_datetime_format(X_col, require_dayfirst=False): vfunc = np.vectorize(guess_datetime_format) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) - month_first_formats = np.unique(vfunc(X_col, dayfirst=False)) - day_first_formats = np.unique(vfunc(X_col, dayfirst=True)) + # pd.unique handles None + month_first_formats = pd.unique(vfunc(X_col, dayfirst=False)) + day_first_formats = pd.unique(vfunc(X_col, dayfirst=True)) - if pd.isnull(month_first_formats).any() or pd.isnull(day_first_formats).any(): + if None in month_first_formats or None in day_first_formats: return None elif ( @@ -321,7 +332,7 @@ def _guess_datetime_format(X_col, require_dayfirst=False): and len(day_first_formats) == 2 and len(month_first_formats[0]) != len(month_first_formats[1]) ): - return "mixed" + return MIXED_FORMAT else: return None diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py index 9f75818ed..47910852f 100644 --- a/skrub/tests/test_datetime_encoder.py +++ b/skrub/tests/test_datetime_encoder.py @@ -7,7 +7,16 @@ from numpy.testing import assert_allclose, assert_array_equal from pandas.testing import assert_frame_equal, assert_series_equal -from skrub._datetime_encoder import TIME_LEVELS, DatetimeEncoder, to_datetime +from skrub._datetime_encoder import ( + TIME_LEVELS, + DatetimeEncoder, + _is_pandas_format_mixed_available, + to_datetime, +) + +NANOSECONDS_FORMAT = ( + "%Y-%m-%d %H:%M:%S.%f" if _is_pandas_format_mixed_available() else None +) def get_date(as_array=False): @@ -57,7 +66,7 @@ def get_nan_datetime(as_array=False): [ ["2020-01-01 10:12:01", None, "2020-01-03 10:00:00"], [np.nan, "2020-02-04 22:12:00", "2021-02-05 12:00:00"], - ["2022-01-01 23:23:43", "2020-12-25 11:12:00", pd.NaT], + ["2022-01-01 23:23:43", "2020-12-25 11:12:00", pd.NA], ], ) if as_array: @@ -116,7 +125,7 @@ def get_mixed_datetime_format(as_array=False): (get_date, TIME_LEVELS[: TIME_LEVELS.index("day") + 1], "%Y-%m-%d"), (get_datetime, TIME_LEVELS, "%Y-%m-%d %H:%M:%S"), (get_tz_datetime, TIME_LEVELS, "%Y-%m-%d %H:%M:%S%z"), - (get_nanoseconds, TIME_LEVELS, "%Y-%m-%d %H:%M:%S.%f"), + (get_nanoseconds, TIME_LEVELS, NANOSECONDS_FORMAT), ], ) @pytest.mark.parametrize( @@ -354,6 +363,13 @@ def test_mixed_datetime_format(): assert_series_equal(ser_dt, expected_ser_dt) +@pytest.mark.skipif( + not _is_pandas_format_mixed_available(), + reason=( + "DeprecationWarning is already handled as a ValueError in the latest" + " pandas version." + ), +) def test_indempotency(): df = get_mixed_datetime_format() df_dt = to_datetime(df) From ff5b5751a0e43cd5671fcb5655a5a588861c8e02 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Mon, 16 Oct 2023 15:25:50 +0200 Subject: [PATCH 15/30] fix doctest --- skrub/_datetime_encoder.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index a94c89e1a..290b94e0c 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -432,23 +432,19 @@ class DatetimeEncoder(TransformerMixin, BaseEstimator): Examples -------- - >>> enc = DatetimeEncoder() - - Let's encode the following dates: - + >>> enc = DatetimeEncoder(add_total_seconds=False) >>> X = [['2022-10-15'], ['2021-12-25'], ['2020-05-18'], ['2019-10-15 12:00:00']] - >>> enc.fit(X) - DatetimeEncoder() + DatetimeEncoder(add_total_seconds=False) The encoder will output a transformed array - with five columns ("year", "month", "day", "hour" and "total_seconds"): + with four columns ("year", "month", "day", "hour"): >>> enc.transform(X) - array([[2022., 10., 15., 0., 1.6657920e+09], - [2021., 12., 25., 0., 1.6403904e+09], - [2020., 5., 18., 0., 1.5897600e+09], - [2019., 10., 15., 12., 1.5711408e+09]]) + array([[2022., 10., 15., 0.], + [2021., 12., 25., 0.], + [2020., 5., 18., 0.], + [2019., 10., 15., 12.]]) """ def __init__( From 7f463bc980c1faa470fefb2b7ef45b0e97dbbefa Mon Sep 17 00:00:00 2001 From: Vincent M Date: Mon, 16 Oct 2023 17:20:38 +0200 Subject: [PATCH 16/30] add scalar and 1d array support for to_datetime --- skrub/_datetime_encoder.py | 42 +++++++++++++++++++++++++--- skrub/tests/test_datetime_encoder.py | 18 ++++++++++-- 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 290b94e0c..d0198f566 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -37,10 +37,11 @@ def _is_pandas_format_mixed_available(): def to_datetime( X, errors="coerce", + unit=None, **kwargs, ): """ - Convert argument to datetime. + Convert argument to datetime. Return the input if not datetime-parsable. Augment :func:`pandas.to_datetime` by supporting dataframes and 2d arrays inputs. It converts compatible columns to datetime, and @@ -60,6 +61,9 @@ def to_datetime( Note that ``'ignore'`` is not used for dataframes, 2d arrays, and series, and is used otherwise as in ``pd.to_datetime``. + unit : str, default None + Unused. Here for compatibility with :func:`pandas.to_datetime`. + **kwargs : key, value mappings Other keyword arguments are passed down to :func:`pandas.to_datetime`. @@ -75,6 +79,13 @@ def to_datetime( -------- :func:`pandas.to_datetime` Convert argument to datetime. + + Examples + -------- + >>> X = pd.DataFrame(dict(a=[1, 2], b=["2021-01-01", "2021-02-02"])) + >>> X = to_datetime(X) + >>> X.dtypes.to_list() + [dtype('int64'), dtype(' Date: Mon, 16 Oct 2023 18:04:33 +0200 Subject: [PATCH 17/30] fix test on py310-min --- skrub/tests/test_datetime_encoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py index d6bbd9a1b..89d6358be 100644 --- a/skrub/tests/test_datetime_encoder.py +++ b/skrub/tests/test_datetime_encoder.py @@ -5,7 +5,7 @@ import pandas as pd import pytest from numpy.testing import assert_allclose, assert_array_equal -from pandas.testing import assert_frame_equal, assert_series_equal +from pandas.testing import assert_frame_equal from skrub._datetime_encoder import ( TIME_LEVELS, @@ -360,7 +360,7 @@ def test_mixed_datetime_format(): ser_dt = to_datetime(df["a"]) expected_ser_dt = expected_df_dt["a"] - assert_series_equal(ser_dt, expected_ser_dt) + assert_array_equal(ser_dt, expected_ser_dt) @pytest.mark.skipif( From 4311a5eaef786fc9e5b409bde01b4981bf58956b Mon Sep 17 00:00:00 2001 From: Vincent M Date: Tue, 31 Oct 2023 15:47:29 +0100 Subject: [PATCH 18/30] update the example --- doc/api.rst | 10 +- doc/conf.py | 1 + examples/03_datetime_encoder.py | 168 +++++++++++++++----------------- 3 files changed, 89 insertions(+), 90 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 6d7469b6d..fee46d1c1 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -79,7 +79,7 @@ This page lists all available functions and classes of `skrub`. .. raw:: html -

Other encoders

+

Dealing with dates

.. autosummary:: :toctree: generated/ @@ -89,6 +89,14 @@ This page lists all available functions and classes of `skrub`. DatetimeEncoder +.. autosummary:: + :toctree: generated/ + :template: function.rst + :nosignatures: + :caption: Converting datetime columns in a table + + to_datetime + .. raw:: html

Deduplication: merging variants of the same entry

diff --git a/doc/conf.py b/doc/conf.py index 710f4d69a..b1bccad12 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -504,6 +504,7 @@ def notebook_modification_function(notebook_content, notebook_filename): "SimilarityEncoder": "skrub.SimilarityEncoder", "DatetimeEncoder": "skrub.DatetimeEncoder", "deduplicate": "skrub.deduplicate", + "to_datetime": "skrub.to_datetime", "TableVectorizer": "skrub.TableVectorizer", "DatasetInfoOnly": "skrub.datasets._fetching.DatasetInfoOnly", "DatasetAll": "skrub.datasets._fetching.DatasetAll", diff --git a/examples/03_datetime_encoder.py b/examples/03_datetime_encoder.py index 12376b881..d89dff662 100644 --- a/examples/03_datetime_encoder.py +++ b/examples/03_datetime_encoder.py @@ -34,6 +34,9 @@ .. |HGBR| replace:: :class:`~sklearn.ensemble.HistGradientBoostingRegressor` + +.. |to_datetime| replace:: + :func:`~skrub.to_datetime` """ @@ -46,19 +49,26 @@ # on the location, date and time of measurement. from pprint import pprint - import pandas as pd data = pd.read_csv( "https://raw.githubusercontent.com/pandas-dev/pandas" "/main/doc/data/air_quality_no2_long.csv" -) +).sort_values("date.utc") # Extract our input data (X) and the target column (y) y = data["value"] X = data[["city", "date.utc"]] X +############################################################################### +# We convert the dataframe date columns using |to_datetime|. Notice how +# we don't need to specify the columns to convert. +from skrub import to_datetime + +X = to_datetime(X) +X.dtypes + ############################################################################### # Encoding the features # ..................... @@ -73,10 +83,8 @@ # lower units, as they are probably unimportant. from sklearn.preprocessing import OneHotEncoder - -from skrub import DatetimeEncoder - from sklearn.compose import make_column_transformer +from skrub import DatetimeEncoder encoder = make_column_transformer( (OneHotEncoder(handle_unknown="ignore"), ["city"]), @@ -88,7 +96,7 @@ pprint(encoder.get_feature_names_out()) ############################################################################### -# We see that the encoder is working as expected: the "date.utc" column has +# We see that the encoder is working as expected: the ``"date.utc"`` column has # been replaced by features extracting the month, day, hour, minute, day of the # week and total second since Epoch information. @@ -101,8 +109,7 @@ from skrub import TableVectorizer -table_vec = TableVectorizer() -table_vec.fit_transform(X) +table_vec = TableVectorizer().fit(X) pprint(table_vec.get_feature_names_out()) ############################################################################### @@ -113,8 +120,7 @@ table_vec = TableVectorizer( datetime_transformer=DatetimeEncoder(add_day_of_the_week=True), -) -table_vec.fit_transform(X) +).fit(X) pprint(table_vec.get_feature_names_out()) ############################################################################### @@ -156,12 +162,6 @@ # # Instead, we can use the |TimeSeriesSplit|, # which ensures that the test set is always in the future. -import numpy as np - -sorted_indices = np.argsort(X["date.utc"]) -X = X.iloc[sorted_indices] -y = y.iloc[sorted_indices] - from sklearn.model_selection import TimeSeriesSplit, cross_val_score cross_val_score( @@ -178,82 +178,71 @@ # # The mean squared error is not obvious to interpret, so we compare # visually the prediction of our model with the actual values. - +import numpy as np import matplotlib.pyplot as plt -from matplotlib.dates import AutoDateFormatter, AutoDateLocator -X_train = X[X["date.utc"] < "2019-06-01"] -X_test = X[X["date.utc"] >= "2019-06-01"] - -y_train = y[X["date.utc"] < "2019-06-01"] -y_test = y[X["date.utc"] >= "2019-06-01"] +mask_train = X["date.utc"] < "2019-06-01" +X_train, X_test = X.loc[mask_train], X.loc[~mask_train] +y_train, y_test = y.loc[mask_train], y.loc[~mask_train] pipeline.fit(X_train, y_train) +y_pred = pipeline.predict(X_test) all_cities = X_test["city"].unique() -fig, axs = plt.subplots(nrows=len(all_cities), ncols=1, figsize=(12, 9)) -fig.subplots_adjust(hspace=0.5) +fig, axes = plt.subplots(nrows=len(all_cities), ncols=1, figsize=(12, 9)) +for ax, city in zip(axes, all_cities): + mask_prediction = X_test["city"] == city + date_prediction = X_test.loc[mask_prediction]["date.utc"] + y_prediction = y_pred[mask_prediction] -for i, city in enumerate(all_cities): - axs[i].plot( - X.loc[X.city == city, "date.utc"], - y.loc[X.city == city], - label="Actual", - ) - axs[i].plot( - X_test.loc[X_test.city == city, "date.utc"], - pipeline.predict(X_test.loc[X_test.city == city]), - label="Predicted", + mask_reference = X["city"] == city + date_reference = X.loc[mask_reference]["date.utc"] + y_reference = y[mask_reference] + + ax.plot(date_reference, y_reference, label="Actual") + ax.plot(date_prediction, y_prediction, label="Predicted") + + ax.set( + ylabel="NO2", + title=city, ) - axs[i].set_title(city) - axs[i].set_ylabel("NO2") - xtick_locator = AutoDateLocator(maxticks=8) - xtick_formatter = AutoDateFormatter(xtick_locator) - axs[i].xaxis.set_major_locator(xtick_locator) - axs[i].xaxis.set_major_formatter(xtick_formatter) - axs[i].legend() + ax.legend() + +fig.subplots_adjust(hspace=0.5) plt.show() ############################################################################### # Let's zoom on a few days: -X_zoomed = X[(X["date.utc"] <= "2019-06-04") & (X["date.utc"] >= "2019-06-01")] -y_zoomed = y[(X["date.utc"] <= "2019-06-04") & (X["date.utc"] >= "2019-06-01")] - -X_train_zoomed = X_zoomed[X_zoomed["date.utc"] < "2019-06-03"] -X_test_zoomed = X_zoomed[X_zoomed["date.utc"] >= "2019-06-03"] +mask_zoom_reference = (X["date.utc"] >= "2019-06-01") & (X["date.utc"] < "2019-06-04") +mask_zoom_prediction = (X_test["date.utc"] >= "2019-06-01") & ( + X_test["date.utc"] < "2019-06-04" +) -y_train_zoomed = y[X["date.utc"] < "2019-06-03"] -y_test_zoomed = y[X["date.utc"] >= "2019-06-03"] +all_cities = ["Paris", "London"] +fig, axes = plt.subplots(nrows=len(all_cities), ncols=1, figsize=(12, 9)) +for ax, city in zip(axes, all_cities): + mask_prediction = (X_test["city"] == city) & mask_zoom_prediction + date_prediction = X_test.loc[mask_prediction]["date.utc"] + y_prediction = y_pred[mask_prediction] -zoomed_cities = X_test_zoomed["city"].unique() + mask_reference = (X["city"] == city) & mask_zoom_reference + date_reference = X.loc[mask_reference]["date.utc"] + y_reference = y[mask_reference] -fig, axs = plt.subplots(nrows=len(zoomed_cities), ncols=1, figsize=(12, 9)) -fig.subplots_adjust(hspace=0.5) + ax.plot(date_reference, y_reference, label="Actual") + ax.plot(date_prediction, y_prediction, label="Predicted") -for i, city in enumerate(zoomed_cities): - axs[i].plot( - X_zoomed.loc[X_zoomed["city"] == city, "date.utc"], - y_zoomed.loc[X_zoomed["city"] == city], - label="Actual", - ) - axs[i].plot( - X_test_zoomed.loc[X_test_zoomed["city"] == city, "date.utc"], - pipeline.predict(X_test_zoomed.loc[X_test_zoomed["city"] == city]), - label="Predicted", + ax.set( + ylabel="NO2", + title=city, ) - axs[i].set_title(city) - axs[i].set_ylabel("NO2") + ax.legend() - xtick_locator = AutoDateLocator(maxticks=8) - xtick_formatter = AutoDateFormatter(xtick_locator) - axs[i].xaxis.set_major_locator(xtick_locator) - axs[i].xaxis.set_major_formatter(xtick_formatter) - - axs[i].legend() plt.show() + ############################################################################### # Features importance # ------------------- @@ -273,27 +262,28 @@ # In this case, we don't use a pipeline, because we want to compute the # importance of the features created by the DatetimeEncoder -X_ = table_vec.fit_transform(X) -reg = HistGradientBoostingRegressor().fit(X_, y) -result = permutation_importance(reg, X_, y, n_repeats=10, random_state=0) -std = result.importances_std -importances = result.importances_mean -indices = np.argsort(importances) -# Sort from least to most -indices = list(reversed(indices)) - -plt.figure(figsize=(12, 9)) -plt.title("Feature importances") -n = len(indices) -labels = np.array(table_vec.get_feature_names_out())[indices] -plt.barh(range(n), importances[indices], color="b", yerr=std[indices]) -plt.yticks(range(n), labels, size=15) -plt.tight_layout(pad=1) -plt.show() +X_transform = table_vec.fit_transform(X) +feature_names = table_vec.get_feature_names_out() + +model = HistGradientBoostingRegressor().fit(X_transform, y) +result = permutation_importance(model, X_transform, y, n_repeats=10, random_state=0) + +result = pd.DataFrame( + dict( + feature_names=feature_names, + std=result.importances_std, + importances=result.importances_mean, + ) +).sort_values("importances", ascending=False) + +result.plot.barh( + y="importances", x="feature_names", title="Feature Importances", figsize=(12, 9) +) +plt.tight_layout() ############################################################################### -# We can see that the hour of the day is the most important feature, -# which seems reasonable. +# We can see that the total seconds since Epoch and the hour of the day +# are the most important feature, which seems reasonable. # # Conclusion # ---------- From 77771f798d9ecf47020b94c71c8c6c93a713b7eb Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 2 Nov 2023 18:05:45 +0100 Subject: [PATCH 19/30] improve to_datetime docstring and parameters validation --- skrub/_datetime_encoder.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index d0198f566..2e3c25ddb 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -37,7 +37,6 @@ def _is_pandas_format_mixed_available(): def to_datetime( X, errors="coerce", - unit=None, **kwargs, ): """ @@ -58,15 +57,13 @@ def to_datetime( errors : {'ignore', 'raise', 'coerce'}, default 'coerce' - If ``'raise'``, then invalid parsing will raise an exception. - If ``'coerce'``, then invalid parsing will be set as ``NaT``. - Note that ``'ignore'`` is not used for dataframes, 2d arrays, - and series, and is used otherwise as in ``pd.to_datetime``. - - unit : str, default None - Unused. Here for compatibility with :func:`pandas.to_datetime`. + Note that ``'ignore'`` is not used and will raise an error. **kwargs : key, value mappings - Other keyword arguments are passed down to - :func:`pandas.to_datetime`. + Other keyword arguments are passed down to :func:`pandas.to_datetime`. + Raise an error if 'unit' is set to any value. This is because, in + `pandas.to_datetime`, unit is specific to timestamps, whereas in + `skru`.to_datetime` we don't attempt to parse numeric columns. Returns ------- @@ -87,8 +84,14 @@ def to_datetime( >>> X.dtypes.to_list() [dtype('int64'), dtype(' Date: Thu, 2 Nov 2023 18:07:11 +0100 Subject: [PATCH 20/30] fix _dataframe import path --- skrub/_datetime_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 2e3c25ddb..e4715f0ff 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -10,7 +10,7 @@ from sklearn.utils.fixes import parse_version from sklearn.utils.validation import check_is_fitted -from .dataframe._namespace import get_df_namespace +from ._dataframe._namespace import get_df_namespace WORD_TO_ALIAS = { "year": "Y", From cd6672de399f84c33248e23daeb0c481848cdd5d Mon Sep 17 00:00:00 2001 From: Vincent M Date: Fri, 3 Nov 2023 19:30:49 +0100 Subject: [PATCH 21/30] improve doc and add some tests --- skrub/_datetime_encoder.py | 91 ++++++++++++---------- skrub/tests/test_datetime_encoder.py | 112 +++++++++++++++++++++------ 2 files changed, 141 insertions(+), 62 deletions(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index e4715f0ff..9b8364321 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -39,31 +39,44 @@ def to_datetime( errors="coerce", **kwargs, ): - """ - Convert argument to datetime. Return the input if not datetime-parsable. - - Augment :func:`pandas.to_datetime` by supporting dataframes - and 2d arrays inputs. It converts compatible columns to datetime, and - pass incompatible columns unchanged. + """Convert the columns of a dataframe or 2d array into a datetime representation. - int, float, str, datetime, list, tuple, 1d array, and Series are defered to - :func:`pandas.to_datetime` directly. + This function augments :func:`pandas.to_datetime` by supporting dataframes + and 2d array inputs. It only attempts to convert columns whose dtype are + object or string. Numeric columns are skip and preserved in the output. Parameters ---------- - X : int, float, str, datetime, list, tuple, nd array, Series, DataFrame/dict-like + X : Pandas or Polars dataframe, 2d-array or any input accepted \ + by ``pd.to_datetime``. The object to convert to a datetime. - errors : {'ignore', 'raise', 'coerce'}, default 'coerce' - - If ``'raise'``, then invalid parsing will raise an exception. - - If ``'coerce'``, then invalid parsing will be set as ``NaT``. - Note that ``'ignore'`` is not used and will raise an error. + errors : {'coerce', 'raise'}, default 'coerce' + When set to 'raise', errors will be raised only when the following conditions + are satisfied, for each column ``X_col``: + - After converting to numpy, the column dtype is np.object_ or np.str_ + - Each entry of the column is datetime-parsable, i.e. + ``pd.to_datetime(X_col, format="mixed")`` doesn't raise an error. + This step is conservative, because e.g. + ``["2020-01-01", "hello", "2020-01-01"]`` + is not considered datetime-parsable (so we won't attempt to convert it). + - The column as a whole is not datetime-parsable, due to a clash of datetime + format, e.g. '2020/01/01' and '2020-01-01'. + + When set to ``'coerce'``, the entries of ``X_col`` that should have raised + an error are set to ``NaT`` instead. + You can choose which format to use with the keyword argument ``format``, as with + ``pd.to_datetime``, e.g. ``to_datetime(X_col, format='%Y/%m/%d')``. + Combined with ``error='coerce'``, this will convert all entries that don't + match this format to ``NaT``. + + Note that the ``'ignore'`` option is not used and will raise an error. **kwargs : key, value mappings Other keyword arguments are passed down to :func:`pandas.to_datetime`. Raise an error if 'unit' is set to any value. This is because, in `pandas.to_datetime`, unit is specific to timestamps, whereas in - `skru`.to_datetime` we don't attempt to parse numeric columns. + `skrub`.to_datetime` we don't attempt to parse numeric columns. Returns ------- @@ -80,7 +93,14 @@ def to_datetime( Examples -------- >>> X = pd.DataFrame(dict(a=[1, 2], b=["2021-01-01", "2021-02-02"])) - >>> X = to_datetime(X) + >>> X + a b + 0 1 31/01/2021 + 1 2 01/02/2022 + >>> to_datetime(X) + a b + 0 1 2021-01-31 + 1 2 2022-02-01 >>> X.dtypes.to_list() [dtype('int64'), dtype(' Date: Sat, 4 Nov 2023 11:48:17 +0100 Subject: [PATCH 22/30] fix docstring format --- skrub/_datetime_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 9b8364321..42a2ba963 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -48,7 +48,7 @@ def to_datetime( Parameters ---------- X : Pandas or Polars dataframe, 2d-array or any input accepted \ - by ``pd.to_datetime``. + by ``pd.to_datetime`` The object to convert to a datetime. errors : {'coerce', 'raise'}, default 'coerce' From 581fd8815644a183497048c0f31bca115072b777 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Sat, 4 Nov 2023 12:10:57 +0100 Subject: [PATCH 23/30] make doctest happy --- skrub/_agg_joiner.py | 19 ++++++++++--------- skrub/_datetime_encoder.py | 14 ++++++-------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/skrub/_agg_joiner.py b/skrub/_agg_joiner.py index ed369964b..0e461c3bf 100644 --- a/skrub/_agg_joiner.py +++ b/skrub/_agg_joiner.py @@ -155,7 +155,7 @@ class AggJoiner(BaseEstimator, TransformerMixin): airportId airportName company_mode_1 total_passengers_mean_1 0 1 Paris CDG AF 103.33... 1 2 NY JFK DL 80.00... - """ # noqa: E501 + """ def __init__( self, @@ -416,18 +416,19 @@ class AggTarget(BaseEstimator, TransformerMixin): ... "company": ["DL", "AF", "AF", "DL", "DL", "TR"], ... }) >>> y = np.array([1, 1, 0, 0, 1, 1]) - >>> join_agg = AggTarget( + >>> agg_target = AggTarget( ... main_key="company", ... operation=["mean", "max"], ... ) - >>> join_agg.fit_transform(X, y) + >>> agg_target.fit_transform(X, y) flightId from_airport ... y_0_max_target y_0_mean_target - 0 1 1 ... 1 0.66... - 1 2 1 ... 1 0.50... - 2 3 1 ... 1 0.50... - 3 4 2 ... 1 0.66... - 4 5 2 ... 1 0.66... - 5 6 2 ... 1 1.00... + 0 1 1 ... 1 0.666667 + 1 2 1 ... 1 0.500000 + 2 3 1 ... 1 0.500000 + 3 4 2 ... 1 0.666667 + 4 5 2 ... 1 0.666667 + 5 6 2 ... 1 1.000000 + [6 rows x 6 columns] """ diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 42a2ba963..5a5134c06 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -94,15 +94,13 @@ def to_datetime( -------- >>> X = pd.DataFrame(dict(a=[1, 2], b=["2021-01-01", "2021-02-02"])) >>> X - a b - 0 1 31/01/2021 - 1 2 01/02/2022 + a b + 0 1 2021-01-01 + 1 2 2021-02-02 >>> to_datetime(X) - a b - 0 1 2021-01-31 - 1 2 2022-02-01 - >>> X.dtypes.to_list() - [dtype('int64'), dtype(' Date: Sat, 4 Nov 2023 12:51:15 +0100 Subject: [PATCH 24/30] fix min pandas tests --- skrub/_datetime_encoder.py | 4 ++-- skrub/tests/test_datetime_encoder.py | 17 ++++++++++++----- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 5a5134c06..a8ef43646 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -361,10 +361,10 @@ def _guess_datetime_format(X_col): month_first_formats = pd.unique(vfunc(X_col, dayfirst=False)) day_first_formats = pd.unique(vfunc(X_col, dayfirst=True)) - if len(month_first_formats) == 1: + if len(month_first_formats) == 1 and month_first_formats[0] is not None: return str(month_first_formats[0]) - elif len(day_first_formats) == 1: + elif len(day_first_formats) == 1 and day_first_formats[0] is not None: return str(day_first_formats[0]) # special heuristic: when both date and datetime formats are diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py index 52d2eabf9..29fc03001 100644 --- a/skrub/tests/test_datetime_encoder.py +++ b/skrub/tests/test_datetime_encoder.py @@ -17,6 +17,7 @@ NANOSECONDS_FORMAT = ( "%Y-%m-%d %H:%M:%S.%f" if _is_pandas_format_mixed_available() else None ) +MSG_MIN_PANDAS_SKIP = "Pandas format=mixed is not available" def get_date(as_array=False): @@ -344,10 +345,7 @@ def test_mixed_type_dataframe(): @pytest.mark.skipif( not _is_pandas_format_mixed_available(), - reason=( - "DeprecationWarning is already handled as a ValueError " - "in the latest pandas version." - ), + reason=MSG_MIN_PANDAS_SKIP, ) def test_indempotency(): df = get_mixed_datetime_format() @@ -382,7 +380,12 @@ def test_datetime_encoder_invalid_params(): 1, [1, 2], np.array([1, 2]), - pd.Timestamp(2020, 1, 1), + pytest.param( + pd.Timestamp(2020, 1, 1), + marks=pytest.mark.skipif( + not _is_pandas_format_mixed_available(), reason=MSG_MIN_PANDAS_SKIP + ), + ), np.array(["2020-01-01", "hello", "2020-01-02"]), ], ) @@ -405,6 +408,10 @@ def test_to_datetime_invalid_params(): to_datetime(2020, unit="second") +@pytest.mark.skipif( + not _is_pandas_format_mixed_available(), + reason=MSG_MIN_PANDAS_SKIP, +) def test_to_datetime_format_param(): X_col = ["2021-01-01", "2021/01/01"] From bc72e81df96bbf04cb1b21232a067d8fb88ae641 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Sat, 4 Nov 2023 17:21:25 +0100 Subject: [PATCH 25/30] fix tests for min pandas --- skrub/_agg_joiner.py | 2 -- skrub/_datetime_encoder.py | 19 ++++++++++++++----- skrub/tests/test_datetime_encoder.py | 11 +---------- skrub/tests/test_table_vectorizer.py | 9 ++++++++- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/skrub/_agg_joiner.py b/skrub/_agg_joiner.py index 0e461c3bf..f84f68f98 100644 --- a/skrub/_agg_joiner.py +++ b/skrub/_agg_joiner.py @@ -428,8 +428,6 @@ class AggTarget(BaseEstimator, TransformerMixin): 3 4 2 ... 1 0.666667 4 5 2 ... 1 0.666667 5 6 2 ... 1 1.000000 - - [6 rows x 6 columns] """ def __init__( diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index a8ef43646..f7055b3de 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -1,5 +1,6 @@ import warnings from collections import defaultdict +from contextlib import nullcontext from typing import Iterable import numpy as np @@ -309,11 +310,19 @@ def _is_column_datetime_parsable(X_col): is_dt_parsable : bool """ # Remove columns of int, float or bool casted as object. - try: - if np.array_equal(X_col, X_col.astype(np.float64)): - return False - except ValueError: - pass + # Pandas < 2.0.0 raise a deprecation warning instead of an error. + with ( + warnings.catch_warnings() + if not _is_pandas_format_mixed_available() + else nullcontext() + ): + if not _is_pandas_format_mixed_available(): + warnings.simplefilter("ignore", category=DeprecationWarning) + try: + if np.array_equal(X_col, X_col.astype(np.float64)): + return False + except ValueError: + pass np_dtypes_candidates = [np.object_, np.str_, np.datetime64] is_type_datetime_compatible = any( diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py index 29fc03001..a4a536353 100644 --- a/skrub/tests/test_datetime_encoder.py +++ b/skrub/tests/test_datetime_encoder.py @@ -343,10 +343,6 @@ def test_mixed_type_dataframe(): assert X_dt.dtype == np.object_ -@pytest.mark.skipif( - not _is_pandas_format_mixed_available(), - reason=MSG_MIN_PANDAS_SKIP, -) def test_indempotency(): df = get_mixed_datetime_format() df_dt = to_datetime(df) @@ -380,12 +376,7 @@ def test_datetime_encoder_invalid_params(): 1, [1, 2], np.array([1, 2]), - pytest.param( - pd.Timestamp(2020, 1, 1), - marks=pytest.mark.skipif( - not _is_pandas_format_mixed_available(), reason=MSG_MIN_PANDAS_SKIP - ), - ), + pd.Timestamp(2020, 1, 1), np.array(["2020-01-01", "hello", "2020-01-02"]), ], ) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index ffe41155b..333b4d7f9 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -8,9 +8,12 @@ from sklearn.utils.validation import check_is_fitted from skrub import GapEncoder, MinHashEncoder, SuperVectorizer, TableVectorizer +from skrub._datetime_encoder import _is_pandas_format_mixed_available from skrub._table_vectorizer import _infer_date_format from skrub.tests.utils import transformers_list_equal +MSG_PANDAS_DEPRECATED_WARNING = "Skip deprecation warning" + def check_same_transformers( expected_transformers: dict, actual_transformers: list @@ -788,7 +791,7 @@ def test_mixed_types() -> None: pd.DataFrame({"col1": [1.0, 2.0, np.nan]}), ), # All datetimes during fit, 1 category during transform - ( + pytest.param( pd.DataFrame( { "col1": [ @@ -816,6 +819,10 @@ def test_mixed_types() -> None: ] } ), + marks=pytest.mark.skipif( + not _is_pandas_format_mixed_available(), + reason=MSG_PANDAS_DEPRECATED_WARNING, + ), ), ], ) From 1f52d1e8a49b32d336513d64c14f8c37c2a6c6ca Mon Sep 17 00:00:00 2001 From: Vincent M Date: Sat, 4 Nov 2023 17:25:47 +0100 Subject: [PATCH 26/30] make doctest happy --- skrub/_agg_joiner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/skrub/_agg_joiner.py b/skrub/_agg_joiner.py index f84f68f98..0e461c3bf 100644 --- a/skrub/_agg_joiner.py +++ b/skrub/_agg_joiner.py @@ -428,6 +428,8 @@ class AggTarget(BaseEstimator, TransformerMixin): 3 4 2 ... 1 0.666667 4 5 2 ... 1 0.666667 5 6 2 ... 1 1.000000 + + [6 rows x 6 columns] """ def __init__( From 0875958d1d7dde882d199b7fb3abc4ccfe4c2ad0 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Mon, 6 Nov 2023 20:17:24 +0100 Subject: [PATCH 27/30] Update skrub/_datetime_encoder.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérôme Dockès --- skrub/_datetime_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index f7055b3de..6abbf0fa4 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -440,7 +440,7 @@ class DatetimeEncoder(TransformerMixin, BaseEstimator): Extract up to this resolution. E.g., ``resolution="day"`` generates the features "year", "month", "day" only. - If ``None``, no feature will be created. + If ``None``, no such feature will be created (but day of the week and total seconds may still be extracted, see below). add_day_of_the_week : bool, default=False Add day of the week feature as a numerical feature From 4137f8984b955322172fcdb5bc40ccaabe3f0292 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Wed, 8 Nov 2023 11:50:05 +0100 Subject: [PATCH 28/30] apply suggestions --- skrub/_datetime_encoder.py | 68 +++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 6abbf0fa4..8626a56bc 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -1,6 +1,5 @@ import warnings from collections import defaultdict -from contextlib import nullcontext from typing import Iterable import numpy as np @@ -46,6 +45,9 @@ def to_datetime( and 2d array inputs. It only attempts to convert columns whose dtype are object or string. Numeric columns are skip and preserved in the output. + Use the 'format' keyword to force a specific datetime format. See more details in + the parameters section. + Parameters ---------- X : Pandas or Polars dataframe, 2d-array or any input accepted \ @@ -60,7 +62,7 @@ def to_datetime( ``pd.to_datetime(X_col, format="mixed")`` doesn't raise an error. This step is conservative, because e.g. ``["2020-01-01", "hello", "2020-01-01"]`` - is not considered datetime-parsable (so we won't attempt to convert it). + is not considered datetime-parsable, so we won't attempt to convert it). - The column as a whole is not datetime-parsable, due to a clash of datetime format, e.g. '2020/01/01' and '2020-01-01'. @@ -75,9 +77,16 @@ def to_datetime( **kwargs : key, value mappings Other keyword arguments are passed down to :func:`pandas.to_datetime`. - Raise an error if 'unit' is set to any value. This is because, in - `pandas.to_datetime`, unit is specific to timestamps, whereas in - `skrub`.to_datetime` we don't attempt to parse numeric columns. + + One notable argument is 'format'. Setting a format overwrites + the datetime format guessing behavior of this function for all columns. + + Note that we don't encourage you to use dayfirst or monthfirst argument, since + their behavior is ambiguous and might not be applied at all. + + Moreover, this function raises an error if 'unit' is set to any value. + This is because, in ``pandas.to_datetime``, 'unit' is specific to timestamps, + whereas in ``skrub.to_datetime`` we don't attempt to parse numeric columns. Returns ------- @@ -291,8 +300,15 @@ def _get_datetime_column_indices(X_split, dayfirst=True): if _is_column_datetime_parsable(X_col): indices.append(col_idx) - # TODO: pass require_dayfirst to _guess_datetime_format - index_to_format[col_idx] = _guess_datetime_format(X_col) + + if np.issubdtype(X_col.dtype, np.datetime64): + # We don't need to specify a parsing format + # for columns that are already of type datetime64. + datetime_format = None + else: + datetime_format = _guess_datetime_format(X_col) + + index_to_format[col_idx] = datetime_format return indices, index_to_format @@ -311,13 +327,8 @@ def _is_column_datetime_parsable(X_col): """ # Remove columns of int, float or bool casted as object. # Pandas < 2.0.0 raise a deprecation warning instead of an error. - with ( - warnings.catch_warnings() - if not _is_pandas_format_mixed_available() - else nullcontext() - ): - if not _is_pandas_format_mixed_available(): - warnings.simplefilter("ignore", category=DeprecationWarning) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=DeprecationWarning) try: if np.array_equal(X_col, X_col.astype(np.float64)): return False @@ -344,24 +355,27 @@ def _is_column_datetime_parsable(X_col): def _guess_datetime_format(X_col): - """ + """Infer the format of a 1d array. + + This functions uses Pandas ``guess_datetime_format`` routine for both + dayfirst and monthfirst case, and select either format when using one + give a unify format on the array. + + When both dayfirst and monthfirst format are possible, we select + monthfirst by default. + + You can overwrite this behaviour by setting a format of the caller function. + Setting a format always take precedence over infering it using + ``_guess_datetime_format``. + Parameters ---------- X_col : ndarray of shape ``(n_samples,)`` - require_dayfirst : bool, default False - Whether to return the dayfirst format when both dayfirst - and monthfirst are valid. - Returns ------- - format : str + datetime_format : str or None """ - if np.issubdtype(X_col.dtype, np.datetime64): - # We don't need to specify a parsing format - # for columns that are already of type datetime64. - return None - X_col = X_col.astype(np.object_) vfunc = np.vectorize(guess_datetime_format) with warnings.catch_warnings(): @@ -440,7 +454,8 @@ class DatetimeEncoder(TransformerMixin, BaseEstimator): Extract up to this resolution. E.g., ``resolution="day"`` generates the features "year", "month", "day" only. - If ``None``, no such feature will be created (but day of the week and total seconds may still be extracted, see below). + If ``None``, no such feature will be created (but day of the week and \ + total seconds may still be extracted, see below). add_day_of_the_week : bool, default=False Add day of the week feature as a numerical feature @@ -573,7 +588,6 @@ def _select_datetime_cols(self, X): self.column_indices_, self.index_to_format_ = _get_datetime_column_indices( X_split ) - del X_split self.index_to_features_ = defaultdict(list) self.n_features_out_ = 0 From 0bf489686e47784bb2eda9108cce187631224421 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 9 Nov 2023 10:34:29 +0100 Subject: [PATCH 29/30] missing remarks --- skrub/_datetime_encoder.py | 1 + skrub/tests/test_datetime_encoder.py | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 8626a56bc..21839c41d 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -371,6 +371,7 @@ def _guess_datetime_format(X_col): Parameters ---------- X_col : ndarray of shape ``(n_samples,)`` + X_col must only contains string objects without any missing value. Returns ------- diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py index a4a536353..a6d144528 100644 --- a/skrub/tests/test_datetime_encoder.py +++ b/skrub/tests/test_datetime_encoder.py @@ -439,11 +439,13 @@ def test_mixed_datetime_format(): def test_mix_of_unambiguous(): - X_col = ["2021/10/15", "2021/13/01"] - - # no format (default), no-op + X_col = ["2021/10/15", "01/14/2021"] out = to_datetime(X_col) - assert_array_equal(out, X_col) + expected_out = np.array( + [np.datetime64("2021-10-15"), np.datetime64("NaT")], + dtype="datetime64[ns]", + ) + assert_array_equal(out, expected_out) def test_only_ambiguous(): From d5a4091f8a6737a26a9eca2dac82f135650a020c Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 9 Nov 2023 10:40:28 +0100 Subject: [PATCH 30/30] fix min pandas version test --- skrub/tests/test_datetime_encoder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py index a6d144528..3881eac67 100644 --- a/skrub/tests/test_datetime_encoder.py +++ b/skrub/tests/test_datetime_encoder.py @@ -438,6 +438,7 @@ def test_mixed_datetime_format(): assert_array_equal(series_dt, expected_series_dt) +@pytest.mark.skipif(not _is_pandas_format_mixed_available(), reason=MSG_MIN_PANDAS_SKIP) def test_mix_of_unambiguous(): X_col = ["2021/10/15", "01/14/2021"] out = to_datetime(X_col)