diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..57c49d5 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,354 @@ +import importlib.resources as pkg_resources +import json +from typing import Any, Dict, List, Optional, Tuple, Union + +import numpy as np +import pandas as pd +import pytest +from pytest import CaptureFixture, FixtureRequest +from scipy.sparse import csr_matrix +from sklearn.model_selection import train_test_split + +from tests import resources + + +@pytest.fixture(scope="session") +def clf_train_test_x_y( + request: FixtureRequest, +) -> Tuple[ + Union[pd.DataFrame, np.ndarray], + Union[pd.DataFrame, np.ndarray], + Union[np.ndarray, List], + Union[np.ndarray, List], +]: + """Returns stratified train/test features/targets sets as a `pytest.fixture` for binary classification problems. + + Parameters + ---------- + request : FixtureRequest + Fixture request for params + + Returns + ------- + Tuple[Union[pd.DataFrame, np.ndarray], + Union[pd.DataFrame, np.ndarray], + Union[np.ndarray, List], + Union[np.ndarray, List], + ] + """ + df = _load_test_data_from_csv( + filename="clf_test_data.csv", + ) + y = df["CLASS"].values + X = df.drop( + ["CLASS"], + axis=1, + ) + X_train, X_test, y_train, y_test = train_test_split( + X, + y, + test_size=0.2, + shuffle=True, + stratify=y, + random_state=1367, + ) + if request.param == "dataframe": + return (X_train, X_test, y_train, y_test) + elif request.param == "array": + return (X_train.values, X_test.values, y_train, y_test) + elif request.param == "list": + return (X_train, X_test, y_train.tolist(), y_test.tolist()) + else: + return None + + +@pytest.fixture(scope="session") +def clf_x_y( + request: FixtureRequest, +) -> Tuple[Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List]]: + """Returns features/targets sets a `pytest.fixture` for binary classification problems. + + Parameters + ---------- + request : FixtureRequest + Fixture request for params + + Returns + ------- + Tuple[Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List]] + """ + df = _load_test_data_from_csv( + filename="clf_test_data.csv", + ) + y = df["CLASS"].values + X = df.drop( + ["CLASS"], + axis=1, + ) + if request.param == "dataframe": + return (X, y) + elif request.param == "array": + return (X.values, y) + elif request.param == "list": + return (X, y.tolist()) + else: + return None + + +@pytest.fixture(scope="session") +def reg_train_test_x_y( + request: FixtureRequest, +) -> Tuple[ + Union[pd.DataFrame, np.ndarray], + Union[pd.DataFrame, np.ndarray], + Union[np.ndarray, List], + Union[np.ndarray, List], +]: + """Returns train/test features/targets sets as a `pytest.fixture` for regression problems. + + Parameters + ---------- + request : FixtureRequest + Fixture request for params + + Returns + ------- + Tuple[Union[pd.DataFrame, np.ndarray], + Union[pd.DataFrame, np.ndarray], + Union[np.ndarray, List], + Union[np.ndarray, List], + ] + """ + df = _load_test_data_from_csv( + filename="reg_test_data.csv", + ) + # TODO(amir): try to pull-out multi target regression as well here + y = df["TARGET1"].values + X = df.drop( + ["TARGET1", "TARGET2"], + axis=1, + ) + X_train, X_test, y_train, y_test = train_test_split( + X, + y, + test_size=0.2, + shuffle=True, + random_state=1367, + ) + if request.param == "dataframe": + return (X_train, X_test, y_train, y_test) + elif request.param == "array": + return (X_train.values, X_test.values, y_train, y_test) + elif request.param == "list": + return (X_train, X_test, y_train.tolist(), y_test.tolist()) + else: + return None + + +@pytest.fixture(scope="session") +def reg_x_y( + request: FixtureRequest, +) -> Tuple[Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List]]: + """Returns features/targets sets a `pytest.fixture` for regression problems. + + Parameters + ---------- + request : FixtureRequest + Fixture request for params + + Returns + ------- + Tuple[Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List]] + """ + df = _load_test_data_from_csv( + filename="reg_test_data.csv", + ) + # TODO(amir): try to pull-out multi target regression as well here + y = df["TARGET1"].values + X = df.drop( + ["TARGET1", "TARGET2"], + axis=1, + ) + if request.param == "dataframe": + return (X, y) + elif request.param == "array": + return (X.values, y) + elif request.param == "list": + return (X, y.tolist()) + else: + return None + + +@pytest.fixture(scope="session") +def datafarame_for_testing() -> pd.DataFrame: + """Returns a `pandas.DataFrame` as `pytest.fixture`. + + Returns + ------- + pd.DataFrame + """ + return _dummy_pandas_dataframe( + size=100, + random_state=1367, + ) + + +@pytest.fixture(scope="session") +def sparse_matrix_for_testing() -> csr_matrix: + """Returns a `scipy.csr_matrix` as `pytest.fixture`. + + Returns + ------- + csr_matrix + """ + return _dummy_sparse_matrix() + + +# TODO(amir): what if values is list ? +def _ids(values: Any) -> str: + """Returns a user-friendly test case ID from the parametrized values. + + Parameters + ---------- + values : Any + Test resource values + + Returns + ------- + str + """ + if isinstance(values, dict): + return ", ".join(f"{k} : {v}" for (k, v) in values.items()) + else: + return str(values) + + +def _load_test_scenarios_from_json(filename: str) -> Dict[str, Any]: + """Returns a json file contains valid and invalid test cases that can be used for `pytest.fixtures`. + + Parameters + ---------- + filename : str + Json filename + + Returns + ------- + Dict[str, Any] + """ + return json.loads( + pkg_resources.read_text( + resources, + filename, + ), + ) + + +def _load_test_data_from_csv(filename: str) -> pd.DataFrame: + """Returns a `pandas.DataFrame` data loaded from a csv file that can be used for `pytest.fixtures`. + + Parameters + ---------- + filename : str + Data filename + + Returns + ------- + pd.DataFrame + """ + with pkg_resources.path(resources, filename) as path: + return pd.read_csv(path) + + +def _captured_log(capsys: CaptureFixture) -> Tuple[str, str]: + """Returns the captured standard output/error via `pytest.capsys` [1]_. + + Parameters + ---------- + capsys : CaptureFixture + Pytest capture fixture to read output and error + + References + ---------- + .. [1] https://docs.pytest.org/en/7.1.x/how-to/capture-stdout-stderr.html + + Returns + ------- + Tuple[str] + Captured output and caputred error + """ + captured = capsys.readouterr() + return (captured.out, captured.err) + + +def _dummy_pandas_dataframe( + size: Optional[int] = 100, + random_state: Optional[int] = 1367, +) -> pd.DataFrame: + """Returns a dummy pandas DataFrame that can be used for `pytest.fixtures`. + + Notes + ----- + The DataFrame shape is (size, 4), two features ("feature_1", "feature_2"), and two targets + ("binary_target", "multi_target"). + + Parameters + ---------- + size : int, optional + Number of samples, by default 100 + + random_state : int, optional + Random seed, by default 1367 + + Returns + ------- + pd.DataFrame + """ + np.random.seed( + seed=random_state, + ) + return pd.DataFrame( + { + "feature_1": np.random.random_sample( + size=size, + ), + "feature_2": np.random.random_sample( + size=size, + ), + "binary_target": np.random.randint( + low=0, + high=2, + size=size, + dtype=int, + ), + "multi_target": np.random.randint( + low=0, + high=3, + size=size, + dtype=int, + ), + }, + ) + + +def _dummy_sparse_matrix() -> csr_matrix: + """Returns a sparse matrix in CSR format with a shape of (3,3) with float entries. + + Notes + ----- + The numpy representation `_dummy_sparse_matrix().toarray()` is as follows: + array([[1., 0., 2.], + [0., 0., 3.], + [4., 5., 6.]]) + + Returns + ------- + csr_matrix + """ + row = np.array([0, 0, 1, 2, 2, 2]) + col = np.array([0, 2, 2, 0, 1, 2]) + data = np.array([1, 2, 3, 4, 5, 6]) + return csr_matrix( + (data, (row, col)), + shape=(3, 3), + dtype=np.float64, + ) diff --git a/tests/slickml/classification/test_glmnet.py b/tests/slickml/classification/test_glmnet.py index 4ddeae4..51f58ca 100644 --- a/tests/slickml/classification/test_glmnet.py +++ b/tests/slickml/classification/test_glmnet.py @@ -7,11 +7,9 @@ import shap from assertpy import assert_that from matplotlib.figure import Figure -from pytest import FixtureRequest -from sklearn.model_selection import train_test_split from slickml.classification import GLMNetCVClassifier -from tests.utils import _ids, _load_test_data_from_csv +from tests.conftest import _ids # TODO(amir): add lolipop plot for coeff + unit-test @@ -19,42 +17,6 @@ class TestGLMNetCVClassifier: """Validates `GLMNetCVClassifier` instantiation.""" - @staticmethod - @pytest.fixture(scope="module") - def clf_x_y_data( - request: FixtureRequest, - ) -> Tuple[ - Union[pd.DataFrame, np.ndarray], - Union[pd.DataFrame, np.ndarray], - Union[np.ndarray, List], - Union[np.ndarray, List], - ]: - """Returns stratified train/test sets.""" - df = _load_test_data_from_csv( - filename="clf_test_data.csv", - ) - y = df["CLASS"].values - X = df.drop( - ["CLASS"], - axis=1, - ) - X_train, X_test, y_train, y_test = train_test_split( - X, - y, - test_size=0.2, - shuffle=True, - stratify=y, - random_state=1367, - ) - if request.param == "dataframe": - return (X_train, X_test, y_train, y_test) - elif request.param == "array": - return (X_train.values, X_test.values, y_train, y_test) - elif request.param == "list": - return (X_train, X_test, y_train.tolist(), y_test.tolist()) - else: - return None - @pytest.mark.parametrize( ("kwargs"), [ @@ -76,18 +38,18 @@ def test_glmnetcvclassifier_instantiation__fails__with_invalid_inputs(self, kwar GLMNetCVClassifier(**kwargs) @pytest.mark.parametrize( - ("clf_x_y_data"), + ("clf_train_test_x_y"), [ ("array"), ("dataframe"), ("list"), ], - indirect=["clf_x_y_data"], + indirect=["clf_train_test_x_y"], ids=_ids, ) def test_glmnetcvclassifier__passes__with_defaults_and_no_test_targets( self, - clf_x_y_data: Tuple[ + clf_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -95,7 +57,7 @@ def test_glmnetcvclassifier__passes__with_defaults_and_no_test_targets( ], ) -> None: """Validates `GLMNetCVClassifier` instanation passes with default inputs.""" - X_train, X_test, y_train, _ = clf_x_y_data + X_train, X_test, y_train, _ = clf_train_test_x_y clf = GLMNetCVClassifier() clf.fit(X_train, y_train) y_pred_proba = clf.predict_proba(X_test) @@ -184,18 +146,18 @@ def test_glmnetcvclassifier__passes__with_defaults_and_no_test_targets( npt.assert_almost_equal(np.mean(clf.shap_values_train_), 0.01112, decimal=5) @pytest.mark.parametrize( - ("clf_x_y_data"), + ("clf_train_test_x_y"), [ ("array"), ("dataframe"), ("list"), ], - indirect=["clf_x_y_data"], + indirect=["clf_train_test_x_y"], ids=_ids, ) def test_glmnetcvclassifier__passes__with_defaults( self, - clf_x_y_data: Tuple[ + clf_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -203,7 +165,7 @@ def test_glmnetcvclassifier__passes__with_defaults( ], ) -> None: """Validates `GLMNetCVClassifier` instanation passes with default inputs.""" - X_train, X_test, y_train, y_test = clf_x_y_data + X_train, X_test, y_train, y_test = clf_train_test_x_y clf = GLMNetCVClassifier() clf.fit(X_train, y_train) # Note: we pass `y_test` for the sake of testing while in inference we might night have @@ -296,7 +258,7 @@ def test_glmnetcvclassifier__passes__with_defaults( # TODO(amir): add a test for `lambda_path` parameter @pytest.mark.parametrize( - ("clf_x_y_data", "kwargs"), + ("clf_train_test_x_y", "kwargs"), [ ("dataframe", {"alpha": 0.9}), ("dataframe", {"n_lambda": 200}), @@ -308,12 +270,12 @@ def test_glmnetcvclassifier__passes__with_defaults( ("dataframe", {"random_state": 42}), ("dataframe", {"max_features": 10}), ], - indirect=["clf_x_y_data"], + indirect=["clf_train_test_x_y"], ids=_ids, ) def test_glmnetcvclassifier__passes__with_valid_inputs( self, - clf_x_y_data: Tuple[ + clf_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -322,7 +284,7 @@ def test_glmnetcvclassifier__passes__with_valid_inputs( kwargs: Optional[Dict[str, Any]], ) -> None: """Validates `GLMNetCVClassifier` instanation passes with valid inputs.""" - X_train, X_test, y_train, y_test = clf_x_y_data + X_train, X_test, y_train, y_test = clf_train_test_x_y clf = GLMNetCVClassifier(**kwargs) clf.fit(X_train, y_train) # Note: we pass `y_test` for the sake of testing while in inference we might night have @@ -404,7 +366,7 @@ def test_glmnetcvclassifier__passes__with_valid_inputs( @pytest.mark.parametrize( ( - "clf_x_y_data", + "clf_train_test_x_y", "waterfall_kwargs", "summary_kwargs", ), @@ -450,17 +412,17 @@ def test_glmnetcvclassifier__passes__with_valid_inputs( }, ), ], - indirect=["clf_x_y_data"], + indirect=["clf_train_test_x_y"], ids=_ids, ) def test_glmnetcvclassifier_shap_plots__passes__with_valid_inputs( self, - clf_x_y_data: Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray], + clf_train_test_x_y: Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray], waterfall_kwargs: Dict[str, Any], summary_kwargs: Dict[str, Any], ) -> None: """Validates `GLMNetCVClassifier` Shap plots passes with valid inputs.""" - X_train, X_test, y_train, y_test = clf_x_y_data + X_train, X_test, y_train, y_test = clf_train_test_x_y clf = GLMNetCVClassifier() clf.fit(X_train, y_train) _ = clf.predict_proba(X_test, y_test) diff --git a/tests/slickml/classification/test_xgboost.py b/tests/slickml/classification/test_xgboost.py index 582f10a..0435b68 100644 --- a/tests/slickml/classification/test_xgboost.py +++ b/tests/slickml/classification/test_xgboost.py @@ -9,53 +9,15 @@ import xgboost as xgb from assertpy import assert_that from matplotlib.figure import Figure -from pytest import FixtureRequest -from sklearn.model_selection import train_test_split from slickml.classification import XGBoostClassifier -from tests.utils import _ids, _load_test_data_from_csv +from tests.conftest import _ids # TODO(amir): Currently `SHAP` raises a lot of warnings. Please figure out a way to dump these warnings class TestXGBoostClassifier: """Validates `XGBoostClassifier` instantiation.""" - @staticmethod - @pytest.fixture(scope="module") - def clf_x_y_data( - request: FixtureRequest, - ) -> Tuple[ - Union[pd.DataFrame, np.ndarray], - Union[pd.DataFrame, np.ndarray], - Union[np.ndarray, List], - Union[np.ndarray, List], - ]: - """Returns stratified train/test sets.""" - df = _load_test_data_from_csv( - filename="clf_test_data.csv", - ) - y = df["CLASS"].values - X = df.drop( - ["CLASS"], - axis=1, - ) - X_train, X_test, y_train, y_test = train_test_split( - X, - y, - test_size=0.2, - shuffle=True, - stratify=y, - random_state=1367, - ) - if request.param == "dataframe": - return (X_train, X_test, y_train, y_test) - elif request.param == "array": - return (X_train.values, X_test.values, y_train, y_test) - elif request.param == "list": - return (X_train, X_test, y_train.tolist(), y_test.tolist()) - else: - return None - @pytest.mark.parametrize( ("kwargs"), [ @@ -75,18 +37,18 @@ def test_xgboostclassifier_instantiation__fails__with_invalid_inputs(self, kwarg XGBoostClassifier(**kwargs) @pytest.mark.parametrize( - ("clf_x_y_data"), + ("clf_train_test_x_y"), [ ("array"), ("dataframe"), ("list"), ], - indirect=["clf_x_y_data"], + indirect=["clf_train_test_x_y"], ids=_ids, ) def test_xgboostclassifier__passes__with_defaults_and_no_test_targets( self, - clf_x_y_data: Tuple[ + clf_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -94,7 +56,7 @@ def test_xgboostclassifier__passes__with_defaults_and_no_test_targets( ], ) -> None: """Validates `XGBoostClassifier` instanation passes with default inputs.""" - X_train, X_test, y_train, _ = clf_x_y_data + X_train, X_test, y_train, _ = clf_train_test_x_y clf = XGBoostClassifier() clf.fit(X_train, y_train) y_pred_proba = clf.predict_proba(X_test) @@ -188,18 +150,18 @@ def test_xgboostclassifier__passes__with_defaults_and_no_test_targets( npt.assert_almost_equal(np.mean(clf.shap_values_train_), 0.11710, decimal=5) @pytest.mark.parametrize( - ("clf_x_y_data"), + ("clf_train_test_x_y"), [ ("array"), ("dataframe"), ("list"), ], - indirect=["clf_x_y_data"], + indirect=["clf_train_test_x_y"], ids=_ids, ) def test_xgboostclassifier__passes__with_defaults( self, - clf_x_y_data: Tuple[ + clf_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -207,7 +169,7 @@ def test_xgboostclassifier__passes__with_defaults( ], ) -> None: """Validates `XGBoostClassifier` instanation passes with default inputs.""" - X_train, X_test, y_train, y_test = clf_x_y_data + X_train, X_test, y_train, y_test = clf_train_test_x_y clf = XGBoostClassifier() clf.fit(X_train, y_train) # Note: we pass `y_test` for the sake of testing while in inference we might night have @@ -302,7 +264,7 @@ def test_xgboostclassifier__passes__with_defaults( npt.assert_almost_equal(np.mean(clf.shap_values_train_), 0.11710, decimal=5) @pytest.mark.parametrize( - ("clf_x_y_data", "kwargs"), + ("clf_train_test_x_y", "kwargs"), [ ("dataframe", {"num_boost_round": 300}), ("dataframe", {"sparse_matrix": True}), @@ -313,12 +275,12 @@ def test_xgboostclassifier__passes__with_defaults( ("dataframe", {"importance_type": "cover"}), ("dataframe", {"params": {"max_depth": 4, "min_child_weight": 5}}), ], - indirect=["clf_x_y_data"], + indirect=["clf_train_test_x_y"], ids=_ids, ) def test_xgboostclassifier__passes__with_valid_inputs( self, - clf_x_y_data: Tuple[ + clf_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -327,7 +289,7 @@ def test_xgboostclassifier__passes__with_valid_inputs( kwargs: Optional[Dict[str, Any]], ) -> None: """Validates `XGBoostClassifier` instanation passes with valid inputs.""" - X_train, X_test, y_train, y_test = clf_x_y_data + X_train, X_test, y_train, y_test = clf_train_test_x_y clf = XGBoostClassifier(**kwargs) clf.fit(X_train, y_train) # Note: we pass `y_test` for the sake of testing while in inference we might night have @@ -392,7 +354,7 @@ def test_xgboostclassifier__passes__with_valid_inputs( @pytest.mark.parametrize( ( - "clf_x_y_data", + "clf_train_test_x_y", "waterfall_kwargs", "summary_kwargs", ), @@ -438,17 +400,17 @@ def test_xgboostclassifier__passes__with_valid_inputs( }, ), ], - indirect=["clf_x_y_data"], + indirect=["clf_train_test_x_y"], ids=_ids, ) def test_xgboostclassifier_shap_plots__passes__with_valid_inputs( self, - clf_x_y_data: Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray], + clf_train_test_x_y: Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray], waterfall_kwargs: Dict[str, Any], summary_kwargs: Dict[str, Any], ) -> None: """Validates `XGBoostClassifier` Shap plots passes with valid inputs.""" - X_train, X_test, y_train, y_test = clf_x_y_data + X_train, X_test, y_train, y_test = clf_train_test_x_y clf = XGBoostClassifier() clf.fit(X_train, y_train) _ = clf.predict_proba(X_test, y_test) diff --git a/tests/slickml/classification/test_xgboostcv.py b/tests/slickml/classification/test_xgboostcv.py index 227a1e9..975d405 100644 --- a/tests/slickml/classification/test_xgboostcv.py +++ b/tests/slickml/classification/test_xgboostcv.py @@ -9,53 +9,15 @@ import xgboost as xgb from assertpy import assert_that from matplotlib.figure import Figure -from pytest import FixtureRequest -from sklearn.model_selection import train_test_split from slickml.classification import XGBoostCVClassifier -from tests.utils import _ids, _load_test_data_from_csv +from tests.conftest import _ids # TODO(amir): Currently `SHAP` raises a lot of warnings. Please figure out a way to dump these warnings class TestXGBoostCVClassifier: """Validates `XGBoostClassifierCV` instantiation.""" - @staticmethod - @pytest.fixture(scope="module") - def clf_x_y_data( - request: FixtureRequest, - ) -> Tuple[ - Union[pd.DataFrame, np.ndarray], - Union[pd.DataFrame, np.ndarray], - Union[np.ndarray, List], - Union[np.ndarray, List], - ]: - """Returns stratified train/test sets.""" - df = _load_test_data_from_csv( - filename="clf_test_data.csv", - ) - y = df["CLASS"].values - X = df.drop( - ["CLASS"], - axis=1, - ) - X_train, X_test, y_train, y_test = train_test_split( - X, - y, - test_size=0.2, - shuffle=True, - stratify=y, - random_state=1367, - ) - if request.param == "dataframe": - return (X_train, X_test, y_train, y_test) - elif request.param == "array": - return (X_train.values, X_test.values, y_train, y_test) - elif request.param == "list": - return (X_train, X_test, y_train.tolist(), y_test.tolist()) - else: - return None - @pytest.mark.parametrize( ("kwargs"), [ @@ -86,18 +48,18 @@ def test_xgboostcvclassifier_instantiation__fails__with_invalid_inputs( XGBoostCVClassifier(**kwargs) @pytest.mark.parametrize( - ("clf_x_y_data"), + ("clf_train_test_x_y"), [ ("array"), ("dataframe"), ("list"), ], - indirect=["clf_x_y_data"], + indirect=["clf_train_test_x_y"], ids=_ids, ) def test_xgboostcvclassifier__passes__with_defaults( self, - clf_x_y_data: Tuple[ + clf_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -105,7 +67,7 @@ def test_xgboostcvclassifier__passes__with_defaults( ], ) -> None: """Validates `XGBoostClassifier` instanation passes with default inputs.""" - X_train, X_test, y_train, y_test = clf_x_y_data + X_train, X_test, y_train, y_test = clf_train_test_x_y clf = XGBoostCVClassifier() clf.fit(X_train, y_train) # Note: we pass `y_test` for the sake of testing while in inference we might night have @@ -227,18 +189,18 @@ def test_xgboostcvclassifier__passes__with_defaults( npt.assert_almost_equal(np.mean(clf.shap_values_train_), 0.03783, decimal=5) @pytest.mark.parametrize( - ("clf_x_y_data"), + ("clf_train_test_x_y"), [ ("array"), ("dataframe"), ("list"), ], - indirect=["clf_x_y_data"], + indirect=["clf_train_test_x_y"], ids=_ids, ) def test_xgboostcvclassifier__passes__with_defaults_and_no_test_targets( self, - clf_x_y_data: Tuple[ + clf_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -246,7 +208,7 @@ def test_xgboostcvclassifier__passes__with_defaults_and_no_test_targets( ], ) -> None: """Validates `XGBoostClassifier` instanation passes with default inputs.""" - X_train, X_test, y_train, _ = clf_x_y_data + X_train, X_test, y_train, _ = clf_train_test_x_y clf = XGBoostCVClassifier() clf.fit(X_train, y_train) y_pred_proba = clf.predict_proba(X_test) @@ -364,7 +326,7 @@ def test_xgboostcvclassifier__passes__with_defaults_and_no_test_targets( npt.assert_almost_equal(np.mean(clf.shap_values_train_), 0.03783, decimal=5) @pytest.mark.parametrize( - ("clf_x_y_data", "kwargs"), + ("clf_train_test_x_y", "kwargs"), [ ("dataframe", {"n_splits": 10}), ("dataframe", {"early_stopping_rounds": 100}), @@ -382,12 +344,12 @@ def test_xgboostcvclassifier__passes__with_defaults_and_no_test_targets( ("dataframe", {"importance_type": "cover"}), ("dataframe", {"params": {"max_depth": 4, "min_child_weight": 5}}), ], - indirect=["clf_x_y_data"], + indirect=["clf_train_test_x_y"], ids=_ids, ) def test_xgboostcvclassifier__passes__with_valid_inputs( self, - clf_x_y_data: Tuple[ + clf_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -396,7 +358,7 @@ def test_xgboostcvclassifier__passes__with_valid_inputs( kwargs: Optional[Dict[str, Any]], ) -> None: """Validates `XGBoostCVClassifier` instanation passes with valid inputs.""" - X_train, X_test, y_train, y_test = clf_x_y_data + X_train, X_test, y_train, y_test = clf_train_test_x_y clf = XGBoostCVClassifier(**kwargs) clf.fit(X_train, y_train) # Note: we pass `y_test` for the sake of testing while in inference we might night have @@ -473,7 +435,7 @@ def test_xgboostcvclassifier__passes__with_valid_inputs( @pytest.mark.parametrize( ( - "clf_x_y_data", + "clf_train_test_x_y", "waterfall_kwargs", "summary_kwargs", ), @@ -519,17 +481,17 @@ def test_xgboostcvclassifier__passes__with_valid_inputs( }, ), ], - indirect=["clf_x_y_data"], + indirect=["clf_train_test_x_y"], ids=_ids, ) def test_xgboostcvclassifier_shap_plots__passes__with_valid_inputs( self, - clf_x_y_data: Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray], + clf_train_test_x_y: Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray], waterfall_kwargs: Dict[str, Any], summary_kwargs: Dict[str, Any], ) -> None: """Validates `XGBoostCVClassifier` Shap plots passes with valid inputs.""" - X_train, X_test, y_train, y_test = clf_x_y_data + X_train, X_test, y_train, y_test = clf_train_test_x_y clf = XGBoostCVClassifier() clf.fit(X_train, y_train) _ = clf.predict_proba(X_test, y_test) diff --git a/tests/slickml/metrics/test_binary_classification_metrics.py b/tests/slickml/metrics/test_binary_classification_metrics.py index e5d4927..df7fb0f 100644 --- a/tests/slickml/metrics/test_binary_classification_metrics.py +++ b/tests/slickml/metrics/test_binary_classification_metrics.py @@ -8,7 +8,7 @@ from matplotlib.figure import Figure from slickml.metrics import BinaryClassificationMetrics -from tests.utils import _ids +from tests.conftest import _ids # TODO(amir): the case for `average_method = None` which is in `__post_init__` diff --git a/tests/slickml/metrics/test_regression_metrics.py b/tests/slickml/metrics/test_regression_metrics.py index d8127c7..6a960cc 100644 --- a/tests/slickml/metrics/test_regression_metrics.py +++ b/tests/slickml/metrics/test_regression_metrics.py @@ -8,7 +8,7 @@ from matplotlib.figure import Figure from slickml.metrics import RegressionMetrics -from tests.utils import _ids +from tests.conftest import _ids # TODO(amir): tests for multi-outputs + "variance_weighted" and "raw_values" methods are still missing diff --git a/tests/slickml/regression/test_glmnet.py b/tests/slickml/regression/test_glmnet.py index 4c3fa80..add5252 100644 --- a/tests/slickml/regression/test_glmnet.py +++ b/tests/slickml/regression/test_glmnet.py @@ -7,11 +7,9 @@ import shap from assertpy import assert_that from matplotlib.figure import Figure -from pytest import FixtureRequest -from sklearn.model_selection import train_test_split from slickml.regression import GLMNetCVRegressor -from tests.utils import _ids, _load_test_data_from_csv +from tests.conftest import _ids # TODO(amir): add lolipop plot for coeff + unit-test @@ -20,42 +18,6 @@ class TestGLMNetCVRegressor: """Validates `GLMNetCVRegressor` instantiation.""" - @staticmethod - @pytest.fixture(scope="module") - def reg_x_y_data( - request: FixtureRequest, - ) -> Tuple[ - Union[pd.DataFrame, np.ndarray], - Union[pd.DataFrame, np.ndarray], - Union[np.ndarray, List], - Union[np.ndarray, List], - ]: - """Returns train/test sets.""" - df = _load_test_data_from_csv( - filename="reg_test_data.csv", - ) - # TODO(amir): try to pull-out multi target regression as well here - y = df["TARGET1"].values - X = df.drop( - ["TARGET1", "TARGET2"], - axis=1, - ) - X_train, X_test, y_train, y_test = train_test_split( - X, - y, - test_size=0.2, - shuffle=True, - random_state=1367, - ) - if request.param == "dataframe": - return (X_train, X_test, y_train, y_test) - elif request.param == "array": - return (X_train.values, X_test.values, y_train, y_test) - elif request.param == "list": - return (X_train, X_test, y_train.tolist(), y_test.tolist()) - else: - return None - @pytest.mark.parametrize( ("kwargs"), [ @@ -77,18 +39,18 @@ def test_glmnetcvregressor_instantiation__fails__with_invalid_inputs(self, kwarg GLMNetCVRegressor(**kwargs) @pytest.mark.parametrize( - ("reg_x_y_data"), + ("reg_train_test_x_y"), [ ("array"), ("dataframe"), ("list"), ], - indirect=["reg_x_y_data"], + indirect=["reg_train_test_x_y"], ids=_ids, ) def test_glmnetcvregressor__passes__with_defaults_and_no_test_targets( self, - reg_x_y_data: Tuple[ + reg_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -96,7 +58,7 @@ def test_glmnetcvregressor__passes__with_defaults_and_no_test_targets( ], ) -> None: """Validates `GLMNetCVRegressor` instanation passes with default inputs.""" - X_train, X_test, y_train, _ = reg_x_y_data + X_train, X_test, y_train, _ = reg_train_test_x_y reg = GLMNetCVRegressor() reg.fit(X_train, y_train) y_pred = reg.predict(X_test) @@ -182,18 +144,18 @@ def test_glmnetcvregressor__passes__with_defaults_and_no_test_targets( npt.assert_almost_equal(np.mean(reg.shap_values_train_), 8.13e-06, decimal=5) @pytest.mark.parametrize( - ("reg_x_y_data"), + ("reg_train_test_x_y"), [ ("array"), ("dataframe"), ("list"), ], - indirect=["reg_x_y_data"], + indirect=["reg_train_test_x_y"], ids=_ids, ) def test_glmnetcvregressor__passes__with_defaults( self, - reg_x_y_data: Tuple[ + reg_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -201,7 +163,7 @@ def test_glmnetcvregressor__passes__with_defaults( ], ) -> None: """Validates `GLMNetCVRegressor` instanation passes with default inputs.""" - X_train, X_test, y_train, y_test = reg_x_y_data + X_train, X_test, y_train, y_test = reg_train_test_x_y reg = GLMNetCVRegressor() reg.fit(X_train, y_train) # Note: we pass `y_test` for the sake of testing while in inference we might night have @@ -291,11 +253,11 @@ def test_glmnetcvregressor__passes__with_defaults( # TODO(amir): add a test for `lambda_path` parameter @pytest.mark.parametrize( - ("reg_x_y_data", "kwargs"), + ("reg_train_test_x_y", "kwargs"), [ - ("dataframe", {"alpha": 0.9}), - ("dataframe", {"n_lambda": 200}), - ("dataframe", {"n_splits": 10}), + ("dataframe", {"alpha": 0.1}), + ("dataframe", {"n_lambda": 50}), + ("dataframe", {"n_splits": 5}), ("dataframe", {"metric": "mean_squared_error"}), ("dataframe", {"scale": False, "sparse_matrix": True}), ("dataframe", {"fit_intercept": False}), @@ -303,12 +265,12 @@ def test_glmnetcvregressor__passes__with_defaults( ("dataframe", {"random_state": 42}), ("dataframe", {"max_features": 10}), ], - indirect=["reg_x_y_data"], + indirect=["reg_train_test_x_y"], ids=_ids, ) def test_glmnetcvregressor__passes__with_valid_inputs( self, - reg_x_y_data: Tuple[ + reg_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -317,7 +279,7 @@ def test_glmnetcvregressor__passes__with_valid_inputs( kwargs: Optional[Dict[str, Any]], ) -> None: """Validates `GLMNetCVRegressor` instanation passes with valid inputs.""" - X_train, X_test, y_train, y_test = reg_x_y_data + X_train, X_test, y_train, y_test = reg_train_test_x_y reg = GLMNetCVRegressor(**kwargs) reg.fit(X_train, y_train) # Note: we pass `y_test` for the sake of testing while in inference we might night have @@ -397,7 +359,7 @@ def test_glmnetcvregressor__passes__with_valid_inputs( @pytest.mark.parametrize( ( - "reg_x_y_data", + "reg_train_test_x_y", "waterfall_kwargs", "summary_kwargs", ), @@ -442,17 +404,17 @@ def test_glmnetcvregressor__passes__with_valid_inputs( }, ), ], - indirect=["reg_x_y_data"], + indirect=["reg_train_test_x_y"], ids=_ids, ) def test_glmnetcvregressor_shap_plots__passes__with_valid_inputs( self, - reg_x_y_data: Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray], + reg_train_test_x_y: Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray], waterfall_kwargs: Dict[str, Any], summary_kwargs: Dict[str, Any], ) -> None: """Validates `GLMNetCVRegressor` Shap plots passes with valid inputs.""" - X_train, X_test, y_train, y_test = reg_x_y_data + X_train, X_test, y_train, y_test = reg_train_test_x_y reg = GLMNetCVRegressor() reg.fit(X_train, y_train) _ = reg.predict(X_test, y_test) diff --git a/tests/slickml/regression/test_xgboost.py b/tests/slickml/regression/test_xgboost.py index 9da343c..723374f 100644 --- a/tests/slickml/regression/test_xgboost.py +++ b/tests/slickml/regression/test_xgboost.py @@ -9,11 +9,9 @@ import xgboost as xgb from assertpy import assert_that from matplotlib.figure import Figure -from pytest import FixtureRequest -from sklearn.model_selection import train_test_split from slickml.regression import XGBoostRegressor -from tests.utils import _ids, _load_test_data_from_csv +from tests.conftest import _ids # TODO(amir): Currently `SHAP` raises a lot of warnings. Please figure out a way to dump these warnings @@ -21,42 +19,6 @@ class TestXGBoostRegressor: """Validates `XGBoostRegressor` instantiation.""" - @staticmethod - @pytest.fixture(scope="module") - def reg_x_y_data( - request: FixtureRequest, - ) -> Tuple[ - Union[pd.DataFrame, np.ndarray], - Union[pd.DataFrame, np.ndarray], - Union[np.ndarray, List], - Union[np.ndarray, List], - ]: - """Returns train/test sets.""" - df = _load_test_data_from_csv( - filename="reg_test_data.csv", - ) - # TODO(amir): try to pull-out multi target regression as well here - y = df["TARGET1"].values - X = df.drop( - ["TARGET1", "TARGET2"], - axis=1, - ) - X_train, X_test, y_train, y_test = train_test_split( - X, - y, - test_size=0.2, - shuffle=True, - random_state=1367, - ) - if request.param == "dataframe": - return (X_train, X_test, y_train, y_test) - elif request.param == "array": - return (X_train.values, X_test.values, y_train, y_test) - elif request.param == "list": - return (X_train, X_test, y_train.tolist(), y_test.tolist()) - else: - return None - @pytest.mark.parametrize( ("kwargs"), [ @@ -76,18 +38,18 @@ def test_xgboostregressor_instantiation__fails__with_invalid_inputs(self, kwargs XGBoostRegressor(**kwargs) @pytest.mark.parametrize( - ("reg_x_y_data"), + ("reg_train_test_x_y"), [ ("array"), ("dataframe"), ("list"), ], - indirect=["reg_x_y_data"], + indirect=["reg_train_test_x_y"], ids=_ids, ) def test_xgboostregressor__passes__with_defaults( self, - reg_x_y_data: Tuple[ + reg_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -95,7 +57,7 @@ def test_xgboostregressor__passes__with_defaults( ], ) -> None: """Validates `XGBoostRegressor` instanation passes with default inputs.""" - X_train, X_test, y_train, y_test = reg_x_y_data + X_train, X_test, y_train, y_test = reg_train_test_x_y reg = XGBoostRegressor() reg.fit(X_train, y_train) # Note: we pass `y_test` for the sake of testing while in inference we might night have @@ -189,18 +151,18 @@ def test_xgboostregressor__passes__with_defaults( npt.assert_almost_equal(np.mean(reg.shap_values_train_), -1.98011e-08, decimal=5) @pytest.mark.parametrize( - ("reg_x_y_data"), + ("reg_train_test_x_y"), [ ("array"), ("dataframe"), ("list"), ], - indirect=["reg_x_y_data"], + indirect=["reg_train_test_x_y"], ids=_ids, ) def test_xgboostregressor__passes__with_defaults_and_no_test_targets( self, - reg_x_y_data: Tuple[ + reg_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -208,7 +170,7 @@ def test_xgboostregressor__passes__with_defaults_and_no_test_targets( ], ) -> None: """Validates `XGBoostRegressor` instanation passes with default inputs.""" - X_train, X_test, y_train, _ = reg_x_y_data + X_train, X_test, y_train, _ = reg_train_test_x_y reg = XGBoostRegressor() reg.fit(X_train, y_train) y_pred = reg.predict(X_test) @@ -298,7 +260,7 @@ def test_xgboostregressor__passes__with_defaults_and_no_test_targets( npt.assert_almost_equal(np.mean(reg.shap_values_train_), -1.98011e-08, decimal=5) @pytest.mark.parametrize( - ("reg_x_y_data", "kwargs"), + ("reg_train_test_x_y", "kwargs"), [ ("dataframe", {"num_boost_round": 300}), ("dataframe", {"sparse_matrix": True}), @@ -309,12 +271,12 @@ def test_xgboostregressor__passes__with_defaults_and_no_test_targets( ("dataframe", {"importance_type": "cover"}), ("dataframe", {"params": {"max_depth": 4, "min_child_weight": 5}}), ], - indirect=["reg_x_y_data"], + indirect=["reg_train_test_x_y"], ids=_ids, ) def test_xgboostregressor__passes__with_valid_inputs( self, - reg_x_y_data: Tuple[ + reg_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -323,7 +285,7 @@ def test_xgboostregressor__passes__with_valid_inputs( kwargs: Optional[Dict[str, Any]], ) -> None: """Validates `XGBoostRegressor` instanation passes with valid inputs.""" - X_train, X_test, y_train, y_test = reg_x_y_data + X_train, X_test, y_train, y_test = reg_train_test_x_y reg = XGBoostRegressor(**kwargs) reg.fit(X_train, y_train) # Note: we pass `y_test` for the sake of testing while in inference we might night have @@ -388,7 +350,7 @@ def test_xgboostregressor__passes__with_valid_inputs( @pytest.mark.parametrize( ( - "reg_x_y_data", + "reg_train_test_x_y", "waterfall_kwargs", "summary_kwargs", ), @@ -433,17 +395,17 @@ def test_xgboostregressor__passes__with_valid_inputs( }, ), ], - indirect=["reg_x_y_data"], + indirect=["reg_train_test_x_y"], ids=_ids, ) def test_xgboostregressor_shap_plots__passes__with_valid_inputs( self, - reg_x_y_data: Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray], + reg_train_test_x_y: Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray], waterfall_kwargs: Dict[str, Any], summary_kwargs: Dict[str, Any], ) -> None: """Validates `XGBoostRegressor` Shap plots passes with valid inputs.""" - X_train, X_test, y_train, y_test = reg_x_y_data + X_train, X_test, y_train, y_test = reg_train_test_x_y reg = XGBoostRegressor() reg.fit(X_train, y_train) _ = reg.predict(X_test, y_test) diff --git a/tests/slickml/regression/test_xgboostcv.py b/tests/slickml/regression/test_xgboostcv.py index bc0e2f3..a84ae19 100644 --- a/tests/slickml/regression/test_xgboostcv.py +++ b/tests/slickml/regression/test_xgboostcv.py @@ -9,11 +9,9 @@ import xgboost as xgb from assertpy import assert_that from matplotlib.figure import Figure -from pytest import FixtureRequest -from sklearn.model_selection import train_test_split from slickml.regression import XGBoostCVRegressor -from tests.utils import _ids, _load_test_data_from_csv +from tests.conftest import _ids # TODO(amir): Currently `SHAP` raises a lot of warnings. Please figure out a way to dump these warnings @@ -21,42 +19,6 @@ class TestXGBoostCVRegressor: """Validates `XGBoostCVRegressor` instantiation.""" - @staticmethod - @pytest.fixture(scope="module") - def reg_x_y_data( - request: FixtureRequest, - ) -> Tuple[ - Union[pd.DataFrame, np.ndarray], - Union[pd.DataFrame, np.ndarray], - Union[np.ndarray, List], - Union[np.ndarray, List], - ]: - """Returns train/test sets.""" - df = _load_test_data_from_csv( - filename="reg_test_data.csv", - ) - # TODO(amir): try to pull-out multi target regression as well here - y = df["TARGET1"].values - X = df.drop( - ["TARGET1", "TARGET2"], - axis=1, - ) - X_train, X_test, y_train, y_test = train_test_split( - X, - y, - test_size=0.2, - shuffle=True, - random_state=1367, - ) - if request.param == "dataframe": - return (X_train, X_test, y_train, y_test) - elif request.param == "array": - return (X_train.values, X_test.values, y_train, y_test) - elif request.param == "list": - return (X_train, X_test, y_train.tolist(), y_test.tolist()) - else: - return None - @pytest.mark.parametrize( ("kwargs"), [ @@ -84,18 +46,18 @@ def test_xgboostcvregressor_instantiation__fails__with_invalid_inputs(self, kwar XGBoostCVRegressor(**kwargs) @pytest.mark.parametrize( - ("reg_x_y_data"), + ("reg_train_test_x_y"), [ ("array"), ("dataframe"), ("list"), ], - indirect=["reg_x_y_data"], + indirect=["reg_train_test_x_y"], ids=_ids, ) def test_xgboostcvregressor__passes__with_defaults( self, - reg_x_y_data: Tuple[ + reg_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -103,7 +65,7 @@ def test_xgboostcvregressor__passes__with_defaults( ], ) -> None: """Validates `XGBoostCVRegressor` instanation passes with default inputs.""" - X_train, X_test, y_train, y_test = reg_x_y_data + X_train, X_test, y_train, y_test = reg_train_test_x_y reg = XGBoostCVRegressor() reg.fit(X_train, y_train) # Note: we pass `y_test` for the sake of testing while in inference we might night have @@ -215,18 +177,18 @@ def test_xgboostcvregressor__passes__with_defaults( assert_that(cv_results_fig).is_instance_of(Figure) @pytest.mark.parametrize( - ("reg_x_y_data"), + ("reg_train_test_x_y"), [ ("array"), ("dataframe"), ("list"), ], - indirect=["reg_x_y_data"], + indirect=["reg_train_test_x_y"], ids=_ids, ) def test_xgboostcvregressor__passes__with_defaults_and_no_test_targets( self, - reg_x_y_data: Tuple[ + reg_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -234,7 +196,7 @@ def test_xgboostcvregressor__passes__with_defaults_and_no_test_targets( ], ) -> None: """Validates `XGBoostCVRegressor` instanation passes with default inputs.""" - X_train, X_test, y_train, _ = reg_x_y_data + X_train, X_test, y_train, _ = reg_train_test_x_y reg = XGBoostCVRegressor() reg.fit(X_train, y_train) y_pred = reg.predict(X_test) @@ -342,7 +304,7 @@ def test_xgboostcvregressor__passes__with_defaults_and_no_test_targets( npt.assert_almost_equal(np.mean(reg.shap_values_train_), -1.98011e-08, decimal=5) @pytest.mark.parametrize( - ("reg_x_y_data", "kwargs"), + ("reg_train_test_x_y", "kwargs"), [ ("dataframe", {"n_splits": 10}), ("dataframe", {"early_stopping_rounds": 100}), @@ -359,12 +321,12 @@ def test_xgboostcvregressor__passes__with_defaults_and_no_test_targets( ("dataframe", {"importance_type": "cover"}), ("dataframe", {"params": {"max_depth": 4, "min_child_weight": 5}}), ], - indirect=["reg_x_y_data"], + indirect=["reg_train_test_x_y"], ids=_ids, ) def test_xgboostcvregressor__passes__with_valid_inputs( self, - reg_x_y_data: Tuple[ + reg_train_test_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], @@ -373,7 +335,7 @@ def test_xgboostcvregressor__passes__with_valid_inputs( kwargs: Optional[Dict[str, Any]], ) -> None: """Validates `XGBoostCVRegressor` instanation passes with valid inputs.""" - X_train, X_test, y_train, y_test = reg_x_y_data + X_train, X_test, y_train, y_test = reg_train_test_x_y reg = XGBoostCVRegressor(**kwargs) reg.fit(X_train, y_train) # Note: we pass `y_test` for the sake of testing while in inference we might night have @@ -446,7 +408,7 @@ def test_xgboostcvregressor__passes__with_valid_inputs( @pytest.mark.parametrize( ( - "reg_x_y_data", + "reg_train_test_x_y", "waterfall_kwargs", "summary_kwargs", ), @@ -491,17 +453,17 @@ def test_xgboostcvregressor__passes__with_valid_inputs( }, ), ], - indirect=["reg_x_y_data"], + indirect=["reg_train_test_x_y"], ids=_ids, ) def test_xgboostcvregressor_shap_plots__passes__with_valid_inputs( self, - reg_x_y_data: Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray], + reg_train_test_x_y: Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray], waterfall_kwargs: Dict[str, Any], summary_kwargs: Dict[str, Any], ) -> None: """Validates `XGBoostCVRegressor` Shap plots passes with valid inputs.""" - X_train, X_test, y_train, y_test = reg_x_y_data + X_train, X_test, y_train, y_test = reg_train_test_x_y reg = XGBoostCVRegressor() reg.fit(X_train, y_train) _ = reg.predict(X_test, y_test) diff --git a/tests/slickml/selection/test_xgboost.py b/tests/slickml/selection/test_xgboost.py index 63ed672..f6bc40b 100644 --- a/tests/slickml/selection/test_xgboost.py +++ b/tests/slickml/selection/test_xgboost.py @@ -5,63 +5,15 @@ import pytest from assertpy import assert_that from matplotlib.figure import Figure -from pytest import FixtureRequest from sklearn.preprocessing import StandardScaler from slickml.selection import XGBoostFeatureSelector -from tests.utils import _ids, _load_test_data_from_csv +from tests.conftest import _ids class TestXGBoostFeatureSelector: """Validates `XGBoostFeatureSelector` instantiation.""" - @staticmethod - @pytest.fixture(scope="module") - def clf_x_y_data( - request: FixtureRequest, - ) -> Tuple[Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List]]: - """Returns stratified train/test sets.""" - df = _load_test_data_from_csv( - filename="clf_test_data.csv", - ) - y = df["CLASS"].values - X = df.drop( - ["CLASS"], - axis=1, - ) - if request.param == "dataframe": - return (X, y) - elif request.param == "array": - return (X.values, y) - elif request.param == "list": - return (X, y.tolist()) - else: - return None - - @staticmethod - @pytest.fixture(scope="module") - def reg_x_y_data( - request: FixtureRequest, - ) -> Tuple[Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List]]: - """Returns train/test sets.""" - df = _load_test_data_from_csv( - filename="reg_test_data.csv", - ) - # TODO(amir): try to pull-out multi target regression as well here - y = df["TARGET1"].values - X = df.drop( - ["TARGET1", "TARGET2"], - axis=1, - ) - if request.param == "dataframe": - return (X, y) - elif request.param == "array": - return (X.values, y) - elif request.param == "list": - return (X, y.tolist()) - else: - return None - @pytest.mark.parametrize( ("kwargs"), [ @@ -93,24 +45,24 @@ def test_clf_xgboostfeatureselector_instantiation__fails__with_invalid_inputs( XGBoostFeatureSelector(**kwargs) @pytest.mark.parametrize( - ("clf_x_y_data"), + ("clf_x_y"), [ ("array"), ("dataframe"), ("list"), ], - indirect=["clf_x_y_data"], + indirect=["clf_x_y"], ids=_ids, ) def test_clf_xgboostfeatureselector__passes__with_defaults( self, - clf_x_y_data: Tuple[ + clf_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], ], ) -> None: """Validates `XGBoostFeatureSelector` instanation passes with default inputs for classification.""" - X, y = clf_x_y_data + X, y = clf_x_y xfs = XGBoostFeatureSelector() xfs.fit(X, y) params = xfs.get_params() @@ -192,24 +144,24 @@ def test_clf_xgboostfeatureselector__passes__with_defaults( assert_that(feature_frequency_fig).is_instance_of(Figure) @pytest.mark.parametrize( - ("reg_x_y_data"), + ("reg_x_y"), [ ("array"), ("dataframe"), ("list"), ], - indirect=["reg_x_y_data"], + indirect=["reg_x_y"], ids=_ids, ) def test_reg_xgboostfeatureselector__passes__with_defaults( self, - reg_x_y_data: Tuple[ + reg_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], ], ) -> None: """Validates `XGBoostFeatureSelector` instanation passes with default inputs for regression.""" - X, y = reg_x_y_data + X, y = reg_x_y xfs = XGBoostFeatureSelector(metrics="rmse") xfs.fit(X, y) params = xfs.get_params() @@ -287,24 +239,24 @@ def test_reg_xgboostfeatureselector__passes__with_defaults( # TODO(amir): change the code for `callback=True` and add the unit-test @pytest.mark.parametrize( - ("clf_x_y_data"), + ("clf_x_y"), [ ("array"), ("dataframe"), ("list"), ], - indirect=["clf_x_y_data"], + indirect=["clf_x_y"], ids=_ids, ) def test_clf_xgboostfeatureselector__passes__with_valid_inputs( self, - clf_x_y_data: Tuple[ + clf_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], ], ) -> None: """Validates `XGBoostFeatureSelector` instanation passes with valid inputs for classification.""" - X, y = clf_x_y_data + X, y = clf_x_y # TODO(amir): callbacks=True has not been tested yet. The look of `self._cv()` would change # therefore the logic for `if _feature_gain["feature"].str.contains("noisy").sum() != 0:` # should be changed accordingly @@ -394,24 +346,24 @@ def test_clf_xgboostfeatureselector__passes__with_valid_inputs( assert_that(feature_frequency_fig).is_instance_of(Figure) @pytest.mark.parametrize( - ("reg_x_y_data"), + ("reg_x_y"), [ ("array"), ("dataframe"), ("list"), ], - indirect=["reg_x_y_data"], + indirect=["reg_x_y"], ids=_ids, ) def test_reg_xgboostfeatureselector__passes__with_valid_inputs( self, - reg_x_y_data: Tuple[ + reg_x_y: Tuple[ Union[pd.DataFrame, np.ndarray], Union[np.ndarray, List], ], ) -> None: """Validates `XGBoostFeatureSelector` instanation passes with valid inputs for regression.""" - X, y = reg_x_y_data + X, y = reg_x_y xfs = XGBoostFeatureSelector( n_iter=1, metrics="rmse", diff --git a/tests/slickml/utils/test_transform.py b/tests/slickml/utils/test_transform.py index c1d8fab..cd6b90a 100644 --- a/tests/slickml/utils/test_transform.py +++ b/tests/slickml/utils/test_transform.py @@ -10,30 +10,12 @@ from scipy.sparse import csr_matrix from slickml.utils import add_noisy_features, array_to_df, df_to_csr, memory_use_csr -from tests.utils import ( - _captured_log, - _dummy_pandas_dataframe, - _dummy_sparse_matrix, - _ids, -) - - -@pytest.fixture -def datafarame_for_testing(): - """Returns pandas.DataFrame as `pytest.fixture`""" - return _dummy_pandas_dataframe( - size=100, - random_state=1367, - ) +from tests.conftest import _captured_log, _ids -@pytest.fixture -def sparse_matrix_for_testing(): - """Returns csr matrix as `pytest.fixture`""" - return _dummy_sparse_matrix() - - -def test_df_to_csr__passes__with_default_inputs(datafarame_for_testing: pd.DataFrame) -> None: +def test_df_to_csr__passes__with_default_inputs( + datafarame_for_testing: pd.DataFrame, +) -> None: """Validates conversion of a pandas DataFrame into CSR matrix with default inputs.""" df = datafarame_for_testing csr = df_to_csr(df) diff --git a/tests/slickml/utils/test_validation.py b/tests/slickml/utils/test_validation.py index 5652f31..0aa5a75 100644 --- a/tests/slickml/utils/test_validation.py +++ b/tests/slickml/utils/test_validation.py @@ -6,7 +6,7 @@ from assertpy import assert_that from slickml.utils import check_var -from tests.utils import _ids +from tests.conftest import _ids @pytest.mark.parametrize( diff --git a/tests/utils.py b/tests/utils.py deleted file mode 100644 index e16a7b3..0000000 --- a/tests/utils.py +++ /dev/null @@ -1,151 +0,0 @@ -import importlib.resources as pkg_resources -import json -from typing import Any, Dict, Optional, Tuple - -import numpy as np -import pandas as pd -from pytest import CaptureFixture -from scipy.sparse import csr_matrix - -from tests import resources - - -# TODO(amir): what if values is list ? -def _ids(values: Any) -> str: - """Returns a user-friendly test case ID from the parametrized values. - - Parameters - ---------- - values : Any - Test resource values - - Returns - ------- - str - """ - if isinstance(values, dict): - return ", ".join(f"{k} : {v}" for (k, v) in values.items()) - else: - return str(values) - - -def _load_test_scenarios_from_json(filename: str) -> Dict[str, Any]: - """Returns a json file contains valid and invalid test cases that can be used for `pytest.fixtures`. - - Parameters - ---------- - filename : str - Json filename - - Returns - ------- - Dict[str, Any] - """ - return json.loads( - pkg_resources.read_text( - resources, - filename, - ), - ) - - -def _load_test_data_from_csv(filename: str) -> pd.DataFrame: - """Returns a `pandas.DataFrame` data loaded from a csv file that can be used for `pytest.fixtures`. - - Parameters - ---------- - filename : str - Data filename - - Returns - ------- - pd.DataFrame - """ - with pkg_resources.path(resources, filename) as path: - return pd.read_csv(path) - - -def _captured_log(capsys: CaptureFixture) -> Tuple[str, str]: - """Returns the captured standard output/error via `pytest.capsys` [1]_. - - References - ---------- - .. [1] https://docs.pytest.org/en/7.1.x/how-to/capture-stdout-stderr.html - - Returns - ------- - Tuple[str] - Captured output and caputred error - """ - captured = capsys.readouterr() - return (captured.out, captured.err) - - -def _dummy_pandas_dataframe( - size: Optional[int] = 100, - random_state: Optional[int] = 1367, -) -> pd.DataFrame: - """Returns a dummy pandas DataFrame that can be used for `pytest.fixtures`. - - The DataFrame shape is (size, 4), two features ("feature_1", "feature_2"), and two targets - ("binary_target", "multi_target"). - - Parameters - ---------- - size : Optional[int], optional - Number of samples, by default 100 - - random_state : Optional[int], optional - Random seed, by default 1367 - - Returns - ------- - pd.DataFrame - """ - np.random.seed( - seed=random_state, - ) - return pd.DataFrame( - { - "feature_1": np.random.random_sample( - size=size, - ), - "feature_2": np.random.random_sample( - size=size, - ), - "binary_target": np.random.randint( - low=0, - high=2, - size=size, - dtype=int, - ), - "multi_target": np.random.randint( - low=0, - high=3, - size=size, - dtype=int, - ), - }, - ) - - -def _dummy_sparse_matrix() -> csr_matrix: - """Returns a sparse matrix in CSR format with a shape of (3,3) with float entries. - - The numpy representation `_dummy_sparse_matrix().toarray()` is as follows: - array([[1., 0., 2.], - [0., 0., 3.], - [4., 5., 6.]]) - - Returns - ------- - csr_matrix - """ - row = np.array([0, 0, 1, 2, 2, 2]) - col = np.array([0, 2, 2, 0, 1, 2]) - data = np.array([1, 2, 3, 4, 5, 6]) - return csr_matrix( - (data, (row, col)), - shape=(3, 3), - dtype=np.float64, - )