diff --git a/.azure-pipelines/linux-CI-nightly.yml b/.azure-pipelines/linux-CI-nightly.yml index a2be2d330..c232ae702 100644 --- a/.azure-pipelines/linux-CI-nightly.yml +++ b/.azure-pipelines/linux-CI-nightly.yml @@ -73,16 +73,7 @@ jobs: displayName: 'install scikit-learn' - script: | - if [ '$(onnx.version)' == 'git' ] - then - git clone https://github.com/onnx/onnx.git --recursive - export ONNX_ML=1 - cd onnx - python setup.py install - cd .. - else - pip install onnx$(onnx.version) - fi + pip install onnx$(onnx.version) displayName: 'install onnx' - script: | diff --git a/.azure-pipelines/linux-conda-CI.yml b/.azure-pipelines/linux-conda-CI.yml index 70fac27c0..5c1dc92eb 100644 --- a/.azure-pipelines/linux-conda-CI.yml +++ b/.azure-pipelines/linux-conda-CI.yml @@ -14,7 +14,7 @@ jobs: strategy: matrix: - Py311-Onnx150-Rt161-Skl131: + Py311-Onnx150-Rt161-Skl132: do.bench: '0' python.version: '3.11' numpy.version: '>=1.21.1' @@ -22,7 +22,8 @@ jobs: onnx.version: 'onnx==1.15.0' # -i https://test.pypi.org/simple/ onnx==1.15.0rc2' onnx.target_opset: '' onnxrt.version: 'onnxruntime==1.16.1' - sklearn.version: '>=1.3.1' + sklearn.version: '>=1.3.2' + pandas.version: '' lgbm.version: '' onnxcc.version: '>=1.8.1' run.example: '1' @@ -35,7 +36,8 @@ jobs: onnx.version: 'onnx==1.14.1' onnx.target_opset: '' onnxrt.version: 'onnxruntime==1.16.0' - sklearn.version: '>=1.3.1' + sklearn.version: '==1.3.1' + pandas.version: '' lgbm.version: '' onnxcc.version: '>=1.8.1' run.example: '1' @@ -49,6 +51,7 @@ jobs: onnx.target_opset: '' onnxrt.version: 'onnxruntime==1.15.1' sklearn.version: '==1.3.0' + pandas.version: '' lgbm.version: '' onnxcc.version: '>=1.8.1' run.example: '0' @@ -62,6 +65,7 @@ jobs: onnx.target_opset: '' onnxrt.version: 'onnxruntime==1.14.0' sklearn.version: '==1.2.2' + pandas.version: '' lgbm.version: '' onnxcc.version: '>=1.8.1' run.example: '0' @@ -75,6 +79,7 @@ jobs: onnx.target_opset: '' onnxrt.version: 'onnxruntime==1.14.0' sklearn.version: '==1.2.1' + pandas.version: '' lgbm.version: '' onnxcc.version: '>=1.8.1' run.example: '0' @@ -88,6 +93,7 @@ jobs: onnx.target_opset: '' onnxrt.version: 'onnxruntime==1.13.1' sklearn.version: '==1.2.0' + pandas.version: '' lgbm.version: '' onnxcc.version: '>=1.8.1' run.example: '0' @@ -101,6 +107,7 @@ jobs: onnx.target_opset: '' onnxrt.version: 'onnxruntime==1.13.1' sklearn.version: '==1.2.0' + pandas.version: '' lgbm.version: '' onnxcc.version: '>=1.8.1' run.example: '0' @@ -113,102 +120,72 @@ jobs: onnx.target_opset: '' onnxrt.version: 'onnxruntime==1.12.1' sklearn.version: '==1.1.3' + pandas.version: '' lgbm.version: '' onnxcc.version: '>=1.8.1' run.example: '0' Py39-Onnx120-Rt1111-Skl11: do.bench: '0' python.version: '3.9' - numpy.version: '>=1.21.0' + numpy.version: '>=1.21.0,<1.23.0' scipy.version: '>=1.7.0' onnx.version: 'onnx==1.12.0' onnx.target_opset: '' onnxrt.version: 'onnxruntime==1.11.1' sklearn.version: '==1.1.3' + pandas.version: '' lgbm.version: '' onnxcc.version: '>=1.8.1' run.example: '0' Py39-Onnx1110-Rt1111-Skl11: do.bench: '0' python.version: '3.9' - numpy.version: '>=1.21.0' + numpy.version: '>=1.21.0,<1.23.0' scipy.version: '>=1.7.0' onnx.version: 'onnx==1.11.0' onnx.target_opset: '' onnxrt.version: 'onnxruntime==1.11.1' sklearn.version: '==1.1.3' + pandas.version: '' lgbm.version: '' onnxcc.version: '>=1.8.1' run.example: '0' Py39-Onnx1110-Rt1111-Skl10: do.bench: '0' python.version: '3.9' - numpy.version: '>=1.22.3' + numpy.version: '>=1.21.0,<1.23.0' scipy.version: '>=1.7.0' onnx.version: 'onnx==1.11.0' onnx.target_opset: '' onnxrt.version: 'onnxruntime==1.11.1' sklearn.version: '==1.0.2' + pandas.version: '' lgbm.version: '' onnxcc.version: '>=1.8.1' run.example: '0' Py39-Onnx1110-Rt1100-Skl10: do.bench: '0' python.version: '3.9' - numpy.version: '>=1.21.0' + numpy.version: '>=1.21.0,<1.23.0' scipy.version: '>=1.7.0' onnx.version: 'onnx==1.11.0' onnx.target_opset: '' onnxrt.version: 'onnxruntime==1.10.0' sklearn.version: '==1.0.2' + pandas.version: '' lgbm.version: '' onnxcc.version: '>=1.8.1' run.example: '0' Py39-Onnx1101-Rt190-Skl10: do.bench: '0' python.version: '3.9' - numpy.version: '>=1.21.0' + numpy.version: '>=1.21.0,<1.23.0' scipy.version: '>=1.7.0' onnx.version: 'onnx==1.10.1' onnx.target_opset: '' onnxrt.version: 'onnxruntime==1.9.0' sklearn.version: '==1.0.2' - lgbm.version: '' - onnxcc.version: '>=1.8.1' - run.example: '0' - Py39-Onnx1101-Rt190-Skl0242: - do.bench: '0' - python.version: '3.9' - numpy.version: '<1.21.0' - scipy.version: '' - onnx.version: 'onnx==1.10.1' - onnx.target_opset: '' - onnxrt.version: 'onnxruntime==1.9.0' - sklearn.version: '==0.24.2' - lgbm.version: '' - onnxcc.version: '>=1.8.1' - run.example: '0' - Py39-Onnx190-Rt180-Skl0242: - do.bench: '0' - python.version: '3.9' - numpy.version: '<1.21.0' - scipy.version: '' - onnx.version: 'onnx==1.9.0' - onnx.target_opset: '' - onnxrt.version: 'onnxruntime>=1.8.1' - sklearn.version: '==0.24.2' - lgbm.version: '' - onnxcc.version: '>=1.8.1' - run.example: '0' - Py39-Onnx190-Rt180-Skl0232: - do.bench: '0' - python.version: '3.9' - numpy.version: '<1.21.0' - scipy.version: '' - onnx.version: 'onnx==1.9.0' - onnx.target_opset: '' - onnxrt.version: 'onnxruntime>=1.8.1' - sklearn.version: '==0.23.2' + pandas.version: '' lgbm.version: '' onnxcc.version: '>=1.8.1' run.example: '0' @@ -255,37 +232,8 @@ jobs: displayName: 'install scikit-learn' - script: | - if [ '$(onnx.version)' == 'git' ] - then - pip install typing-extensions>=3.6.2.1 - git clone https://github.com/onnx/onnx.git --recursive - export ONNX_ML=1 - cd onnx - python setup.py install - cd .. - else - if [ '$(onnx.version)' == 'test' ] - then - pip install typing-extensions>=3.6.2.1 - pip install -i https://test.pypi.org/simple/ onnx - else - pip install typing-extensions>=3.6.2.1 - pip install $(onnx.version) - fi - fi - displayName: 'install onnx' - - - script: | - pip show onnx - displayName: 'onnx version' - - - script: | - pip install $(onnxrt.version) - displayName: 'install onnxruntime' - - - script: | - pip show onnx - displayName: 'onnx version' + pip install $(onnx.version) $(onnxrt.version) + displayName: 'install onnxruntime, onnx' - script: | if [ '$(onnxcc.version)' == 'git' ] @@ -296,10 +244,6 @@ jobs: fi displayName: 'install onnxconverter-common' - - script: | - pip show onnx - displayName: 'onnx version' - - script: | pip install -r requirements.txt pip install -r requirements-dev.txt @@ -309,6 +253,10 @@ jobs: pip show onnx displayName: 'onnx version' + - script: | + pip install $(onnx.version) $(onnxrt.version) "numpy$(numpy.version)" scikit-learn$(sklearn.version) scipy$(scipy.version) "pandas$(pandas.version)" + displayName: install onnxruntime, onnx, numpy, scikit-learn, pandas again # to make sure the proper version is installed + - script: | pip install -e . displayName: 'install' @@ -317,6 +265,8 @@ jobs: echo "---------------" pip show numpy echo "---------------" + pip show pandas + echo "---------------" pip show scipy echo "---------------" pip show pandas @@ -340,6 +290,7 @@ jobs: fi cd tests python -c "from numpy import __version__;print('numpy', __version__)" + python -c "from pandas import __version__;print('pandas', __version__)" python -c "from scipy import __version__;print('scipy', __version__)" python -c "from sklearn import __version__;print('sklearn', __version__)" python -c "from onnxruntime import __version__;print('onnxruntime', __version__)" diff --git a/.gitignore b/.gitignore index 4a5128a88..4ca6f0c5f 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,7 @@ build/ *.bat # test generated files +*.onnx .pytest_cache .cache htmlcov diff --git a/CHANGELOGS.md b/CHANGELOGS.md index 6d5f0133c..9345b4f6b 100644 --- a/CHANGELOGS.md +++ b/CHANGELOGS.md @@ -2,12 +2,20 @@ ## 1.16.0 +* Add an example on how to handle FunctionTransformer + [#1042](https://github.com/onnx/sklearn-onnx/pull/1042), + Versions of `scikit-learn < 1.0` are not tested any more. +* FeatureHasher, raise an error when the delimiter length is > 1, + [#1036](https://github.com/onnx/sklearn-onnx/pull/1036) +* skl2onnx works with onnx==1.15.0, + [#1034](https://github.com/onnx/sklearn-onnx/pull/1034) * fix OneHotEncoder when categories indices to drop are not None [#1028](https://github.com/onnx/sklearn-onnx/pull/1028) * fix converter for AdaBoost estimators in scikit-learn==1.3.1 [#1027](https://github.com/onnx/sklearn-onnx/pull/1027) -* add function 'add_onnx_graph' to insert onnx graph coming from other converting +* add function 'add_onnx_graph' to insert onnx graph coming from other converting, libraries within the converter mapped to a custom estimator - [#1023](https://github.com/onnx/sklearn-onnx/pull/1023) + [#1023](https://github.com/onnx/sklearn-onnx/pull/1023), + [#1024](https://github.com/onnx/sklearn-onnx/pull/1024) * add option 'language' to converters of CountVectorizer, TfIdfVectorizer [#1020](https://github.com/onnx/sklearn-onnx/pull/1020) diff --git a/docs/tutorial/plot_jfunction_transformer.py b/docs/tutorial/plot_jfunction_transformer.py new file mode 100644 index 000000000..6e9162f37 --- /dev/null +++ b/docs/tutorial/plot_jfunction_transformer.py @@ -0,0 +1,292 @@ +""" +Issues with FunctionTransformer +=============================== + +A pipeline including a `FunctionTransformer +`_ +cannot be automatically converted into onnx because there is no converter able to +convert custom python code into ONNX. A custom converter needs to be written +specifically for it. + +Initial try ++++++++++++ + +A very simple pipeline and the first attempt to convert it into ONNX. +""" +import numpy as np +from numpy.testing import assert_allclose +from onnx.version_converter import convert_version +from pandas import DataFrame +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.tree import DecisionTreeClassifier +from sklearn.preprocessing import FunctionTransformer +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline +from skl2onnx import to_onnx + +# For the custom converter +from skl2onnx import update_registered_converter +from skl2onnx.common.utils import check_input_and_output_numbers +from skl2onnx.algebra.onnx_ops import OnnxSlice, OnnxSub, OnnxDiv, OnnxMul, OnnxCastLike +from skl2onnx.helpers import add_onnx_graph +import onnxscript +from onnxscript import opset18 as op + +# To check discrepancies +from onnx.reference import ReferenceEvaluator +from onnxruntime import InferenceSession + + +def calculate_growth(df): + df["c"] = 100 * (df["a"] - df["b"]) / df["b"] + return df + + +mapper = ColumnTransformer( + transformers=[ + ("c", FunctionTransformer(calculate_growth), ["a", "b"]), + ], + remainder="passthrough", + verbose_feature_names_out=False, +) +mapper.set_output(transform="pandas") + +pipe = Pipeline([("mapper", mapper), ("classifier", DecisionTreeClassifier())]) + +data = DataFrame( + [ + dict(a=2, b=1, f=5), + dict(a=50, b=4, f=10), + dict(a=5, b=2, f=4), + dict(a=100, b=6, f=20), + ] +) +y = np.array([0, 1, 0, 1], dtype=np.int64) +pipe.fit(data, y) + +try: + to_onnx(pipe, data[:1], options={"zipmap": False}) +except Exception as e: + print("It does not work:", e) + +################################## +# Use of custom transformer +# +++++++++++++++++++++++++ +# +# It is easier to write a custom converter if the FunctionTransformer +# is implemented as a custom transformer. + + +class GrowthCalculator(BaseEstimator, TransformerMixin): + def __init__(self): + pass + + def calculate_growth(self, x, y): + return 100 * (x - y) / y + + def fit(self, X, y=None): + return self + + def transform(self, X, y=None): + x = X.apply(lambda x: self.calculate_growth(x.a, x.b), axis=1) + return x.values.reshape((-1, 1)) + + +mapper = ColumnTransformer( + transformers=[ + ("ab", FunctionTransformer(), ["a", "b"]), # We keep the first column. + ("c", GrowthCalculator(), ["a", "b"]), # We add a new one. + ], + remainder="passthrough", + verbose_feature_names_out=False, +) + +pipe_tr = Pipeline([("mapper", mapper), ("classifier", DecisionTreeClassifier())]) +pipe_tr.fit(data, y) + +############################# +# Both pipelines return the same output. +assert_allclose(pipe.predict_proba(data), pipe_tr.predict_proba(data)) + +############################# +# Let's check it produces the same number of features. +assert_allclose(pipe.steps[0][-1].transform(data), pipe_tr.steps[0][-1].transform(data)) + +############################# +# But the conversion still fails with a different error message. + +try: + to_onnx(pipe_tr, data[:1], options={"zipmap": False}) +except Exception as e: + print("It does not work:", e) + + +################################# +# Custom converter +# ++++++++++++++++ +# +# We need to implement the method `calculate_growth` in ONNX. +# The first function returns the expected type and shape. + + +def growth_shape_calculator(operator): + check_input_and_output_numbers(operator, input_count_range=1, output_count_range=1) + # Gets the input type, the transformer works on any numerical type. + input_type = operator.inputs[0].type.__class__ + # The first dimension is usually dynamic (batch dimension). + input_dim = operator.inputs[0].get_first_dimension() + operator.outputs[0].type = input_type([input_dim, 1]) + + +def growth_converter(scope, operator, container): + # No need to retrieve the fitted estimator, it is not trained. + # op = operator.raw_operator + opv = container.target_opset + X = operator.inputs[0] + + # 100 * (x-y)/y --> 100 * (X[0] - X[1]) / X[1] + + zero = np.array([0], dtype=np.int64) + one = np.array([1], dtype=np.int64) + two = np.array([2], dtype=np.int64) + hundred = np.array([100], dtype=np.float32) + + # Slice(data, starts, ends, axes) + x0 = OnnxSlice(X, zero, one, one, op_version=opv) + x1 = OnnxSlice(X, one, two, one, op_version=opv) + z = OnnxMul( + OnnxCastLike(hundred, X, op_version=opv), + OnnxDiv(OnnxSub(x0, x1, op_version=opv), x1, op_version=opv), + op_version=opv, + output_names=operator.outputs[0], + ) + z.add_to(scope, container) + + +update_registered_converter( + GrowthCalculator, + "AliasGrowthCalculator", + growth_shape_calculator, + growth_converter, +) + + +onx = to_onnx(pipe_tr, data[:1], target_opset=18, options={"zipmap": False}) + +############################ +# Let's check there is no discrepancies +# +++++++++++++++++++++++++++++++++++++ +# +# First the expected values + +expected = (pipe_tr.predict(data), pipe_tr.predict_proba(data)) +print(expected) + +############################## +# Then let's check with :class:`onnx.reference.ReferenceEvaluator`. + +feeds = { + "a": data["a"].values.reshape((-1, 1)), + "b": data["b"].values.reshape((-1, 1)), + "f": data["f"].values.reshape((-1, 1)), +} + +# verbose=10 to show intermediate results +ref = ReferenceEvaluator(onx, verbose=0) +got = ref.run(None, feeds) + +assert_allclose(expected[0], got[0]) +assert_allclose(expected[1], got[1]) + +####################################### +# Then with the runtime used to deploy, onnxruntime for example. + +ref = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"]) +got = ref.run(None, feeds) + +assert_allclose(expected[0], got[0]) +assert_allclose(expected[1], got[1]) + +################################# +# Custom converter with onnxscript +# ++++++++++++++++++++++++++++++++ +# +# `onnxscript `_ +# offers a less verbose API than what onnx package implements. +# Let's see how to use it to write the converters. + + +@onnxscript.script() +def calculate_onnxscript_verbose(X): + # onnxscript must define an opset. We use an identity node + # from a specific opset to set it (otherwise it fails). + x0 = op.Slice(X, [0], [1], [1]) + x1 = op.Slice(X, [1], [2], [1]) + return op.Mul(op.Div(op.Sub(x0, x1), x1), 100) + + +######################################### +# This version uses the strict definition of ONNX operators. +# The code can be more simple if regular python operators are used. +# They may not be converted into ONNX but an error message +# is raised in that case. + + +@onnxscript.script() +def calculate_onnxscript(X): + # onnxscript must define an opset. We use an identity node + # from a specific opset to set it (otherwise it fails). + xi = op.Identity(X) + x0 = xi[:, :1] + x1 = xi[:, 1:] + return (x0 - x1) / x1 * 100 + + +######################################### +# We can also check that it is equivalent to the python implementation. +f_expected = calculate_growth(data)["c"].values +f_got = calculate_onnxscript(data[["a", "b"]].values.astype(np.float32)) +assert_allclose(f_expected.ravel(), f_got.ravel(), atol=1e-6) + +######################################### +# Let's use it in the converter. + + +def growth_converter_onnxscript(scope, operator, container): + # No need to retrieve the fitted estimator, it is not trained. + # op = operator.raw_operator + opv = container.target_opset + + # 100 * (x-y)/y --> 100 * (X[0] - X[1]) / X[1] + proto = calculate_onnxscript.to_model_proto() + # The function is written with opset 18, it needs to be converted + # to the opset required by the user when the conversion starts. + proto_version = convert_version(proto, opv) + add_onnx_graph(scope, operator, container, proto_version) + + +update_registered_converter( + GrowthCalculator, + "AliasGrowthCalculator", + growth_shape_calculator, + growth_converter_onnxscript, +) + +################################### +# Let's check it works. + +onx = to_onnx(pipe_tr, data[:1], target_opset=18, options={"zipmap": False}) + + +################################### +# And again the discrepancies. + +ref = ReferenceEvaluator(onx, verbose=0) +got = ref.run(None, feeds) +assert_allclose(expected[0], got[0]) +assert_allclose(expected[1], got[1]) + + +####################################### +# Finally. +print("done.") diff --git a/docs/tutorial_2_new_converter.rst b/docs/tutorial_2_new_converter.rst index 507e663a7..101909cfb 100644 --- a/docs/tutorial_2_new_converter.rst +++ b/docs/tutorial_2_new_converter.rst @@ -42,6 +42,7 @@ an example. auto_tutorial/plot_icustom_converter auto_tutorial/plot_jcustom_syntax + auto_tutorial/plot_jfunction_transformer auto_tutorial/plot_kcustom_converter_wrapper auto_tutorial/plot_lcustom_options auto_tutorial/plot_mcustom_parser diff --git a/requirements-dev.txt b/requirements-dev.txt index d0ebc86f0..7d137d5d0 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -11,6 +11,7 @@ wheel # docs furo +onnxscript sphinx sphinxcontrib-blockdiag tqdm diff --git a/skl2onnx/__init__.py b/skl2onnx/__init__.py index 8e73eed22..1201cee88 100644 --- a/skl2onnx/__init__.py +++ b/skl2onnx/__init__.py @@ -3,7 +3,7 @@ """ Main entry point to the converter from the *scikit-learn* to *onnx*. """ -__version__ = "1.16.1" +__version__ = "1.16.0" __author__ = "Microsoft" __producer__ = "skl2onnx" __producer_version__ = __version__ diff --git a/skl2onnx/operator_converters/feature_hasher.py b/skl2onnx/operator_converters/feature_hasher.py index 959a46d50..f001a05f5 100644 --- a/skl2onnx/operator_converters/feature_hasher.py +++ b/skl2onnx/operator_converters/feature_hasher.py @@ -64,7 +64,7 @@ def convert_sklearn_feature_hasher( ) else: raise RuntimeError( - f"Only one character separator are supported but delimiter is {separator!r}." + f"Only one character separators are supported but delimiter is {separator!r}." ) shape = scope.get_unique_variable_name(f"shape{i}") container.add_node("Shape", [col_to_split.full_name], [shape])