dask · SfinxCZ · Jan 8, 2019 · Dec 3, 2018 · Dec 5, 2018 · Dec 5, 2018
diff --git a/dask_lightgbm/core.py b/dask_lightgbm/core.py
@@ -1,13 +1,13 @@
 import logging
 from collections import defaultdict
 
+import dask.array as da
+import dask.dataframe as dd
 import lightgbm
 import numpy as np
 import pandas as pd
 from lightgbm.basic import _safe_call, _LIB
 from toolz import first, assoc
-import dask.dataframe as dd
-import dask.array as da
 
 try:
  import sparse
@@ -85,7 +85,6 @@ def _fit_local(params, model_factory, list_of_parts, worker_addresses, return_mo
  return None
 
 
-
 def train(client, X, y, params, model_factory, sample_weight=None, **kwargs):
  data_parts = X.to_delayed()
  label_parts = y.to_delayed()
@@ -215,6 +214,13 @@ def predict_proba(self, X, client=None, **kwargs):
  return predict(client, self.to_local(), X, proba=True, dtype=self.classes_[0].dtype, **kwargs)
  predict_proba.__doc__ = lightgbm.LGBMClassifier.predict_proba.__doc__
 
+ def score(self, X, y, client=None, compute=True):
+ # Source: dask_ml.metrics.accuracy_score
+ result = (y == self.predict(X, client=client)).mean()
+ if compute:
+ result = result.compute()
+ return result
+
  def to_local(self):
  model = lightgbm.LGBMClassifier(**self.get_params())
  model._Booster = self._Booster
@@ -227,3 +233,60 @@ def to_local(self):
  model._best_score = self._best_score
 
  return model
+
+
+class LGBMRegressor(lightgbm.LGBMRegressor):
+
+ def fit(self, X, y=None, sample_weight=None, client=None, **kwargs):
+ if client is None:
+ client = default_client()
+ model_factory = lightgbm.LGBMRegressor
+ params = self.get_params(True)
+
+ model = train(client, X, y, params, model_factory, sample_weight, **kwargs)
+ self.set_params(**model.get_params())
+ self._Booster = model._Booster
+ self._n_features = model._n_features
+ self._evals_result = model._evals_result
+ self._best_iteration = model._best_iteration
+ self._best_score = model._best_score
+
+ return self
+ fit.__doc__ = lightgbm.LGBMRegressor.fit.__doc__
+
+ def _network_params(self):
+ return {
+ "machines": self.machines
+ }
+
+ def predict(self, X, client=None, **kwargs):
+ if client is None:
+ client = default_client()
+ return predict(client, self.to_local(), X, **kwargs)
+ predict.__doc__ = lightgbm.LGBMRegressor.predict.__doc__
+
+ def score(self, X, y, client=None, compute=True):
+ # Source: dask_ml.metrics.r2_score
+ # Ensure compatibility with Dataframes and Series
+ X = X.values if isinstance(X, dd._Frame) else X
+ y = y.values if isinstance(y, dd._Frame) else y
+
+ yp = self.predict(X, client=client)
+
+ numerator = ((y - yp) ** 2).sum()
+ denominator = ((y - yp.mean()) ** 2).sum()
+
+ result = 1 - numerator / denominator
+ if compute:
+ result = result.compute()
+ return result
+
+ def to_local(self):
+ model = lightgbm.LGBMRegressor(**self.get_params())
+ model._Booster = self._Booster
+ model._n_features = self._n_features
+ model._evals_result = self._evals_result
+ model._best_iteration = self._best_iteration
+ model._best_score = self._best_score
+
+ return model
diff --git a/dask_lightgbm/tests/test_core.py b/dask_lightgbm/tests/test_core.py
@@ -1,30 +1,34 @@
+# Workaround for conflict with distributed 1.23.0
+# https:/dask/dask-xgboost/pull/27#issuecomment-417474734
+from concurrent.futures import ThreadPoolExecutor
+
+import dask.array as da
+import dask.dataframe as dd
+import distributed.comm.utils
+import lightgbm
 import numpy as np
 import pandas as pd
-import sparse
-import lightgbm
-import scipy.sparse
 import pytest
-
-import dask.array as da
+import scipy.sparse
+import sparse
 from dask.array.utils import assert_eq
-import dask.dataframe as dd
 from dask.distributed import Client
-from sklearn.datasets import make_blobs
 from distributed.utils_test import gen_cluster, loop, cluster # noqa
+from sklearn.datasets import make_blobs, make_regression
 from sklearn.metrics import confusion_matrix
 
 import dask_lightgbm.core as dlgbm
 
-# Workaround for conflict with distributed 1.23.0
-# https:/dask/dask-xgboost/pull/27#issuecomment-417474734
-from concurrent.futures import ThreadPoolExecutor
-import distributed.comm.utils
-
 distributed.comm.utils._offload_executor = ThreadPoolExecutor(max_workers=2)
 
 
-def _create_data(n_samples=100, centers=2, output="array", chunk_size=50):
- X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42)
+def _create_data(objective, n_samples=100, centers=2, output="array", chunk_size=50):
+ if objective == 'classification':
+ X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42)
+ elif objective == 'regression':
+ X, y = make_regression(n_samples=n_samples, random_state=42)
+ else:
+ raise ValueError(objective)
  rnd = np.random.RandomState(42)
  w = rnd.rand(X.shape[0])*0.01
 
@@ -63,19 +67,24 @@ def _create_data(n_samples=100, centers=2, output="array", chunk_size=50):
 def test_classifier(loop, output, listen_port, centers):
  with cluster() as (s, [a, b]):
  with Client(s['address'], loop=loop) as client:
- X, y, w, dX, dy, dw = _create_data(output=output, centers=centers)
+ X, y, w, dX, dy, dw = _create_data('classification', output=output, centers=centers)
 
  a = dlgbm.LGBMClassifier(local_listen_port=listen_port)
  a = a.fit(dX, dy, sample_weight=dw)
  p1 = a.predict(dX, client=client)
  p1 = p1.compute()
+ s1 = a.score(dX, dy)
 
  b = lightgbm.LGBMClassifier()
  b.fit(X, y, sample_weight=w)
  p2 = b.predict(X)
+ s2 = b.score(X, y)
  print(confusion_matrix(y, p1))
  print(confusion_matrix(y, p2))
 
+ assert_eq(s1, s2)
+ print(s1)
+
  assert_eq(p1, p2)
  assert_eq(y, p1)
  assert_eq(y, p2)
@@ -94,7 +103,7 @@ def test_classifier(loop, output, listen_port, centers):
 def test_classifier_proba(loop, output, listen_port, centers):
  with cluster() as (s, [a, b]):
  with Client(s['address'], loop=loop) as client:
- X, y, w, dX, dy, dw = _create_data(output=output, centers=centers)
+ X, y, w, dX, dy, dw = _create_data('classification', output=output, centers=centers)
 
  a = dlgbm.LGBMClassifier(local_listen_port=listen_port)
  a = a.fit(dX, dy, sample_weight=dw)
@@ -111,9 +120,9 @@ def test_classifier_proba(loop, output, listen_port, centers):
 def test_classifier_local_predict(loop): #noqa
  with cluster() as (s, [a, b]):
  with Client(s['address'], loop=loop):
- X, y, w, dX, dy, dw = _create_data(output="array")
+ X, y, w, dX, dy, dw = _create_data('classification', output="array")
 
- a = dlgbm.LGBMClassifier(local_listen_port=11400)
+ a = dlgbm.LGBMClassifier(local_listen_port=10400)
  a = a.fit(dX, dy, sample_weight=dw)
  p1 = a.to_local().predict(dX)
 
@@ -126,6 +135,88 @@ def test_classifier_local_predict(loop): #noqa
  assert_eq(y, p2)
 
 
+@pytest.mark.parametrize("output, listen_port", [
+ ('array', 31400),
+ ('scipy_csr_matrix', 32400),
+ ('sparse', 33400),
+ ('dataframe', 34400),
+])
+def test_regressor(loop, output, listen_port):
+ with cluster() as (s, [a, b]):
+ with Client(s['address'], loop=loop) as client:
+ X, y, w, dX, dy, dw = _create_data('regression', output=output)
+
+ a = dlgbm.LGBMRegressor(local_listen_port=listen_port, seed=42)
+ a = a.fit(dX, dy, client=client, sample_weight=dw)
+ s1 = a.score(dX, dy, client=client)
+ p1 = a.predict(dX, client=client).compute()
+
+ b = lightgbm.LGBMRegressor(seed=42)
+ b.fit(X, y, sample_weight=w)
+ s2 = b.score(X, y)
+ p2 = b.predict(X)
+
+ # Scores should be the same
+ assert_eq(s1, s2, atol=.01)
+ print(s1)
+
+ # Predictions should be roughly the same
+ assert_eq(y, p1, rtol=1., atol=50.)
+ assert_eq(y, p2, rtol=1., atol=50.)
+
+
+@pytest.mark.parametrize("output, listen_port, alpha", [
+ ('array', 41400, .1),
+ ('array', 42400, .5),
+ ('array', 43400, .9),
+ ('scipy_csr_matrix', 44400, .1),
+ ('scipy_csr_matrix', 45400, .5),
+ ('scipy_csr_matrix', 46400, .9),
+ ('sparse', 47400, .1),
+ ('sparse', 48400, .5),
+ ('sparse', 49400, .9),
+ ('dataframe', 50400, .1),
+ ('dataframe', 51400, .5),
+ ('dataframe', 52400, .9),
+])
+def test_regressor_quantile(loop, output, listen_port, alpha):
+ with cluster() as (s, [a, b]):
+ with Client(s['address'], loop=loop) as client:
+ X, y, w, dX, dy, dw = _create_data('regression', output=output)
+
+ a = dlgbm.LGBMRegressor(local_listen_port=listen_port, seed=42, objective='quantile', alpha=alpha)
+ a = a.fit(dX, dy, client=client, sample_weight=dw)
+ p1 = a.predict(dX, client=client).compute()
+ q1 = np.count_nonzero(y < p1) / y.shape[0]
+
+ b = lightgbm.LGBMRegressor(seed=42, objective='quantile', alpha=alpha)
+ b.fit(X, y, sample_weight=w)
+ p2 = b.predict(X)
+ q2 = np.count_nonzero(y < p2) / y.shape[0]
+
+ # Quantiles should be right
+ np.isclose(q1, alpha, atol=.1)
+ np.isclose(q2, alpha, atol=.1)
+
+
+def test_regressor_local_predict(loop):
+ with cluster() as (s, [a, b]):
+ with Client(s['address'], loop=loop):
+ X, y, w, dX, dy, dw = _create_data('regression', output="array")
+
+ a = dlgbm.LGBMRegressor(local_listen_port=30400, seed=42)
+ a = a.fit(dX, dy, sample_weight=dw)
+ p1 = a.predict(dX).compute()
+ p2 = a.to_local().predict(X)
+ s1 = a.score(dX, dy)
+ s2 = a.to_local().score(X, y)
+ print(s1)
+
+ # Predictions and scores should be the same
+ assert_eq(p1, p2)
+ np.isclose(s1, s2)
+
+
 def test_build_network_params():
  workers_ips = [
  "tcp://192.168.0.1:34545",

diff --git a/system_tests/test_fit_predict.py b/system_tests/test_fit_predict.py
@@ -27,11 +27,26 @@ def test_classify_newsread(self):
  dX = data.iloc[:, :-1]
  dy = data.iloc[:, -1]
 
- d_classif = dlgbm.LGBMClassifier(n_estimators=50)
+ d_classif = dlgbm.LGBMClassifier(n_estimators=50, local_listen_port=12400)
  d_classif.fit(dX, dy)
 
- dy_pred = d_classif.predict(dX)
+ dy_pred = d_classif.predict(dX, client=self.client)
 
  print(confusion_matrix(dy.compute(), dy_pred.compute()))
 
- self.assertGreaterEqual((dy == dy_pred).sum()/len(dy), 0.9)
+ s1 = (dy == dy_pred).sum()/len(dy)
+ s2 = d_classif.score(dX, dy, client=self.client)
+ self.assertEqual(s1, s2)
+ self.assertGreaterEqual(s2, 0.8)
+
+ def test_regress_newsread(self):
+ data = dd.read_csv("./system_tests/data/*.gz", compression="gzip", blocksize=None)
+ dX = data.iloc[:, 1:]
+ dy = data.iloc[:, 0]
+
+ d_regress = dlgbm.LGBMRegressor(n_estimators=50, local_listen_port=13400)
+ d_regress.fit(dX, dy)
+
+ d_regress.predict(dX, client=self.client)
+
+ self.assertGreaterEqual(d_regress.score(dX, dy, client=self.client), 0.8)