intel · seraphimstreets · Jun 22, 2022 · Jul 29, 2022 · pdxjohnny · Jul 24, 2022
diff --git a/operations/data/dffml_operations_data/definitions.py b/operations/data/dffml_operations_data/definitions.py
@@ -3,13 +3,17 @@
 
 definitions = [
  Definition(name="input_data", primitive="List[List[int]]"),
+ Definition(name="target_data", primitive="List[int]"),
  Definition(name="output_data", primitive="List[List[int]]"),
  Definition(name="n_components", primitive="int"),
  Definition(name="n_iter", primitive="int"),
  Definition(name="random_state", primitive="int"),
  Definition(name="missing_values", primitive="Any"),
  Definition(name="strategy", primitive="str"),
  Definition(name="categories", primitive="List[List[Any]]"),
+ Definition(name="percentile", primitive="int"),
+ Definition(name="k", primitive="int"),
+ Definition(name="score_func", primitive="function")
 ]
 
 for definition in definitions:

diff --git a/operations/data/dffml_operations_data/operations.py b/operations/data/dffml_operations_data/operations.py
@@ -2,6 +2,7 @@
 from sklearn.decomposition import PCA, TruncatedSVD
 from sklearn.preprocessing import OneHotEncoder, StandardScaler
 from sklearn.impute import SimpleImputer
+from sklearn.feature_selection import f_classif, SelectKBest, SelectPercentile
 
 from dffml.df.base import op
 
@@ -14,6 +15,10 @@
  random_state,
  n_components,
  missing_values,
+ target_data,
+ k,
+ percentile,
+ score_func
 )
 
 
@@ -211,3 +216,63 @@ async def ordinal_encoder(data):
  enc.fit(data)
  new_data = enc.transform(data).toarray()
  return {"result": new_data}
+
+@op(
+ inputs={"data": input_data, "target_data": target_data, "k": k, "score_func": score_func},
+ outputs={"result": output_data}
+)
+async def select_k_best(data, target_data, score_func=f_classif, k=10):
+ """
+ Select the top k features, based on the score function.
+
+ Parameters
+ ----------
+ data : List[List[int]]
+ Input data, excluding the target column
+ target_data : List[int]
+ 1D list containing values for the target column.
+ score_func : function
+ Function that takes in data and target_data, and returns 
+ a pair of arrays (scores, pvalues) or a single array with
+ scores.
+ k : int
+ Number of top features to select.
+
+ Returns
+ -------
+ result: Encoded data for categorical values
+ """
+
+ selector = SelectKBest(score_func, k=k)
+ new_data = selector.fit_transform(data, target_data)
+ return {"result": new_data}
+
+@op(
+ inputs={"data": input_data, "target_data": target_data, "percentile": percentile, "score_func": score_func},
+ outputs={"result": output_data}
+)
+async def select_percentile(data, target_data, score_func=f_classif, percentile=10):
+ """
+ Select a certain top percentile of features, based on the score function.
- Select a certain top percentile of features, based on the score function.
+ Select a certain top percentile of features, based on the score function.
+
+ References:
+
+ - https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html
- Select a certain top percentile of features, based on the score function.
+ Select a certain top percentile of features, based on the score function.
+
+ References:
+
+ - https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html
+
+ Parameters
+ ----------
+ data : List[List[int]]
+ Input data, excluding the target column
+ target_data : List[int]
+ 1D list containing values for the target column.
+ score_func : function
+ Function that takes in data and target_data, and returns 
+ a pair of arrays (scores, pvalues) or a single array with
+ scores.
+ percentile : int
+ Percentile of top features to select.
+
+ Returns
+ -------
+ result: Encoded data for categorical values
+ """
+
+ selector = SelectPercentile(score_func, percentile=percentile)
+ new_data = selector.fit_transform(data, target_data)
+ return {"result": new_data}
diff --git a/operations/data/tests/test_operations.py b/operations/data/tests/test_operations.py
@@ -5,7 +5,7 @@
 from dffml.operation.output import GetSingle
 from dffml.df.memory import MemoryOrchestrator
 from dffml.util.asynctestcase import AsyncTestCase
-
+from sklearn.feature_selection import f_classif
 from dffml_operations_data.operations import *
 
 
@@ -225,3 +225,76 @@ async def test_ordinal_encoder(self):
  == output_data
  ).all()
  )
+
+ async def test_select_k_best(self):
+ input_data = [[1, 1], [1, 2], [1, 1], [0, 2], [1, 1], [1, 1]]
+ target_data = [1,2,1,2,1,2]
+ output_data = [[1], [2], [1], [2], [1], [1]]
+
+ async for ctx, results in MemoryOrchestrator.run(
+ DataFlow.auto(select_k_best, GetSingle),
+ [
+ Input(
+ value=[select_k_best.op.outputs["result"].name],
+ definition=GetSingle.op.inputs["spec"],
+ ),
+ Input(
+ value=input_data,
+ definition=select_k_best.op.inputs["data"],
+ ),
+ Input(
+ value=target_data,
+ definition=select_k_best.op.inputs["target_data"],
+ ),
+ Input(
+ value=f_classif,
+ definition=select_k_best.op.inputs["score_func"],
+ ),
+ Input(
+ value=1,
+ definition=select_k_best.op.inputs["k"],
+ ),
+ ],
+ ):
+ self.assertTrue(
+ (
+ results[select_k_best.op.outputs["result"].name]
+ == output_data
+ ).all()
+ )
+ async def test_select_percentile(self):
+ input_data = [[1, 1], [1, 2], [1, 1], [0, 2], [1, 1], [1, 1]]
+ target_data = [1,2,1,2,1,2]
+ output_data = [[1], [2], [1], [2], [1], [1]]
+
+ async for ctx, results in MemoryOrchestrator.run(
+ DataFlow.auto(select_percentile, GetSingle),
+ [
+ Input(
+ value=[select_percentile.op.outputs["result"].name],
+ definition=GetSingle.op.inputs["spec"],
+ ),
+ Input(
+ value=input_data,
+ definition=select_percentile.op.inputs["data"],
+ ),
+ Input(
+ value=target_data,
+ definition=select_percentile.op.inputs["target_data"],
+ ),
+ Input(
+ value=f_classif,
+ definition=select_percentile.op.inputs["score_func"],
+ ),
+ Input(
+ value=50,
+ definition=select_percentile.op.inputs["percentile"],
+ ),
+ ],
+ ):
+ self.assertTrue(
+ (
+ results[select_percentile.op.outputs["result"].name]
+ == output_data
+ ).all()
+ )