Skip to content
This repository has been archived by the owner on Aug 25, 2024. It is now read-only.

Adding feature selection operations #1398

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions operations/data/dffml_operations_data/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@

definitions = [
Definition(name="input_data", primitive="List[List[int]]"),
Definition(name="target_data", primitive="List[int]"),
Definition(name="output_data", primitive="List[List[int]]"),
Definition(name="n_components", primitive="int"),
Definition(name="n_iter", primitive="int"),
Definition(name="random_state", primitive="int"),
Definition(name="missing_values", primitive="Any"),
Definition(name="strategy", primitive="str"),
Definition(name="categories", primitive="List[List[Any]]"),
Definition(name="percentile", primitive="int"),
Definition(name="k", primitive="int"),
Definition(name="score_func", primitive="function")
]

for definition in definitions:
Expand Down
65 changes: 65 additions & 0 deletions operations/data/dffml_operations_data/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import f_classif, SelectKBest, SelectPercentile

from dffml.df.base import op

Expand All @@ -14,6 +15,10 @@
random_state,
n_components,
missing_values,
target_data,
k,
percentile,
score_func
)


Expand Down Expand Up @@ -211,3 +216,63 @@ async def ordinal_encoder(data):
enc.fit(data)
new_data = enc.transform(data).toarray()
return {"result": new_data}

@op(
inputs={"data": input_data, "target_data": target_data, "k": k, "score_func": score_func},
outputs={"result": output_data}
)
async def select_k_best(data, target_data, score_func=f_classif, k=10):
"""
Select the top k features, based on the score function.

Parameters
----------
data : List[List[int]]
Input data, excluding the target column
target_data : List[int]
1D list containing values for the target column.
score_func : function
Function that takes in data and target_data, and returns
a pair of arrays (scores, pvalues) or a single array with
scores.
k : int
Number of top features to select.

Returns
-------
result: Encoded data for categorical values
"""

selector = SelectKBest(score_func, k=k)
new_data = selector.fit_transform(data, target_data)
return {"result": new_data}

@op(
inputs={"data": input_data, "target_data": target_data, "percentile": percentile, "score_func": score_func},
outputs={"result": output_data}
)
async def select_percentile(data, target_data, score_func=f_classif, percentile=10):
"""
Select a certain top percentile of features, based on the score function.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Select a certain top percentile of features, based on the score function.
Select a certain top percentile of features, based on the score function.
References:
- https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html


Parameters
----------
data : List[List[int]]
Input data, excluding the target column
target_data : List[int]
1D list containing values for the target column.
score_func : function
Function that takes in data and target_data, and returns
a pair of arrays (scores, pvalues) or a single array with
scores.
percentile : int
Percentile of top features to select.

Returns
-------
result: Encoded data for categorical values
"""

selector = SelectPercentile(score_func, percentile=percentile)
new_data = selector.fit_transform(data, target_data)
return {"result": new_data}
75 changes: 74 additions & 1 deletion operations/data/tests/test_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from dffml.operation.output import GetSingle
from dffml.df.memory import MemoryOrchestrator
from dffml.util.asynctestcase import AsyncTestCase

from sklearn.feature_selection import f_classif
from dffml_operations_data.operations import *


Expand Down Expand Up @@ -225,3 +225,76 @@ async def test_ordinal_encoder(self):
== output_data
).all()
)

async def test_select_k_best(self):
input_data = [[1, 1], [1, 2], [1, 1], [0, 2], [1, 1], [1, 1]]
target_data = [1,2,1,2,1,2]
output_data = [[1], [2], [1], [2], [1], [1]]

async for ctx, results in MemoryOrchestrator.run(
DataFlow.auto(select_k_best, GetSingle),
[
Input(
value=[select_k_best.op.outputs["result"].name],
definition=GetSingle.op.inputs["spec"],
),
Input(
value=input_data,
definition=select_k_best.op.inputs["data"],
),
Input(
value=target_data,
definition=select_k_best.op.inputs["target_data"],
),
Input(
value=f_classif,
definition=select_k_best.op.inputs["score_func"],
),
Input(
value=1,
definition=select_k_best.op.inputs["k"],
),
],
):
self.assertTrue(
(
results[select_k_best.op.outputs["result"].name]
== output_data
).all()
)
async def test_select_percentile(self):
input_data = [[1, 1], [1, 2], [1, 1], [0, 2], [1, 1], [1, 1]]
target_data = [1,2,1,2,1,2]
output_data = [[1], [2], [1], [2], [1], [1]]

async for ctx, results in MemoryOrchestrator.run(
DataFlow.auto(select_percentile, GetSingle),
[
Input(
value=[select_percentile.op.outputs["result"].name],
definition=GetSingle.op.inputs["spec"],
),
Input(
value=input_data,
definition=select_percentile.op.inputs["data"],
),
Input(
value=target_data,
definition=select_percentile.op.inputs["target_data"],
),
Input(
value=f_classif,
definition=select_percentile.op.inputs["score_func"],
),
Input(
value=50,
definition=select_percentile.op.inputs["percentile"],
),
],
):
self.assertTrue(
(
results[select_percentile.op.outputs["result"].name]
== output_data
).all()
)