Addressed situation when assign_default_confidence() returns only dat…

…aframe with all NaN confidence values (#548) Ok, so here was the problem: When the dataframe whose redundant rows had to be filtered out had all `NaN` values for confidence, the line https:/mapping-commons/sssom-py/blob/550206721911f711ee678eb1a8da50591649bd04/src/sssom/util.py#L441 returned `df` = Empty dataframe and the entire source data frame = `nan_df`. Due to this, the following line: https:/mapping-commons/sssom-py/blob/550206721911f711ee678eb1a8da50591649bd04/src/sssom/util.py#L447 result in `dfmax = {}` which is of type `pandas.Series`. Hence the confusion. The correct way to handle this is simple adding an `if` statement: https:/mapping-commons/sssom-py/blob/ffa2109616020f994196cbb827d71bca17192014/src/sssom/util.py#L447-L469 I've added an explicit test and it passes. Fixes #546
mapping-commons · Jun 26, 2024 · e0dfcb3 · e0dfcb3
1 parent edf432a
commit e0dfcb3
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 23 deletions.
diff --git a/src/sssom/util.py b/src/sssom/util.py
@@ -444,33 +444,32 @@ def filter_redundant_rows(df: pd.DataFrame, ignore_predicate: bool = False) -> p
  else:
  key = [SUBJECT_ID, OBJECT_ID, PREDICATE_ID]
  dfmax: pd.DataFrame
- dfmax = df.groupby(key, as_index=False)[CONFIDENCE].apply(max).drop_duplicates()
- max_conf: Dict[Tuple[str, ...], float] = {}
- for _, row in dfmax.iterrows():
+ if not df.empty:
+ dfmax = df.groupby(key, as_index=False)[CONFIDENCE].apply(max).drop_duplicates()
+ max_conf: Dict[Tuple[str, ...], float] = {}
+ for _, row in dfmax.iterrows():
+ if ignore_predicate:
+ max_conf[(row[SUBJECT_ID], row[OBJECT_ID])] = row[CONFIDENCE]
+ else:
+ max_conf[(row[SUBJECT_ID], row[OBJECT_ID], row[PREDICATE_ID])] = row[CONFIDENCE]
  if ignore_predicate:
- max_conf[(row[SUBJECT_ID], row[OBJECT_ID])] = row[CONFIDENCE]
+ df = df[
+ df.apply(
+ lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID])],
+ axis=1,
+ )
+ ]
  else:
- max_conf[(row[SUBJECT_ID], row[OBJECT_ID], row[PREDICATE_ID])] = row[CONFIDENCE]
- if ignore_predicate:
- df = df[
- df.apply(
- lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID])],
- axis=1,
- )
- ]
- else:
- df = df[
- df.apply(
- lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID], x[PREDICATE_ID])],
- axis=1,
- )
- ]
+ df = df[
+ df.apply(
+ lambda x: x[CONFIDENCE]
+ >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID], x[PREDICATE_ID])],
+ axis=1,
+ )
+ ]
  # We are preserving confidence = NaN rows without making assumptions.
  # This means that there are potential duplicate mappings
- # FutureWarning: The frame.append method is deprecated and
- # will be removed from pandas in a future version.
- # Use pandas.concat instead.
- # return_df = df.append(nan_df).drop_duplicates()
+
  confidence_reconciled_df = pd.concat([df, nan_df]).drop_duplicates()
 
  # Reconciling dataframe rows based on the predicates with equal confidence.

diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py
@@ -22,6 +22,13 @@ def test_filter(self):
  df2 = filter_redundant_rows(self.msdf2.df)
  self.assertEqual(18, len(df2.index))
 
+ # Create a new dataframe with the confidence column having NaN values
+ import numpy as np
+
+ self.msdf1.df["confidence"] = np.NAN
+ df3 = filter_redundant_rows(self.msdf1.df)
+ self.assertEqual(11, len(df3.index))
+
  def test_deal_with_negation(self):
  """Test handling negating returns the right number of rows."""
  df1 = deal_with_negation(self.msdf1.df)