From ed02dec00b5aac8415f28abbd51e39c9547ded80 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 6 Apr 2023 14:16:01 -0400 Subject: [PATCH 1/2] Count only for the first hit for subject or tissue within filename Also added assertion so we do not count incorrectly. But may be should be just a warning? Closes #172 --- dandischema/metadata.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/dandischema/metadata.py b/dandischema/metadata.py index b1f4134..8fbd809 100644 --- a/dandischema/metadata.py +++ b/dandischema/metadata.py @@ -311,13 +311,16 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None: stats = _get_samples(value, stats, hierarchy) break + # which components already found, so we do not count more than + # once in some incorrectly named datasets + found: Dict[str, str] = {} for part in Path(assetmeta["path"]).name.split(".")[0].split("_"): - if part.startswith("sub-"): - subject = part.replace("sub-", "") + if found.get("subject") and part.startswith("sub-"): + found["subject"] = subject = part.split("sub-", 1)[1] if subject not in stats["subjects"]: stats["subjects"].append(subject) - if part.startswith("sample-"): - sample = part.replace("sample-", "") + if not found.get("sample") and part.startswith("sample-"): + found["sample"] = sample = part.replace("sample-", "") if sample not in stats["tissuesample"]: stats["tissuesample"].append(sample) @@ -338,10 +341,13 @@ def aggregate_assets_summary(metadata: Iterable[Dict[str, Any]]) -> dict: stats: _stats_type = {} for meta in metadata: _add_asset_to_stats(meta, stats) - stats["numberOfBytes"] = stats.get("numberOfBytes", 0) stats["numberOfFiles"] = stats.get("numberOfFiles", 0) stats["numberOfSubjects"] = len(stats.pop("subjects", [])) or None + if stats["numberOfSubjects"]: + # Must not happen. If does -- a bug in software + assert stats["numberOfFiles"] + assert stats["numberOfSubjects"] <= stats["numberOfFiles"] stats["numberOfSamples"] = ( len(stats.pop("tissuesample", [])) + len(stats.pop("slice", [])) ) or None From d22ac9c9fcc7bdabeed5348c27c164973603381f Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Wed, 19 Apr 2023 11:33:17 -0400 Subject: [PATCH 2/2] Fix logic to count only if not yet present --- dandischema/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dandischema/metadata.py b/dandischema/metadata.py index 8fbd809..0dcf779 100644 --- a/dandischema/metadata.py +++ b/dandischema/metadata.py @@ -315,7 +315,7 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None: # once in some incorrectly named datasets found: Dict[str, str] = {} for part in Path(assetmeta["path"]).name.split(".")[0].split("_"): - if found.get("subject") and part.startswith("sub-"): + if not found.get("subject") and part.startswith("sub-"): found["subject"] = subject = part.split("sub-", 1)[1] if subject not in stats["subjects"]: stats["subjects"].append(subject)