From f25740813787c37fb2c99c5373acdefb18260297 Mon Sep 17 00:00:00 2001 From: Samuel Lee Date: Fri, 29 May 2020 14:41:14 -0400 Subject: [PATCH 1/4] Updates to CrosscheckFingerprints documentation. --- .../fingerprint/CrosscheckFingerprints.java | 22 +++++------ .../picard/fingerprint/CrosscheckMetric.java | 37 ++++++++++--------- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/src/main/java/picard/fingerprint/CrosscheckFingerprints.java b/src/main/java/picard/fingerprint/CrosscheckFingerprints.java index 2747cfa52a..44bfc5a9f9 100644 --- a/src/main/java/picard/fingerprint/CrosscheckFingerprints.java +++ b/src/main/java/picard/fingerprint/CrosscheckFingerprints.java @@ -78,8 +78,8 @@ * and sample for VCF files) and then optionally aggregates it by library, sample or file, to increase power and provide * results at the desired resolution. Output is in a "Moltenized" format, one row per comparison. The results will * be emitted into a metric file for the class {@link CrosscheckMetric}. - * In this format the output will include the LOD score and also tumor-aware LOD score which can - * help assess identity even in the presence of a severe loss of heterozygosity with high purity (which could + * In this format, the output will include both the LOD score and the tumor-aware LOD score; the latter can + * help assess identity even in the presence of a severe loss of heterozygosity with high purity (the tool could * otherwise fail to notice that samples are from the same individual.) * A matrix output is also available to facilitate visual inspection of crosscheck results. *
@@ -87,14 +87,14 @@ * as a follow-up step to running CrosscheckFingerprints. *
* There are cases where one would like to identify a few groups out of a collection of many possible groups (say - * to link a bam to it's correct sample in a multi-sample vcf. In this case one would not case for the cross-checking + * to link a bam to its correct sample in a multi-sample vcf. In this case, one would not care for the cross-checking * of the various samples in the VCF against each other, but only in checking the identity of the bam against the various * samples in the vcf. The {@link #SECOND_INPUT} is provided for this use-case. With {@link #SECOND_INPUT} provided, CrosscheckFingerprints * does the following: * *
  • aggregation of data happens independently for the input files in {@link #INPUT} and {@link #SECOND_INPUT}.
  • *
  • aggregation of data happens at the SAMPLE level.
  • - *
  • each samples from {@link #INPUT} will only be compared to that same sample in {@link #INPUT}.
  • + *
  • each sample from {@link #INPUT} will only be compared to that same sample in {@link #INPUT}.
  • *
  • {@link #MATRIX_OUTPUT} is disabled.
  • *
    *
    @@ -131,13 +131,13 @@ * * This tool calculates the LOD score for identity check between "groups" of data in the INPUT files as defined by * the CROSSCHECK_BY argument. A positive value indicates that the data seems to have come from the same individual - * or, in other words the identity checks out. The scale is logarithmic (base 10), so a LOD of 6 indicates - * that it is 1,000,000 more likely that the data matches the genotypes than not. A negative value indicates + * or, in other words, the identity checks out. The scale is logarithmic (base 10), so a LOD of 6 indicates + * that it is 1,000,000 times more likely that the data matches the genotypes than not. A negative value indicates * that the data do not match. A score that is near zero is inconclusive and can result from low coverage * or non-informative genotypes. Each group is assigned a sample identifier (for SAM this is taken from the SM tag in * the appropriate readgroup header line, for VCF this is taken from the column label in the file-header. * After combining all the data from the same "group" together, an all-against-all comparison is performed. Results are - * categorized a {@link FingerprintResult} enum: EXPECTED_MATCH, EXPECTED_MISMATCH, UNEXPECTED_MATCH, UNEXPECTED_MISMATCH, + * categorized as a {@link FingerprintResult} enum: EXPECTED_MATCH, EXPECTED_MISMATCH, UNEXPECTED_MATCH, UNEXPECTED_MISMATCH, * or AMBIGUOUS depending on the LOD score and on whether the sample identifiers of the groups agree: LOD scores that are * less than LOD_THRESHOLD are considered mismatches, and those greater than -LOD_THRESHOLD are matches (between is ambiguous). * If the sample identifiers are equal, the groups are expected to match. They are expected to mismatch otherwise. @@ -317,7 +317,7 @@ public class CrosscheckFingerprints extends CommandLineProgram { "the groups are from the same individual. ") public double LOD_THRESHOLD = 0; - @Argument(doc = "Specificies which data-type should be used as the basic comparison unit. Fingerprints from readgroups can " + + @Argument(doc = "Specifies which data-type should be used as the basic comparison unit. Fingerprints from readgroups can " + "be \"rolled-up\" to the LIBRARY, SAMPLE, or FILE level before being compared." + " Fingerprints from VCF can be be compared by SAMPLE or FILE.") public CrosscheckMetric.DataType CROSSCHECK_BY = CrosscheckMetric.DataType.READGROUP; @@ -325,7 +325,7 @@ public class CrosscheckFingerprints extends CommandLineProgram { @Argument(doc = "The number of threads to use to process files and generate fingerprints.") public int NUM_THREADS = 1; - @Argument(doc = "specifies whether the Tumor-aware result should be calculated. These are time consuming and can roughly double the " + + @Argument(doc = "Specifies whether the Tumor-aware result should be calculated. These are time consuming and can roughly double the " + "runtime of the tool. When crosschecking many groups not calculating the tumor-aware results can result in a significant speedup.") public boolean CALCULATE_TUMOR_AWARE_RESULTS = true; @@ -337,7 +337,7 @@ public class CrosscheckFingerprints extends CommandLineProgram { "the expected sample. Must be greater than zero. ") public double GENOTYPING_ERROR_RATE = 0.01; - @Argument(doc = "If true then only groups that do not relate to each other as expected will have their LODs reported.") + @Argument(doc = "If true, then only groups that do not relate to each other as expected will have their LODs reported.") public boolean OUTPUT_ERRORS_ONLY = false; @Argument(doc = "The rate at which a heterozygous genotype in a normal sample turns into a homozygous (via loss of heterozygosity) " + @@ -352,7 +352,7 @@ public class CrosscheckFingerprints extends CommandLineProgram { @Argument(doc = "When one or more mismatches between groups is detected, exit with this value instead of 0.") public int EXIT_CODE_WHEN_MISMATCH = 1; - @Argument(doc = "When all LOD score are zero, exit with this value.") + @Argument(doc = "When all LOD scores are zero, exit with this value.") public int EXIT_CODE_WHEN_NO_VALID_CHECKS = 1; @Hidden diff --git a/src/main/java/picard/fingerprint/CrosscheckMetric.java b/src/main/java/picard/fingerprint/CrosscheckMetric.java index f94a1b0050..2f86d04fa3 100644 --- a/src/main/java/picard/fingerprint/CrosscheckMetric.java +++ b/src/main/java/picard/fingerprint/CrosscheckMetric.java @@ -59,6 +59,7 @@ public Boolean isMatch() { } } + /** The data type. */ public enum DataType { FILE, SAMPLE, @@ -66,44 +67,46 @@ public enum DataType { READGROUP } + /** The LEFT group value. */ public String LEFT_GROUP_VALUE; + /** The RIGHT group value. */ public String RIGHT_GROUP_VALUE; - // The overall result of the match + /** The overall result of the match. */ public FingerprintResult RESULT; - // The data type that was being compared + /** The data type that was being compared. */ public DataType DATA_TYPE; - // The resulting LOD score comparing LEFT and RIGHT data + /** The resulting LOD score comparing LEFT and RIGHT data. */ public Double LOD_SCORE; - // The resulting LOD score comparing LEFT as tumor and RIGHT as normal + /** The resulting LOD score comparing LEFT as tumor and RIGHT as normal. */ public Double LOD_SCORE_TUMOR_NORMAL; - // The resulting LOD score comparing LEFT as normal and RIGHT as tumor + /** The resulting LOD score comparing LEFT as normal and RIGHT as tumor. */ public Double LOD_SCORE_NORMAL_TUMOR; - // The LEFT run barcode (PU field) expected to look like : D047KACXX110901.1.ACCAACTG + /** The LEFT run barcode (PU field) expected to look like : D047KACXX110901.1.ACCAACTG. */ public String LEFT_RUN_BARCODE; - // The LEFT lane + /** The LEFT lane. */ public Integer LEFT_LANE; - // The LEFT molecular (sample) barcode + /** The LEFT molecular (sample) barcode. */ public String LEFT_MOLECULAR_BARCODE_SEQUENCE; - // The LEFT library identifier + /** The LEFT library identifier. */ public String LEFT_LIBRARY; - // The LEFT sample identifier + /** The LEFT sample identifier. */ public String LEFT_SAMPLE; - // The LEFT file from which the fingerprint was obtained + /** The LEFT file from which the fingerprint was obtained. */ public String LEFT_FILE; - // The RIGHT run barcode (PU field) expected to look like : D047KACXX110901.1.ACCAACTG + /** The RIGHT run barcode (PU field) expected to look like : D047KACXX110901.1.ACCAACTG. */ public String RIGHT_RUN_BARCODE; - // The LEFT lane + /** The RIGHT lane. */ public Integer RIGHT_LANE; - // The LEFT molecular (sample) barcode + /** The RIGHT molecular (sample) barcode. */ public String RIGHT_MOLECULAR_BARCODE_SEQUENCE; - // The LEFT library identifier + /** The RIGHT library identifier. */ public String RIGHT_LIBRARY; - // The LEFT sample identifier + /** The RIGHT sample identifier. */ public String RIGHT_SAMPLE; - // The LEFT file from which the fingerprint was obtained + /** The RIGHT file from which the fingerprint was obtained. */ public String RIGHT_FILE; } From 17b9cdbc3c98b40b435fc3a8fa54aa9a6d991f21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Derek=20Caetano-Anoll=C3=A9s?= Date: Wed, 15 Sep 2021 20:49:30 +0200 Subject: [PATCH 2/4] Update CrosscheckFingerprints summary I've revised the introduction to the CrosscheckFingerprints doc for improved readability. The previous reference to "moltenized" output has been changed to "molten" for the sake of standardization. --- .../fingerprint/CrosscheckFingerprints.java | 48 +++++-------------- 1 file changed, 13 insertions(+), 35 deletions(-) diff --git a/src/main/java/picard/fingerprint/CrosscheckFingerprints.java b/src/main/java/picard/fingerprint/CrosscheckFingerprints.java index c28bf84667..80f568ffbf 100644 --- a/src/main/java/picard/fingerprint/CrosscheckFingerprints.java +++ b/src/main/java/picard/fingerprint/CrosscheckFingerprints.java @@ -157,45 +157,23 @@ @CommandLineProgramProperties( summary = - "Checks that all data in the set of input files appear to come from the same " + - "individual. Can be used to cross-check readgroups, libraries, samples, or files. " + - "Operates on SAM/BAM/CRAM and VCF (including gVCF and gzipped-VCF). " + + "Checks the odds that all data in the set of input files come from the same individual. Can be used to cross-check readgroups, libraries, samples, or files. Acceptable inputs include BAM/SAM/CRAM and VCF/GVCF files. Output delivers LOD scores in the form of a CrosscheckMetric file. \n" + "\n" + "

    Summary

    \n" + - "Checks if all the genetic data within a set of files appear to come from the same individual. " + - "It quickly determines whether a group's genotype matches that of an input file by selective sampling, " + - "and has been designed to work well for low-depth SAM (as well as high depth ones and VCFs.) " + - "The tool collects fingerprints (essentially, genotype information from different parts of the genome) " + - "at the finest level available in the data (readgroup for read-data files " + - "and sample for variant-data files) and then optionally aggregates it by library, sample or file, to increase power and provide " + - "results at the desired resolution. Output is in a \"Moltenized\" format, one row per comparison. The results are " + - "emitted into a CrosscheckMetric metric file. " + - "In this format the output will include the LOD score and also tumor-aware LOD score which can " + - "help assess identity even in the presence of a severe loss of heterozygosity with high purity (which could cause it to " + - "otherwise fail to notice that samples are from the same individual.) " + - "A matrix output is also available to facilitate visual inspection of crosscheck results.\n " + + "CrosscheckFingerprints rapidly checks the odds that all of the genetic data within a set of files come from the same individual. This is accomplished by selectively sampling from the input files, and determining whether the genotypes of the specified readgroups match to each other. \n " + + "Output is generated in the form of a “molten” (one row per comparison) CrosscheckMetric file that includes the Logarithm of the Odds (LOD) score, as well as the tumor-aware LOD score. Tumor-aware LOD scores can be used to assess genotypic identity in the presence of a severe Loss of Heterozygosity (LOH) with high purity—this could otherwise lead to a failure of the tool to identify samples are from the same individual. Output is also available as a matrix, to facilitate visual inspection of crosscheck results. \n" + + "Metric files can contain many rows of output. We therefore recommend following up CrosscheckFingerprints with a step using [ClusterCrosscheckMetrics (Picard)](https://gatk.broadinstitute.org/hc/en-us/articles/360045798972--Tool-Documentation-Index); this tool will cluster groups together that pass a designated LOD threshold, ensuring that groups within the cluster are related to each other. \n" + + "There may be cases where several groups out of a collection of possible groups must be identified—for example, to link a BAM to its correct sample in a multi-sample VCF. In this case, it would not be necessary to cross-check the various samples in the VCF against each other, but only to check the identity of the BAM against the various samples in the VCF. For this application, the SECOND_INPUT argument is provided. With SECOND_INPUT, CrosscheckFingerprints does the following: \n" + + " - Independently aggregates data for the input files in INPUT and SECOND_INPUT. \n" + + " - Aggregates data at the SAMPLE level. \n" + + " - Compares samples from INPUT to the same sample in SECOND_INPUT. \n" + + " - Disables MATRIX_OUTPUT. \n" + "\n" + - "Since there can be many rows of output in the metric file, we recommend the use of ClusterCrosscheckMetrics " + - "as a follow-up step to running CrosscheckFingerprints.\n " + + "In other cases, the groups collected may not have any observations (‘reads’ for BAM files, or ‘calls’ for VCF files) at fingerprinting sites. Alternatively, a sample in INPUT may be missing from SECOND_INPUT. These cases are handled as follows: \n" + + " - If running in CHECK_SAME_SAMPLES mode with the INPUT and SECOND_INPUT sets of input files: when either set of inputs (1) includes a sample not found in the other, or (2) contains a sample with no observations at any fingerprinting sites, then an error will be logged and the tool will return EXIT_CODE_WHEN_MISMATCH. \n" + + " - If running in any other running mode: when a group which is being crosschecked does not have any observations at fingerprinting sites, a warning will be logged. \n" + "\n" + - "There are cases where one would like to identify a few groups out of a collection of many possible groups (say " + - "to link a SAM to its correct sample in a multi-sample VCF. In this case one would not case for the cross-checking " + - "of the various samples in the VCF against each other, but only in checking the identity of the SAM against the various " + - "samples in the VCF. The SECOND_INPUT is provided for this use-case. With SECOND_INPUT provided, CrosscheckFingerprints " + - "does the following:\n" + - " - aggregation of data happens independently for the input files in INPUT and SECOND_INPUT. \n" + - " - aggregation of data happens at the SAMPLE level \n" + - " - each samples from INPUT will only be compared to that same sample in SECOND_INPUT. \n" + - " - MATRIX_OUTPUT is disabled. " + - "\n" + - "In some cases, the groups collected may not have any observations (calls for a VCF, reads for a SAM) at fingerprinting sites, or " + - "a sample in INPUT may be missing from the SECOND_INPUT. These cases are handled as follows: If running in CHECK_SAME_SAMPLES mode " + - "with INPUT and SECOND_INPUT, and either INPUT or SECOND_INPUT includes a sample not found in the other, or contains a sample with " + - "no observations at any fingerprinting sites, an error will be logged and the tool will return EXIT_CODE_WHEN_MISMATCH. In all other " + - "running modes, when any group which is being crosschecked does not have any observations at fingerprinting sites, a warning is " + - "logged. As long as there is at least one comparison where both sides have observations at fingerprinting sites, the tool will " + - "return zero. However, if all comparisons have at least one side with no observations at fingerprinting sites, an error will be " + - "logged and the tool will return EXIT_CODE_WHEN_NO_VALID_CHECKS." + + "Note that, as long as there is at least one comparison in which both files have observations at fingerprinting sites, the tool will return a ‘zero’. However, an error will be logged and the tool will return EXIT_CODE_WHEN_NO_VALID_CHECKS if all comparisons have at least one side without observations at a fingerprinting site (ie. all LOD scores are zero). \n" + "\n" + "
    " + "

    Examples

    " + From 68f779a1e825066f7012cbcca93969e163e6d64b Mon Sep 17 00:00:00 2001 From: Samuel Lee Date: Fri, 29 May 2020 14:41:14 -0400 Subject: [PATCH 3/4] Updates to CrosscheckFingerprints documentation. --- .../fingerprint/CrosscheckFingerprints.java | 22 +++++------ .../picard/fingerprint/CrosscheckMetric.java | 37 ++++++++++--------- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/src/main/java/picard/fingerprint/CrosscheckFingerprints.java b/src/main/java/picard/fingerprint/CrosscheckFingerprints.java index c8c2893821..c28bf84667 100644 --- a/src/main/java/picard/fingerprint/CrosscheckFingerprints.java +++ b/src/main/java/picard/fingerprint/CrosscheckFingerprints.java @@ -79,8 +79,8 @@ * and sample for VCF files) and then optionally aggregates it by library, sample or file, to increase power and provide * results at the desired resolution. Output is in a "Moltenized" format, one row per comparison. The results will * be emitted into a metric file for the class {@link CrosscheckMetric}. - * In this format the output will include the LOD score and also tumor-aware LOD score which can - * help assess identity even in the presence of a severe loss of heterozygosity with high purity (which could + * In this format, the output will include both the LOD score and the tumor-aware LOD score; the latter can + * help assess identity even in the presence of a severe loss of heterozygosity with high purity (the tool could * otherwise fail to notice that samples are from the same individual.) * A matrix output is also available to facilitate visual inspection of crosscheck results. *
    @@ -88,14 +88,14 @@ * as a follow-up step to running CrosscheckFingerprints. *
    * There are cases where one would like to identify a few groups out of a collection of many possible groups (say - * to link a bam to it's correct sample in a multi-sample vcf. In this case one would not case for the cross-checking + * to link a bam to its correct sample in a multi-sample vcf. In this case, one would not care for the cross-checking * of the various samples in the VCF against each other, but only in checking the identity of the bam against the various * samples in the vcf. The {@link #SECOND_INPUT} is provided for this use-case. With {@link #SECOND_INPUT} provided, CrosscheckFingerprints * does the following: * *
  • aggregation of data happens independently for the input files in {@link #INPUT} and {@link #SECOND_INPUT}.
  • *
  • aggregation of data happens at the SAMPLE level.
  • - *
  • each samples from {@link #INPUT} will only be compared to that same sample in {@link #INPUT}.
  • + *
  • each sample from {@link #INPUT} will only be compared to that same sample in {@link #INPUT}.
  • *
  • {@link #MATRIX_OUTPUT} is disabled.
  • *
    *
    @@ -132,13 +132,13 @@ * * This tool calculates the LOD score for identity check between "groups" of data in the INPUT files as defined by * the CROSSCHECK_BY argument. A positive value indicates that the data seems to have come from the same individual - * or, in other words the identity checks out. The scale is logarithmic (base 10), so a LOD of 6 indicates - * that it is 1,000,000 more likely that the data matches the genotypes than not. A negative value indicates + * or, in other words, the identity checks out. The scale is logarithmic (base 10), so a LOD of 6 indicates + * that it is 1,000,000 times more likely that the data matches the genotypes than not. A negative value indicates * that the data do not match. A score that is near zero is inconclusive and can result from low coverage * or non-informative genotypes. Each group is assigned a sample identifier (for SAM this is taken from the SM tag in * the appropriate readgroup header line, for VCF this is taken from the column label in the file-header. * After combining all the data from the same "group" together, an all-against-all comparison is performed. Results are - * categorized a {@link FingerprintResult} enum: EXPECTED_MATCH, EXPECTED_MISMATCH, UNEXPECTED_MATCH, UNEXPECTED_MISMATCH, + * categorized as a {@link FingerprintResult} enum: EXPECTED_MATCH, EXPECTED_MISMATCH, UNEXPECTED_MATCH, UNEXPECTED_MISMATCH, * or AMBIGUOUS depending on the LOD score and on whether the sample identifiers of the groups agree: LOD scores that are * less than LOD_THRESHOLD are considered mismatches, and those greater than -LOD_THRESHOLD are matches (between is ambiguous). * If the sample identifiers are equal, the groups are expected to match. They are expected to mismatch otherwise. @@ -328,7 +328,7 @@ public class CrosscheckFingerprints extends CommandLineProgram { "the groups are from the same individual. ") public double LOD_THRESHOLD = 0; - @Argument(doc = "Specificies which data-type should be used as the basic comparison unit. Fingerprints from readgroups can " + + @Argument(doc = "Specifies which data-type should be used as the basic comparison unit. Fingerprints from readgroups can " + "be \"rolled-up\" to the LIBRARY, SAMPLE, or FILE level before being compared." + " Fingerprints from VCF can be be compared by SAMPLE or FILE.") public CrosscheckMetric.DataType CROSSCHECK_BY = CrosscheckMetric.DataType.READGROUP; @@ -336,7 +336,7 @@ public class CrosscheckFingerprints extends CommandLineProgram { @Argument(doc = "The number of threads to use to process files and generate fingerprints.") public int NUM_THREADS = 1; - @Argument(doc = "specifies whether the Tumor-aware result should be calculated. These are time consuming and can roughly double the " + + @Argument(doc = "Specifies whether the Tumor-aware result should be calculated. These are time consuming and can roughly double the " + "runtime of the tool. When crosschecking many groups not calculating the tumor-aware results can result in a significant speedup.") public boolean CALCULATE_TUMOR_AWARE_RESULTS = true; @@ -348,7 +348,7 @@ public class CrosscheckFingerprints extends CommandLineProgram { "the expected sample. Must be greater than zero. ") public double GENOTYPING_ERROR_RATE = 0.01; - @Argument(doc = "If true then only groups that do not relate to each other as expected will have their LODs reported.") + @Argument(doc = "If true, then only groups that do not relate to each other as expected will have their LODs reported.") public boolean OUTPUT_ERRORS_ONLY = false; @Argument(doc = "The rate at which a heterozygous genotype in a normal sample turns into a homozygous (via loss of heterozygosity) " + @@ -363,7 +363,7 @@ public class CrosscheckFingerprints extends CommandLineProgram { @Argument(doc = "When one or more mismatches between groups is detected, exit with this value instead of 0.") public int EXIT_CODE_WHEN_MISMATCH = 1; - @Argument(doc = "When all LOD score are zero, exit with this value.") + @Argument(doc = "When all LOD scores are zero, exit with this value.") public int EXIT_CODE_WHEN_NO_VALID_CHECKS = 1; @Argument(doc = "Maximal effect of any single haplotype block on outcome (-log10 of maximal likelihood difference between the different values for the three possible genotypes).", minValue = 0) diff --git a/src/main/java/picard/fingerprint/CrosscheckMetric.java b/src/main/java/picard/fingerprint/CrosscheckMetric.java index f94a1b0050..2f86d04fa3 100644 --- a/src/main/java/picard/fingerprint/CrosscheckMetric.java +++ b/src/main/java/picard/fingerprint/CrosscheckMetric.java @@ -59,6 +59,7 @@ public Boolean isMatch() { } } + /** The data type. */ public enum DataType { FILE, SAMPLE, @@ -66,44 +67,46 @@ public enum DataType { READGROUP } + /** The LEFT group value. */ public String LEFT_GROUP_VALUE; + /** The RIGHT group value. */ public String RIGHT_GROUP_VALUE; - // The overall result of the match + /** The overall result of the match. */ public FingerprintResult RESULT; - // The data type that was being compared + /** The data type that was being compared. */ public DataType DATA_TYPE; - // The resulting LOD score comparing LEFT and RIGHT data + /** The resulting LOD score comparing LEFT and RIGHT data. */ public Double LOD_SCORE; - // The resulting LOD score comparing LEFT as tumor and RIGHT as normal + /** The resulting LOD score comparing LEFT as tumor and RIGHT as normal. */ public Double LOD_SCORE_TUMOR_NORMAL; - // The resulting LOD score comparing LEFT as normal and RIGHT as tumor + /** The resulting LOD score comparing LEFT as normal and RIGHT as tumor. */ public Double LOD_SCORE_NORMAL_TUMOR; - // The LEFT run barcode (PU field) expected to look like : D047KACXX110901.1.ACCAACTG + /** The LEFT run barcode (PU field) expected to look like : D047KACXX110901.1.ACCAACTG. */ public String LEFT_RUN_BARCODE; - // The LEFT lane + /** The LEFT lane. */ public Integer LEFT_LANE; - // The LEFT molecular (sample) barcode + /** The LEFT molecular (sample) barcode. */ public String LEFT_MOLECULAR_BARCODE_SEQUENCE; - // The LEFT library identifier + /** The LEFT library identifier. */ public String LEFT_LIBRARY; - // The LEFT sample identifier + /** The LEFT sample identifier. */ public String LEFT_SAMPLE; - // The LEFT file from which the fingerprint was obtained + /** The LEFT file from which the fingerprint was obtained. */ public String LEFT_FILE; - // The RIGHT run barcode (PU field) expected to look like : D047KACXX110901.1.ACCAACTG + /** The RIGHT run barcode (PU field) expected to look like : D047KACXX110901.1.ACCAACTG. */ public String RIGHT_RUN_BARCODE; - // The LEFT lane + /** The RIGHT lane. */ public Integer RIGHT_LANE; - // The LEFT molecular (sample) barcode + /** The RIGHT molecular (sample) barcode. */ public String RIGHT_MOLECULAR_BARCODE_SEQUENCE; - // The LEFT library identifier + /** The RIGHT library identifier. */ public String RIGHT_LIBRARY; - // The LEFT sample identifier + /** The RIGHT sample identifier. */ public String RIGHT_SAMPLE; - // The LEFT file from which the fingerprint was obtained + /** The RIGHT file from which the fingerprint was obtained. */ public String RIGHT_FILE; } From de1f23708fc6928f61dc2073dffb25f81d3d405b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Derek=20Caetano-Anoll=C3=A9s?= Date: Thu, 3 Nov 2022 17:35:14 +0100 Subject: [PATCH 4/4] Updated according to comments Apologies for removing the line breaks, I wasn't aware I was affecting the readability. My focus was on the content. Hopefully I've addressed everything. --- .../fingerprint/CrosscheckFingerprints.java | 127 +++++++++++------- 1 file changed, 79 insertions(+), 48 deletions(-) diff --git a/src/main/java/picard/fingerprint/CrosscheckFingerprints.java b/src/main/java/picard/fingerprint/CrosscheckFingerprints.java index 80f568ffbf..54f0864346 100644 --- a/src/main/java/picard/fingerprint/CrosscheckFingerprints.java +++ b/src/main/java/picard/fingerprint/CrosscheckFingerprints.java @@ -65,33 +65,33 @@ import static picard.fingerprint.Fingerprint.CrosscheckMode.CHECK_SAME_SAMPLE; /** - * Checks that all data in the set of input files appear to come from the same - * individual. Can be used to compare according to readgroups, libraries, samples, or files. - * Operates on bams/sams and vcfs (including gvcfs). + * Checks that all data in the set of input files appear to come from the same + * individual. Can be used to compare according to readgroups, libraries, samples, or files. + * Operates on bams/sams and vcfs (including gvcfs). * *

    Summary

    - * Checks if all the genetic data within a set of files appear to come from the same individual. - * It quickly determines whether a "group's" genotype matches that of an input SAM/BAM/VCF by selective sampling, - * and has been designed to work well even for low-depth SAM/BAMs. + * Checks if all the genetic data within a set of files appear to come from the same individual. + * It quickly determines whether a "group's" genotype matches that of an input SAM/BAM/VCF by selective sampling, + * and has been designed to work well even for low-depth SAM/BAMs. *
    - * The tool collects "fingerprints" (essentially genotype information from different parts of the genome) - * at the finest level available in the data (readgroup for SAM files - * and sample for VCF files) and then optionally aggregates it by library, sample or file, to increase power and provide - * results at the desired resolution. Output is in a "Moltenized" format, one row per comparison. The results will - * be emitted into a metric file for the class {@link CrosscheckMetric}. - * In this format, the output will include both the LOD score and the tumor-aware LOD score; the latter can - * help assess identity even in the presence of a severe loss of heterozygosity with high purity (the tool could - * otherwise fail to notice that samples are from the same individual.) - * A matrix output is also available to facilitate visual inspection of crosscheck results. + * The tool collects "fingerprints" (essentially genotype information from different parts of the genome) + * at the finest level available in the data (readgroup for SAM files + * and sample for VCF files) and then optionally aggregates it by library, sample or file, to increase power and provide + * results at the desired resolution. Output is in a "Moltenized" format, one row per comparison. The results will + * be emitted into a metric file for the class {@link CrosscheckMetric}. + * In this format, the output will include both the LOD score and the tumor-aware LOD score; the latter can + * help assess identity even in the presence of a severe loss of heterozygosity with high purity (the tool could + * otherwise fail to notice that samples are from the same individual.) + * A matrix output is also available to facilitate visual inspection of crosscheck results. *
    - * Since there can be many rows of output in the metric file, we recommend the use of {@link ClusterCrosscheckMetrics} - * as a follow-up step to running CrosscheckFingerprints. + * Since there can be many rows of output in the metric file, we recommend the use of {@link ClusterCrosscheckMetrics} + * as a follow-up step to running CrosscheckFingerprints. *
    - * There are cases where one would like to identify a few groups out of a collection of many possible groups (say - * to link a bam to its correct sample in a multi-sample vcf. In this case, one would not care for the cross-checking - * of the various samples in the VCF against each other, but only in checking the identity of the bam against the various - * samples in the vcf. The {@link #SECOND_INPUT} is provided for this use-case. With {@link #SECOND_INPUT} provided, CrosscheckFingerprints - * does the following: + * There are cases where one would like to identify a few groups out of a collection of many possible groups (say + * to link a bam to its correct sample in a multi-sample vcf. In this case, one would not care for the cross-checking + * of the various samples in the VCF against each other, but only in checking the identity of the bam against the various + * samples in the vcf. The {@link #SECOND_INPUT} is provided for this use-case. With {@link #SECOND_INPUT} provided, + * CrosscheckFingerprints does the following: * *
  • aggregation of data happens independently for the input files in {@link #INPUT} and {@link #SECOND_INPUT}.
  • *
  • aggregation of data happens at the SAMPLE level.
  • @@ -99,11 +99,14 @@ *
  • {@link #MATRIX_OUTPUT} is disabled.
  • *
    *
    - * In some cases, the groups collected may not have any observations (calls for a vcf, reads for a bam) at fingerprinting sites, or a sample in INPUT may be missing from the SECOND_INPUT. - * These cases are handled as follows: If running in CHECK_SAME_SAMPLES mode with INPUT and SECOND_INPUT, and either INPUT or SECOND_INPUT includes a sample - * not found in the other, or contains a sample with no observations at any fingerprinting sites, an error will be logged and the tool will return EXIT_CODE_WHEN_MISMATCH. - * In all other running modes, when any group which is being crosschecked does not have any observations at fingerprinting sites, a warning is logged. As long as there is at least - * one comparison where both sides have observations at fingerprinting sites, the tool will return zero. However, if all comparisons have at least one side with no observations + * In some cases, the groups collected may not have any observations (calls for a vcf, reads for a bam) at fingerprinting + * sites, or a sample in INPUT may be missing from the SECOND_INPUT. These cases are handled as follows: If running in + * CHECK_SAME_SAMPLES mode with INPUT and SECOND_INPUT, and either INPUT or SECOND_INPUT includes a sample not found in the + * other, or contains a sample with no observations at any fingerprinting sites, an error will be logged and the tool will + * return EXIT_CODE_WHEN_MISMATCH. + * In all other running modes, when any group which is being crosschecked does not have any observations at fingerprinting + * sites, a warning is logged. As long as there is at least one comparison where both sides have observations at + * fingerprinting sites, the tool will return zero. However, if all comparisons have at least one side with no observations * at fingerprinting sites, an error will be logged and the tool will return EXIT_CODE_WHEN_NO_VALID_CHECKS. *

    Examples

    *

    Check that all the readgroups from a sample match each other:

    @@ -157,23 +160,48 @@ @CommandLineProgramProperties( summary = - "Checks the odds that all data in the set of input files come from the same individual. Can be used to cross-check readgroups, libraries, samples, or files. Acceptable inputs include BAM/SAM/CRAM and VCF/GVCF files. Output delivers LOD scores in the form of a CrosscheckMetric file. \n" + + "Checks the odds that all data in the set of input files come from the same " + + "individual. Can be used to cross-check readgroups, libraries, samples, or files. " + + "Acceptable inputs include BAM/SAM/CRAM and VCF/GVCF files. Output delivers LOD " + + "scores in the form of a CrosscheckMetric file. \n" + "\n" + "

    Summary

    \n" + - "CrosscheckFingerprints rapidly checks the odds that all of the genetic data within a set of files come from the same individual. This is accomplished by selectively sampling from the input files, and determining whether the genotypes of the specified readgroups match to each other. \n " + - "Output is generated in the form of a “molten” (one row per comparison) CrosscheckMetric file that includes the Logarithm of the Odds (LOD) score, as well as the tumor-aware LOD score. Tumor-aware LOD scores can be used to assess genotypic identity in the presence of a severe Loss of Heterozygosity (LOH) with high purity—this could otherwise lead to a failure of the tool to identify samples are from the same individual. Output is also available as a matrix, to facilitate visual inspection of crosscheck results. \n" + - "Metric files can contain many rows of output. We therefore recommend following up CrosscheckFingerprints with a step using [ClusterCrosscheckMetrics (Picard)](https://gatk.broadinstitute.org/hc/en-us/articles/360045798972--Tool-Documentation-Index); this tool will cluster groups together that pass a designated LOD threshold, ensuring that groups within the cluster are related to each other. \n" + - "There may be cases where several groups out of a collection of possible groups must be identified—for example, to link a BAM to its correct sample in a multi-sample VCF. In this case, it would not be necessary to cross-check the various samples in the VCF against each other, but only to check the identity of the BAM against the various samples in the VCF. For this application, the SECOND_INPUT argument is provided. With SECOND_INPUT, CrosscheckFingerprints does the following: \n" + - " - Independently aggregates data for the input files in INPUT and SECOND_INPUT. \n" + - " - Aggregates data at the SAMPLE level. \n" + - " - Compares samples from INPUT to the same sample in SECOND_INPUT. \n" + + "CrosscheckFingerprints rapidly checks the odds that all of the genetic data within " + + "a set of files come from the same individual. This is accomplished by selectively " + + "sampling from the input files, and determining whether the genotypes of the " + + "specified Groups match to each other. (Groups are defined by the input and the argument " + + "CROSSCHECK_BY; they can be READ_GROUP, LIBRARY, SAMPLE, or FILE.) \n " + + "Output is generated in the form of a “molten” (one row per comparison) CrosscheckMetric " + + "file that includes the Logarithm of the Odds (LOD) score, as well as the tumor-aware LOD " + + "score. Tumor-aware LOD scores can be used to assess genotypic identity in the presence of " + + "a severe Loss of Heterozygosity (LOH) with high purity—this could otherwise lead to a " + + "failure of the tool to identify samples are from the same individual. Output is also available " + + "as a matrix, to facilitate visual inspection of crosscheck results. \n" + + "Metric files can contain many rows of output. We therefore recommend following up CrosscheckFingerprints " + + "with a step using [ClusterCrosscheckMetrics (Picard)](https://gatk.broadinstitute.org/hc/en-us/articles/360045798972--Tool-Documentation-Index); this tool will cluster groups together that pass a designated LOD threshold, " + + "ensuring that groups within the cluster are related to each other. \n" + + "There may be cases where several groups out of a collection of possible groups must be identified---for example, " + + "to link a BAM to its correct sample in a multi-sample VCF. In this case, it would not be necessary to cross-check " + + "the various samples in the VCF against each other, but only to check the identity of the BAM against the various " + + "samples in the VCF. For this application, the SECOND_INPUT argument is provided. With SECOND_INPUT, " + + "CrosscheckFingerprints can do the following: \n" + + " - Independently aggregate data for the input files in INPUT and SECOND_INPUT. \n" + + " - Aggregate data at the SAMPLE level. \n" + + " - Compare samples from INPUT to the same sample in SECOND_INPUT. \n" + " - Disables MATRIX_OUTPUT. \n" + "\n" + - "In other cases, the groups collected may not have any observations (‘reads’ for BAM files, or ‘calls’ for VCF files) at fingerprinting sites. Alternatively, a sample in INPUT may be missing from SECOND_INPUT. These cases are handled as follows: \n" + - " - If running in CHECK_SAME_SAMPLES mode with the INPUT and SECOND_INPUT sets of input files: when either set of inputs (1) includes a sample not found in the other, or (2) contains a sample with no observations at any fingerprinting sites, then an error will be logged and the tool will return EXIT_CODE_WHEN_MISMATCH. \n" + - " - If running in any other running mode: when a group which is being crosschecked does not have any observations at fingerprinting sites, a warning will be logged. \n" + + "In other cases, the groups collected may not have any observations (‘reads’ for BAM files, or ‘calls’ for VCF files) " + + "at fingerprinting sites. Alternatively, a sample in INPUT may be missing from SECOND_INPUT. These cases are handled " + + "as follows: \n" + + " - If running in CHECK_SAME_SAMPLES mode with the INPUT and SECOND_INPUT sets of input files: when either set of inputs " + + "(1) includes a sample not found in the other, or (2) contains a sample with no observations at any fingerprinting sites, " + + "then an error will be logged and the tool will return EXIT_CODE_WHEN_MISMATCH. \n" + + " - If running in any other running mode: when a group which is being crosschecked does not have any observations at " + + "fingerprinting sites, a warning will be logged. \n" + "\n" + - "Note that, as long as there is at least one comparison in which both files have observations at fingerprinting sites, the tool will return a ‘zero’. However, an error will be logged and the tool will return EXIT_CODE_WHEN_NO_VALID_CHECKS if all comparisons have at least one side without observations at a fingerprinting site (ie. all LOD scores are zero). \n" + + "Note that, as long as there is at least one comparison in which both files have observations at fingerprinting sites, " + + "the tool will return a ‘zero’. However, an error will be logged and the tool will return EXIT_CODE_WHEN_NO_VALID_CHECKS " + + "if all comparisons have at least one side without observations at a fingerprinting site (ie. all LOD scores are zero). \n" + "\n" + "
    " + "

    Examples

    " + @@ -267,14 +295,16 @@ public class CrosscheckFingerprints extends CommandLineProgram { "Should only be used with SECOND_INPUT. ", optional = true) public File SECOND_INPUT_SAMPLE_MAP; - @Argument(doc = "A tsv with two columns representing the individual with which each sample is associated. The first column is the sample id, and the second " + - "column is the associated individual id. Values in the first column must be unique. If INPUT_SAMPLE_MAP or SECOND_INPUT_SAMPLE_MAP is also specified, " + - "then the values in the first column of this file should be the sample aliases specified in the second columns of INPUT_SAMPLE_MAP and SECOND_INPUT_SAMPLE_MAP, " + - "respectively. When this input is specified, expectations for matches will be based on the equality or inequality of the individual ids associated with two " + - "samples, as opposed to the sample ids themselves. Samples which are not listed in this file will have their sample id used as their individual id, for the " + - "purposes of match expectations. This means that one sample id could be used as the individual id for another sample, but not included in the map itself, and " + - "these two samples would be considered to have come from the same individual. Note that use of this parameter only affects labelling of matches and mismatches as " + - "EXPECTED or UNEXPECTED. It has no affect on how data is grouped for crosschecking.", optional = true) + @Argument(doc = "A tsv with two columns representing the individual with which each sample is associated. The first column " + + "is the sample id, and the second column is the associated individual id. Values in the first column must be unique. " + + "If INPUT_SAMPLE_MAP or SECOND_INPUT_SAMPLE_MAP is also specified, then the values in the first column of this file " + + "should be the sample aliases specified in the second columns of INPUT_SAMPLE_MAP and SECOND_INPUT_SAMPLE_MAP, " + + "respectively. When this input is specified, expectations for matches will be based on the equality or inequality of " + + "the individual ids associated with two samples, as opposed to the sample ids themselves. Samples which are not listed " + + "in this file will have their sample id used as their individual id, for the purposes of match expectations. This means " + + "that one sample id could be used as the individual id for another sample, but not included in the map itself, and these " + + "two samples would be considered to have come from the same individual. Note that use of this parameter only affects " + + "labelling of matches and mismatches as EXPECTED or UNEXPECTED. It has no affect on how data is grouped for crosschecking.", optional = true) public File SAMPLE_INDIVIDUAL_MAP; @Argument(doc = "An argument that controls how crosschecking with both INPUT and SECOND_INPUT should occur. ") @@ -344,7 +374,8 @@ public class CrosscheckFingerprints extends CommandLineProgram { @Argument(doc = "When all LOD scores are zero, exit with this value.") public int EXIT_CODE_WHEN_NO_VALID_CHECKS = 1; - @Argument(doc = "Maximal effect of any single haplotype block on outcome (-log10 of maximal likelihood difference between the different values for the three possible genotypes).", minValue = 0) + @Argument(doc = "Maximal effect of any single haplotype block on outcome (-log10 of maximal likelihood difference between the different " + + "values for the three possible genotypes).", minValue = 0) public double MAX_EFFECT_OF_EACH_HAPLOTYPE_BLOCK = 3.0; @Hidden