diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..7c3fe7b --- /dev/null +++ b/.coveragerc @@ -0,0 +1,22 @@ +# .coveragerc to control coverage.py +[run] +branch = True + +[report] +# Regexes for lines to exclude from consideration +exclude_lines = + # Have to re-enable the standard pragma + pragma: no cover + + # Don't complain about missing debug-only code: + def __repr__ + + # Don't complain if tests don't hit defensive assertion code: + raise AssertionError + raise NotImplementedError + + # Don't complain if non-runnable code isn't run: + if 0: + if __name__ == .__main__.: + +ignore_errors = True diff --git a/.travis.yml b/.travis.yml index 3d27758..7afe4a0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,7 +23,7 @@ install: - conda create -q -n test-environment --file requirements/conda.txt python=$TRAVIS_PYTHON_VERSION pytest coverage numpy=1.9.2 bcolz=0.10 cython scipy PyYAML>=3.10 ipython pandas>=0.11.0 pyflakes pyzmq - source activate test-environment - conda install -c https://conda.anaconda.org/bioconda pysam pybedtools - - pip install gemini + - pip install gemini==0.18.2 # install dependencies - pip install -r requirements/dev.txt diff --git a/puzzle/plugins/gemini/mixins/variant.py b/puzzle/plugins/gemini/mixins/variant.py index 28f08c9..17b9e62 100644 --- a/puzzle/plugins/gemini/mixins/variant.py +++ b/puzzle/plugins/gemini/mixins/variant.py @@ -1,13 +1,14 @@ import logging from gemini import GeminiQuery +from gemini.gim import (DeNovo, AutoRec, AutoDom, CompoundHet) from puzzle.plugins import BaseVariantMixin from puzzle.plugins.constants import Results from puzzle.models import (Compound, Variant, Gene, Genotype, Transcript,) from puzzle.utils import (get_most_severe_consequence, get_omim_number, - get_cytoband_coord) + get_cytoband_coord, build_gemini_query, Args) from . import VariantExtras @@ -16,21 +17,48 @@ class VariantMixin(BaseVariantMixin, VariantExtras): """Class to store variant specific functions for gemini plugin""" + def variant(self, case_id, variant_id): + """Return a specific variant. - def build_gemini_query(self, query, extra_info): - """Append sql to a gemini query + We solve this by building a gemini query and send it to _variants - Args: - query(str): The gemini query - extra_info(str): The text that should be added + Args: + case_id (str): Path to a gemini database + variant_id (int): A gemini variant id + + Returns: + variant_obj (dict): A puzzle variant - Return: - extended_query(str) """ - if 'WHERE' in query: - return "{0} AND {1}".format(query, extra_info) - else: - return "{0} WHERE {1}".format(query, extra_info) + #Use the gemini id for fast lookup + variant_id = int(variant_id) + gemini_query = "SELECT * from variants WHERE variant_id = {0}".format( + variant_id + ) + + individuals = [] + # Get the individuals for the case + case_obj = self.case(case_id) + for individual in case_obj.individuals: + individuals.append(individual) + + self.db = case_obj.variant_source + self.variant_type = case_obj.variant_type + + gq = GeminiQuery(self.db) + gq.run(gemini_query) + + for gemini_variant in gq: + variant = self._format_variant( + case_id=case_id, + gemini_variant=gemini_variant, + individual_objs=individuals, + index=gemini_variant['variant_id'], + add_all_info = True + ) + return variant + + return None def variants(self, case_id, skip=0, count=1000, filters=None): """Return count variants for a case. @@ -65,54 +93,19 @@ def variants(self, case_id, skip=0, count=1000, filters=None): logger.debug("Looking for variants in {0}".format(case_id)) limit = count + skip - - gemini_query = filters.get('gemini_query') or "SELECT * from variants v" + + genetic_models = None + if filters.get('genetic_models'): + gemini_query = build_gemini_query(filters, add_where=False) + else: + gemini_query = build_gemini_query(filters) any_filter = False - if filters.get('frequency'): - frequency = filters['frequency'] - - extra_info = "(v.max_aaf_all < {0} or v.max_aaf_all is"\ - " Null)".format(frequency) - gemini_query = self.build_gemini_query(gemini_query, extra_info) - - if filters.get('cadd'): - cadd_score = filters['cadd'] - - extra_info = "(v.cadd_scaled > {0})".format(cadd_score) - gemini_query = self.build_gemini_query(gemini_query, extra_info) - - if filters.get('gene_ids'): - gene_list = [gene_id.strip() for gene_id in filters['gene_ids']] - - gene_string = "v.gene in (" - for index, gene_id in enumerate(gene_list): - if index == 0: - gene_string += "'{0}'".format(gene_id) - else: - gene_string += ", '{0}'".format(gene_id) - gene_string += ")" - - gemini_query = self.build_gemini_query(gemini_query, gene_string) - - if filters.get('range'): - chrom = filters['range']['chromosome'] - if not chrom.startswith('chr'): - chrom = "chr{0}".format(chrom) - - range_string = "v.chrom = '{0}' AND "\ - "((v.start BETWEEN {1} AND {2}) OR "\ - "(v.end BETWEEN {1} AND {2}))".format( - chrom, - filters['range']['start'], - filters['range']['end'] - ) - gemini_query = self.build_gemini_query(gemini_query, range_string) - filtered_variants = self._variants( case_id=case_id, gemini_query=gemini_query, + genetic_models=filters.get('genetic_models') ) if filters.get('consequence'): @@ -143,50 +136,7 @@ def variants(self, case_id, skip=0, count=1000, filters=None): return Results(variants, len(variants)) - def variant(self, case_id, variant_id): - """Return a specific variant. - - We solve this by building a gemini query and send it to _variants - - Args: - case_id (str): Path to a gemini database - variant_id (int): A gemini variant id - - Returns: - variant_obj (dict): A puzzle variant - - """ - #Use the gemini id for fast lookup - variant_id = int(variant_id) - gemini_query = "SELECT * from variants WHERE variant_id = {0}".format( - variant_id - ) - - individuals = [] - # Get the individuals for the case - case_obj = self.case(case_id) - for individual in case_obj.individuals: - individuals.append(individual) - - self.db = case_obj.variant_source - self.variant_type = case_obj.variant_type - - gq = GeminiQuery(self.db) - gq.run(gemini_query) - - for gemini_variant in gq: - variant = self._format_variant( - case_id=case_id, - gemini_variant=gemini_variant, - individual_objs=individuals, - index=gemini_variant['variant_id'], - add_all_info = True - ) - return variant - - return None - - def _variants(self, case_id, gemini_query): + def _variants(self, case_id, gemini_query, genetic_models=None): """Return variants found in the gemini database Args: @@ -205,37 +155,90 @@ def _variants(self, case_id, gemini_query): self.db = case_obj.variant_source self.variant_type = case_obj.variant_type - - gq = GeminiQuery(self.db) - - gq.run(gemini_query) + + variant_generators = [] + + models_found = [] + if genetic_models: + for genetic_model in genetic_models: + + if genetic_model in ['XR', 'XR_dn', 'XD', 'XD_dn']: + chrom_x_string = "chrom = 'chrX'" + if not gemini_query: + gemini_query = chrom_x_string + else: + gemini_query = "{0} AND {1}".format(gemini_query, chrom_x_string) + + if genetic_model in ['AR_hom', 'AR_hom_dn', 'XR', 'XR_dn']: + results = AutoRec(Args(db=self.db, + columns="*", + filter=gemini_query, + families=case_id)) # pragma: no cover + variant_generators.append(results.report_candidates())# pragma: no cover + models_found = ['AR_hom'] + elif genetic_model in ['AD', 'XD']: + results = AutoDom(Args(db=self.db, + columns="*", + filter=gemini_query, + families=case_id)) # pragma: no cover + variant_generators.append(results.report_candidates())# pragma: no cover + models_found = [genetic_model] + elif genetic_model in ['AD_dn', 'XD_dn']: + results = DeNovo(Args(db=self.db, + columns="*", + filter=gemini_query, + families=case_id)) # pragma: no cover + variant_generators.append(results.report_candidates()) # pragma: no cover + models_found = ['AD_dn'] + elif genetic_model in ['AR_comp', 'AR_comp_dn']: + results = CompoundHet(Args(db=self.db, + columns="*", + filter=gemini_query, + families=case_id)) # pragma: no cover + models_found = ['AR_comp'] + variant_generators.append(results.report_candidates()) # pragma: no cover + + + else: + gq = GeminiQuery(self.db) + gq.run(gemini_query) + variant_generators.append(gq) index = 0 - for gemini_variant in gq: - variant = None - - # Check if variant is non ref in the individuals - is_variant = self._is_variant(gemini_variant, individuals) - - if self.variant_type == 'snv' and not is_variant: + for variants in variant_generators: + for gemini_variant in variants: variant = None - - else: - index += 1 - logger.debug("Updating index to: {0}".format(index)) - variant = self._format_variant( - case_id=case_id, - gemini_variant=gemini_variant, - individual_objs=individuals, - index=index - ) - - if variant: - - yield variant + + if not genetic_models: + # Check if variant is non ref in the individuals + is_variant = self._is_variant(gemini_variant, individuals) + + if self.variant_type == 'snv' and not is_variant: + variant = None + + else: + index += 1 + logger.debug("Updating index to: {0}".format(index)) + variant = self._format_variant( + case_id=case_id, + gemini_variant=gemini_variant, + individual_objs=individuals, + index=index, + models_found=models_found + ) + + else: + index += 1 + is_variant = True + variant = self.variant(case_id, gemini_variant['variant_id']) + variant['index'] = index + + if variant: + + yield variant def _format_variant(self, case_id, gemini_variant, individual_objs, - index=0, add_all_info=False): + index=0, add_all_info=False, models_found = []): """Make a puzzle variant from a gemini variant Args: @@ -248,6 +251,7 @@ def _format_variant(self, case_id, gemini_variant, individual_objs, variant (dict): A Variant object """ chrom = gemini_variant['chrom'] + if chrom.startswith('chr') or chrom.startswith('CHR'): chrom = chrom[3:] @@ -269,7 +273,6 @@ def _format_variant(self, case_id, gemini_variant, individual_objs, variant.variant_id)) variant['index'] = index - # Add the most severe consequence self._add_most_severe_consequence(variant, gemini_variant) @@ -288,7 +291,6 @@ def _format_variant(self, case_id, gemini_variant, individual_objs, else: ### Consequence and region annotations #Add the transcript information - self._add_transcripts(variant, gemini_variant) self._add_thousand_g(variant, gemini_variant) self._add_exac(variant, gemini_variant) self._add_gmaf(variant, gemini_variant) @@ -305,20 +307,17 @@ def _format_variant(self, case_id, gemini_variant, individual_objs, sift = gemini_variant['sift_pred'] if sift: variant.add_severity('SIFT', sift) - - #Add the genes based on the hgnc symbols - self._add_hgnc_symbols(variant) - if self.variant_type == 'snv': - self._add_genes(variant) - - self._add_consequences(variant) - + ### GENOTYPE ANNOATTIONS ### #Get the genotype info if add_all_info: + self._add_transcripts(variant, gemini_variant) self._add_genotypes(variant, gemini_variant, case_id, individual_objs) - if self.variant_type == 'sv': - self._add_genes(variant) + self._add_genes(variant) + + self._add_consequences(variant, gemini_variant) + self._add_hgnc_symbols(variant, gemini_variant) + variant.genetic_models = models_found return variant diff --git a/puzzle/plugins/gemini/mixins/variant_extras/consequences.py b/puzzle/plugins/gemini/mixins/variant_extras/consequences.py index b2672c9..ea00a9b 100644 --- a/puzzle/plugins/gemini/mixins/variant_extras/consequences.py +++ b/puzzle/plugins/gemini/mixins/variant_extras/consequences.py @@ -5,14 +5,17 @@ class ConsequenceExtras(object): """Methods to handle consequences""" - def _add_consequences(self, variant_obj): + def _add_consequences(self, variant_obj, gemini_variant): """Add the consequences found in all transcripts Args: variant_obj (puzzle.models.Variant) """ - consequences = set() + + if gemini_variant['impact_so']: + consequences.add(gemini_variant['impact_so']) + for transcript in variant_obj.transcripts: for consequence in transcript.consequence.split('&'): consequences.add(consequence) diff --git a/puzzle/plugins/gemini/mixins/variant_extras/genes.py b/puzzle/plugins/gemini/mixins/variant_extras/genes.py index 290e465..9fcc204 100644 --- a/puzzle/plugins/gemini/mixins/variant_extras/genes.py +++ b/puzzle/plugins/gemini/mixins/variant_extras/genes.py @@ -4,17 +4,20 @@ class GeneExtras(object): """Methods for genes""" - def _add_hgnc_symbols(self, variant_obj): + def _add_hgnc_symbols(self, variant_obj, gemini_variant): """Add hgnc symbols to the variant If there are transcripts use the symbols found here, otherwise use phizz to get the gene ids. """ hgnc_symbols = set() + #if there are transcripts we get the gene symbols from these if variant_obj.transcripts: for transcript in variant_obj.transcripts: if transcript.hgnc_symbol: hgnc_symbols.add(transcript.hgnc_symbol) + elif gemini_variant['gene']: + hgnc_symbols.add(gemini_variant['gene']) else: chrom = variant_obj.CHROM start = variant_obj.start diff --git a/puzzle/plugins/gemini/mixins/variant_extras/genotypes.py b/puzzle/plugins/gemini/mixins/variant_extras/genotypes.py index 2f5d68d..1d681e6 100644 --- a/puzzle/plugins/gemini/mixins/variant_extras/genotypes.py +++ b/puzzle/plugins/gemini/mixins/variant_extras/genotypes.py @@ -19,6 +19,7 @@ def _add_genotypes(self, variant_obj, gemini_variant, case_id, individual_objs (list(dict)): A list of Individuals """ + for ind in individual_objs: index = ind.ind_index variant_obj.add_individual(Genotype( diff --git a/puzzle/plugins/gemini/plugin.py b/puzzle/plugins/gemini/plugin.py index 6ae96b5..d0b303b 100644 --- a/puzzle/plugins/gemini/plugin.py +++ b/puzzle/plugins/gemini/plugin.py @@ -35,6 +35,7 @@ def __init__(self, vtype='snv'): self.filters.can_query_gemini = True self.filters.can_filter_sv_len = True self.filters.can_filter_range = True + self.filters.can_filter_inheritance = True def test_gemini_db(self): """Check if self.db is a valid gemini database diff --git a/puzzle/utils/__init__.py b/puzzle/utils/__init__.py index 5757171..92f7994 100644 --- a/puzzle/utils/__init__.py +++ b/puzzle/utils/__init__.py @@ -5,4 +5,5 @@ from .ped import get_individuals, get_cases from .phenomizer import hpo_genes from .constants import IMPACT_SEVERITIES -from .get_file_info import (get_file_type, get_variant_type) \ No newline at end of file +from .get_file_info import (get_file_type, get_variant_type) +from .gemini_extras import (build_gemini_query, Args) \ No newline at end of file diff --git a/puzzle/utils/gemini_extras.py b/puzzle/utils/gemini_extras.py new file mode 100644 index 0000000..4abd96f --- /dev/null +++ b/puzzle/utils/gemini_extras.py @@ -0,0 +1,116 @@ +import logging + +logger = logging.getLogger(__name__) + +class Args(object): + """ + >>> args = Args(db='some.db') + >>> args.db + 'some.db' + """ + _opts = ("columns", "db", "filter", "min_kindreds", "families", + "pattern_only", "max_priority", # only for compound_het + "allow_unaffected", "min_gq", "lenient", "min_sample_depth") + def __init__(self, **kwargs): + if not "min_gq" in kwargs: + kwargs['min_gq'] = 0 + + if not "lenient" in kwargs: + kwargs['lenient'] = False + + for k in ("families", "filter"): + if not k in kwargs: + kwargs[k] = None + + if not "gt_phred_ll" in kwargs: + kwargs['gt_phred_ll'] = None + + if not "min_sample_depth" in kwargs: + kwargs['min_sample_depth'] = 0 + + for k in ("min_kindreds", "max_priority"): + if not k in kwargs: kwargs[k] = 1 + + for k in ("pattern_only", "allow_unaffected"): + if not k in kwargs: kwargs[k] = False + + self.__dict__.update(**kwargs) + + +def add_to_query(query, extra_info, add_where=True): + """Append sql to a gemini query + + Args: + query(str): The gemini query + extra_info(str): The text that should be added + + Return: + extended_query(str) + """ + if add_where: + if not 'WHERE' in query: + return "{0} WHERE {1}".format(query, extra_info) + else: + return "{0} AND {1}".format(query, extra_info) + else: + if not query: + return "{0}".format(extra_info) + else: + return "{0} AND {1}".format(query, extra_info) + +def build_gemini_query(filters, add_where=True): + """Build a gemini query based on the filters + + Args: + filters (dict): A dictionary with filters + + Returns: + gemini_query (str): A gemini query string + """ + if add_where: + gemini_query = filters.get('gemini_query') or "SELECT * from variants" + else: + gemini_query = "" + + if filters.get('frequency'): + frequency = filters['frequency'] + + extra_info = "(max_aaf_all < {0} or max_aaf_all is"\ + " Null)".format(frequency) + gemini_query = add_to_query(gemini_query, extra_info, add_where) + + if filters.get('cadd'): + cadd_score = filters['cadd'] + + extra_info = "(cadd_scaled > {0})".format(cadd_score) + gemini_query = add_to_query(gemini_query, extra_info, add_where) + + if filters.get('gene_ids'): + gene_list = [gene_id.strip() for gene_id in filters['gene_ids']] + + gene_string = "gene in (" + for index, gene_id in enumerate(gene_list): + if index == 0: + gene_string += "'{0}'".format(gene_id) + else: + gene_string += ", '{0}'".format(gene_id) + gene_string += ")" + + gemini_query = add_to_query(gemini_query, gene_string, add_where) + + if filters.get('range'): + chrom = filters['range']['chromosome'] + if not chrom.startswith('chr'): + chrom = "chr{0}".format(chrom) + + range_string = "chrom = '{0}' AND "\ + "((start BETWEEN {1} AND {2}) OR "\ + "(end BETWEEN {1} AND {2}))".format( + chrom, + filters['range']['start'], + filters['range']['end'] + ) + gemini_query = add_to_query(gemini_query, range_string, add_where) + + return gemini_query + diff --git a/tests/conftest.py b/tests/conftest.py index 9bca53b..77831b1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -53,6 +53,12 @@ def gemini_path(request): gemini_db = "tests/fixtures/HapMapFew.db" return gemini_db +@pytest.fixture(scope='function') +def gemini_large_path(request): + """Return the path of a gemini database""" + gemini_db = "tests/fixtures/HapMapMany.db" + return gemini_db + @pytest.fixture(scope='function') def vcf_file(request): diff --git a/tests/plugins/gemini/test_gemini_init.py b/tests/plugins/gemini/test_gemini_init.py index f8d748b..a913c44 100644 --- a/tests/plugins/gemini/test_gemini_init.py +++ b/tests/plugins/gemini/test_gemini_init.py @@ -13,7 +13,7 @@ def test_setup_no_db(): assert adapter.variant_type == 'snv' assert adapter.filters.can_filter_gene == True - assert adapter.filters.can_filter_inheritance == False + assert adapter.filters.can_filter_inheritance == True def test_setup_with_variant_type(gemini_path): adapter = GeminiPlugin('sv') diff --git a/tests/plugins/gemini/test_gemini_variant_mixin.py b/tests/plugins/gemini/test_gemini_variant_mixin.py index da69ac0..565c198 100644 --- a/tests/plugins/gemini/test_gemini_variant_mixin.py +++ b/tests/plugins/gemini/test_gemini_variant_mixin.py @@ -73,17 +73,6 @@ def test_is_variant(gemini_case_obj): assert not plugin._is_variant(gemini_variant, ind_objs) -def test_build_gemini_query(): - plugin = GeminiPlugin() - query = "SELECT * from variants" - extra_info = "max_aaf_all < 0.01" - new_query = plugin.build_gemini_query(query, extra_info) - assert new_query == "SELECT * from variants WHERE max_aaf_all < 0.01" - - extra_info = "cadd_score > 10" - new_query = plugin.build_gemini_query(new_query, extra_info) - assert new_query == "SELECT * from variants WHERE max_aaf_all < 0.01 AND cadd_score > 10" - class TestFilters: def test_filters_no_filters(self, gemini_case_obj): @@ -250,3 +239,48 @@ def test_filters_range(self, gemini_case_obj): assert variant_obj.stop <= end assert nr_of_variants == 1 + +class TestGeminiInheritance: + def test_filters_ar(self, gemini_case_obj): + plugin = GeminiPlugin() + plugin.add_case(gemini_case_obj) + + filters = {'genetic_models': ['AR_hom']} + result = plugin.variants('643594', filters=filters, count=1000) + variants = result.variants + nr_of_variants = result.nr_of_variants + + assert nr_of_variants == 0 + + def test_filters_ad(self, gemini_case_obj): + plugin = GeminiPlugin() + plugin.add_case(gemini_case_obj) + + filters = {'genetic_models': ['AD']} + result = plugin.variants('643594', filters=filters, count=1000) + variants = result.variants + nr_of_variants = result.nr_of_variants + + assert nr_of_variants == 0 + + def test_filters_ad_dn(self, gemini_case_obj): + plugin = GeminiPlugin() + plugin.add_case(gemini_case_obj) + + filters = {'genetic_models': ['AD_dn']} + result = plugin.variants('643594', filters=filters, count=1000) + variants = result.variants + nr_of_variants = result.nr_of_variants + + assert nr_of_variants == 6 + + def test_filters_ar_comp(self, gemini_case_obj): + plugin = GeminiPlugin() + plugin.add_case(gemini_case_obj) + + filters = {'genetic_models': ['AR_comp']} + result = plugin.variants('643594', filters=filters, count=10000) + variants = result.variants + nr_of_variants = result.nr_of_variants + + assert nr_of_variants == 0 diff --git a/tests/utils/test_gemini_query.py b/tests/utils/test_gemini_query.py new file mode 100644 index 0000000..d2b1b1c --- /dev/null +++ b/tests/utils/test_gemini_query.py @@ -0,0 +1,36 @@ +from puzzle.utils.gemini_extras import (add_to_query, build_gemini_query) + +def test_add_to_query(): + query = "SELECT * from variants" + extra_info = "max_aaf_all < 0.01" + new_query = add_to_query(query, extra_info) + assert new_query == "SELECT * from variants WHERE max_aaf_all < 0.01" + + extra_info = "cadd_score > 10" + new_query = add_to_query(new_query, extra_info) + assert new_query == "SELECT * from variants WHERE max_aaf_all < 0.01 AND cadd_score > 10" + +def test_build_gemini_query(): + filters = { + 'frequency' : 0.01 + } + query = build_gemini_query(filters) + + assert query == "SELECT * from variants WHERE (max_aaf_all < 0.01 or max_aaf_all is Null)" + +def test_build_gemini_query_models(): + filters = { + 'frequency' : 0.01 + } + query = build_gemini_query(filters, add_where = False) + + assert query == "(max_aaf_all < 0.01 or max_aaf_all is Null)" + +def test_build_gemini_query_models_two(): + filters = { + 'frequency' : 0.01, + 'cadd' : 15.0 + } + query = build_gemini_query(filters, add_where = False) + + assert query == "(max_aaf_all < 0.01 or max_aaf_all is Null) AND (cadd_scaled > 15.0)" \ No newline at end of file