-
Notifications
You must be signed in to change notification settings - Fork 1
/
__init__.py
381 lines (318 loc) · 16.6 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
"""
Part of the PyMAF package. This file holds useful high-level representations of
multiple species alignments and the top-level interface to query the MAF-block
database and obtain alignments for arbitrary regions
"""
__author__ = "Marvin Jens"
__copyright__ = "Copyright 2016, Massachusetts Institute of Technology"
__credits__ = ["Marvin Jens"]
__license__ = "MIT"
__version__ = "0.7"
__email__ = "[email protected]"
__status__ = "beta"
import os,re,sys
from byo import rev_comp,complement
from byo.track import Track, Accessor
from byo.io import fasta_chunks
import logging
from PyMAF.maf_hdf5 import MAFBlockDB
from PyMAF.alignment import Alignment
from byo.io.genome_accessor import GenomeCache, RemoteCache
class MAFCoverageCollector(object):
"""
A helper class to aggregate the information from multiple MAF blocks
spanning a reference region. The main task is to keep track of the
start and end coordinates of the orthologous sequences in other
species.
"""
def __init__(self, ref, ref_start, ref_end, genome_provider, excess_threshold=2, lack_threshold=0.1, min_len=1, species=[]):
from collections import defaultdict
self.logger = logging.getLogger('pymaf.MAFCoverageCollector')
self.excess_threshold = excess_threshold
self.lack_threshold = lack_threshold
self.min_len = min_len
self.ref = ref
self.ref_start = ref_start
self.ref_end = ref_end
self.genome_provider = genome_provider
self.left_col = None
self.right_col = None
self.species_min_max = {}
self.species_intervals = defaultdict(list)
self.species_strands = defaultdict(set)
self.species_chroms = defaultdict(set)
self.species_left_adjust = defaultdict(int)
self.species_right_adjust = defaultdict(int)
self.species = []
self.select_species = set(species)
self.logger.debug("select_species = {0}".format(species))
self.ref_start_thisblock = None
self.ref_end_thisblock = None
def add_record(self,row):
"""
this is used to feed data. Changes in MAF block are detected by change
in ref_start_thisblock. This requires that the reference species is
always passed first! Currently not checked, but silently assumed!!!
"""
#self.logger.debug("adding '{0}', self.ref={3} self.ref_start={1} self.ref_end={2}".format(species, self.ref_start, self.ref_end, self.ref))
if row.species == self.ref:
# we are looking at the reference species!
self.ref_start_thisblock = row.start
self.ref_end_thisblock = row.end
# no ref_start given, use bounds of this block!
if self.ref_start == None:
self.ref_start = row.start
if self.ref_end == None:
self.ref_end = row.end
if row.start <= self.ref_start:
# how many columns do we have to skip until we reach
# the desired start position (gaps!)?
delta = self.ref_start - row.start
j = 0
while delta > 0:
if row.seq[j] != '-':
delta -= 1
j += 1
self.left_col = j
#print "ref",self.left_col,seq[:self.left_col+1],seq[self.left_col:]
if row.end >= self.ref_end:
# same for the end, what column corresponds to ref_end?
delta = row.end - self.ref_end
j = len(row.seq)-1
while delta > 0:
if row.seq[j] != '-':
delta -= 1
j -= 1
self.right_col = j+1
else:
# for other species, how many genomic positions do we need to skip
# until we are at left_col?
if self.left_col != None and self.ref_start_thisblock <= self.ref_start:
# count non-gap positions before left_col/ref_start
self.species_left_adjust[row.species] = self.left_col - row.seq[:self.left_col].count('-')
if self.right_col != None and self.ref_end_thisblock >= self.ref_end:
# count non-gap positions after the ref_end/right_col
self.species_right_adjust[row.species] = (len(row.seq)-self.right_col) - row.seq[self.right_col:].count('-')
# record the encountered species, their contigs and strands
strand = row.sense
if not row.species in self.species_strands:
self.species.append(row.species)
self.species_strands[row.species].add(strand)
self.species_chroms[row.species].add(row.chrom)
# build a "maximal cover", the largest chunk of orthologous sequence
# according to the MAF blocks
if not row.species in self.species_min_max:
self.species_min_max[row.species] = (row.start,row.end)
S,E = self.species_min_max[row.species]
self.species_min_max[row.species] = (min(row.start,S),max(row.end,E))
self.species_intervals[row.species].append( (row.start, row.end) )
def get_sequences(self,sense):
"""
This is a generator, yielding (species,chrom,start,end,strand,seq)
for as many species as it can. Actual genome access is deferred to
a GenomeProviderCache instance passed to the constructor.
"""
ref_len = self.ref_end - self.ref_start
for species in self.species:
if not species in self.select_species and self.select_species:
#self.logger.debug("skipping species '{0}' because it was not selected".format(species))
continue
if len(self.species_chroms[species]) > 1:
chrom_list = ','.join(sorted(self.species_chroms[species]))
self.logger.warning('sequence is split across different contigs/chroms {chrom_list} in species {species}'.format(**locals()))
continue
else:
chrom = list(self.species_chroms[species])[0]
if len(self.species_strands[species]) > 1:
self.logger.warning('sequence is split across different strands in species {species}'.format(**locals()))
continue
else:
strand = list(self.species_strands[species])[0]
start,end = self.species_min_max[species]
#print "getting",species,chrom,strand
genome = self.genome_provider[species]
if species == self.ref:
start,end = self.ref_start,self.ref_end
else:
if strand == '+':
start += self.species_left_adjust[species]
end -= self.species_right_adjust[species]
else:
# Dr. minus strikes again...
end -= self.species_left_adjust[species]
start += self.species_right_adjust[species]
if end - start > ref_len* self.excess_threshold:
self.logger.warning('homologous sequence exceeds {0}x times the reference sequence for {1}. skipping!'.format(self.excess_threshold, species))
continue
if end - start < ref_len* self.lack_threshold:
self.logger.warning('skipping homologous sequence shorter than {0}x times the reference sequence for {1}. skipping!'.format(self.lack_threshold, species))
continue
if end - start < self.min_len:
self.logger.warning('skipping homologous sequence of length {0} < min_len of {1} for {2}!'.format(end-start, self.min_len, species))
continue
#print "retrieving for species='{species}' chrom='{chrom}' sense='{strand}'".format(**locals())
seq = genome.get_oriented(chrom.split('.')[0],start,end,strand).upper()
if genome.no_data:
self.logger.warning("skipping {species} due to missing genome".format(**locals()))
continue
else:
if sense == '-':
# Dr. minus again. This time my own convention makes it
# even more weird. But it makes sense if you think about
# it: get_data always preserves the order "left-to-right"
# in chromosome logic. If you expect rev_comp, use
# get_oriented() instead.
seq = complement(seq)
yield species,chrom,start,end,strand,seq
class MAFBlockMultiGenomeAccessor(Accessor):
"""
This class uses an indexed HDF5 version of the raw MAF blocks to find
the MAF blocks overlapping the start and end of an arbitrary genomic
span in O(log(N)) time. It then parses these MAF blocks, extracts the
orthologous coordinates for the aligned species and retrieves the
orthologous genomic sequences (with the help of MAFCoverageCollector).
"""
def __init__(self,maf_path,chrom,sense,sense_specific=False,empty="",genome_provider=None,excess_threshold=5.,lack_threshold=.0,min_len=0, aln_class=Alignment, muscle_iterations=0, in_memory=False, **kwargs):
super(MAFBlockMultiGenomeAccessor,self).__init__(maf_path,chrom,sense,sense_specific=False,**kwargs)
self.logger = logging.getLogger("pymaf.MAFBlockMultiGenomeAccessor")
self.maf_path = maf_path
self.empty = empty
self.reference = self.system
self.aln_class = aln_class
self.genome_provider = genome_provider
self.excess_threshold = excess_threshold
self.lack_threshold = lack_threshold
assert lack_threshold < 1.
assert excess_threshold > 1.
self.min_len = min_len
self.muscle_iterations = muscle_iterations
h5path = os.path.join(self.maf_path,chrom+".maf.hdf5")
self.maf_file = MAFBlockDB(h5path, in_memory=in_memory)
def find_inside_region_by_species(self, ref_chrom, ref_start, ref_end, ref_sense, species, spc_chrom, spc_start, spc_end, select_species = []):
# TODO implement binary search for relevant maf block such that a MUSCLE alignment can be requested for a smaller region.
self.logger.debug("collecting candidate blocks inside {ref_start}-{ref_end}".format(ref_start=ref_start, ref_end=ref_end) )
start_i = self.data[ref_start]
end_i = self.data[ref_end]
block_starts = set()
for comb in self.index[start_i:end_i+1]:
block_starts |= set(comb.split(','))
block_starts = sorted([int(bs,16) for bs in block_starts])
n_rows = len(block_starts)
self.logger.debug('find_inside_region_by_species({species} {spc_start}-{spc_end}) binary-searching {n_rows} blocks'.format(**locals()) )
# these are the only two we need to consider
select_species = [self.reference, species]
def search_block(block_starts, pos):
def recurse(block_starts, pos):
#print "recursion:", block_starts, pos
if not block_starts:
return None
if len(block_starts) == 1:
blk = self.load_block(block_starts[0], species=select_species)
s,e = blk.species_min_max.get(species, (-1,-1))
#print "checked block", block_starts, s,e, "?", pos
if (s == e) or blk.species_chroms[species] != set([spc_chrom]):
#print "ignore", s,e, blk.species_chroms[species], spc_chrom
return "ignore",block_starts[0],blk
elif s <= pos <= e:
#print "hit"
return "hit",block_starts[0],blk
elif e < pos:
#print "left"
return "left",block_starts[0],blk
elif s < pos:
#print "right"
return "right",block_starts[0],blk
else:
#print "splitting interval"
mid = len(block_starts)/2
code, blk_start, blk = recurse([block_starts[mid]], pos)
#print "result=", code, blk_start
if code == "hit":
return code, blk_start, blk
elif code == "left":
return recurse(block_starts[mid:], pos)
elif code == "right":
return recurse(block_starts[:mid], pos)
elif code == "ignore":
block_starts.pop(mid)
return recurse(block_starts, pos)
res = recurse(list(block_starts), pos)
#print "search received", res
if not res:
return None
code, blk_start, blk = res
if code == 'hit' :
return blk_start, blk
else:
return None
#print "-----searching start",block_starts
start_search = search_block(block_starts, spc_start)
#print "-----searching end",block_starts
end_search = search_block(block_starts, spc_end)
#print start_search
#print end_search
if not (start_search and end_search):
return
start_row, start_cov = start_search
end_row, end_cov = end_search
start_mfa = "\n".join([">{species} {chrom}:{start}-{end}{strand}\n{seq}".format(**locals()) for species,chrom,start,end,strand,seq in start_cov.get_sequences(ref_sense)])
start_aln = self.aln_class(start_mfa, ref=self.reference, acc=self)
end_mfa = "\n".join([">{species} {chrom}:{start}-{end}{strand}\n{seq}".format(**locals()) for species,chrom,start,end,strand,seq in end_cov.get_sequences(ref_sense)])
end_aln = self.aln_class(end_mfa, ref=self.reference, acc=self)
start_ofs_spc = start_cov.species_min_max[species][0]
start_ofs_ref = start_cov.species_min_max[self.reference][0]
end_ofs_spc = end_cov.species_min_max[species][0]
end_ofs_ref = end_cov.species_min_max[self.reference][0]
ref_start = start_aln.lift_over(species, spc_start - start_ofs_spc, self.reference) + start_ofs_ref
ref_end = end_aln.lift_over(species, spc_end - end_ofs_spc, self.reference) + end_ofs_ref
return ref_start, ref_end
def get_data(self, chrom, ref_start, ref_end, ref_sense, select_species = []):
coverage = MAFCoverageCollector(
self.reference,
ref_start,
ref_end,
self.genome_provider,
excess_threshold=self.excess_threshold,
lack_threshold=self.lack_threshold,
min_len=self.min_len,
species=select_species
)
for block in self.maf_file.query_interval(self.reference, self.chrom, ref_start, ref_end, select_species):
for row in block:
#print row
coverage.add_record(row)
return list(coverage.get_sequences(ref_sense))
def get_oriented(self,chrom,start,end,sense, select_species=[]):
res = self.get_data(chrom,start,end,sense, select_species=select_species)
if sense == '-':
# get_data returned already complement(seq). Only need to reverse.
res = [(species,chrom,start,end,strand,seq[::-1]) for species,chrom,start,end,strand,seq in res]
mfa = "\n".join([">{species} {chrom}:{start}-{end}{strand}\n{seq}".format(**locals()) for species,chrom,start,end,strand,seq in res])
aln = self.aln_class(mfa, ref=self.reference, acc=self)
if self.muscle_iterations:
return aln.MUSCLE(n_iter=self.muscle_iterations)
else:
return aln
def get_dummy(self,chrom,start,end,sense):
return []
def flush(self):
self.maf_file.close()
def get_track(genome_path, maf_path, reference_species, excess_threshold=2, lack_threshold=.0, min_len=1, new_chrom_flush=False, in_memory=False, muscle_iterations = 0):
if genome_path.startswith('http://'):
genome_provider = RemoteCache(genome_path)
else:
genome_provider = GenomeCache(genome_path)
MAF_track = Track(
maf_path,
MAFBlockMultiGenomeAccessor,
auto_flush=new_chrom_flush,
sense_specific=False,
genome_provider=genome_provider,
excess_threshold=excess_threshold,
lack_threshold=lack_threshold,
min_len=min_len,
system=reference_species,
in_memory = in_memory,
muscle_iterations = muscle_iterations,
)
return MAF_track