-
Notifications
You must be signed in to change notification settings - Fork 4
/
CigarString.py
executable file
·133 lines (118 loc) · 4.18 KB
/
CigarString.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#=========================================================================
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
# License (GPL) version 3, as described at www.opensource.org.
# 2018 William H. Majoros (bmajoros@alumni.duke.edu)
#=========================================================================
from __future__ import (absolute_import, division, print_function,
unicode_literals, generators, nested_scopes, with_statement)
from builtins import (bytes, dict, int, list, object, range, str, ascii,
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
from Rex import Rex
rex=Rex()
from CigarOp import CigarOp
from Interval import Interval
#=========================================================================
# Attributes:
# ops : array of CigarOp
# Instance Methods:
# cigar=CigarString(string)
# bool=cigar.completeMatch()
# numOps=cigar.length()
# cigarOp=cigar[i] # returns a CigarOp object
# str=cigar.toString()
# cigar.computeIntervals(refPos)
# ops=cigar.matchesByLength() # sorted by decreasing length
# op=cigar.longestMatch() # returns a CigarOp object (or None)
# L=cigar.longestMatchLen() # returns integer
# (numMatches,numMismatches)=cigar.longestMatchStats(seq1,seq2)
# # ^ Returns none if no match; must call computeIntervals() first!
# L=cigar.totalAlignmentLength()
# cigar.setOps(ops)
#=========================================================================
class CigarString:
"""CigarString parses CIGAR strings (alignments)"""
def __init__(self,cigar):
self.ops=self.parse(cigar)
def setOps(self,ops):
self.ops=ops
def matchesByLength(self):
matches=[]
for op in self.ops:
if(op.getOp() in ("M","=","X")): matches.append(op)
matches.sort(key=lambda x: -x.getLength())
return matches
def countIndelBases(self):
N=0
for op in self.ops:
c=op.getOp()
if(c=="I" or c=="D"): N+=1
return N
def totalAlignmentLength(self):
L=0
for op in self.ops:
if(op.getOp() in ("M","=","X")): L+=op.getLength()
return L
def computeIntervals(self,refPos):
ops=self.ops
n=len(ops)
begin1=0; begin2=refPos
for i in range(n):
op=ops[i]
L=op.getLength()
end1=begin1; end2=begin2
if(op.advanceInQuery()): end1+=L
if(op.advanceInRef()): end2+=L
op.interval1=Interval(begin1,end1)
op.interval2=Interval(begin2,end2)
begin1=end1; begin2=end2
def length(self):
return len(self.ops)
def __getitem__(self,i):
return self.ops[i]
def completeMatch(self):
ops=self.ops
return len(ops)==1 and ops[0].op in ("M","=","X")
def longestMatchStats(self,query,ref):
m=self.longestMatch()
if(m is None): return None
sub1=query[m.interval1.getBegin():m.interval1.getEnd()]
sub2=ref[m.interval2.getBegin():m.interval2.getEnd()]
matches=0; mismatches=0
for i in range(len(sub1)):
if(sub1[i]==sub2[i]): matches+=1
else: mismatches+=1
return (matches,mismatches)
def longestMatchLen(self):
m=self.longestMatch()
if(m is None): return 0
return m.getLength()
def longestMatch(self):
longest=None
longestLength=0
for op in self.ops:
if(op.getOp()in ("M","=","X")):
L=op.getLength()
if(L>longestLength):
longest=op
longestLength=L
return longest
def toString(self):
ops=self.ops
s=""
for op in ops:
s+=str(op.length)
s+=op.op
return s
def parse(self,cigar):
array=[]
if(cigar=="*"): return array
while(len(cigar)>0):
if(not rex.find("(\d+)(.)",cigar)):
raise Exception("Can't parse CIGAR string: "+cigar)
L=int(rex[1])
op=rex[2]
rec=CigarOp(op,L)
array.append(rec)
rex.find("\d+.(.*)",cigar)
cigar=rex[1]
return array