-
Notifications
You must be signed in to change notification settings - Fork 4
/
SamRecord.py
executable file
·161 lines (135 loc) · 5.35 KB
/
SamRecord.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#=========================================================================
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
# License (GPL) version 3, as described at www.opensource.org.
# 2018 William H. Majoros (bmajoros@allumni.duke.edu)
#=========================================================================
from __future__ import (absolute_import, division, print_function,
unicode_literals, generators, nested_scopes, with_statement)
from builtins import (bytes, dict, int, list, object, range, str, ascii,
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
from Rex import Rex
rex=Rex()
from SamMDtagParser import SamMDtagParser
#=========================================================================
# Attributes:
# ID = read identifier
# refName = name of reference sequence the read aligns to
# refPos = position in reference where alignment begins
# CIGAR = CigarString
# seq = read sequence
# flags = bitfield
# tags = array of tags at end of record (MD:Z:122G25, NM:i:1, etc.)
# Instance Methods:
# rec=SamRecord(ID,refName,refPos,cigar,seq,flags,tags)
# ID=rec.getID()
# cigar=rec.getCigar() # returns CigarString object
# seq=rec.getSequence()
# L=rec.seqLength()
# refName=rec.getRefName()
# refPos=rec.getRefPos()
# tags=rec.getTags()
# fields=rec.parseMDtag()
# N=rec.countMismatches() # uses MD tag
# tag=getTag("MD") # returns the third field, e.g. "122G25" in MD:Z:122G25
# bool=rec.flag_hasMultipleSegments()
# bool=rec.flag_properlyAligned()
# bool=rec.flag_unmapped()
# bool=rec.flag_nextSegmentUnmapped()
# bool=rec.flag_revComp()
# bool=rec.flag_nextSegmentRevComp()
# bool=rec.flag_firstOfPair()
# bool=rec.flag_secondOfPair()
# bool=rec.flag_secondaryAlignment()
# bool=rec.flag_failedFilters()
# bool=rec.flag_PCRduplicate()
# bool=rec.flag_supplAlignment()
# Class Methods:
#=========================================================================
class SamRecord:
"""SamRecord"""
def __init__(self,ID,refName,refPos,CIGAR,seq,flags,tags):
self.ID=ID
self.refName=refName
self.refPos=refPos
self.CIGAR=CIGAR
self.seq=seq
self.flags=flags
self.tags=tags
def seqLength(self):
return len(self.seq)
def getTags(self):
return self.tags
def getTag(self,label):
for tag in self.tags:
if(not rex.find("^([^:]+):[^:]+:(\S+)",tag)):
raise Exception("Can't parse SAM tag: "+tag)
if(rex[1]==label): return rex[2]
return None
def countMismatches(self):
N=SamMDtagParser.countMismatches(self.parseMDtag())
return N
def parseMDtag(self):
md=self.getTag("MD")
fields=[]
if(md is None): return None
while(len(md)>0):
if(rex.find("^(\d+)(.*)",md)):
fields.append(rex[1])
md=rex[2]
elif(rex.find("^([ACGTN])(.*)",md)):
fields.append(rex[1])
md=rex[2]
elif(rex.find("^(\^[ACGTN]+)(.*)",md)):
fields.append(rex[1])
md=rex[2]
else:
raise Exception("Can't parse MD tag: "+md)
return fields
def getRefName(self):
return self.refName
def getRefPos(self):
return self.refPos
def getCigar(self):
return self.CIGAR
def getID(self):
return self.ID
def getSequence(self):
return self.seq
def flag_hasMultipleSegments(self):
return bool(self.flags & 0x1)
def flag_properlyAligned(self):
return bool(self.flags & 0x2)
def flag_unmapped(self):
return bool(self.flags & 0x4)
def flag_nextSegmentUnmapped(self):
return bool(self.flags & 0x8)
def flag_revComp(self):
return bool(self.flags & 0x10)
def flag_nextSegmentRevComp(self):
return bool(self.flags & 0x20)
def flag_firstOfPair(self):
return bool(self.flags & 0x40)
def flag_secondOfPair(self):
return bool(self.flags & 0x80)
def flag_secondaryAlignment(self):
return bool(self.flags & 0x100)
def flag_failedFilters(self):
return bool(self.flags & 0x200)
def flag_PCRduplicate(self):
return bool(self.flags & 0x400)
def flag_supplAlignment(self):
return bool(self.flags & 0x800)
# FLAGS:
# 0x1 template having multiple segments in sequencing
# 0x2 each segment properly aligned according to the aligner
# > 0x4 segment unmapped
# > 0x8 next segment in the template unmapped
# > 0x10 SEQ being reverse complemented
# 0x20 SEQ of the next segment in the template being reverse complemented
# > 0x40 the first segment in the template
# > 0x80 the last segment in the template
# 0x100 secondary alignment
# 0x200 not passing filters, such as platform/vendor quality controls
# > 0x400 PCR or optical duplicate
# 0x800 supplementary alignment
# M03884:303:000000000-C4RM6:1:1101:1776:15706 99 chrX:31786371-31797409 6687 44 150M = 6813 271 ATACTATTGCTGCGGTAATAACTGTAACTGCAGTTACTATTTAGTGATTTGTATGTAGATGTAGATGTAGTCTATGTCAGACACTATGCTGAGCATTTTATGGTTGCTATGTACTGATACATACAGAAACAAGAGGTACGTTCTTTTACA BBBBFFFFFFFGGGGGEFGGFGHFHFFFHHHFFHHHFHFHHHGFHEDGGHFHBGFHGBDHFHFFFHHHHFHHHHHGHGFFBGGGHFHFFHHFFFFHHHHGHGFHHGFHGHHHGFHFFHHFHHFFGFFFFGGEHFFEHHFGHHHGHHHHFB AS:i:300 XN:i:0