-
Notifications
You must be signed in to change notification settings - Fork 1
/
cds_fold.py
207 lines (164 loc) · 7.31 KB
/
cds_fold.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import gzip
from Bio import SeqIO
import argparse
parser = argparse.ArgumentParser(description="Generate list of 0-fold and 4-fold sites from a fasta formatted reference genome and gff annotation file.")
parser.add_argument("reference", type=str, help="Fasta formatted reference file")
parser.add_argument("gff", type=str, help="version 3 gff file.")
#start_check = True, complete_check = True)
parser.add_argument('-c', '--complete_check', action='store_false', default = True,
help='Switch to disable check if the the translated protein has any missing amino acids.')
parser.add_argument('-s', '--start_check', action='store_false', default = True,
help='Switch to disable check if the the translated protein has a start codon.')
parser.add_argument("-o", "--outfile", type=str, help="Name of the output file", required = True)
args = parser.parse_args()
###############
## FUNCTIONS ##
###############
def translate(seq):
seq = seq.upper()
if len(seq) % 3 != 0:
raise ValueError('input sequence not divisible by 3')
if False in [s in ["A", "T", "G", "C", "N"] for s in seq]:
print(f'found non nucleotide in codon {seq}.')
table = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}
protein = ""
for i in range(0, len(seq), 3):
codon = seq[i:i + 3]
try:
protein += table.get(codon, "_")
except:
protein += "-"
return protein
#check all mutations at each position to determine fold
def condon(seq, pos):
if len(seq) != 3:
raise ValueError('input sequence should be 3 bps')
if pos not in [1,2,3]:
raise ValueError('pos argument must be 1, 2, or 3')
if pos == 1: return [translate(b + seq[1:]) for b in ["A", "T", "G", "C"]]
if pos == 2: return [translate(seq[0] + b + seq[2]) for b in ["A", "T", "G", "C"]]
if pos == 3: return [translate(seq[0:2] + b) for b in ["A", "T", "G", "C"]]
#count potential number unique amino acids present at a locus
def n_fold(seq, gene_name):
if len(seq) % 3 != 0:
print(f"skipping gene: {gene_name}. input sequence not divisible by 3")
return None
else:
return [len(set(condon(seq[i:i+3], j))) for i in range(0, len(seq), 3) for j in [1,2,3]]
#quality check for issues with protein sequence
def build_seq(c_sequence, seqid, start, end, strand):
if strand == "-":
c_seq = fasta_dict[seqid].seq[start-1:end]
return c_seq.reverse_complement() + c_sequence
if strand == "+":
c_seq = fasta_dict[seqid].seq[start-1:end]
return c_sequence + c_seq
#quality check for issues with protein sequence
def protein_qc(seq, gene_name, start_check = True, complete_check = True):
if len(seq) % 3 != 0:
print(f"Warning! Skipping gene {gene_name}. Protein is not divisible by 3")
return False
if len(seq) < 3:
print(f"Warning! Skipping gene {gene_name}. DNA sequence is too short.")
return False
protein = translate(seq)
if len(protein) < 1:
print(f"Warning! Skipping gene {gene_name}. Protein sequence is too short.")
return False
n_stops = protein.count("_")
start_aa = protein[0]
last_aa = protein[-1]
if n_stops != 1 or last_aa != "_":
print(f"Warning! Skipping gene: {gene_name}. Incorrect number and/or location of stop codons.")
return False
if start_aa != "M" and start_check:
print(f"Warning! Skipping gene: {gene_name}. Does not start with M")
return False
if "-" in protein and complete_check:
print(f"Warning! Skipping gene: {gene_name}. Contains a non-amino acid three base pair condon")
return False
return True
#add to dictionary to track fold types at reference position
def append_fold(fold_dict, pos_key, fold):
if pos_key not in fold_dict:
fold_dict[pos_key] = []
fold_dict[pos_key].append(fold)
else:
fold_dict[pos_key].append(fold)
#########################################################################
## READ REFERENCE GENOME SEQUENCE INTO A BIOPYTHON SEQUENCE DICITONARY ##
#########################################################################
fasta_dict = SeqIO.to_dict(SeqIO.parse(args.reference, "fasta"))
#################
## PROCESS GFF ##
#################
def openfile(filename):
if filename.endswith(".gz"):
return gzip.open(filename, "rt")
else:
return open(filename, "r")
s = 0
smx = 1
old_name = ""
old_strand = ""
old_seqid = ""
c_sequence = ""
seq_pos = []
fold_dict = {}
with openfile(args.gff) as gff:
for line in gff:
if line[0] != "#":
seqid, source, s_type, start, end, score, strand, phase, attr = line.strip().split('\t')
start = int(start)
end = int(end)
if s_type == "CDS":
name = attr.split("ID=")[1].split(";")[0]
if old_name == "": old_name = name
if old_strand == "": old_strand = strand
if old_seqid == "" : old_seqid = seqid
if name == old_name:
c_sequence = build_seq(c_sequence, seqid, start, end, strand)
seq_pos += list(range(start, end+1))
else:
gene_folds = n_fold(c_sequence, old_name)
prot_qc = protein_qc(c_sequence, old_name, start_check = args.start_check, complete_check = args.complete_check)
if prot_qc:
if gene_folds:
n_folds = len(gene_folds)
if old_strand == "-":
gene_folds = gene_folds[::-1]
for idx, fold in enumerate(gene_folds):
if fold == 1:
append_fold(fold_dict, f"{old_seqid} {seq_pos[idx]}", 4)
if fold == 4:
append_fold(fold_dict, f"{old_seqid} {seq_pos[idx]}", 0)
old_name = name
old_strand = strand
old_seqid = seqid
c_sequence = ""
c_sequence = build_seq(c_sequence, seqid, start, end, strand)
seq_pos = list(range(start, end+1))
outfile = open(args.outfile, 'w')
print(f"#chrom pos fold", file = outfile)
#print(f"chrom pos fold", file = zero_fold)
for k,v, in fold_dict.items():
if len(set(v)) == 1:
fold_one = list(set(v))[0]
print(k, fold_one, file = outfile)