-
Notifications
You must be signed in to change notification settings - Fork 3
/
parse_antismash_genbank.py
executable file
·109 lines (89 loc) · 3.09 KB
/
parse_antismash_genbank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python3
"""
Extract PK, NRP and terpene synthases from an antiSMASH 4.3 GenBank.
Cameron Gilchrist
"""
import argparse
from collections import defaultdict, namedtuple
from Bio import SeqIO
def parse_record(record):
"""Parse BioPython SeqRecord object for synthases.
Returns an iterator
"""
for feature in record.features:
if feature.type == "CDS":
synthase = parse_feature(feature)
if not synthase:
continue
else:
yield synthase
def parse_feature(feature):
"""Parse CDS SeqFeature, returning True if it is a synthase."""
if "sec_met" not in feature.qualifiers:
# Need a /sec_met="Type: ..." qualifier
return None
field = feature.qualifiers["sec_met"][0]
Synthase = namedtuple("Synthase", ["protein_id", "type", "translation"])
if field == "Type: nrps-t1pks":
synthase_type = "PKS-NRPS"
elif "nrps" in field:
synthase_type = "NRPS"
elif "pks" in field:
synthase_type = "PKS"
elif "terpene" in field:
synthase_type = "Terpene"
else: # P450, misc crap etc
return None
if "protein_id" not in feature.qualifiers:
return None
return Synthase(
protein_id=feature.qualifiers["protein_id"][0],
translation=feature.qualifiers["translation"][0],
type=synthase_type,
)
def write_fasta(synthases, output):
"""Write FASTA of Synthase namedtuples to open file handle."""
fasta = [
">{}\n{}".format(synthase.protein_id, wrap_fasta(synthase.translation))
for synthase in synthases
]
output.write("\n".join(fasta))
def wrap_fasta(sequence, limit=80):
"""Wrap FASTA record to 80 characters per line."""
return "\n".join(sequence[i : i + limit] for i in range(0, len(sequence), limit))
def main(args):
""" Run script.
"""
synthases = defaultdict(list)
for record in SeqIO.parse(args.genbank, "genbank"):
print(f"Parsing: {record.id}")
for synthase in parse_record(record):
print("Found synthase:", synthase.type, synthase.protein_id)
synthases[synthase.type].append(synthase)
with open(f"{args.output}_pks.faa", "w") as pks, open(
f"{args.output}_nrps.faa", "w"
) as nrps, open(f"{args.output}_terpene.faa", "w") as terpene, open(
f"{args.output}_hybrid.faa", "w"
) as hybrid:
write_fasta(synthases["PKS"], pks)
write_fasta(synthases["NRPS"], nrps)
write_fasta(synthases["Terpene"], terpene)
write_fasta(synthases["PKS-NRPS"], hybrid)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Parse GenBank file generated by antiSMASH for" " synthases."
)
parser.add_argument(
"-g",
dest="genbank",
help="Final antiSMASH GenBank file. Usually formatted" " *.final.gbk.",
required=True,
)
parser.add_argument(
"-o",
dest="output",
help="Base name for output files, i.e. output_pks.faa",
required=True,
)
args = parser.parse_args()
main(args)