-
Notifications
You must be signed in to change notification settings - Fork 0
/
mlk_tfbs_cisbp.py
129 lines (99 loc) · 5.53 KB
/
mlk_tfbs_cisbp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# TFBS Annotation
# Uses CIS-BP output to generate genbank annotations
# http://bioinfo.life.hust.edu.cn/AnimalTFDB/#!/tfbs_predict
import os
import sys
import argparse
import pandas as pd
import numpy
import datetime
import statistics
from collections import defaultdict
from scipy import stats
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Alphabet import IUPAC
parser = argparse.ArgumentParser()
parser.add_argument("tfbs_file", help="input file of the transcription factor binding site")
parser.add_argument("-s", "--score", help="score cutoff; defualt = 0", default = 0, type = float)
parser.add_argument("-r", "--relativescore", help="relative sscore cutoff; default = 0", default = 0, type = float)
parser.add_argument("-x", "--sequence", help="raw sequence to process")
parser.add_argument("-v", "--verbose", action='store_true', help="print detailed stats")
arguments = parser.parse_args()
# Read in TFBS file
tfbs_db = pd.read_csv(arguments.tfbs_file, sep=',', header=1)
#build genbank file entry
if arguments.sequence is not None:
sequence_string = arguments.sequence
sequence_object = Seq(sequence_string, IUPAC.unambiguous_dna)
#create a record
genbank_record = SeqRecord(sequence_object,
id='123456789', # random accession number
name=arguments.tfbs_file.rsplit('.')[0] + '-tfbs',
description='Autogenerated Genbank file annotated with TFBS data')#,
#date=datetime.datetime.now())
#calculate relative score
list_scores = [float(x) for x in tfbs_db["Score"]]
print(stats.percentileofscore(list_scores, 9.538))
# data structure to keep track of previously added tfbs
#todo
tfbs_dict_previous_added = defaultdict(list)
all_scores = []
all_relativescores = []
passed_scores = []
passed_relativescores = []
print('')
print('Thresholding cutoffs at: ')
print('score: ', arguments.score)
print('relativescore: ', arguments.relativescore)
for entry in tfbs_db.iterrows():
all_scores.append(entry[1]['Score'])
all_relativescores.append(stats.percentileofscore(list_scores, entry[1]['Score']))
#cuttoff by score
if float(entry[1]['Score']) > arguments.score and float(stats.percentileofscore(list_scores, entry[1]['Score'])) > float(arguments.relativescore):
#check for redundancy
if entry[1]['Name'] not in tfbs_dict_previous_added.keys() or (entry[1]['From'], entry[1]['To']) not in tfbs_dict_previous_added[entry[1]['Name']]:
#append
tfbs_dict_previous_added[entry[1]['Name']].append((entry[1]['From'], entry[1]['To']))
feature = SeqFeature(FeatureLocation(start=entry[1]['From'], end=entry[1]['To']), type='TFBS', strand=0)
feature.qualifiers['note'] = str(entry[1]['Score']) + ' [' + str(stats.percentileofscore(list_scores, entry[1]['Score'])) + ']' + ' ' + entry[1]['Direction']+ entry[1]['Sequence']
feature.qualifiers['label'] = str(entry[1]['Name'])
#color picker
if stats.percentileofscore(list_scores, entry[1]['Score']) < 60 :
feature.qualifiers['ApEinfo_revcolor'] = '#FFFFFF' #white
feature.qualifiers['ApEinfo_fwdcolor'] = '#FFFFFF' #white
if stats.percentileofscore(list_scores, entry[1]['Score']) >= 60 and stats.percentileofscore(list_scores, entry[1]['Score']) <70:
feature.qualifiers['ApEinfo_revcolor'] = '#D3D3D3' #light gray
feature.qualifiers['ApEinfo_fwdcolor'] = '#D3D3D3' #light gray
if stats.percentileofscore(list_scores, entry[1]['Score']) >= 70 and stats.percentileofscore(list_scores, entry[1]['Score']) < 80:
feature.qualifiers['ApEinfo_revcolor'] = '#808080' #gray
feature.qualifiers['ApEinfo_fwdcolor'] = '#808080' #gray
if stats.percentileofscore(list_scores, entry[1]['Score']) >= 80 and stats.percentileofscore(list_scores, entry[1]['Score']) < 90:
feature.qualifiers['ApEinfo_revcolor'] = '#505050' #dark gray
feature.qualifiers['ApEinfo_fwdcolor'] = '#505050' #dark gray
if stats.percentileofscore(list_scores, entry[1]['Score']) >= 90 and stats.percentileofscore(list_scores, entry[1]['Score']) <= 100:
feature.qualifiers['ApEinfo_revcolor'] = '#000000' #black
feature.qualifiers['ApEinfo_fwdcolor'] = '#000000' #black
genbank_record.features.append(feature)
passed_scores.append(entry[1]['Score'])
passed_scores.append(stats.percentileofscore(list_scores, entry[1]['Score']))
else:
pass
print('duplicate entry')
print('')
print('added ', len(tfbs_dict_previous_added.keys()), ' features to ', arguments.tfbs_file.rsplit('.')[0])
if arguments.verbose is True:
# print run metrics
print('')
print('Stats of Entire Set: (score, relative score)')
print('Ranges:', ((min(all_scores),max(all_scores)), (min(all_relativescores), (max(all_relativescores)))))
print('Means:', statistics.mean(all_scores), statistics.mean(all_relativescores))
print('')
print('Stats of Passed Set: (score, relative score)')
print('Ranges:', (min(passed_scores),max(passed_scores)), (min(passed_relativescores),max(passed_relativescores)))
print('Means:', statistics.mean(passed_scores), statistics.mean(passed_relativescores))
# Save as GenBank file
output_file = open(arguments.tfbs_file.rsplit('.')[0] + '-tfbs_annotated.gb', 'w')
SeqIO.write(genbank_record, output_file, 'genbank')