-
Notifications
You must be signed in to change notification settings - Fork 3
/
annotate_bedpe.py
executable file
·372 lines (312 loc) · 18.6 KB
/
annotate_bedpe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
#!/usr/bin/env python
# This script takes input in bedpe format, ie from converting breakdancer output to bedpe format with convertBreakdancerBedToBedpe.py,
# and categorizes the calls into their own separate files based on their implied SV type and overlap with annotation features. The script
# uses bedpe format so that it can use the "anchoring regions" of each call - the regions in which the supporting reads aligned - as well as
# the actual called affected region (the "inner span" of the bedpe).
# Calls are first divided based on orientation, length and inter/intra chromosome status to separate out candidate deletions,
# insertions, tranlocations, and inversions. Then based on the SV event size, score, and overlaps with annotated features they are further
# categorized. The remaining calls of each type in the "Stringent" category should hypothetically contain the most true positives.
# The annotation files are:
# te_file: This is a track of transposable elements. Currently we are using the repeatmasker track downloaded from UCSC for hg19. This file
# is used in the following ways:
#
# 1) Apparent translocations in which one anchoring region overlaps a TE are more likely to be TE insertions in the sample as opposed to the
# reference, and the mapping to a distant TE is likely an alignment artifact.
# 2) Deletions where the deleted portion overlaps a TE are more likely TE insertions in the reference than actual deletions in the sample.
#
# Optional annotation files:
#
# common_deletions: This is meant to be a set of variants commonly occuring in the population (and therefore less likely to be implicated in
# cancer) Currently taken from the 1000 Genomes project SV pilot data.
#
# seg_dups: segmental duplications track from UCSC
#
# cent_tel: centromeric and telomeric regions, currently just taken from UCSC by adding 100kb of flanking sequence to their centromere/telomere annotations
#
# Other parameters:
#
# insert_size: expected insert size of the library
# output_dir: directory to be populated with annotated files
# sample_name: name of the sample to be added to track names in bed files
import sys
import os
import pybedtools
import subprocess
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("bedpe_file", help="BEDPE file of sv call anchoring regions")
parser.add_argument("insert_size", type=int, help="Insert size of the library")
parser.add_argument("output_dir", help="directory into which to put annotations")
parser.add_argument("sample_name", help="name of the sample being analyzed")
parser.add_argument("te_file", help="BED file of transposable element annotations")
parser.add_argument("--seg_dups_file", help="BED file of segmental duplications")
parser.add_argument("--cent_tel_file", help="BED file of centromeric/telomeric regions")
parser.add_argument("--common_deletions_file", help="BED file of common deletions")
args = parser.parse_args()
bedpe_file = args.bedpe_file
te_file = args.te_file
common_deletions_file = args.common_deletions_file
insert_size = args.insert_size
output_dir = args.output_dir
seg_dups_file = args.seg_dups_file
cent_tel_file = args.cent_tel_file
sample_name = args.sample_name
print 'analyzing bedpe file: ' + bedpe_file
log = open(output_dir + "/annotate.log", "w")
log.write("input file: {0}\n".format(bedpe_file))
log.write("sample name: ".format(sample_name))
log.write("te file: {0}\n".format(te_file))
log.write("common deletions file: {0}\n".format(common_deletions_file))
log.write("insert size: {0}\n".format(insert_size))
log.write("segmental duplications file: {0}\n".format(seg_dups_file))
log.write("centromeres and telomeres file: {0}\n".format(cent_tel_file))
common_deletion_overlap_pct = 0.5
# filter out calls that are longer than length
def bedpe_lt_length_filter(feature, length):
if int(feature[4]) - int(feature[2]) < length:
return True
return False
# filter out calls that are shorter than length
def bedpe_gt_length_filter(feature, length):
if int(feature[4]) - int(feature[2]) > length:
return True
return False
# filter out calls that aren't transchromosomal
def inter_chr_filter(feature):
if feature[0] == feature[3]:
return False
return True
# filter out calls that are transchromosomal
def intra_chr_filter(feature):
if feature[0] == feature[3]:
return True
return False
# filter out calls that have a score less than score
def score_gte_filter(feature, score):
if int(feature[7]) >= score:
return True
return False
# filter out calls that have a score greater than score
def score_lt_filter(feature, score):
if int(feature[7]) < score:
return True
return False
# filter out calls where the orientation of the reads in the clusters isn't the expected PE
def expected_orientation_filter(feature, matches):
expected = False
if (feature[8] == '+' and feature[9] == '-') or (feature[8] == '-' and feature[9] == '+'):
expected = True
if matches:
return expected
else:
return not expected
# filter out calls by name
def name_not_in_set_filter(feature, name_set):
if feature[6] in name_set:
return False
else:
return True
# filter out calls where the ends match a te feature (added to the feature by pair_to_bed)
def bedpe_reciprocal_overlap_ends_filter(feature, overlap_pct):
te_chr = feature[22]
te_start = int(feature[23])
te_end = int(feature[24])
te_length = te_end - te_start
return overlaps_by(te_chr, te_start, te_end, feature[0], int(feature[1]), int(feature[2]), overlap_pct) or overlaps_by(te_chr, te_start, te_end, feature[3], int(feature[4]), int(feature[5]), overlap_pct)
# filter out calls where the inner span overlaps a TE
def bedpe_reciprocal_overlap_ispan_filter(feature, overlap_pct):
te_chr = feature[22]
te_start = int(feature[23])
te_end = int(feature[24])
te_length = te_end - te_start
return overlaps_by(te_chr, te_start, te_end, feature[0], int(feature[2]), int(feature[4]), overlap_pct)
# Determine if the the two features overlap by overlap_pct
def overlaps_by(chr1, start1, end1, chr2, start2, end2, overlap_pct):
if chr1 != chr2:
return False
max_start = max(start1,start2)
min_end = min(end1,end2)
overlap_len = min_end - max_start
len1 = end1 - start1
len2 = end2 - start2
return float(overlap_len) / len1 >= overlap_pct and float(overlap_len) / len2 >= overlap_pct
# write out a bed file
def write_bed(call, fh):
for f in call.fields:
fh.write(str(f) + "\t")
fh.write("\n")
# find duplicate calls by looking for pairs of calls where the both ends match with a little slop
def merge_duplicate_breaks(calls, slop):
dups = calls.pair_to_pair(calls, type="both", rdn=True, slop=slop).saveas()
low_scoring_dups = set()
for call in dups:
n1 = call[6]
n2 = call[28]
s1 = int(call[7])
s2 = int(call[29])
low_score_name = ""
if s1 < s2:
low_score_name = n1
else:
low_score_name = n2
low_scoring_dups.add(low_score_name)
return calls.filter(name_not_in_set_filter, low_scoring_dups).saveas()
# call script to convert bedpe to blocked bed12 format for a visualizable track
def convert_bedpe_to_bed12(bedpe_file, track_name):
bed12_file = open(bedpe_file + ".bed", 'w')
subprocess.call("bedpeToBed12.py -i {0} -d 1000000000 -n \"{1}\"".format(bedpe_file, track_name), shell=True, stdout=bed12_file)
# use sort -u to get rid of duplicate bed entries
def uniqify(bedtool, output_dir, file_name):
ufile = open(output_dir + "/" + file_name, 'w')
subprocess.call("sort -u " + output_dir + "/tmp." + file_name, shell=True, stdout=ufile)
os.remove(output_dir + "/tmp." + file_name)
return pybedtools.BedTool(ufile.name)
# break down the calls of type sv_type by region, overlaps
def save_output(master_out_bed, calls, output_dir, file_name, sample_name, sv_type, seg_dups, cent_tel):
if len(calls) == 0:
log.write("Zero calls of type " + sv_type + "\n")
return
track_name = sample_name + "_" + sv_type
calls.saveas(output_dir + "/" + file_name + '.bedpe')
convert_bedpe_to_bed12(output_dir + "/" + file_name + '.bedpe', track_name)
# get rid of duplicate calls
calls = merge_duplicate_breaks(calls, 1000)
print sv_type + "\tNON_DUPLICATE\t" + str(len(calls))
log.write(sv_type + "\tNON_DUPLICATE\t" + str(len(calls)) + "\n")
calls.saveas(output_dir + "/" + file_name + '_dedup.bedpe')
convert_bedpe_to_bed12(output_dir + "/" + file_name + '_dedup.bedpe', track_name)
# find calls overlapping segmental duplications
if seg_dups is not None:
seg_dup_overlap = calls.pair_to_bed(seg_dups, f=1, type="either").cut(xrange(0,22)).saveas(output_dir + "/tmp." + file_name + "_dedup_segdup.bedpe")
seg_dup_overlap = uniqify(seg_dup_overlap, output_dir, file_name + "_dedup_segdup.bedpe")
convert_bedpe_to_bed12(output_dir + "/" + file_name + '_dedup_segdup.bedpe', track_name + "_IN_SEG_DUPS")
subprocess.call("cat " + output_dir + "/" + file_name + '_dedup_segdup.bedpe.bed', shell=True, stdout=master_out_bed)
# find calls overlapping peri- centromeric and telomeric regions
if cent_tel is not None:
cent_tel_overlap = calls.pair_to_bed(cent_tel, f=1, type="either").cut(xrange(0,22)).saveas(output_dir + "/tmp." + file_name + "_dedup_cent_tel.bedpe")
cent_tel_overlap = uniqify(cent_tel_overlap, output_dir, file_name + "_dedup_cent_tel.bedpe")
convert_bedpe_to_bed12(output_dir + "/" + file_name + '_dedup_cent_tel.bedpe', track_name + "_IN_CENT_TEL")
subprocess.call("cat " + output_dir + "/" + file_name + '_dedup_cent_tel.bedpe.bed', shell=True, stdout=master_out_bed)
# subract SD and C/T calls from the stringent set
if seg_dups is not None and len(seg_dup_overlap) > 0:
stringent_minus_sd = calls.pair_to_pair(seg_dup_overlap, type="notboth").saveas()
else:
stringent_minus_sd = calls
if cent_tel is not None and len(cent_tel_overlap) > 0:
stringent_minus_ct = stringent_minus_sd.pair_to_pair(cent_tel_overlap, type="notboth").saveas()
else:
stringent_minus_ct = stringent_minus_sd
print sv_type + "\tTOTAL_STRINGENT\t" + str(len(stringent_minus_ct))
log.write(sv_type + "\tTOTAL_STRINGENT\t" + str(len(stringent_minus_ct)) + "\n")
if len(stringent_minus_ct) == 0:
return
# filter out very short (< 1kb) and short (< 5kb) calls into their own category
very_short_stringent = stringent_minus_ct.filter(bedpe_lt_length_filter, 1000).saveas()
very_short_stringent.saveas(output_dir + "/" + file_name + "_dedup_stringent_very_short.bedpe")
convert_bedpe_to_bed12(output_dir + "/" + file_name + '_dedup_stringent_very_short.bedpe', track_name + "_STRINGENT_LT_1KB")
subprocess.call("cat " + output_dir + "/" + file_name + '_dedup_stringent_very_short.bedpe.bed', shell=True, stdout=master_out_bed)
if len(very_short_stringent) > 0:
stringent_minus_vs = stringent_minus_ct.pair_to_pair(very_short_stringent, type="notboth").saveas()
else:
stringent_minus_vs = stringent_minus_ct.saveas()
print sv_type + "\tTOTAL_STRINGENT_MINUS_VERY_SHORT\t" + str(len(stringent_minus_vs))
log.write(sv_type + "\tTOTAL_STRINGENT_MINUS_VERY_SHORT\t" + str(len(stringent_minus_vs)) + "\n")
if len(stringent_minus_vs) == 0:
return
short_stringent = stringent_minus_vs.filter(bedpe_lt_length_filter, 5000).saveas()
short_stringent.saveas(output_dir + "/" + file_name + "_dedup_stringent_short.bedpe")
convert_bedpe_to_bed12(output_dir + "/" + file_name + '_dedup_stringent_short.bedpe', track_name + "_STRINGENT_LT_5KB")
subprocess.call("cat " + output_dir + "/" + file_name + '_dedup_stringent_short.bedpe.bed', shell=True, stdout=master_out_bed)
if len(short_stringent) > 0:
stringent_minus_vss = stringent_minus_vs.pair_to_pair(short_stringent, type="notboth").saveas()
else:
stringent_minus_vss = stringent_minus_vs.saveas()
# from the remaining calls separate those with high score (breakdancer score = 99) from those with lower scores
stringent_high_score = stringent_minus_vss.filter(score_gte_filter, 99).saveas(output_dir + "/" + file_name + "_dedup_stringent_high_score.bedpe")
convert_bedpe_to_bed12(output_dir + "/" + file_name + '_dedup_stringent_high_score.bedpe', track_name + "_STRINGENT_HIGH_SCORE")
subprocess.call("cat " + output_dir + "/" + file_name + '_dedup_stringent_high_score.bedpe.bed', shell=True, stdout=master_out_bed)
stringent_low_score = stringent_minus_vss.filter(score_lt_filter, 99).saveas(output_dir + "/" + file_name + "_dedup_stringent_low_score.bedpe")
convert_bedpe_to_bed12(output_dir + "/" + file_name + '_dedup_stringent_low_score.bedpe', track_name + "_STRINGENT_LOW_SCORE")
subprocess.call("cat " + output_dir + "/" + file_name + '_dedup_stringent_low_score.bedpe.bed', shell=True, stdout=master_out_bed)
if seg_dups is not None:
print sv_type + "\tSEGMENTAL_DUPLICATION\t" + str(len(seg_dup_overlap))
if cent_tel is not None:
print sv_type + "\tIN_PERI_CENTROMERE_TELOMERE\t" + str(len(cent_tel_overlap))
print sv_type + "\tSTRINGENT_VERY_SHORT\t" + str(len(very_short_stringent))
print sv_type + "\tSTRINGENT_SHORT\t" + str(len(short_stringent))
print sv_type + "\tTOTAL_STRINGENT\t" + str(len(stringent_minus_vss))
print sv_type + "\tSTRINGENT_LOW_SCORE\t" + str(len(stringent_low_score))
print sv_type + "\tSTRINGENT_HIGH_SCORE\t" + str(len(stringent_high_score))
if seg_dups is not None:
log.write( sv_type + "\tSEGMENTAL_DUPLICATION\t" + str(len(seg_dup_overlap)) + "\n")
if cent_tel is not None:
log.write( sv_type + "\tIN_PERI_CENTROMERE_TELOMERE\t" + str(len(cent_tel_overlap)) + "\n")
log.write( sv_type + "\tSTRINGENT_VERY_SHORT\t" + str(len(very_short_stringent)) + "\n")
log.write( sv_type + "\tSTRINGENT_SHORT\t" + str(len(short_stringent)) + "\n")
log.write( sv_type + "\tTOTAL_STRINGENT\t" + str(len(stringent_minus_vss)) + "\n")
log.write( sv_type + "\tSTRINGENT_LOW_SCORE\t" + str(len(stringent_low_score)) + "\n")
log.write( sv_type + "\tSTRINGENT_HIGH_SCORE\t" + str(len(stringent_high_score)) + "\n")
bedpe_calls = pybedtools.BedTool(bedpe_file)
tes = pybedtools.BedTool(te_file)
if seg_dups_file is not None:
seg_dups = pybedtools.BedTool(seg_dups_file)
else:
seg_dups = None
if cent_tel_file is not None:
cent_tel = pybedtools.BedTool(cent_tel_file)
else:
cent_tel = None
if common_deletions_file is not None:
common_deletions = pybedtools.BedTool(common_deletions_file)
else:
common_deletions = None
master_out_bed = open(output_dir + "/" + sample_name + "_svs.bed", 'a')
num_calls = len(bedpe_calls)
print "TOTAL\tALL\t" + str(num_calls)
log.write("TOTAL\tALL\t" + str(num_calls) + "\n")
# divide calls by type:
# translocations: inter-chromosomal
inter_calls = bedpe_calls.filter(inter_chr_filter).saveas()
print "TRANSLOCATIONS\tALL\t" + str(len(inter_calls))
log.write("TRANSLOCATIONS\tALL\t" + str(len(inter_calls)) + "\n")
save_output(master_out_bed, inter_calls, output_dir, "translocations", sample_name, "TRANSLOCATIONS", seg_dups, cent_tel)
# translocations where one end maps to a TE are more like TE insertions in the sample
possible_te_insertions = inter_calls.pair_to_bed(tes, f=.75).saveas()
filtered_possible_te_insertions = possible_te_insertions.filter(bedpe_reciprocal_overlap_ends_filter, 0.75).saveas()
print "TRANSLOCATIONS-POSSIBLE_TE_INSERTIONS\tALL\t" + str(len(filtered_possible_te_insertions))
log.write("TRANSLOCATIONS-POSSIBLE_TE_INSERTIONS\tALL\t" + str(len(filtered_possible_te_insertions)) + "\n")
save_output(master_out_bed, filtered_possible_te_insertions, output_dir, "translocations_possible_te_insertions", sample_name, "TRANSLOCATIONS-POSSIBLE_TE_INSERTIONS", seg_dups, cent_tel)
# now working with intra chromosomal calls
intra_calls = bedpe_calls.filter(intra_chr_filter).saveas()
# expected paired end fragment orientation
expected_orientation = intra_calls.filter(expected_orientation_filter, matches=True).saveas()
# deletions <- right orientation, long fragment size
long_indel_intra_calls = expected_orientation.filter(bedpe_gt_length_filter, insert_size).saveas()
print "DELETIONS\tALL\t" + str(len(long_indel_intra_calls))
log.write("DELETIONS\tALL\t" + str(len(long_indel_intra_calls)) + "\n")
save_output(master_out_bed, long_indel_intra_calls, output_dir, "deletions", sample_name, "DELETIONS", seg_dups, cent_tel)
# special case of deletions - if the deleted area ("ispan") is a TE, more likely a TE insertion in the reference
possible_te_reference_insertions = long_indel_intra_calls.pair_to_bed(tes, type="ispan", f=.75).saveas()
filtered_possible_te_reference_insertions = possible_te_reference_insertions.filter(bedpe_reciprocal_overlap_ispan_filter, 0.75).saveas()
print "POSSIBLE_TE_INSERTIONS_IN_REFERENCE\tALL\t" + str(len(filtered_possible_te_reference_insertions))
log.write("POSSIBLE_TE_INSERTIONS_IN_REFERENCE\tALL\t" + str(len(filtered_possible_te_reference_insertions)) + "\n")
save_output(master_out_bed, filtered_possible_te_reference_insertions, output_dir, "possible_te_reference_insertions", sample_name, "POSSIBLE_TE_INSERTIONS_IN_REFERENCE", seg_dups, cent_tel)
# deletions that match population variants
if common_deletions is not None:
common_deletions = long_indel_intra_calls.pair_to_bed(common_deletions, type="ispan", f=common_deletion_overlap_pct).saveas()
filtered_possible_common_deletions = common_deletions.filter(bedpe_reciprocal_overlap_ispan_filter, common_deletion_overlap_pct).saveas()
print "COMMON_DELETIONS\tALL\t" + str(len(filtered_possible_common_deletions))
log.write("COMMON_DELETIONS\tALL\t" + str(len(filtered_possible_common_deletions)) + "\n")
save_output(master_out_bed, filtered_possible_common_deletions, output_dir, "possible_common_deletions", sample_name, "COMMON_DELETIONS", seg_dups, cent_tel)
# insertions are shorter than the fragment size
short_indel_intra_calls = expected_orientation.filter(bedpe_lt_length_filter, insert_size).saveas()
print "INSERTIONS\tALL\t" + str(len(short_indel_intra_calls))
log.write("INSERTIONS\tALL\t" + str(len(short_indel_intra_calls)) + "\n")
save_output(master_out_bed, short_indel_intra_calls, output_dir, "insertions", sample_name, "INSERTIONS", seg_dups, cent_tel)
# inversions are what's left
unexpected_orientation = intra_calls.filter(expected_orientation_filter, matches=False).saveas()
print "INVERSION\tALL\t" + str(len(unexpected_orientation))
log.write("INVERSION\tALL\t" + str(len(unexpected_orientation)) + "\n")
save_output(master_out_bed, unexpected_orientation, output_dir, "inversions", sample_name, "INVERSIONS", seg_dups, cent_tel)
pybedtools.cleanup()
log.close()