-
Notifications
You must be signed in to change notification settings - Fork 0
/
bamutils_rpkm2vcf2.py
executable file
·131 lines (111 loc) · 3.71 KB
/
bamutils_rpkm2vcf2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python
"""
Convert bamutils RPKM output to VCF; meant for pan-genome.
bamutils_rpkm2vcf.py
--present <lower threshold for proportion of mean considered present>
--absent <upper threshold for proportion of mean considered absent>
<input directory>
Output is to stdout.
The input directory should have files named <strain>.<replicon>.tsv.
Assumes input is in 0-based coordinates.
--absent < --ambig < --greater
"""
#==============================================================================#
import argparse
import csv
import math
import os
import os.path as osp
import sys
#==============================================================================#
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument('--absent', type=float)
parser.add_argument('--present', type=float)
parser.add_argument('indir')
args = parser.parse_args()
absent_threshold = args.absent
present_threshold = args.present
indir = args.indir
if absent_threshold >= present_threshold:
raise Exception('--absent must be < --present')
strains_ = set()
replicons = set()
nfiles = 0
for fname in os.listdir(indir):
if fname.endswith('.tsv'):
strain, replicon, _ = fname.split('.')
strains_.add(strain)
replicons.add(replicon)
nfiles += 1
if nfiles != len(strains_) * len(replicons):
raise Exception('Some strain/replicons missing or extra files')
strains = sorted(strains_)
mean_depth = {}
strain_data = {}
gene_data = {}
genes_list = []
for strain in strains:
strain_data[strain] = {}
sum_depth = 0.
n_genes = 0
for replicon in replicons:
fname = osp.join(indir, strain + '.' + replicon + '.tsv')
with open(fname) as handle:
lp = handle.tell()
while handle.readline().startswith('#'):
lp = handle.tell()
continue
handle.seek(lp)
rdr = csv.DictReader(handle, delimiter='\t')
for row in rdr:
rpkm = float(row['RPKM'])
gene = row['name']
strain_data[strain][gene] = rpkm
sum_depth += rpkm
n_genes += 1
if gene not in gene_data:
gene_data[gene] = (row['chrom'], row['start'])
genes_list.append(gene)
mean_depth[strain] = sum_depth / n_genes
genotype_calls = {}
for strain in strains:
genotype_calls[strain] = {}
mean = mean_depth[strain]
for gene in genes_list:
ratio = strain_data[strain][gene] / mean
if ratio >= present_threshold:
call = 'T'
elif ratio <= absent_threshold:
call = 'A'
else:
call = 'N'
genotype_calls[strain][gene] = call
sys.stdout.write('##fileformat=VCFv4.1\n')
sys.stdout.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT')
sys.stdout.write('\t' + '\t'.join(strains) + '\n')
for gene in genes_list:
chrom = gene_data[gene][0]
pos = gene_data[gene][1]
var_id = gene
gts = []
for strain in strains:
try:
gts.append(genotype_calls[strain][gene])
except:
import pdb; pdb.set_trace()
unique_gts = set(gts) - set('N')
ref_gt = 'T'
alt_gts = list(unique_gts - set(['T', 'N']))
if len(alt_gts) == 0:
alt_gts = ['.']
sys.stdout.write(chrom + '\t' + str(pos) + '\t' + var_id + '\tT\t')
sys.stdout.write(','.join(alt_gts) + '\t.\t.\t.\tGT')
for gt in gts:
if gt == 'N':
sys.stdout.write('\t./.')
elif gt == ref_gt:
sys.stdout.write('\t0/0')
else:
g = str(alt_gts.index(gt) + 1)
sys.stdout.write('\t' + g + '/' + g)
sys.stdout.write('\n')