forked from roberta-davidson/PopGen-Scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vcf_to_pos_BED_snp.sh
40 lines (28 loc) · 1.18 KB
/
vcf_to_pos_BED_snp.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/bin/bash
#bash script to create a .snp, .pos and .BED files from a VCF. Files required by pileupcaller for genotyping
VCF=$1
#Need to create .snp and .pos and .BED file.
#.pos file looks like:
#22 16050075
#22 16050115
#.snp file looks like it:
#rs587697622 22 0 16050075 A G
#rs587755077 22 0 16050115 G A
#.BED file looks like this:
#22 16050074 16050075
#22 16050114 16050115
#count header rows
L="$(grep "#" ${VCF}.vcf | wc -l)"
echo "Header rows - ${L}"
#remove vcf headers
awk 'NR>l {print $0}' l="${L}" ${VCF}.vcf > ${VCF}.nohead.vcf
#To create the .snp of only SNPs, remove longer variants and rename snpID to "CHR_SITE", you do:
awk -v OFS='\t' 'length($5)<2 && length($4)<2 {print $1"_"$2,$1,0,$2,$4,$5}' ${VCF}.nohead.vcf > ${VCF}.snp
#Create file correlating snpID with CHR_POS for safekeeping
awk -v OFS='\t' '{print $1"_"$2,$3}' ${VCF}.nohead.vcf > ${VCF}.snp_names
#To create the .pos from snp sites you do:
awk -v OFS='\t' '{print $3,$4}' ${VCF}.snp > ${VCF}.pos
#Create .BED from snp sites file
awk -v OFS='\t' '{n=1;print $2,$4-n,$4}' ${VCF}.snp > ${VCF}.BED
# Clean up
rm ${VCF}.nohead.vcf