mappingtools.py

#!/usr/bin/env python
# Copyright (C) <2020> PMBL;South China Agricultural University. All rights reserved
__author__ = "Write by Fangping Li"
__version__ = '0.1.0'

import argparse
import sys
import os
import collections

def get_options():    
    example = "mappingtools.py -i pangenome.fa -1 illnumina_R1.fq.gz -2 illnumina_R3.fq.gz -g guide.goc -l location.lg -o outputcov -c 5 " 
    description = "Create one-by-one mapping and pav example: " + example
    parser = argparse.ArgumentParser(description = description,prog = 'mappingtools.py')
    parser.add_argument('-i', '--inpan', action='store',type=str,help='input your reference .fasta')
    parser.add_argument('-t', '--threads', action='store',type=int,default=1,help='how many thread do you want to use')
    parser.add_argument('-b', '--inbase', action='store',type=int,help='how many base-pair will you consider it as a total insertion rather than a replace; if these PAVs are considered as insertion, the coverage of this locus will be -1',default=20)
    parser.add_argument('-1', '--pairend1', action='store',type=str,help='input your pairend1 .fastq')
    parser.add_argument('-2', '--pairend2', action='store',type=str,help='input your pairend2 .fastq') 
    parser.add_argument('-l', '--location', action='store',type=str,
                         help='location of software "mummer" "lastz" and "svmu","samtools","bowtie"',default="location.lg")
    parser.add_argument('-g', '--gocguide', action='store',type=str,help='input your .goc file generated by multiple.py')
    parser.add_argument('-o', '--output', action='store',type=str,
                         help='name of the  pan-genome coverage output',default="output")
    parser.add_argument('-c', '--covfliter', action='store',type=int,help='coverage fliter',default=5)
    parser.add_argument('-clean', '--clean', action='store',type=str,choices=('yes','no'),
                             help='Clean all of the middle file!"',default="no")
    
    parser.add_argument('-create', '--createfasta', action='store',type=str,choices=('yes','no'),
                             help='Create a optimal reference genome for short read"',default="no")

    return parser.parse_args()

def acm(a,b):
    b=b  
    sum=0  
    for i in a:  
        sum=sum+i  
    mn = sum/b
    return mn 


gosome = get_options()
location = gosome.location


parame = open(location,"r") #mum lastz svmu samtools bowtie2 location
parameline = list(parame.readlines())
mum = parameline[0].split("=")[1].strip()
lastz = parameline[1].split("=")[1].strip()
svmu = parameline[2].split("=")[1].strip()
bowtie2 = parameline[3].split("=")[1].strip()
samtools = parameline[4].split("=")[1].strip()
p1=gosome.pairend1
p2=gosome.pairend2

parame.close()
panin = gosome.inpan
panout = gosome.output
print(bowtie2+"bowtie2-build "+"--threads "+str(gosome.threads)+" "+panin+" "+panout)
os.system(bowtie2+"bowtie2-build "+"--threads "+str(gosome.threads)+" "+panin+" "+panout)
print("begin mapping")
na1 = os.system("bosm.sh "+bowtie2+" "+panout+" "+samtools+" "+p1+" "+p2+" "+panout+" "+str(gosome.threads))

if na1 ==0:
    print("mapping done")
    mapout=panout+".sort.bam"
else:
    print("mapping error!")
    quit()

    
gocguide = gosome.gocguide
goc = open(gocguide,"r")
gocl = list(goc.readlines())
goc.close()
golist2 = open("golist2","w")
for i in gocl:
    if i.find("Org")==-1:
        i = i.split()
        ch = i[1]
        if "more" in i:

            lc = i.index("more")
            loc1 = i[2]
            loc2 = i[lc-2]
            
        else:
            loc1 = i[2]
            loc2 = i[5]

        print("cadepth.sh "+samtools+" "+mapout+" "+ch+" "+loc1+" "+loc2,file = golist2)

golist2.close()
na2 = os.system("parallel --jobs "+str(gosome.threads)+" < golist2")

if na2 == 0:
    print("Cov acquire done!")
else:
    print("Cov acquire error!")

finaloutput = open(gosome.output+".cov","w")#open the .goc file again and let the .goc file guide the acquire of coverage  
print("ID	PanPosition	RefPosition	coverage",file = finaloutput)

for i in gocl:#cov acquire go!
    ref = []
    
    if i.find("Org")==-1:
        i = i.split()
        ch = i[1]
        loc1 = i[1]
        loc2 = i[2]
        if "more" in i:
            lc = i.index("more")
            locstart = i[2]
            locend = i[lc-2]
        else:
            locstart = i[2]
            locend = i[5]
            lc = 7
        filename = mapout+"_"+ch+"_"+locstart+"_"+locend+".bam.depth"#open the .depth file by the guide of coverage
        #print(filename)
        filename = open(filename,"r")
        filenamego = list(filename.readlines())
        filename.close()
        maplc = i[2:lc-1]
        print(maplc)
        loc0 = str(int(maplc[0]))
        loc0c = int(loc0) - int(loc00)  
        print(i[0],file = finaloutput,end = "	")
        print(loc0, file = finaloutput,end = "	")
        print(loc0c, file = finaloutput,end = "	") 
        
        if "more" in i: #all cummlation count!
           
            loc0f =i.index("more")-1
            loc00 +=int(i[loc0f])
 
        else:

            loc00 +=int(i[6])   
            
        
        mapc = 2
        for k in range(len(maplc)):
            que =[]
            if mapc%2 == 0:
                #print(mapc)
                loc1 = maplc[mapc-2]
                #print(loc1)
                loc2 = maplc[mapc-1]
                #print(loc2)
                if int(loc2) - int(loc1) > gosome.inbase:
                    print(loc1)
                    print(loc2)#how many base-pair will you consider it as a total insertion rather than a replace
                    for j in filenamego:#judge the location of depth
                        j = j.split()
                        if int(j[1])>= int(loc1) and int(j[1])< int(loc2):
                            que.append(int(j[2]))
                            #print(que)
                    try:
                        quem = str(acm(que,int(loc2)-int(loc1)))[:6]#mean caculate
                    except:
                        quem = -1
                    print(quem,end = "	",file = finaloutput)
                else:
                    quem = -1
                    print(quem,file = finaloutput,end = "	")
            mapc += 1
                       
        print("",file = finaloutput)
    else:
        loc00 = 0 #all cummlation count!
        

finaloutput.close()    

if gosome.clean == "yes":
    os.system("rm *depth")
    os.system("rm *chr*_*_*.bam")
            
print("mapping coverage caculation done!")
print("Your ouput is "+ gosome.output+".cov")


covfliter = int(gosome.covfliter) 
goin = open(gosome.output+".cov","r")
goinf = list(goin.readlines())
goin.close()
goout = open(gosome.output+".cov"+".hapc","w")
dicg = collections.OrderedDict()

print("ID	PanPosition	RefPosition	HaplotypeComposition	Haplotype",file = goout)
for i in goinf:
    #print(i)
    if i.find("Pan") == -1:
        i = i.split()
        #print(i)
        dicname = i[0]
        dicg[dicname] = []
        dicg[dicname].append(i[1])
        dicg[dicname].append(i[2])
        for j in i[3:]:
            if float(j) > covfliter:
                dicg[dicname].append("1")
            else:
                dicg[dicname].append("0")
                            
        #print(dicg[dicname])


for i in dicg.keys():
    hap = i
    hapc =""
    print(hap,end = "	",file = goout) 
    print(dicg[i][0],end = "	",file = goout) 
    print(dicg[i][1],end = "	",file = goout) 
    for j in dicg[i][2:]:
        
        hapc += j 
        
        print(j,end = "	",file = goout)
        
    print("Hap"+hapc, file = goout)
    #print("",file = goout)
    
goout.close()

print("Hap caculation done!")
print("Your ouput is "+gosome.output+".cov"+".hapc")

if gosome.createfasta == "yes":
    print("Go to create a optimal reference genome for short read")
    naa = os.system("bestred.py -i "+panin+" -g "+gocguide+" -hap "+gosome.output+".cov"+".hapc"+" -o "+ gosome.output+".cov"+".hapc.fasta" )
    if naa == 0:
        print("Success!")
        print("Your optimal reference ouput is "+gosome.output+".cov"+".hapc.fasta")