create_bkg_seqs.py

#!/usr/bin/env python3

"""
Create background sequences from input FASTA files.
"""

__author__ = "Akshay Paropkari"
__version__ = "0.3.4"


import argparse
from os import mkdir
from os.path import abspath, exists, isfile, join
from random import choice, randint, random
from sys import exit
from time import strftime

from utils import (
    dna_iupac_codes,
    gc_len_matched_bkg_seq_gen,
    get_transmat,
    parse_fasta,
    random_dna,
)

try:
    from rpy2.robjects.packages import importr
except ImportError:
    exit("\nPlease install rpy2 package.\n")
else:
    try:
        dnashaper = importr("DNAshapeR")
    except Exception:
        exit("\nPlease install bioconductor-dnashaper package.\n")
try:
    import pandas as pd
except ImportError:
    exit("\nPlease install pandas package\n")


def handle_program_options():
    parser = argparse.ArgumentParser(
        description="Using foreground sequences, generate background sequences. "
        "Background sequences will be generated firstly by matching foreground motif "
        "GC-percent and length. Secondly, the foregound sequences will be shuffled to "
        "keep dinucleotide composition constant.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "fg_fasta_file",
        metavar="/path/to/true_binding_seqeuences.fasta",
        type=str,
        help="Path to foreground/true positive sequence dataset FASTA "
        "format file [REQUIRED]",
    )
    parser.add_argument(
        "protein_name",
        type=str,
        choices=["bcr1", "brg1", "efg1", "ndt80", "rob1", "tec1"],
        help="Specify the name of transcription factor. Please see the "
        "list of valid choices for this parameter [REQUIRED]",
    )
    parser.add_argument(
        "genome_fasta_files",
        type=str,
        nargs="+",
        metavar="/path/to/other_organism_CDS_exonic_sequences.fasta",
        help="Specify path to one or more genome files to use as template"
        " from which to generate random background sequences. These genome"
        "file(s) must be FASTA file with very low probability of "
        "containing sequences with binding motifs. For example, these "
        "files can be FASTA file of exonic regions of non-related "
        "species. Please do not supply gzipped files. [REQUIRED]",
    )
    parser.add_argument(
        "output_dir",
        type=str,
        metavar="/path/to/descriptive_output_folder_name",
        help="Specify a directory to save background sequence data. [REQUIRED]",
    )
    parser.add_argument(
        "-m",
        "--mononucleotide_shuffle",
        action="store_true",
        help="Supply this parameter to generate mononucleotide shuffled background "
        "sequences. Mononucleotide shuffled sequences will not be generated by default."
        "Write `-m` to turn on this parameter.",
    )
    parser.add_argument(
        "-t",
        "--tolerance",
        type=int,
        default=1,
        help="Percent tolerance allowed for matching GC content of "
        "background sequence with foreground sequence. The default value "
        "is one percent difference between background and foreground "
        "sequence. A value of zero will increase eexecution time for this"
        " script.",
    )
    return parser.parse_args()


##########################################################################################
# altschulEriksonDinuclShuffle.py
# P. Clote, Oct 2003


def computeCountAndLists(s):
    # WARNING: Use of function count(s,"UU") returns 1 on word UUU since it apparently
    # counts only nonoverlapping words UU. For this reason, we work with the indices.

    # Initialize lists and mono- and dinucleotide dictionaries
    nuclList = ["A", "C", "G", "T"]
    List = {nt: [] for nt in nuclList}  # List is a dictionary of lists
    s = s.upper()
    nuclCnt = {}  # empty dictionary
    dinuclCnt = {}  # empty dictionary
    for x in nuclList:
        nuclCnt[x] = 0
        dinuclCnt[x] = {y: 0 for y in nuclList}

    # Compute count and lists
    nuclCnt[s[0]] = 1
    nuclTotal = 1
    dinuclTotal = 0
    for i in range(len(s) - 1):
        x = s[i]
        y = s[i + 1]
        List[x].append(y)
        nuclCnt[y] += 1
        nuclTotal += 1
        dinuclCnt[x][y] += 1
        dinuclTotal += 1
    assert nuclTotal == len(s)
    assert dinuclTotal == len(s) - 1
    return nuclCnt, dinuclCnt, List


def chooseEdge(x, dinuclCnt):
    numInList = 0
    for y in ["A", "C", "G", "T"]:
        numInList += dinuclCnt[x][y]
    z = random()
    denom = (
        dinuclCnt[x]["A"] + dinuclCnt[x]["C"] + dinuclCnt[x]["G"] + dinuclCnt[x]["T"]
    )
    numerator = dinuclCnt[x]["A"]
    if z < float(numerator) / float(denom):
        dinuclCnt[x]["A"] -= 1
        return "A"
    numerator += dinuclCnt[x]["C"]
    if z < float(numerator) / float(denom):
        dinuclCnt[x]["C"] -= 1
        return "C"
    numerator += dinuclCnt[x]["G"]
    if z < float(numerator) / float(denom):
        dinuclCnt[x]["G"] -= 1
        return "G"
    dinuclCnt[x]["T"] -= 1
    return "T"


def connectedToLast(edgeList, nuclList, lastCh):
    D = {x: 0 for x in nuclList}
    for edge in edgeList:
        a = edge[0]
        b = edge[1]
        if b == lastCh:
            D[a] = 1
    for i in range(2):
        for edge in edgeList:
            a = edge[0]
            b = edge[1]
            if D[b] == 1:
                D[a] = 1
    # ok = 0
    for x in nuclList:
        if x != lastCh and D[x] == 0:
            return 0
    return 1


def eulerian(s):
    nuclCnt, dinuclCnt, List = computeCountAndLists(s)
    # compute nucleotides appearing in s
    nuclList = []
    for x in ["A", "C", "G", "T"]:
        if x in s:
            nuclList.append(x)
    # compute numInList[x] = number of dinucleotides beginning with x
    numInList = {}
    for x in nuclList:
        numInList[x] = 0
        for y in nuclList:
            numInList[x] += dinuclCnt[x][y]
    # create dinucleotide shuffle L
    # firstCh = s[0]   # start with first letter of s
    lastCh = s[-1]
    edgeList = []
    for x in nuclList:
        if x != lastCh:
            edgeList.append([x, chooseEdge(x, dinuclCnt)])
    ok = connectedToLast(edgeList, nuclList, lastCh)
    return ok, edgeList, nuclList, lastCh


def shuffleEdgeList(L):
    n = len(L)
    barrier = n
    for i in range(n - 1):
        z = int(random() * barrier)
        tmp = L[z]
        L[z] = L[barrier - 1]
        L[barrier - 1] = tmp
        barrier -= 1
    return L


def dinuclShuffle(s):
    s = s if len(dna_iupac_codes(s)) == 1 else choice(dna_iupac_codes(s))
    ok = 0
    while not ok:
        ok, edgeList, nuclList, lastCh = eulerian(s)
    nuclCnt, dinuclCnt, List = computeCountAndLists(s)

    # remove last edges from each vertex list, shuffle, then add back
    # the removed edges at end of vertex lists.
    for [x, y] in edgeList:
        List[x].remove(y)
    for x in nuclList:
        shuffleEdgeList(List[x])
    for [x, y] in edgeList:
        List[x].append(y)

    # construct the eulerian path
    L = [s[0]]
    prevCh = s[0]
    for i in range(len(s) - 2):
        ch = List[prevCh][0]
        L.append(ch)
        del List[prevCh][0]
        prevCh = ch
    L.append(s[-1])
    t = "".join(L)
    return random_dna(2, False) + t + random_dna(2, False)


##########################################################################################


def mono_nt_shuffle(seqA: str) -> str:
    """
    Given a input sequence seqA, return its Durstenfeld shuffled version
    """
    shuffled_seqA = []
    for nt in seqA:
        shuffled_seqA_len = len(shuffled_seqA)
        j = randint(0, shuffled_seqA_len)
        if j == shuffled_seqA_len:
            shuffled_seqA.append(nt)
        else:
            shuffled_seqA.append(shuffled_seqA[j])
            shuffled_seqA[j] = nt
    return random_dna(2, False) + "".join(shuffled_seqA) + random_dna(2, False)


def main():
    print("#" * 90, "\n\n", strftime("%x %X | START BACKGROUND SEQUENCES GENERATION\n"))
    args = handle_program_options()

    try:
        assert isfile(args.fg_fasta_file)
    except AssertionError as e:
        print(
            "Error with input foreground FASTA file(s). Please check supplied FASTA "
            "file - {0}".format(e)
        )
        exit()
    else:
        # parse foreground sequence FASTA file
        print(
            strftime(
                f"%x %X | Parsing foreground sequence FASTA file {args.fg_fasta_file}"
            )
        )
        fg_seqs_df = (
            pd.DataFrame.from_dict(
                {header: seq for header, seq in parse_fasta(args.fg_fasta_file)},
                orient="index",
                columns=["sequence"],
            )
            .reset_index()
            .rename(columns={"index": "location"})
        )

        try:
            outdir = abspath(args.output_dir)
            assert exists(outdir)
        except AssertionError:
            # output directory doesn't exist, create it
            mkdir(outdir)
        else:
            # get output file path and name
            outfnh = join(outdir, f"{args.protein_name}_bkg_seqs.fasta")

    if args.mononucleotide_shuffle:
        ################################################
        # MONONUCLEOTIDE SHUFFLED FOREGROUND SEQUENCES #
        # Durstenfeld shuffle                          #
        ################################################
        print(
            strftime("%x %X | Generating mononucleotide shuffled background sequences")
        )
        fg_seqs_df["mononuc_shuffled_bkg_seq"] = fg_seqs_df["sequence"].apply(
            mono_nt_shuffle
        )

    ##############################################
    # DINUCLEOTIDE SHUFFLED FOREGROUND SEQUENCES #
    ##############################################
    print(strftime("%x %X | Generating dinucleotide shuffled background sequences"))
    fg_seqs_df["dinuc_shuffled_bkg_seq"] = fg_seqs_df["sequence"].apply(dinuclShuffle)

    ################################################################
    # GC CONTENT AND LENGTH MATCHED BACKGROUND SEQUENCE GENERATION #
    ################################################################
    # parse unrelated genome FASTA files containing CDS sequences
    print(
        strftime("%x %X | Calculating transition probability for non-Candida genomes")
    )
    degree = 2
    cds_transmat = {
        f.split("/")[-1].split("_")[0]: get_transmat(f, degree)
        for f in args.genome_fasta_files
    }
    print(
        strftime(
            "%x %X | Generating GC content and length matched background sequences"
        )
    )
    fg_seqs_df["gc_len_matched_bkg"] = fg_seqs_df["sequence"].apply(
        gc_len_matched_bkg_seq_gen, args=(cds_transmat, list(fg_seqs_df["sequence"]),)
    )

    print(strftime(f"%x %X | Writing background sequences to {outfnh}"))
    fg_seqs_df = fg_seqs_df.sample(frac=1.0, random_state=39)
    with open(outfnh, "w") as outf:
        for bkg, data in (
            fg_seqs_df.set_index("location", verify_integrity=True)
            .drop(columns=["sequence"])
            .to_dict()
            .items()
        ):
            for location, seq in data.items():
                outf.write(f">{bkg + '_for_' + location}\n{seq}\n")

    ######################################################
    # CALCULATE DNA SHAPE VALUE FOR BACKGROUND SEQUENCES #
    ######################################################
    print(
        strftime(
            f"%x %X | Writing background sequences shape data to {abspath(args.output_dir)}"
        )
    )
    for shape in ["MGW", "Roll", "HelT", "ProT", "EP"]:
        dnashaper.getDNAShape(outfnh, shape)

    print(strftime("\n%x %X | END BACKGROUND SEQUENCES GENERATION\n"))


if __name__ == "__main__":
    exit(main())