Skip to content

Commit

Permalink
Support for single-end data
Browse files Browse the repository at this point in the history
  • Loading branch information
mcmero committed Nov 1, 2021
1 parent 88b2868 commit 603a57d
Show file tree
Hide file tree
Showing 3 changed files with 325 additions and 1 deletion.
310 changes: 310 additions & 0 deletions MINTIE_SE.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,310 @@
/*
__ __ ___ _ _ _____ ___ _____
| \/ |_ _| \ | |_ _|_ _| ____|
| |\/| || || \| | | | | || _|
| | | || || |\ | | | | || |___
|_| |_|___|_| \_| |_| |___|_____|
Method for Inferring Novel Transcripts and Isoforms using Equivalences classes
Author: Marek Cmero
*/

code_base = file(bpipe.Config.config.script).parentFile.absolutePath
load code_base + "/tools.groovy"
load code_base + "/references.groovy"

// initialise defaults if not provided
if(!binding.variables.containsKey("fastqCaseFormat")){
fastqCaseFormat="cases/%_R*.fastq.gz"
}
if(!binding.variables.containsKey("fastqControlFormat")){
fastqControlFormat="controls/%_R*.fastq.gz"
}
if(!binding.variables.containsKey("assemblyFasta")){
assemblyFasta=""
}
if(!binding.variables.containsKey("run_de_step")){
run_de_step="true"
}
if(!binding.variables.containsKey("splice_motif_mismatch")){
splice_motif_mismatch=0
}

fastq_dedupe = {
from("*.gz"){
def sample_name = branch.name
output.dir = sample_name
produce(sample_name+'.1.fastq.gz'){
exec """
$dedupe $input.gz $output
""", "fastq_dedupe"
}
}
}

trim = {
output.dir = branch.name
produce('trim1.fastq.gz') {
if (assemblyFasta != '') {
// no need to trim if assembly provided
exec """
touch $output1 ; touch $output2
"""
} else {
exec """
$trimmomatic SE -threads $threads -phred$scores $input1
$output1.prefix
LEADING:$minQScore TRAILING:$minQScore MINLEN:$min_read_length ;
gzip $output1.prefix ;
""", "trim"
}
}
}

assemble = {
def sample_name = branch.name
def Ks_for_soap = Ks.toString().contains(',') ? Ks.split(',').join(' ') : Ks
output.dir = sample_name
produce(sample_name + '_denovo_filt.fasta'){
if (assemblyFasta != '') {
exec """
ln -s $assemblyFasta $output ;
"""
} else if (assembler.toLowerCase() == 'trinity') {
exec """
$Trinity --seqType fq --max_memory ${assembly_mem}G --output $sample_name/trinity_assembly \
--left $input1 --right $input2 --CPU $threads ;
ln -s trinity_assembly/Trinity.fasta $output ;
""", "assemble"
} else if (assembler.toLowerCase() == 'spades') {
exec """
$rnaspades -1 $input1 -2 $input2 -k $Ks -t $threads -m $assembly_mem -o $sample_name/SPAdes_assembly ;
ln -s SPAdes_assembly/contigs.fasta $output ;
""", "assemble"
} else {
exec """
rlens=`gunzip -c $input1 \
| awk -v mrl=$min_read_length 'BEGIN {minlen = mrl; maxlen = 0} {
if (NR % 4 == 2) {
rlen = length(\$1) ;
if (rlen > maxlen) {maxlen = rlen}
if (rlen < minlen) {minlen = rlen}
}} END {print minlen" "maxlen}'` ;
min_rlen=\${rlens% *} ;
max_rlen=\${rlens#* } ;
if [ ! -d $output.dir/SOAPassembly ]; then
mkdir $output.dir/SOAPassembly ;
fi ;
cd $output.dir/SOAPassembly ;
echo \"max_rd_len=\$max_rlen\" > config.config ;
echo -e \"[LIB]\\nq=../../$input1\" >> config.config ;
if [ -e SOAP.fasta ]; then rm SOAP.fasta ; fi ;
for k in $Ks_for_soap ; do
if [ \$k -gt \$min_rlen ]; then
echo "WARNING: Kmer size \$k exceeds minimum read length \${min_rlen}. Please double check parameters." ;
else
$soapdenovotrans pregraph -s config.config -o outputGraph_\$k -K \$k -p $threads ;
$soapdenovotrans contig -g outputGraph_\$k ;
cat outputGraph_\$k.contig | sed "s/^>/>k\${k}_/g" >> SOAP.fasta ;
fi ;
done ;
cd ../../ ;
$dedupe in=$sample_name/SOAPassembly/SOAP.fasta out=stdout.fa threads=$threads overwrite=true | \
$fasta_formatter | \
awk '!/^>/ { next } { getline seq } length(seq) > $min_contig_len { print \$0 "\\n" seq }' > $output ;
if [ ! -s $output ] ; then
rm $output ;
echo "ERROR: de novo assembled contigs fasta file is empty." ;
echo "Please check paths for SOAPdenovoTrans, dedupe and fasta" ;
echo "formatter are correct, and their dependencies are installed." ;
fi ;
""", "assemble"
}
}
}

create_salmon_index = {
def sample_name = branch.name
def salmon_index = sample_name + "/all_fasta_index"
output.dir = salmon_index
def index_fasta = output.dir + "/" + sample_name + ".fasta"
produce(index_fasta, '*.bin'){
exec """
cat $trans_fasta $input.fasta > $output1 ;
$salmon index -t $output1 -i $salmon_index -p $threads ;
""", "create_salmon_index"
}
}

run_salmon = {
def workingDir = System.getProperty("user.dir");
def rf = inputs.split().collect { workingDir+"/$it" }[0]
def salmon_index="all_fasta_index"
def base_outdir = "salmon_out"
def controls_dir = fastqControlFormat.split("/")[-2]
def sample_name = branch.name

if(type == "controls"){
sample_name = branch.parent.parent.name
def control_name = branch.name

output.dir = sample_name + "/" + controls_dir + "/" + control_name + "_salmon_out/aux_info"
base_outdir = control_name + "_salmon_out"
salmon_index = "../all_fasta_index"
} else {
output.dir = sample_name + "/salmon_out/aux_info"
}

produce("eq_classes.txt*"){
exec """
cd $output.dir/../.. ;
$salmon quant --dumpEq --seqBias --validateMappings --hardFilter -i $salmon_index -l A -r $rf -p $threads -o $base_outdir
""", "run_salmon"
}
}

create_ec_count_matrix = {
def sample_name = branch.name
def sample_names = inputs.split().collect { it.split('/')[-3].split('_salmon_out')[0] }
sample_names.set(0, sample_name) // case sample, rest are controls
sample_names = sample_names.join(',')
output.dir = sample_name
produce("ec_count_matrix.txt"){
exec """
$python $code_base/DE/create_ec_count_matrix.py $inputs $sample_names $output1 ;
""", "create_ec_count_matrix"
}
}

run_de = {
def run_de_bool = run_de_step.toBoolean()
def sample_name = branch.name
output.dir = sample_name
produce("eq_classes_de.txt"){
if(run_de_bool) {
exec """
${R}script $code_base/DE/compare_eq_classes.R $sample_name $input $trans_fasta $output --FDR=$fdr --minCPM=$min_cpm --minLogFC=$min_logfc
""", "run_de"
} else {
exec """
$python $code_base/DE/get_novel_contigs.py $input $trans_fasta $output.dir/${sample_name}_denovo_filt.fasta
""", "run_de"
}
}
}

filter_on_significant_ecs = {
def sample_name = branch.name
output.dir = sample_name
produce("de_contigs.fasta"){
exec """
$python $code_base/util/filter_fasta.py $output.dir/${sample_name}_denovo_filt.fasta $input.txt --col_id contig > $output1 ;
""", "filter_sig_ecs"
}
}

align_contigs_against_genome = {
def sample_name = branch.name
output.dir = sample_name
produce('aligned_contigs_against_genome.sam'){
exec """
$gmap -D $gmap_refdir -d $gmap_genome -f samse -t $threads -x $min_gap --max-intronlength-ends=500000 -n 0 $input.fasta > $output
""", "align_contigs_against_genome"
}
}

annotate_contigs = {
def sample_name = branch.name
output.dir = sample_name
produce("annotated_contigs.vcf", "annotated_contigs_info.tsv", "annotated_contigs.bam"){
exec """
$python ${code_base}/annotate/annotate_contigs.py \
$sample_name $input.bam \
$ann_info $tx_annotation \
$output.bam $output.tsv \
--minClip $min_clip \
--minGap $min_gap \
--minMatch $min_match \
--log $output.dir/annotate.log > $output.vcf
""", "annotate_contigs"
}
}

refine_contigs = {
def sample_name = branch.name
output.dir = sample_name
produce("novel_contigs.vcf", "novel_contigs_info.tsv", "novel_contigs.bam", "novel_contigs.fasta"){
exec """
$python ${code_base}/annotate/refine_annotations.py \
$input.tsv $input.vcf $input.bam $tx_annotation \
$genome_fasta $output.prefix \
--minClip $min_clip \
--minGap $min_gap \
--mismatches $splice_motif_mismatch \
--log $output.dir/refine.log > $output.vcf ;
$samtools index $output.bam ;
$python $code_base/util/filter_fasta.py $input.fasta $output.tsv --col_id contig_id > $output.fasta ;
""", "refine_contigs"
}
}

calculate_VAF = {
output.dir = branch.name
produce("vaf_estimates.txt"){
exec """
${R}script ${code_base}/annotate/estimate_VAF.R $branch.name/ec_count_matrix.txt $branch.name/salmon_out/quant.sf $input.tsv $trans_fasta $tx2gene $output
""", "calculate_VAF"
}
}

post_process = {
def sample_name = branch.name
def var_filter = var_filter.split(',').join(' ')
def gf_arg = gene_filter == '' ? '' : '--gene_filter ' + gene_filter
def vf_arg = var_filter == '' ? '' : '--var_filter ' + var_filter
output.dir = sample_name
produce(sample_name + '_results.tsv'){
exec """
$python ${code_base}/annotate/post_process.py \
$sample_name \
$input.tsv \
$input.fasta \
$sample_name/eq_classes_de.txt \
$sample_name/vaf_estimates.txt \
$gf_arg \
$vf_arg \
--log $output.dir/postprocess.log > $output
"""
}
}

sort_and_index_bam = {
output.dir = new File(input.sam).getParentFile()
transform('sam') to ('bam') {
exec """
$samtools sort -@ $threads -m ${sort_ram} $input.sam -o $output ;
$samtools index $output
""", "sort_and_index_bam"
}
}

run { fastqCaseFormat * [ fastq_dedupe +
trim +
assemble +
create_salmon_index +
[ fastqCaseFormat * [ run_salmon.using(type: "case") ],
fastqControlFormat * [ run_salmon.using(type:"controls") ] ] +
create_ec_count_matrix +
run_de +
filter_on_significant_ecs +
align_contigs_against_genome +
sort_and_index_bam +
annotate_contigs +
refine_contigs +
calculate_VAF +
post_process ]
}
3 changes: 2 additions & 1 deletion mintie
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ do
echo -e "\nusage (setup references): mintie -r "
echo -e "\nusage (setup test data): mintie -t "
echo -e "\nusage (wrapper): mintie -w -p [params.txt] cases/*.fastq.gz controls/*.fastq.gz "
echo -e "\nusage (direct):\n export \$MINTIEDIR=$MINTIE_HOME;\n bpipe run -@$MINTIEDIR/params.txt [ <other bpipe options >] \n\t \$MINTIEDIR/MINTIE.groovy cases/*.fastq.gz controls/*fastq.gz"
echo -e "\nusage (direct):\n export \$MINTIEDIR=$MINTIE_HOME;\n bpipe run -@\$MINTIEDIR/params.txt [ <other bpipe options >] \n\t \$MINTIEDIR/MINTIE.groovy cases/*.fastq.gz controls/*fastq.gz"
echo -e "\nusage (direct single-end):\n export \$MINTIEDIR=$MINTIE_HOME;\n bpipe run -@\$MINTIEDIR/params.txt [ <other bpipe options >] \n\t \$MINTIEDIR/MINTIE_SE.groovy cases/*.fastq.gz controls/*fastq.gz"
echo ""
exit 0
shift
Expand Down
13 changes: 13 additions & 0 deletions test/run_test_se.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/sh

# modify reference to control fasta
grep -v allvars-control references.groovy > backup_references.groovy
echo "trans_fasta=\"$PWD/test/data/allvars-control.fasta\"" > tmp.txt
cat backup_references.groovy tmp.txt > references.groovy ; rm tmp.txt

cases=`ls test/data/cases/*gz | grep R1`
controls=`ls test/data/controls/*gz | grep R1`
bpipe @test/test_params.txt MINTIE_SE.groovy $cases $controls

# restore original references
mv backup_references.groovy references.groovy

0 comments on commit 603a57d

Please sign in to comment.