Skip to content

Latest commit

 

History

History
230 lines (184 loc) · 6.92 KB

readme.md

File metadata and controls

230 lines (184 loc) · 6.92 KB

Workflow

De novo TE anno

Identify raw TEs

type=tir #or ltr helitron
perl ~/las/git_bin/EDTA/EDTA_raw.pl --genome $genome --species Maize --type $type -t $threads

Generate de novo TE library for each genome and perform initial annotation

perl ~/las/git_bin/EDTA/EDTA.pl --genome $genome --species Maize -t $threads --cds $cds --curatedlib maizeTE02052020 --anno 1

Make pan-genome TE library

Filter out single-copy annotations

# $genome.out is the RepeatMasker .out file generated from the last step, located in $genome.mod.EDTA.anno/
for i in `cat list.cds|awk '{print $1}'`; do
    perl ~/las/git_bin/EDTA/util/output_by_list.pl \
      1 \
     <(perl -nle 's/#.*//; print $_' $i.mod.EDTA.TElib.novel.fa) \
     1 \
     <(perl ~/las/git_bin/EDTA/util/find_flTE.pl $i.mod.out | \
     awk '{print $10}'| \
     sort| \
     uniq -c |\
     perl -nle 'my ($count, $id) = (split); if ($id=~/LTR/){next if $count<=2} else {next if $count ==1} print $_' |\
     awk '{print $2}') -FA > $i.mod.EDTA.TElib.novel.fa.real &
done

get classification info and convert #unknown to #DNA/Helitron

for j in *mod.EDTA.TElib.novel.fa; do 
    for i in `cat $j.real`; do 
        grep $i $j; 
    done| \
    perl -nle 's/#unknown/#DNA\/Helitron/; print $_' > $j.real.ori & 
done

aggregate novel TE libraries

i=0
for j in *real.ori; do
  i=$(($i+5000));
  perl ~/las/git_bin/EDTA/util/rename_TE.pl $j $i;
done > NAM.EDTA1.8.0.EDTA.TElib.novel.fa.raw
perl ~/las/git_bin/EDTA/util/rename_TE.pl NAM.EDTA1.8.0.EDTA.TElib.novel.fa.raw > NAM.EDTA1.8.0.EDTA.TElib.novel.fa.raw2
mv NAM.EDTA1.8.0.EDTA.TElib.novel.fa.raw2 NAM.EDTA1.8.0.EDTA.TElib.novel.fa.raw

remove redundant

nohup perl ~/las/git_bin/EDTA/util/cleanup_nested.pl \
    -in NAM.EDTA1.8.0.EDTA.TElib.novel.fa.raw \
    -cov 0.95 \
    -minlen 80 \
    -miniden 80 \
    -t 36 &

remove a number of false TEs and rename IDs

RepeatMasker -pa 36 -q -no_is -norna -nolow -div 40 -lib rm.fa -cutoff 225 NAM.EDTA1.8.0.EDTA.TElib.novel.fa.raw.cln
perl ~/las/git_bin/EDTA/util/output_by_list.pl 1 NAM.EDTA1.8.0.EDTA.TElib.novel.fa.raw.cln 1 \
    <(awk '{print $5}' NAM.EDTA1.8.0.EDTA.TElib.novel.fa.raw.cln.out|grep TE) -ex -FA | \
    perl ~/las/git_bin/EDTA/util/rename_TE.pl - > NAM.EDTA1.8.0.EDTA.TElib.novel.fa

make comprehensive TE library

cat maizeTE02052020 NAM.EDTA1.8.0.EDTA.TElib.novel.v2.fa > NAM.EDTA1.8.0.MTEC02052020.TElib.fa

Annotate pan-genome TE

Re-mask all genomes with the pan-genome TE lib

lib=NAM.EDTA1.8.0.MTEC02052020.TElib.fa
RepeatMasker -pa 36 -q -div 40 -lib $lib -cutoff 225 -gff $genome

Re-run EDTA final on each genome

lib=NAM.EDTA1.8.0.MTEC02052020.TElib.fa
perl ~/las/git_bin/EDTA/EDTA.pl \
   --genome $genome \
   --species Maize \
   -t $threads \
   --step final \
   --anno 1 \
   --rmout $genome.out \
   --curatedlib \$lib \
   --cds $cds

calculate LAI

for i in `awk '{print $1}' list.cds`; do
 perl ~/las/git_bin/LTR_retriever/LAI \
    -genome $(echo $i|perl -nle 's/\..*//; print $_')/$i \
    -intact $(echo $i|perl -nle 's/\..*//; print $_')/$i.mod.EDTA.raw/LTR/$i.mod.pass.list \
    -all $i.out \
    -q -iden 94.854 \
    -totLTR 76.34 \
    -t 2 &
done

Summarize pan-genome TE

aggregate TE summary info

for i in *mod.EDTA.TEanno.sum; do
   cat <(echo $i| perl -nle 's/\..*//; print "$_\t${_}_cp\t${_}_bp\t${_}_pcnt"')\
       <(head -32 $i|grep -v -P "\-\-|=|total|masked"| perl -0777 -ne 's/\s+unknown/\nLTR_unknown/; print $_' |grep %) |\
   perl transpose3.pl -; \
done > NAM.EDTA1.9.0.MTEC02052020.TE.v1.anno.sum

grep summary for each superfamily

cat head <(grep pcnt NAM.EDTA1.9.0.MTEC02052020.TE.v1.anno.sum) > NAM.EDTA1.9.0.MTEC02052020.TE.v1.anno.pcnt.txt
cat head <(grep bp NAM.EDTA1.9.0.MTEC02052020.TE.v1.anno.sum) > NAM.EDTA1.9.0.MTEC02052020.TE.v1.anno.bp.txt
cat head <(grep cp NAM.EDTA1.9.0.MTEC02052020.TE.v1.anno.sum) > NAM.EDTA1.9.0.MTEC02052020.TE.v1.anno.cp.txt

extract family percent

for i in *mod.EDTA.TEanno.sum; do
  perl get_TE_fam_pcnt.pl $i &
done

aggregate into a big table

cat *mod.EDTA.TEanno.sum.fam | perl combine_TE_fam_pcnt.pl pcnt - > NAM.EDTA1.9.0.MTEC02052020.TE.v1.anno.sum.fam
cat *mod.EDTA.TEanno.sum.fam | perl combine_TE_fam_pcnt.pl bp - > NAM.EDTA1.9.0.MTEC02052020.TE.v1.anno.sum.fam.bp
perl -i -nle 's/#/_/g; print $_' NAM.EDTA1.9.0.MTEC02052020.TE.v1.anno.sum.fam
perl -i -nle 's/#/_/g; print $_' NAM.EDTA1.9.0.MTEC02052020.TE.v1.anno.sum.fam.bp

keep TE only

grep -v -P "CL569186.1|AF013103.1|\)n|cent|Cent|telo|knob|TR-1|osed|sela" NAM.EDTA1.9.0.MTEC02052020.TE.v1.anno.sum.fam > NAM.EDTA1.9.0.MTEC02052020.TE.v1.1.anno.sum.fam
grep -v -P "CL569186.1|AF013103.1|\)n|cent|Cent|telo|knob|TR-1|osed|sela" NAM.EDTA1.9.0.MTEC02052020.TE.v1.anno.sum.fam.bp > NAM.EDTA1.9.0.MTEC02052020.TE.v1.1.anno.sum.fam.bp

count effective TEs

grep -v -P 'long_terminal_repeat|repeat_region|target_site_duplication' *mod.EDTA.TEanno.gff3 |\
perl -nle 'next unless s/ID=//; \
           my ($cla, $id)=(split)[2,8]; \
           $id=~s/.*; \
           Name=(.*);Classific.*/$1/; \
           $id=~s/;.*//; $id=~s/#/_/; \
           print "$id\t$cla"' |\
grep -v -P "\)n|A-rich|G-rich|begin|position" |\
sort -u > NAM.EDTA1.9.0.MTEC02052020.TE.v1.anno.TEfam.list &
grep -v : NAM.EDTA1.9.0.MTEC02052020.TE.v1.anno.TEfam.list|wc -l

sum intact size for LTR TIR and Helitron in each genome

for j in `ls *fasta.mod.EDTA.intact.gff3`; do
   echo -n "$j ";
  for i in LTR_ /DT Heli; do
     grep ID $j | grep $i | awk '{print $1"\t"$4"\t"$5"\t"$3}'|\
     perl ~/las/git_bin/EDTA/util/combine_overlap.pl - |\
    perl ~/las/git_bin/EDTA/util/count_mask.pl -;
  done;
done |\
perl -ne 'chomp; print "\n" if /gff/; print "$_\t"' > NAM.EDTA1.9.0.MTEC02052020.intact.sum &
perl -i -nle 'next if /^$/; s/.*\///; s/\..*gff3//; print $_' NAM.EDTA1.9.0.MTEC02052020.intact.sum
cat <(echo "Genome LTR TIR Helitron") NAM.EDTA1.9.0.MTEC02052020.intact.sum > NAM.EDTA1.9.0.MTEC02052020.intact.sum.txt

get intact LTR info

for i in `ls *mod.EDTA.intact.gff3 |grep -v -P 'Ab10|AB10'`; do
  grep LTR_retrotransposon $i |\
  perl -nle 'my ($chr, undef, $supfam, $from, $to, undef, $str, undef, $info)=(split); \
             my $genome = $1 if $chr=~s/^(.*?)_//; \
             my ($id, $classification, $SO, $iden, $motif, $tsd)=($1, $2, $3, $4, $5, $6) if $info=~/Name=(.*);\
             Classification=(.*); \
             Sequence_ontology=(.*); \
             ltr_identity=(.*); \
             Method=structural; \
             motif=(.*);tsd=(.*)$/;\
            print "$genome\t$chr\t$supfam\t$classification\t$from\t$to\t$str\t$id\t$SO\t$motif\t$tsd\t$iden"';
done > NAM.26.intact.LTR.list &

plot figures in R

Rscript NAM_TE_summary.R

plot figures in R

Rscript Suppl. Fig. S4.R
Rscript Suppl. Fig. S5.R
Rscript Suppl. Fig. S6.R