diff --git a/CHANGELOG.md b/CHANGELOG.md index 7db7d40..b0c9159 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -472,6 +472,7 @@ ________________________________________________________________
Daily Change Log: +* [2024.8.30] - Added ${N_JOBS} to download scripts with default set to maximum threads available * [2024.8.29] - Added `VERSION` file created in `download_databases.sh` * [2024.7.11] - Alignment fraction threshold for genome clustering only applied to reference but should also apply to query. Added `--af_mode` with either `relaxed = max([Alignment_fraction_ref, Alignment_fraction_query]) > minimum_af` or `strict = (Alignment_fraction_ref > minimum_af) & (Alignment_fraction_query > minimum_af)` to `edgelist_to_clusters.py`, `global_clustering.py`, `local_clustering.py`, and `cluster.py`. * [2024.7.3] - Added `pigz` to `VEBA-annotate_env` which isn't a problem with most `conda` installations but needed for `docker` containers. diff --git a/README.md b/README.md index e4ca980..0ab5b2e 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ ___________________________________________________________________ ### Announcements -* **Current Stable Version:** [`v2.2.1`](https://github.com/jolespin/veba/releases/tag/v2.2.0) +* **Current Stable Version:** [`v2.2.1`](https://github.com/jolespin/veba/releases/tag/v2.2.1) * **Current Database Version:** [`VDB_v7`](install/DATABASE.md) diff --git a/install/download_databases-annotate.sh b/install/download_databases-annotate.sh index f3e44f4..7e8d5cc 100644 --- a/install/download_databases-annotate.sh +++ b/install/download_databases-annotate.sh @@ -1,5 +1,5 @@ #!/bin/bash -# __version__ = "2024.6.8" +# __version__ = "2024.8.30" # VEBA_DATABASE_VERSION = "VDB_v7" # MICROEUKAYROTIC_DATABASE_VERSION = "MicroEuk_v3" # usage: bash veba/download_databases-annotate.sh /path/to/veba_database_destination/ @@ -9,7 +9,8 @@ DATABASE_DIRECTORY=${1:-"."} REALPATH_DATABASE_DIRECTORY=$(realpath $DATABASE_DIRECTORY) SCRIPT_DIRECTORY=$(dirname "$0") -# N_JOBS=$(2:-"1") +MAXIMUM_NUMBER_OF_CPU=$(python -c "from multiprocessing import cpu_count; print(cpu_count())") +N_JOBS=${3:-${MAXIMUM_NUMBER_OF_CPU}} # Database structure echo ". .. ... ..... ........ ............." @@ -78,19 +79,19 @@ mkdir -v -p ${DATABASE_DIRECTORY}/Annotate/UniRef wget -v -P ${DATABASE_DIRECTORY}/Annotate/UniRef/ https://ftp.uniprot.org/pub/databases/uniprot/current_release/uniref/uniref90/uniref90.release_note wget -v -P ${DATABASE_DIRECTORY} https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz -diamond makedb --in ${DATABASE_DIRECTORY}/uniref90.fasta.gz --db ${DATABASE_DIRECTORY}/Annotate/UniRef/uniref90.dmnd +diamond makedb --in ${DATABASE_DIRECTORY}/uniref90.fasta.gz --db ${DATABASE_DIRECTORY}/Annotate/UniRef/uniref90.dmnd --threads ${N_JOBS} rm -rf ${DATABASE_DIRECTORY}/uniref90.fasta.gz wget -v -P ${DATABASE_DIRECTORY}/Annotate/UniRef/ https://ftp.uniprot.org/pub/databases/uniprot/current_release/uniref/uniref50/uniref50.release_note wget -v -P ${DATABASE_DIRECTORY} https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz -diamond makedb --in ${DATABASE_DIRECTORY}/uniref50.fasta.gz --db ${DATABASE_DIRECTORY}/Annotate/UniRef/uniref50.dmnd +diamond makedb --in ${DATABASE_DIRECTORY}/uniref50.fasta.gz --db ${DATABASE_DIRECTORY}/Annotate/UniRef/uniref50.dmnd --threads ${N_JOBS} rm -rf ${DATABASE_DIRECTORY}/uniref50.fasta.gz #MiBIG mkdir -v -p ${DATABASE_DIRECTORY}/Annotate/MIBiG wget -v -P ${DATABASE_DIRECTORY} https://dl.secondarymetabolites.org/mibig/mibig_prot_seqs_3.1.fasta seqkit rmdup -s ${DATABASE_DIRECTORY}/mibig_prot_seqs_3.1.fasta > ${DATABASE_DIRECTORY}/mibig_prot_seqs_3.1.rmdup.fasta -diamond makedb --in ${DATABASE_DIRECTORY}/mibig_prot_seqs_3.1.rmdup.fasta --db ${DATABASE_DIRECTORY}/Annotate/MIBiG/mibig_v3.1.dmnd +diamond makedb --in ${DATABASE_DIRECTORY}/mibig_prot_seqs_3.1.rmdup.fasta --db ${DATABASE_DIRECTORY}/Annotate/MIBiG/mibig_v3.1.dmnd --threads ${N_JOBS} rm -rf ${DATABASE_DIRECTORY}/mibig_prot_seqs_3.1.fasta rm -rf ${DATABASE_DIRECTORY}/mibig_prot_seqs_3.1.rmdup.fasta @@ -104,13 +105,13 @@ rm -rf ${DATABASE_DIRECTORY}/mibig_prot_seqs_3.1.rmdup.fasta mkdir -v -p ${DATABASE_DIRECTORY}/Annotate/VFDB wget -v -P ${DATABASE_DIRECTORY} http://www.mgc.ac.cn/VFs/Down/VFDB_setA_pro.fas.gz wget -v -P ${DATABASE_DIRECTORY}/Annotate/VFDB/ http://www.mgc.ac.cn/VFs/Down/VFs.xls.gz -diamond makedb --in ${DATABASE_DIRECTORY}/VFDB_setA_pro.fas.gz --db ${DATABASE_DIRECTORY}/Annotate/VFDB/VFDB_setA_pro.dmnd +diamond makedb --in ${DATABASE_DIRECTORY}/VFDB_setA_pro.fas.gz --db ${DATABASE_DIRECTORY}/Annotate/VFDB/VFDB_setA_pro.dmnd --threads ${N_JOBS} rm -rf ${DATABASE_DIRECTORY}/VFDB_setA_pro.fas.gz # CAZy mkdir -v -p ${DATABASE_DIRECTORY}/Annotate/CAZy wget -v -P ${DATABASE_DIRECTORY} https://bcb.unl.edu/dbCAN2/download/CAZyDB.07262023.fa -diamond makedb --in ${DATABASE_DIRECTORY}/CAZyDB.07262023.fa --db ${DATABASE_DIRECTORY}/Annotate/CAZy/CAZyDB.07262023.dmnd +diamond makedb --in ${DATABASE_DIRECTORY}/CAZyDB.07262023.fa --db ${DATABASE_DIRECTORY}/Annotate/CAZy/CAZyDB.07262023.dmnd --threads ${N_JOBS} rm -rf ${DATABASE_DIRECTORY}/CAZyDB.07262023.fa diff --git a/install/download_databases-classify.sh b/install/download_databases-classify.sh index 4689b3c..b234167 100644 --- a/install/download_databases-classify.sh +++ b/install/download_databases-classify.sh @@ -1,5 +1,5 @@ #!/bin/bash -# __version__ = "2024.6.8.1" +# __version__ = "2024.8.30" # VEBA_DATABASE_VERSION = "VDB_v7" # MICROEUKAYROTIC_DATABASE_VERSION = "MicroEuk_v3" # usage: bash veba/download_databases-classify.sh /path/to/veba_database_destination/ @@ -9,7 +9,8 @@ DATABASE_DIRECTORY=${1:-"."} REALPATH_DATABASE_DIRECTORY=$(realpath $DATABASE_DIRECTORY) SCRIPT_DIRECTORY=$(dirname "$0") -# N_JOBS=$(2:-"1") +MAXIMUM_NUMBER_OF_CPU=$(python -c "from multiprocessing import cpu_count; print(cpu_count())") +N_JOBS=${3:-${MAXIMUM_NUMBER_OF_CPU}} # Database structure echo ". .. ... ..... ........ ............." @@ -81,7 +82,7 @@ wget -v -P ${DATABASE_DIRECTORY} https://portal.nersc.gov/CheckV/checkv-db-${CHE tar xvzf ${DATABASE_DIRECTORY}/checkv-db-${CHECKVDB_VERSION}.tar.gz -C ${DATABASE_DIRECTORY} mv ${DATABASE_DIRECTORY}/checkv-db-${CHECKVDB_VERSION} ${DATABASE_DIRECTORY}/Classify/CheckV echo "${CHECKV_VERSION}" > ${DATABASE_DIRECTORY}/Classify/CheckV/database_version -diamond makedb --in ${DATABASE_DIRECTORY}/Classify/CheckV/genome_db/checkv_reps.faa --db ${DATABASE_DIRECTORY}/Classify/CheckV/genome_db/checkv_reps.dmnd +diamond makedb --in ${DATABASE_DIRECTORY}/Classify/CheckV/genome_db/checkv_reps.faa --db ${DATABASE_DIRECTORY}/Classify/CheckV/genome_db/checkv_reps.dmnd --threads ${N_JOBS} rm -rf ${DATABASE_DIRECTORY}/checkv-db-${CHECKVDB_VERSION}.tar.gz # geNomad diff --git a/install/download_databases-contamination.sh b/install/download_databases-contamination.sh index 30c0d6b..2cc5a1e 100644 --- a/install/download_databases-contamination.sh +++ b/install/download_databases-contamination.sh @@ -1,5 +1,5 @@ #!/bin/bash -# __version__ = "2024.6.8" +# __version__ = "2024.8.30" # VEBA_DATABASE_VERSION = "VDB_v7" # MICROEUKAYROTIC_DATABASE_VERSION = "MicroEuk_v3" # usage: bash veba/download_databases-contamination.sh /path/to/veba_database_destination/ @@ -9,7 +9,8 @@ DATABASE_DIRECTORY=${1:-"."} REALPATH_DATABASE_DIRECTORY=$(realpath $DATABASE_DIRECTORY) SCRIPT_DIRECTORY=$(dirname "$0") -# N_JOBS=$(2:-"1") +MAXIMUM_NUMBER_OF_CPU=$(python -c "from multiprocessing import cpu_count; print(cpu_count())") +N_JOBS=${3:-${MAXIMUM_NUMBER_OF_CPU}} # Database structure echo ". .. ... ..... ........ ............." diff --git a/install/download_databases-markers.sh b/install/download_databases-markers.sh index 185336e..1d715d4 100644 --- a/install/download_databases-markers.sh +++ b/install/download_databases-markers.sh @@ -1,5 +1,5 @@ #!/bin/bash -# __version__ = "2024.6.8" +# __version__ = "2024.8.30" # VEBA_DATABASE_VERSION = "VDB_v8" # MICROEUKAYROTIC_DATABASE_VERSION = "MicroEuk_v3" # usage: bash veba/download_databases-preprocess.sh /path/to/veba_database_destination/ @@ -9,7 +9,8 @@ DATABASE_DIRECTORY=${1:-"."} REALPATH_DATABASE_DIRECTORY=$(realpath $DATABASE_DIRECTORY) SCRIPT_DIRECTORY=$(dirname "$0") -# N_JOBS=$(2:-"1") +MAXIMUM_NUMBER_OF_CPU=$(python -c "from multiprocessing import cpu_count; print(cpu_count())") +N_JOBS=${3:-${MAXIMUM_NUMBER_OF_CPU}} # Database structure echo ". .. ... ..... ........ ............." diff --git a/install/download_databases.sh b/install/download_databases.sh index fae2c96..1de8f0d 100644 --- a/install/download_databases.sh +++ b/install/download_databases.sh @@ -1,7 +1,7 @@ #!/bin/bash -# __version__ = "2024.8.29" +# __version__ = "2024.8.30" # MICROEUKAYROTIC_DATABASE_VERSION = "MicroEuk_v3" -# usage: bash veba/download_databases.sh /path/to/veba_database_destination/ [optional positional argument: /path/to/conda_environments/] +# usage: bash veba/download_databases.sh /path/to/veba_database_destination/ [optional positional argument: /path/to/conda_environments/ number_of_threads] # Version VEBA_DATABASE_VERSION="VDB_v7" @@ -12,7 +12,12 @@ SCRIPT_DIRECTORY=$(dirname "$0") CONDA_ENVS_PATH=${2:-"$(conda info --base)/envs/"} -# N_JOBS=$(2:-"1") +MAXIMUM_NUMBER_OF_CPU=$(python -c "from multiprocessing import cpu_count; print(cpu_count())") +N_JOBS=${3:-${MAXIMUM_NUMBER_OF_CPU}} +echo ". .. ... ..... ........ ............." +echo "Detected ${MAXIMUM_NUMBER_OF_CPU} available threads" +echo "Using ${N_JOBS} threads" +echo ". .. ... ..... ........ ............." # Database structure echo ". .. ... ..... ........ ............." @@ -33,24 +38,24 @@ echo $VEBA_DATABASE_VERSION > ${DATABASE_DIRECTORY}/VERSION echo ". .. ... ..... ........ ............." echo "Downloading and configuring database (markers)" echo ". .. ... ..... ........ ............." -bash ${SCRIPT_DIRECTORY}/download_databases-markers.sh ${DATABASE_DIRECTORY} | grep -v "\[partial-database\]" +bash ${SCRIPT_DIRECTORY}/download_databases-markers.sh ${DATABASE_DIRECTORY} ${N_JOBS} | grep -v "\[partial-database\]" echo ". .. ... ..... ........ ............." echo "Downloading and configuring database (contamination)" echo ". .. ... ..... ........ ............." -bash ${SCRIPT_DIRECTORY}/download_databases-contamination.sh ${DATABASE_DIRECTORY} | grep -v "\[partial-database\]" +bash ${SCRIPT_DIRECTORY}/download_databases-contamination.sh ${DATABASE_DIRECTORY} ${N_JOBS} | grep -v "\[partial-database\]" echo ". .. ... ..... ........ ............." echo "Downloading and configuring database (classify)" echo ". .. ... ..... ........ ............." echo "This might take a while depending on source database i/o speed..." -bash ${SCRIPT_DIRECTORY}/download_databases-classify.sh ${DATABASE_DIRECTORY} | grep -v "\[partial-database\]" +bash ${SCRIPT_DIRECTORY}/download_databases-classify.sh ${DATABASE_DIRECTORY} ${N_JOBS} | grep -v "\[partial-database\]" echo ". .. ... ..... ........ ............." echo "Downloading and configuring database (annotate)" echo ". .. ... ..... ........ ............." echo "This might take a while depending on source database i/o speed..." -bash ${SCRIPT_DIRECTORY}/download_databases-annotate.sh ${DATABASE_DIRECTORY} | grep -v "\[partial-database\]" +bash ${SCRIPT_DIRECTORY}/download_databases-annotate.sh ${DATABASE_DIRECTORY} ${N_JOBS} | grep -v "\[partial-database\]" # Environment variables echo ". .. ... ..... ........ ............."