Skip to content

Commit

Permalink
Merge commit 'd6e24d8d917df06cf5932f01f05b8d12778b7a11'
Browse files Browse the repository at this point in the history
  • Loading branch information
milot-mirdita committed Aug 26, 2023
2 parents 0081807 + d6e24d8 commit 45d68ee
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 7 deletions.
6 changes: 3 additions & 3 deletions lib/mmseqs/data/workflow/databases.sh
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ case "${SELECTION}" in
;;
"GTDB")
if notExists "${TMP_PATH}/download.done"; then
downloadFile "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/VERSION" "${TMP_PATH}/version"
downloadFile "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/VERSION.txt" "${TMP_PATH}/version"
downloadFile "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/genomic_files_reps/gtdb_proteins_aa_reps.tar.gz" "${TMP_PATH}/gtdb.tar.gz"
downloadFile "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_taxonomy.tsv" "${TMP_PATH}/bac120_taxonomy.tsv"
downloadFile "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_taxonomy.tsv" "${TMP_PATH}/ar53_taxonomy.tsv"
Expand Down Expand Up @@ -371,9 +371,9 @@ case "${INPUT_TYPE}" in
;;
"GTDB")
# shellcheck disable=SC2086
"${MMSEQS}" tar2db "${TMP_PATH}/gtdb.tar.gz" "${TMP_PATH}/tardb" --tar-include 'faa$' ${THREADS_PAR} \
"${MMSEQS}" tar2db "${TMP_PATH}/gtdb.tar.gz" "${TMP_PATH}/tardb" --tar-include 'faa.gz$' ${THREADS_PAR} \
|| fail "tar2db died"
sed 's|_protein\.faa||g' "${TMP_PATH}/tardb.lookup" > "${TMP_PATH}/tardb.lookup.tmp"
sed 's|_protein\.faa\.gz||g' "${TMP_PATH}/tardb.lookup" > "${TMP_PATH}/tardb.lookup.tmp"
mv -f -- "${TMP_PATH}/tardb.lookup.tmp" "${TMP_PATH}/tardb.lookup"
# shellcheck disable=SC2086
"${MMSEQS}" createdb "${TMP_PATH}/tardb" "${OUTDB}" ${COMP_PAR} \
Expand Down
1 change: 1 addition & 0 deletions lib/mmseqs/src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ extern int profile2repseq(int argc, const char **argv, const Command& command);
extern int proteinaln2nucl(int argc, const char **argv, const Command& command);
extern int rescorediagonal(int argc, const char **argv, const Command& command);
extern int ungappedprefilter(int argc, const char **argv, const Command& command);
extern int gappedprefilter(int argc, const char **argv, const Command& command);
extern int unpackdb(int argc, const char **argv, const Command& command);
extern int rbh(int argc, const char **argv, const Command& command);
extern int result2flat(int argc, const char **argv, const Command& command);
Expand Down
8 changes: 8 additions & 0 deletions lib/mmseqs/src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,14 @@ std::vector<Command> baseCommands = {
CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"prefilterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::prefilterDb }}},
{"gappedprefilter", gappedprefilter, &par.gappedprefilter, COMMAND_PREFILTER,
"Optimal Smith-Waterman-based prefiltering (slow)",
NULL,
"Martin Steinegger <martin.steinegger@snu.ac.kr>",
"<i:queryDB> <i:targetDB> <o:prefilterDB>",
CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"prefilterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::prefilterDb }}},
{"kmermatcher", kmermatcher, &par.kmermatcher, COMMAND_PREFILTER,
"Find bottom-m-hashed k-mer matches within sequence DB",
NULL,
Expand Down
17 changes: 17 additions & 0 deletions lib/mmseqs/src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,23 @@ Parameters::Parameters():
ungappedprefilter.push_back(&PARAM_COMPRESSED);
ungappedprefilter.push_back(&PARAM_V);
// gappedprefilter
gappedprefilter.push_back(&PARAM_SUB_MAT);
gappedprefilter.push_back(&PARAM_GAP_OPEN);
gappedprefilter.push_back(&PARAM_GAP_EXTEND);
gappedprefilter.push_back(&PARAM_E);
gappedprefilter.push_back(&PARAM_C);
gappedprefilter.push_back(&PARAM_COV_MODE);
gappedprefilter.push_back(&PARAM_NO_COMP_BIAS_CORR);
gappedprefilter.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
gappedprefilter.push_back(&PARAM_MIN_DIAG_SCORE);
gappedprefilter.push_back(&PARAM_MAX_SEQS);
gappedprefilter.push_back(&PARAM_TAXON_LIST);
gappedprefilter.push_back(&PARAM_PRELOAD_MODE);
gappedprefilter.push_back(&PARAM_THREADS);
gappedprefilter.push_back(&PARAM_COMPRESSED);
gappedprefilter.push_back(&PARAM_V);
// clustering
clust.push_back(&PARAM_CLUSTER_MODE);
clust.push_back(&PARAM_MAXITERATIONS);
Expand Down
1 change: 1 addition & 0 deletions lib/mmseqs/src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,7 @@ class Parameters {
PARAMETER(PARAM_LOCAL_TMP)
std::vector<MMseqsParameter*> prefilter;
std::vector<MMseqsParameter*> ungappedprefilter;
std::vector<MMseqsParameter*> gappedprefilter;

// alignment
PARAMETER(PARAM_ALIGNMENT_MODE)
Expand Down
41 changes: 38 additions & 3 deletions lib/mmseqs/src/prefiltering/ungappedprefilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@
#include <omp.h>
#endif


int ungappedprefilter(int argc, const char **argv, const Command &command) {
int prefilterInternal(int argc, const char **argv, const Command &command, int mode) {
Parameters &par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, 0, 0);
DBWriter resultWriter(par.db3.c_str(), par.db3Index.c_str(), 1, par.compressed, Parameters::DBTYPE_PREFILTER_RES);
Expand Down Expand Up @@ -141,7 +140,37 @@ int ungappedprefilter(int argc, const char **argv, const Command &command) {
continue;
}

int score = aligner.ungapped_alignment(tSeq.numSequence, tSeq.L);
int score;
if (mode == 0) {
score = aligner.ungapped_alignment(tSeq.numSequence, tSeq.L);
} else {
std::string backtrace;
s_align res;
if (isIdentity) {
res = aligner.scoreIdentical(
tSeq.numSequence, tSeq.L, evaluer, Matcher::SCORE_ONLY, backtrace
);
} else {
res = aligner.ssw_align(
tSeq.numSequence,
tSeq.numConsensusSequence,
tSeq.getAlignmentProfile(),
tSeq.L,
backtrace,
par.gapOpen.values.aminoacid(),
par.gapExtend.values.aminoacid(),
Matcher::SCORE_ONLY,
par.evalThr,
evaluer,
par.covMode,
par.covThr,
par.correlationScoreWeight,
qSeq.L / 2,
tId
);
}
score = res.score1;
}
bool hasDiagScore = (score > par.minDiagScoreThr);
double evalue = 0.0;
// check if evalThr != inf
Expand Down Expand Up @@ -199,4 +228,10 @@ int ungappedprefilter(int argc, const char **argv, const Command &command) {
return 0;
}

int ungappedprefilter(int argc, const char **argv, const Command &command) {
return prefilterInternal(argc, argv, command, 0);
}

int gappedprefilter(int argc, const char **argv, const Command &command) {
return prefilterInternal(argc, argv, command, 1);
}
2 changes: 1 addition & 1 deletion lib/mmseqs/src/util/createdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ int createdb(int argc, const char **argv, const Command& command) {
const size_t testForNucSequence = 100;
size_t isNuclCnt = 0;
Debug::Progress progress;
std::vector<unsigned short>* sourceLookup = new std::vector<unsigned short>[shuffleSplits]();
std::vector<unsigned int>* sourceLookup = new std::vector<unsigned int>[shuffleSplits]();
for (size_t i = 0; i < shuffleSplits; ++i) {
sourceLookup[i].reserve(16384);
}
Expand Down

0 comments on commit 45d68ee

Please sign in to comment.