diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 34e429811..302919d4c 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -152,6 +152,7 @@ Parameters::Parameters(): PARAM_IGNORE_MULTI_KMER(PARAM_IGNORE_MULTI_KMER_ID, "--ignore-multi-kmer", "Skip repeating k-mers", "Skip k-mers occurring multiple times (>=2)", typeid(bool), (void *) &ignoreMultiKmer, "", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), PARAM_HASH_SHIFT(PARAM_HASH_SHIFT_ID, "--hash-shift", "Shift hash", "Shift k-mer hash initialization", typeid(int), (void *) &hashShift, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), PARAM_PICK_N_SIMILAR(PARAM_PICK_N_SIMILAR_ID, "--pick-n-sim-kmer", "Add N similar to search", "Add N similar k-mers to search", typeid(int), (void *) &pickNbest, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), + PARAM_MATCH_ADJACENT_SEQ(PARAM_MATCH_ADJACENT_SEQ_ID, "--match-adjacent-seq", "Compare adjacent sequences to k-mers", "Compare sequence information adjacent to k-mers and elect multiple representative sequences per cluster", typeid(bool), (void *) &matchAdjacentSeq, "", MMseqsParameter::COMMAND_CLUSTLINEAR), PARAM_ADJUST_KMER_LEN(PARAM_ADJUST_KMER_LEN_ID, "--adjust-kmer-len", "Adjust k-mer length", "Adjust k-mer length based on specificity (only for nucleotides)", typeid(bool), (void *) &adjustKmerLength, "", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), PARAM_RESULT_DIRECTION(PARAM_RESULT_DIRECTION_ID, "--result-direction", "Result direction", "result is 0: query, 1: target centric", typeid(int), (void *) &resultDirection, "^[0-1]{1}$", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), PARAM_WEIGHT_FILE(PARAM_WEIGHT_FILE_ID, "--weights", "Weight file name", "Weights used for cluster priorization", typeid(std::string), (void*) &weightFile, "", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT ), @@ -984,6 +985,7 @@ Parameters::Parameters(): kmermatcher.push_back(&PARAM_V); kmermatcher.push_back(&PARAM_WEIGHT_FILE); kmermatcher.push_back(&PARAM_WEIGHT_THR); + kmermatcher.push_back(&PARAM_MATCH_ADJACENT_SEQ); // kmermatcher kmersearch.push_back(&PARAM_SEED_SUB_MAT); @@ -2513,6 +2515,9 @@ void Parameters::setDefaults() { resultDirection = Parameters::PARAM_RESULT_DIRECTION_TARGET; weightThr = 0.9; weightFile = ""; + // TODO: change to true after fixing regression tests + matchAdjacentSeq = false; + hashSeqBuffer = 1.05; // result2stats stat = ""; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 745d2f809..0fa75e597 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -552,6 +552,8 @@ class Parameters { int resultDirection; float weightThr; std::string weightFile; + bool matchAdjacentSeq; + float hashSeqBuffer; // indexdb int checkCompatible; @@ -865,6 +867,7 @@ class Parameters { PARAMETER(PARAM_IGNORE_MULTI_KMER) PARAMETER(PARAM_HASH_SHIFT) PARAMETER(PARAM_PICK_N_SIMILAR) + PARAMETER(PARAM_MATCH_ADJACENT_SEQ) PARAMETER(PARAM_ADJUST_KMER_LEN) PARAMETER(PARAM_RESULT_DIRECTION) PARAMETER(PARAM_WEIGHT_FILE) diff --git a/src/linclust/kmermatcher.cpp b/src/linclust/kmermatcher.cpp index 3e0740c59..7bb3f076e 100644 --- a/src/linclust/kmermatcher.cpp +++ b/src/linclust/kmermatcher.cpp @@ -38,17 +38,17 @@ uint64_t hashUInt64(uint64_t in, uint64_t seed) { return XXH64(&in, sizeof(uint64_t), seed); } -template -KmerPosition *initKmerPositionMemory(size_t size) { - KmerPosition * hashSeqPair = new(std::nothrow) KmerPosition[size + 1]; +template +KmerPosition *initKmerPositionMemory(size_t size) { + KmerPosition * hashSeqPair = new(std::nothrow) KmerPosition[size + 1]; Util::checkAllocation(hashSeqPair, "Can not allocate memory"); - size_t pageSize = Util::getPageSize()/sizeof(KmerPosition); + size_t pageSize = Util::getPageSize()/sizeof(KmerPosition); #pragma omp parallel { #pragma omp for schedule(static) for (size_t page = 0; page < size+1; page += pageSize) { size_t readUntil = std::min(size+1, page + pageSize) - page; - memset(hashSeqPair+page, 0xFF, sizeof(KmerPosition)* readUntil); + memset(hashSeqPair+page, 0xFF, sizeof(KmerPosition)* readUntil); } } return hashSeqPair; @@ -67,7 +67,7 @@ void maskSequence(int maskMode, int maskLowerCase, float maskProb, Sequence &seq maskProb /*options.minMaskProb*/, probMatrix->hardMaskTable); } if(maskLowerCase == 1 && (Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS) || - Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES))) { + Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES))) { const char * charSeq = seq.getSeqData(); for (int i = 0; i < seq.L; i++) { seq.numSequence[i] = (islower(charSeq[i])) ? maskLetter : seq.numSequence[i]; @@ -75,10 +75,8 @@ void maskSequence(int maskMode, int maskLowerCase, float maskProb, Sequence &seq } } -template -std::pair fillKmerPositionArray(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, - size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution){ +template +std::pair fillKmerPositionArray(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution){ size_t offset = 0; int querySeqType = seqDbr.getDbtype(); size_t longestKmer = par.kmerSize; @@ -98,6 +96,7 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz #pragma omp parallel { unsigned int thread_idx = 0; + const unsigned char xIndex = subMat->aa2num[static_cast('X')]; #ifdef OPENMP thread_idx = static_cast(omp_get_thread_num()); #endif @@ -114,7 +113,7 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz Indexer idxer(subMat->alphabetSize - 1, par.kmerSize); const unsigned int BUFFER_SIZE = 1048576; size_t bufferPos = 0; - KmerPosition * threadKmerBuffer = new KmerPosition[BUFFER_SIZE]; + KmerPosition * threadKmerBuffer = new KmerPosition[BUFFER_SIZE]; SequencePosition * kmers = (SequencePosition *) malloc((par.pickNbest * (par.maxSeqLen + 1) + 1) * sizeof(SequencePosition)); size_t kmersArraySize = par.maxSeqLen; const size_t flushSize = 100000000; @@ -244,6 +243,9 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz threadKmerBuffer[bufferPos].id = seqId; threadKmerBuffer[bufferPos].pos = 0; threadKmerBuffer[bufferPos].seqLen = seq.L; + for (size_t i = 0; i < 6; i++) { + threadKmerBuffer[bufferPos].setAdjacentSeq(i, xIndex); + } if(hashDistribution != NULL){ __sync_fetch_and_add(&hashDistribution[static_cast(seqHash)], 1); } @@ -252,7 +254,7 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz size_t writeOffset = __sync_fetch_and_add(&offset, bufferPos); if(writeOffset + bufferPos < kmerArraySize){ if(kmerArray!=NULL){ - memcpy(kmerArray + writeOffset, threadKmerBuffer, sizeof(KmerPosition) * bufferPos); + memcpy(kmerArray + writeOffset, threadKmerBuffer, sizeof(KmerPosition) * bufferPos); } } else{ Debug(Debug::ERROR) << "Kmer array overflow. currKmerArrayOffset="<< writeOffset @@ -321,6 +323,32 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz threadKmerBuffer[bufferPos].id = seqId; threadKmerBuffer[bufferPos].pos = (kmers + kmerIdx)->pos; threadKmerBuffer[bufferPos].seqLen = seq.L; + // store adjacent seq information + unsigned int startPos = (kmers + kmerIdx)->pos; + unsigned int endPos = (kmers + kmerIdx)->pos + adjustedKmerSize - 1; + for (size_t i = 0; i < 6; i++) { + threadKmerBuffer[bufferPos].setAdjacentSeq(i, xIndex); + } + if (startPos >= 3) { + threadKmerBuffer[bufferPos].setAdjacentSeq(0, seq.numSequence[startPos - 3]); + threadKmerBuffer[bufferPos].setAdjacentSeq(1, seq.numSequence[startPos - 2]); + threadKmerBuffer[bufferPos].setAdjacentSeq(2, seq.numSequence[startPos - 1]); + }else if (startPos == 2) { + threadKmerBuffer[bufferPos].setAdjacentSeq(1, seq.numSequence[startPos - 2]); + threadKmerBuffer[bufferPos].setAdjacentSeq(2, seq.numSequence[startPos - 1]); + }else if (startPos == 1) { + threadKmerBuffer[bufferPos].setAdjacentSeq(2, seq.numSequence[startPos - 1]); + } + if (endPos + 3 <= static_cast(seq.L) - 1) { + threadKmerBuffer[bufferPos].setAdjacentSeq(3, seq.numSequence[endPos + 1]); + threadKmerBuffer[bufferPos].setAdjacentSeq(4, seq.numSequence[endPos + 2]); + threadKmerBuffer[bufferPos].setAdjacentSeq(5, seq.numSequence[endPos + 3]); + }else if (endPos + 2 == static_cast(seq.L) - 1) { + threadKmerBuffer[bufferPos].setAdjacentSeq(3, seq.numSequence[endPos + 1]); + threadKmerBuffer[bufferPos].setAdjacentSeq(4, seq.numSequence[endPos + 2]); + }else if (endPos + 1 == static_cast(seq.L) - 1) { + threadKmerBuffer[bufferPos].setAdjacentSeq(3, seq.numSequence[endPos + 1]); + } bufferPos++; if(hashDistribution != NULL){ __sync_fetch_and_add(&hashDistribution[(kmers + kmerIdx)->score], 1); @@ -331,7 +359,7 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz if(writeOffset + bufferPos < kmerArraySize){ if(kmerArray!=NULL) { memcpy(kmerArray + writeOffset, threadKmerBuffer, - sizeof(KmerPosition) * bufferPos); + sizeof(KmerPosition) * bufferPos); } } else{ Debug(Debug::ERROR) << "Kmer array overflow. currKmerArrayOffset="<< writeOffset @@ -362,7 +390,7 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz if(bufferPos > 0){ size_t writeOffset = __sync_fetch_and_add(&offset, bufferPos); if(kmerArray != NULL){ - memcpy(kmerArray+writeOffset, threadKmerBuffer, sizeof(KmerPosition) * bufferPos); + memcpy(kmerArray+writeOffset, threadKmerBuffer, sizeof(KmerPosition) * bufferPos); } } free(kmers); @@ -386,8 +414,8 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz } -template -void swapCenterSequence(KmerPosition *hashSeqPair, size_t splitKmerCount, SequenceWeights &seqWeights) { +template +void swapCenterSequence(KmerPosition *hashSeqPair, size_t splitKmerCount, SequenceWeights &seqWeights) { size_t prevHash = hashSeqPair[0].kmer; @@ -433,24 +461,28 @@ void swapCenterSequence(KmerPosition *hashSeqPair, size_t splitKmerCount, Seq } } -template void swapCenterSequence<0, short>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); -template void swapCenterSequence<0, int>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); -template void swapCenterSequence<1, short>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); -template void swapCenterSequence<1, int>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template void swapCenterSequence<0, short, false>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template void swapCenterSequence<0, int, false>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template void swapCenterSequence<1, short, false>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template void swapCenterSequence<1, int, false>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template void swapCenterSequence<0, short, true>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template void swapCenterSequence<0, int, true>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template void swapCenterSequence<1, short, true>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); +template void swapCenterSequence<1, int, true>(KmerPosition *kmers, size_t splitKmerCount, SequenceWeights &seqWeights); -template -KmerPosition * doComputation(size_t totalKmers, size_t hashStartRange, size_t hashEndRange, std::string splitFile, - DBReader & seqDbr, Parameters & par, BaseMatrix * subMat) { +template +KmerPosition * doComputation(size_t &totalKmers, size_t hashStartRange, size_t hashEndRange, std::string splitFile, + DBReader & seqDbr, Parameters & par, BaseMatrix * subMat) { - KmerPosition * hashSeqPair = initKmerPositionMemory(totalKmers); + KmerPosition * hashSeqPair = initKmerPositionMemory(totalKmers); size_t elementsToSort; if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ - std::pair ret = fillKmerPositionArray(hashSeqPair, totalKmers, seqDbr, par, subMat, true, hashStartRange, hashEndRange, NULL); + std::pair ret = fillKmerPositionArray(hashSeqPair, totalKmers, seqDbr, par, subMat, true, hashStartRange, hashEndRange, NULL); elementsToSort = ret.first; par.kmerSize = ret.second; Debug(Debug::INFO) << "\nAdjusted k-mer length " << par.kmerSize << "\n"; }else{ - std::pair ret = fillKmerPositionArray(hashSeqPair, totalKmers, seqDbr, par, subMat, true, hashStartRange, hashEndRange, NULL); + std::pair ret = fillKmerPositionArray(hashSeqPair, totalKmers, seqDbr, par, subMat, true, hashStartRange, hashEndRange, NULL); elementsToSort = ret.first; } if(hashEndRange == SIZE_T_MAX){ @@ -460,9 +492,9 @@ KmerPosition * doComputation(size_t totalKmers, size_t hashStartRange, size_t Debug(Debug::INFO) << "Sort kmer "; Timer timer; if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) { - SORT_PARALLEL(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPosReverse); + SORT_PARALLEL(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPosReverse); }else{ - SORT_PARALLEL(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPos); + SORT_PARALLEL(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPos); } Debug(Debug::INFO) << timer.lap() << "\n"; @@ -472,9 +504,9 @@ KmerPosition * doComputation(size_t totalKmers, size_t hashStartRange, size_t sequenceWeights = new SequenceWeights(par.weightFile.c_str()); if (sequenceWeights != NULL) { if (Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) { - swapCenterSequence(hashSeqPair, totalKmers, *sequenceWeights); + swapCenterSequence(hashSeqPair, totalKmers, *sequenceWeights); } else { - swapCenterSequence(hashSeqPair, totalKmers, *sequenceWeights); + swapCenterSequence(hashSeqPair, totalKmers, *sequenceWeights); } } } @@ -483,20 +515,25 @@ KmerPosition * doComputation(size_t totalKmers, size_t hashStartRange, size_t // The longest sequence is the first since we sorted by kmer, seq.Len and id size_t writePos; if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ - writePos = assignGroup(hashSeqPair, totalKmers, par.includeOnlyExtendable, par.covMode, par.covThr, sequenceWeights, par.weightThr); + writePos = assignGroup(hashSeqPair, totalKmers, par.includeOnlyExtendable, par.covMode, par.covThr, sequenceWeights, par.weightThr, subMat, par.hashSeqBuffer); }else{ - writePos = assignGroup(hashSeqPair, totalKmers, par.includeOnlyExtendable, par.covMode, par.covThr, sequenceWeights, par.weightThr); + writePos = assignGroup(hashSeqPair, totalKmers, par.includeOnlyExtendable, par.covMode, par.covThr, sequenceWeights, par.weightThr, subMat, par.hashSeqBuffer); } delete sequenceWeights; + if (writePos == SIZE_T_MAX) { + delete [] hashSeqPair; + totalKmers = 0; + return NULL; + } // sort by rep. sequence (stored in kmer) and sequence id Debug(Debug::INFO) << "Sort by rep. sequence "; timer.reset(); if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ - SORT_PARALLEL(hashSeqPair, hashSeqPair + writePos, KmerPosition::compareRepSequenceAndIdAndDiagReverse); + SORT_PARALLEL(hashSeqPair, hashSeqPair + writePos, KmerPosition::compareRepSequenceAndIdAndDiagReverse); }else{ - SORT_PARALLEL(hashSeqPair, hashSeqPair + writePos, KmerPosition::compareRepSequenceAndIdAndDiag); + SORT_PARALLEL(hashSeqPair, hashSeqPair + writePos, KmerPosition::compareRepSequenceAndIdAndDiag); } //kx::radix_sort(hashSeqPair, hashSeqPair + elementsToSort, SequenceComparision()); // for(size_t i = 0; i < writePos; i++){ @@ -517,10 +554,10 @@ KmerPosition * doComputation(size_t totalKmers, size_t hashStartRange, size_t } - template -size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, - SequenceWeights * sequenceWeights, float weightThr) { +size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, + SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *, float &) { + size_t writePos=0; size_t prevHash = hashSeqPair[0].kmer; size_t repSeqId = hashSeqPair[0].id; @@ -594,8 +631,6 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool inc diagonal = queryPos - targetPos; rId = (queryNeedsToBeRev) ? BIT_CLEAR(rId, 63) : BIT_SET(rId, 63); } -// std::cout << diagonal << "\t" << repSeq_i_pos << "\t" << hashSeqPair[i].pos << std::endl; - bool canBeExtended = diagonal < 0 || (diagonal > (queryLen - hashSeqPair[i].seqLen)); bool canBecovered = Util::canBeCovered(covThr, covMode, @@ -609,7 +644,6 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool inc writePos++; } } -// hashSeqPair[i].kmer = SIZE_T_MAX; hashSeqPair[i].kmer = (i != writePos - 1) ? SIZE_T_MAX : hashSeqPair[i].kmer; } prevSetSize = 0; @@ -639,10 +673,187 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool inc return writePos; } -template size_t assignGroup<0, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr); -template size_t assignGroup<0, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr); -template size_t assignGroup<1, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr); -template size_t assignGroup<1, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr); +template +size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, + SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer) { + + // change splitKmerCount to exclude additional memory + splitKmerCount = static_cast(splitKmerCount / hashSeqBuffer); + // declare variables + size_t writePos = 0; + size_t writeTmp = 0; + size_t prevHash = hashSeqPair[0].kmer; + size_t repSeqId = hashSeqPair[0].id; + if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { + bool isReverse = (BIT_CHECK(hashSeqPair[0].kmer, 63) == false); + repSeqId = (isReverse) ? BIT_CLEAR(repSeqId, 63) : BIT_SET(repSeqId, 63); + prevHash = BIT_SET(prevHash, 63); + } + size_t prevHashStart = 0; + size_t prevSetSize = 0; + size_t skipByWeightCount = 0; + bool repIsReverse; + T queryLen; + T repSeq_i_pos; + unsigned char repAdjacent[6]; + // modulate # of rep seq selected in same kmer groups + size_t repSeqNum = 20; + + for (size_t elementIdx = 0; elementIdx < splitKmerCount+1; elementIdx++) { + size_t currKmer = hashSeqPair[elementIdx].kmer; + if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { + currKmer = BIT_SET(currKmer, 63); + } + if (prevHash != currKmer) { + size_t repIdx = prevHashStart; + for (size_t k = 0; k < repSeqNum; k++) { + // initialize variables + repSeqId = hashSeqPair[repIdx].id; + if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { + repIsReverse = (BIT_CHECK(hashSeqPair[repIdx].kmer, 63) == 0); + repSeqId = (repIsReverse) ? repSeqId : BIT_SET(repSeqId, 63); + } + queryLen = hashSeqPair[repIdx].seqLen; + repSeq_i_pos = hashSeqPair[repIdx].pos; + for (size_t i = 0; i < 6; i++) { + repAdjacent[i] = hashSeqPair[repIdx].getAdjacentSeq(i); + } + int matchIdx = 0; + for (size_t n = 0; n < 6; n++) { + matchIdx += subMat->subMatrix[repAdjacent[n]][repAdjacent[n]]; + } + int matchThreshold = matchIdx; + + for (size_t i = prevHashStart; i < elementIdx; i++) { + // skip target sequences if weight > weightThr + if (i > prevHashStart && sequenceWeights != NULL + && sequenceWeights->getWeightById(hashSeqPair[i].id) > weightThr) + continue; + size_t rId = (prevSetSize-skipByWeightCount == 1) ? SIZE_T_MAX : repSeqId; + // remove singletones from set + if (rId != SIZE_T_MAX) { + int diagonal = repSeq_i_pos - hashSeqPair[i].pos; + if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { + // 00 No problem here both are forward + // 01 We can revert the query of target, lets invert the query. + // 10 Same here, we can revert query to match the not inverted target + // 11 Both are reverted so no problem! + // So we need just 1 bit of information to encode all four states + bool targetIsReverse = (BIT_CHECK(hashSeqPair[i].kmer, 63) == false); + bool queryNeedsToBeRev = false; + // we now need 2 byte of information (00),(01),(10),(11) + // we need to flip the coordinates of the query + T queryPos = 0; + T targetPos = 0; + // revert kmer in query hits normal kmer in target + // we need revert the query + if (repIsReverse == true && targetIsReverse == false) { + queryPos = repSeq_i_pos; + targetPos = hashSeqPair[i].pos; + queryNeedsToBeRev = true; + // both k-mers were extracted on the reverse strand + // this is equal to both are extract on the forward strand + // we just need to offset the position to the forward strand + }else if (repIsReverse == true && targetIsReverse == true) { + queryPos = (queryLen - 1) - repSeq_i_pos; + targetPos = (hashSeqPair[i].seqLen - 1) - hashSeqPair[i].pos; + queryNeedsToBeRev = false; + // query is not revers but target k-mer is reverse + // instead of reverting the target, we revert the query and offset the the query/target position + }else if (repIsReverse == false && targetIsReverse == true) { + queryPos = (queryLen - 1) - repSeq_i_pos; + targetPos = (hashSeqPair[i].seqLen - 1) - hashSeqPair[i].pos; + queryNeedsToBeRev = true; + // both are forward, everything is good here + }else { + queryPos = repSeq_i_pos; + targetPos = hashSeqPair[i].pos; + queryNeedsToBeRev = false; + } + diagonal = queryPos - targetPos; + rId = (queryNeedsToBeRev) ? BIT_CLEAR(rId, 63) : BIT_SET(rId, 63); + } + + bool canBeExtended = diagonal < 0 || (diagonal > (queryLen - hashSeqPair[i].seqLen)); + bool canBecovered = Util::canBeCovered(covThr, covMode, + static_cast(queryLen), + static_cast(hashSeqPair[i].seqLen)); + int matchCount = 0; + for (size_t n = 0; n < 6; n++) { + matchCount += subMat->subMatrix[repAdjacent[n]][hashSeqPair[i].getAdjacentSeq(n)]; + } + if ((matchCount <= matchIdx) && (repIdx < i)) { + matchIdx = matchCount; + repIdx = i; + } + // store new connection information in hashSeqPair + if ((includeOnlyExtendable == false && canBecovered) || (canBeExtended && includeOnlyExtendable == true)) { + // if possible, store information in an in-place manner + if (writePos < prevHashStart) { + hashSeqPair[writePos].kmer = rId; + hashSeqPair[writePos].pos = diagonal; + hashSeqPair[writePos].seqLen = hashSeqPair[i].seqLen; + hashSeqPair[writePos].id = hashSeqPair[i].id; + writePos++; + // otherwise, store information sequentially starting from the splitKmerCount + }else if (splitKmerCount + writeTmp < hashSeqBuffer * splitKmerCount) { + hashSeqPair[splitKmerCount + writeTmp].kmer = rId; + hashSeqPair[splitKmerCount + writeTmp].pos = diagonal; + hashSeqPair[splitKmerCount + writeTmp].seqLen = hashSeqPair[i].seqLen; + hashSeqPair[splitKmerCount + writeTmp].id = hashSeqPair[i].id; + writeTmp++; + // if both are impossible, increase hashSeqPair's memory and split again + }else { + // calculate elaborate buffer size based on the progress rate + hashSeqBuffer = 1 + (static_cast(splitKmerCount) / static_cast(writePos)) * (hashSeqBuffer - 1) * 1.2; + Debug(Debug::INFO) << "\n" << "Buffer size is unsufficient, splitting again" << "\n\n"; + return SIZE_T_MAX; + } + } + } + } + // terminate iteration of the search within the same kmer group + if (matchIdx == matchThreshold) { + break; + } + } + // re-initialize variables + prevHashStart = elementIdx; + prevSetSize = 0; + skipByWeightCount = 0; + } + if (hashSeqPair[elementIdx].kmer == SIZE_T_MAX) { + break; + } + prevSetSize++; + if (prevSetSize > 1 && sequenceWeights != NULL + && sequenceWeights->getWeightById(hashSeqPair[elementIdx].id) > weightThr) + skipByWeightCount++; + prevHash = hashSeqPair[elementIdx].kmer; + if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { + prevHash = BIT_SET(prevHash, 63); + } + } + // re-order hashSeqPair + for (size_t i = 0; i < writeTmp; i++) { + hashSeqPair[writePos + i] = hashSeqPair[splitKmerCount + i]; + } + writePos = writePos + writeTmp; + // mark the end index of the valid information as SIZE_T_MAX + hashSeqPair[writePos].kmer = SIZE_T_MAX; + + return writePos; +} + +template size_t assignGroup<0, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<0, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<1, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<1, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); + +template size_t assignGroup<0, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<0, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<1, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template size_t assignGroup<1, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, SequenceWeights *sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); void setLinearFilterDefault(Parameters *p) { @@ -666,13 +877,13 @@ size_t computeKmerCount(DBReader &reader, size_t KMER_SIZE, size_t return totalKmers; } -template +template size_t computeMemoryNeededLinearfilter(size_t totalKmer) { - return sizeof(KmerPosition) * totalKmer; + return sizeof(KmerPosition) * totalKmer; } -template +template int kmermatcherInner(Parameters& par, DBReader& seqDbr) { int querySeqType = seqDbr.getDbtype(); @@ -696,19 +907,19 @@ int kmermatcherInner(Parameters& par, DBReader& seqDbr) { Debug(Debug::INFO) << "\n"; float kmersPerSequenceScale = (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) ? par.kmersPerSequenceScale.values.nucleotide() : par.kmersPerSequenceScale.values.aminoacid(); - size_t totalKmers = computeKmerCount(seqDbr, par.kmerSize, par.kmersPerSequence, kmersPerSequenceScale); - size_t totalSizeNeeded = computeMemoryNeededLinearfilter(totalKmers); + size_t totalKmers = static_cast(computeKmerCount(seqDbr, par.kmerSize, par.kmersPerSequence, kmersPerSequenceScale) * par.hashSeqBuffer); + size_t totalSizeNeeded = computeMemoryNeededLinearfilter(totalKmers); // compute splits size_t splits = static_cast(std::ceil(static_cast(totalSizeNeeded) / memoryLimit)); size_t totalKmersPerSplit = std::max(static_cast(1024+1), - static_cast(std::min(totalSizeNeeded, memoryLimit)/sizeof(KmerPosition))+1); + static_cast(std::min(totalSizeNeeded, memoryLimit)/sizeof(KmerPosition))+1); - std::vector> hashRanges = setupKmerSplits(par, subMat, seqDbr, totalKmersPerSplit, splits); + std::vector> hashRanges = setupKmerSplits(par, subMat, seqDbr, totalKmersPerSplit, splits); if(splits > 1){ Debug(Debug::INFO) << "Process file into " << hashRanges.size() << " parts\n"; } std::vector splitFiles; - KmerPosition *hashSeqPair = NULL; + KmerPosition *hashSeqPair = NULL; size_t mpiRank = 0; #ifdef HAVE_MPI @@ -731,7 +942,12 @@ int kmermatcherInner(Parameters& par, DBReader& seqDbr) { for(size_t split = fromSplit; split < fromSplit+splitCount; split++) { std::string splitFileName = par.db2 + "_split_" +SSTR(split); - hashSeqPair = doComputation(totalKmers, hashRanges[split].first, hashRanges[split].second, splitFileName, seqDbr, par, subMat); + hashSeqPair = doComputation(totalKmers, hashRanges[split].first, hashRanges[split].second, splitFileName, seqDbr, par, subMat); + } + // detect insufficient buffer size + if (totalKmers == 0) { + delete subMat; + return EXIT_SUCCESS; } MPI_Barrier(MPI_COMM_WORLD); if(mpiRank == 0){ @@ -747,7 +963,12 @@ int kmermatcherInner(Parameters& par, DBReader& seqDbr) { std::string splitFileNameDone = splitFileName + ".done"; if(FileUtil::fileExists(splitFileNameDone.c_str()) == false){ - hashSeqPair = doComputation(totalKmersPerSplit, hashRanges[split].first, hashRanges[split].second, splitFileName, seqDbr, par, subMat); + hashSeqPair = doComputation(totalKmersPerSplit, hashRanges[split].first, hashRanges[split].second, splitFileName, seqDbr, par, subMat); + } + // detect insufficient buffer size + if (totalKmersPerSplit == 0) { + delete subMat; + return EXIT_SUCCESS; } splitFiles.push_back(splitFileName); @@ -815,7 +1036,7 @@ int kmermatcherInner(Parameters& par, DBReader& seqDbr) { return EXIT_SUCCESS; } -template +template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits){ std::vector> hashRanges; if (splits > 1) { @@ -824,9 +1045,9 @@ std::vector> setupKmerSplits(Parameters &par, BaseMatr size_t * hashDist = new size_t[USHRT_MAX+1]; memset(hashDist, 0 , sizeof(size_t) * (USHRT_MAX+1)); if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ - fillKmerPositionArray(NULL, SIZE_T_MAX, seqDbr, par, subMat, true, 0, SIZE_T_MAX, hashDist); + fillKmerPositionArray(NULL, SIZE_T_MAX, seqDbr, par, subMat, true, 0, SIZE_T_MAX, hashDist); }else{ - fillKmerPositionArray(NULL, SIZE_T_MAX, seqDbr, par, subMat, true, 0, SIZE_T_MAX, hashDist); + fillKmerPositionArray(NULL, SIZE_T_MAX, seqDbr, par, subMat, true, 0, SIZE_T_MAX, hashDist); } seqDbr.remapData(); // figure out if machine has enough memory to run this job @@ -837,12 +1058,15 @@ std::vector> setupKmerSplits(Parameters &par, BaseMatr } } if(maxBucketSize > totalKmers){ - Debug(Debug::INFO) << "Not enough memory to run the kmermatcher. Minimum is at least " << maxBucketSize* sizeof(KmerPosition) << " bytes\n"; + Debug(Debug::INFO) << "Not enough memory to run the kmermatcher. Minimum is at least " << maxBucketSize* sizeof(KmerPosition) << " bytes\n"; EXIT(EXIT_FAILURE); } // define splits size_t currBucketSize = 0; size_t currBucketStart = 0; + // reflect increment of buffer size + totalKmers = totalKmers / par.hashSeqBuffer; + for(size_t i = 0; i < (USHRT_MAX+1); i++){ if(currBucketSize+hashDist[i] >= totalKmers){ hashRanges.emplace_back(currBucketStart, i - 1); @@ -859,13 +1083,43 @@ std::vector> setupKmerSplits(Parameters &par, BaseMatr return hashRanges; } -int kmermatcher(int argc, const char **argv, const Command &command) { - MMseqsMPI::init(argc, argv); - - Parameters &par = Parameters::getInstance(); - setLinearFilterDefault(&par); - par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_CLUSTLINEAR); +// https://github.com/soedinglab/MMseqs2/pull/873#issue-2464876011 +void matchWithAdjacentSeq(Parameters &par,int argc, const char **argv, const Command &command) { + float hashSeqBuffer; + bool firstIt = true; + do { + // FIXME: currently have to reopen DB every iteration due to DBReader::unmapData() + DBReader seqDbr( + par.db1.c_str(), + par.db1Index.c_str(), + par.threads, + DBReader::USE_INDEX | DBReader::USE_DATA + ); + seqDbr.open(DBReader::NOSORT); + int querySeqType = seqDbr.getDbtype(); + + // print is executed only once + if (firstIt) { + firstIt = false; + setKmerLengthAndAlphabet(par, seqDbr.getAminoAcidDBSize(), querySeqType); + std::vector *params = command.params; + par.printParameters(command.cmd, argc, argv, *params); + Debug(Debug::INFO) << "Database size: " << seqDbr.getSize() << " type: " << seqDbr.getDbTypeName() << "\n"; + } + + // if the buffer size is insufficient, par.hashSeqBuffer is changed and repeat split again + hashSeqBuffer = par.hashSeqBuffer; + if (seqDbr.getMaxSeqLen() < SHRT_MAX) { + kmermatcherInner(par, seqDbr); + } + else { + kmermatcherInner(par, seqDbr); + } + seqDbr.close(); + } while (hashSeqBuffer != par.hashSeqBuffer); +} +void matchWithoutAdjacentSeq(Parameters &par,int argc, const char **argv, const Command &command) { DBReader seqDbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_INDEX | DBReader::USE_DATA); seqDbr.open(DBReader::NOSORT); @@ -877,20 +1131,38 @@ int kmermatcher(int argc, const char **argv, const Command &command) { Debug(Debug::INFO) << "Database size: " << seqDbr.getSize() << " type: " << seqDbr.getDbTypeName() << "\n"; if (seqDbr.getMaxSeqLen() < SHRT_MAX) { - kmermatcherInner(par, seqDbr); + kmermatcherInner(par, seqDbr); } else { - kmermatcherInner(par, seqDbr); + kmermatcherInner(par, seqDbr); } seqDbr.close(); +} + +int kmermatcher(int argc, const char **argv, const Command &command) { + MMseqsMPI::init(argc, argv); + + Parameters &par = Parameters::getInstance(); + setLinearFilterDefault(&par); + par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_CLUSTLINEAR); + + bool matchAdjacentSeq = par.matchAdjacentSeq; + + if (matchAdjacentSeq) { + matchWithAdjacentSeq(par, argc, argv, command); + } else { + // overwrite value (no need for buffer) + par.hashSeqBuffer = 1.0; + matchWithoutAdjacentSeq(par, argc, argv, command); + } return EXIT_SUCCESS; } -template +template void writeKmerMatcherResult(DBWriter & dbw, - KmerPosition *hashSeqPair, size_t totalKmers, + KmerPosition *hashSeqPair, size_t totalKmers, std::vector &repSequence, size_t threads) { std::vector threadOffsets; size_t splitSize = totalKmers/threads; @@ -1186,8 +1458,8 @@ void mergeKmerFilesAndOutput(DBWriter & dbw, } -template -void writeKmersToDisk(std::string tmpFile, KmerPosition *hashSeqPair, size_t totalKmers) { +template +void writeKmersToDisk(std::string tmpFile, KmerPosition *hashSeqPair, size_t totalKmers) { size_t repSeqId = SIZE_T_MAX; size_t lastTargetId = SIZE_T_MAX; seqLenType lastDiagonal=0; @@ -1309,26 +1581,44 @@ void setKmerLengthAndAlphabet(Parameters ¶meters, size_t aaDbSize, int seqTy } } -template std::pair fillKmerPositionArray<0, short>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<1, short>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<2, short>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<0, int>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<1, int>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); -template std::pair fillKmerPositionArray<2, int>(KmerPosition< int> * kmerArray, size_t kmerArraySize, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); - -template KmerPosition *initKmerPositionMemory(size_t size); -template KmerPosition *initKmerPositionMemory(size_t size); - -template size_t computeMemoryNeededLinearfilter(size_t totalKmer); -template size_t computeMemoryNeededLinearfilter(size_t totalKmer); - -template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits); -template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits); +template std::pair fillKmerPositionArray<0, short, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<1, short, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<2, short, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<0, int, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<1, int, false>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<2, int, false>(KmerPosition< int, false> * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<0, short, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<1, short, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<2, short, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<0, int, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<1, int, true>(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); +template std::pair fillKmerPositionArray<2, int, true>(KmerPosition< int, true> * kmerArray, size_t kmerArraySize, DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); + +template KmerPosition *initKmerPositionMemory(size_t size); +template KmerPosition *initKmerPositionMemory(size_t size); +template KmerPosition *initKmerPositionMemory(size_t size); +template KmerPosition *initKmerPositionMemory(size_t size); + +template size_t computeMemoryNeededLinearfilter(size_t totalKmer); +template size_t computeMemoryNeededLinearfilter(size_t totalKmer); +template size_t computeMemoryNeededLinearfilter(size_t totalKmer); +template size_t computeMemoryNeededLinearfilter(size_t totalKmer); + +template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits); +template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits); +template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits); +template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits); #undef SIZE_T_MAX diff --git a/src/linclust/kmermatcher.h b/src/linclust/kmermatcher.h index 43a7413a2..1b7d158b4 100644 --- a/src/linclust/kmermatcher.h +++ b/src/linclust/kmermatcher.h @@ -4,6 +4,7 @@ #include "DBWriter.h" #include "Parameters.h" #include "BaseMatrix.h" +#include "SequenceWeights.h" #include @@ -46,14 +47,36 @@ struct SequencePosition{ } }; -template -struct __attribute__((__packed__))KmerPosition { +template +struct AdjacentSeqArray { + void setAdjacentSeq(int index, const unsigned char val) { + _adjacentSeq[index] = val; + } + unsigned char getAdjacentSeq(int index) { + return _adjacentSeq[index]; + } + + unsigned char _adjacentSeq[6]; +}; + +// save memory when adjacent sequence is unused +template <> +struct AdjacentSeqArray { + void setAdjacentSeq(const int, const unsigned char) {}; + unsigned char getAdjacentSeq(int) { + Debug(Debug::ERROR) << "Invalid read attempt at adjacent sequence array"; + return '\0'; + } +}; + +template +struct __attribute__((__packed__))KmerPosition : public AdjacentSeqArray { size_t kmer; unsigned int id; T seqLen; T pos; - static bool compareRepSequenceAndIdAndPos(const KmerPosition &first, const KmerPosition &second){ + static bool compareRepSequenceAndIdAndPos(const KmerPosition &first, const KmerPosition &second){ if(first.kmer < second.kmer ) return true; if(second.kmer < first.kmer ) @@ -73,7 +96,7 @@ struct __attribute__((__packed__))KmerPosition { return false; } - static bool compareRepSequenceAndIdAndPosReverse(const KmerPosition &first, const KmerPosition &second){ + static bool compareRepSequenceAndIdAndPosReverse(const KmerPosition &first, const KmerPosition &second){ size_t firstKmer = BIT_SET(first.kmer, 63); size_t secondKmer = BIT_SET(second.kmer, 63); if(firstKmer < secondKmer ) @@ -95,7 +118,7 @@ struct __attribute__((__packed__))KmerPosition { return false; } - static bool compareRepSequenceAndIdAndDiagReverse(const KmerPosition &first, const KmerPosition &second){ + static bool compareRepSequenceAndIdAndDiagReverse(const KmerPosition &first, const KmerPosition &second){ size_t firstKmer = BIT_SET(first.kmer, 63); size_t secondKmer = BIT_SET(second.kmer, 63); if(firstKmer < secondKmer) @@ -113,7 +136,7 @@ struct __attribute__((__packed__))KmerPosition { return false; } - static bool compareRepSequenceAndIdAndDiag(const KmerPosition &first, const KmerPosition &second){ + static bool compareRepSequenceAndIdAndDiag(const KmerPosition &first, const KmerPosition &second){ if(first.kmer < second.kmer) return true; if(second.kmer < first.kmer) @@ -193,7 +216,11 @@ class CompareResultBySeqId { template -size_t assignGroup(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr); +size_t assignGroup(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, + SequenceWeights * sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); +template +size_t assignGroup(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr, + SequenceWeights * sequenceWeights, float weightThr, BaseMatrix *subMat, float &hashSeqBuffer); template void mergeKmerFilesAndOutput(DBWriter & dbw, std::vector tmpFiles, std::vector &repSequence); @@ -205,23 +232,23 @@ size_t queueNextEntry(KmerPositionQueue &queue, int file, size_t offsetPos, T *e void setKmerLengthAndAlphabet(Parameters ¶meters, size_t aaDbSize, int seqType); -template -void writeKmersToDisk(std::string tmpFile, KmerPosition *kmers, size_t totalKmers); +template +void writeKmersToDisk(std::string tmpFile, KmerPosition *kmers, size_t totalKmers); -template -void writeKmerMatcherResult(DBWriter & dbw, KmerPosition *hashSeqPair, size_t totalKmers, +template +void writeKmerMatcherResult(DBWriter & dbw, KmerPosition *hashSeqPair, size_t totalKmers, std::vector &repSequence, size_t threads); -template -KmerPosition * doComputation(size_t totalKmers, size_t split, size_t splits, std::string splitFile, +template +KmerPosition * doComputation(size_t &totalKmers, size_t split, size_t splits, std::string splitFile, DBReader & seqDbr, Parameters & par, BaseMatrix * subMat, size_t KMER_SIZE, size_t chooseTopKmer, float chooseTopKmerScale = 0.0); -template -KmerPosition *initKmerPositionMemory(size_t size); +template +KmerPosition *initKmerPositionMemory(size_t size); -template -std::pair fillKmerPositionArray(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, +template +std::pair fillKmerPositionArray(KmerPosition * kmerArray, size_t kmerArraySize, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution); @@ -229,10 +256,10 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, si void maskSequence(int maskMode, int maskLowerCase, Sequence &seq, int maskLetter, ProbabilityMatrix * probMatrix); -template +template size_t computeMemoryNeededLinearfilter(size_t totalKmer); -template +template std::vector> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader &seqDbr, size_t totalKmers, size_t splits); size_t computeKmerCount(DBReader &reader, size_t KMER_SIZE, size_t chooseTopKmer,