From e396ca4d47fd5877f55bc446c9b4b4d14fbe8749 Mon Sep 17 00:00:00 2001 From: Milot Mirdita Date: Tue, 26 Dec 2023 23:15:01 +0900 Subject: [PATCH] Carry extended dbtype for complexsearch to work with clustered dbs --- src/strucclustutils/createcomplexreport.h | 1 + src/strucclustutils/expandcomplex.cpp | 4 +- src/strucclustutils/scorecomplex.cpp | 78 +++++++++++++++++------ 3 files changed, 61 insertions(+), 22 deletions(-) diff --git a/src/strucclustutils/createcomplexreport.h b/src/strucclustutils/createcomplexreport.h index 5a1939f3..39658228 100644 --- a/src/strucclustutils/createcomplexreport.h +++ b/src/strucclustutils/createcomplexreport.h @@ -1,6 +1,7 @@ #ifndef FOLDSEEK_CREATECOMPLEXREPORT_H #define FOLDSEEK_CREATECOMPLEXREPORT_H #include "Matcher.h" +#include "MemoryMapped.h" const unsigned int NOT_AVAILABLE_CHAIN_KEY = 4294967295; const double MAX_ASSIGNED_CHAIN_RATIO = 1.0; diff --git a/src/strucclustutils/expandcomplex.cpp b/src/strucclustutils/expandcomplex.cpp index 3d0a7545..126a25fa 100644 --- a/src/strucclustutils/expandcomplex.cpp +++ b/src/strucclustutils/expandcomplex.cpp @@ -32,7 +32,9 @@ int expandcomplex(int argc, const char **argv, const Command &command) { std::string dbLookupFile = par.db2 + ".lookup"; DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); alnDbr.open(DBReader::LINEAR_ACCCESS); - DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), static_cast(par.threads), par.compressed, Parameters::DBTYPE_PREFILTER_RES); + int dbType = Parameters::DBTYPE_PREFILTER_RES; + dbType = DBReader::setExtendedDbtype(dbType, Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC); + DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), static_cast(par.threads), par.compressed, dbType); resultWriter.open(); std::vector qComplexIndices; std::vector dbComplexIndices; diff --git a/src/strucclustutils/scorecomplex.cpp b/src/strucclustutils/scorecomplex.cpp index 47921e33..88ac9af3 100644 --- a/src/strucclustutils/scorecomplex.cpp +++ b/src/strucclustutils/scorecomplex.cpp @@ -5,11 +5,9 @@ #include "Util.h" #include "LocalParameters.h" #include "Matcher.h" -#include "structureto3diseqdist.h" #include "StructureUtil.h" #include "TMaligner.h" #include "Coordinate16.h" -#include "MemoryMapped.h" #include "createcomplexreport.h" #ifdef OPENMP @@ -606,28 +604,65 @@ class ComplexScorer { int scorecomplex(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_ALIGN); + + DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + alnDbr.open(DBReader::LINEAR_ACCCESS); + uint16_t extended = DBReader::getExtendedDbtype(alnDbr.getDbtype()); + int dbType = Parameters::DBTYPE_ALIGNMENT_RES; + bool needSrc = false; + if (extended & Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC) { + needSrc = true; + dbType = DBReader::setExtendedDbtype(dbType, Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC); + } + DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), static_cast(par.threads), par.compressed, dbType); + resultWriter.open(); + const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); - IndexReader q3DiDbr(StructureUtil::getIndexWithSuffix(par.db1, "_ss"), par.threads, IndexReader::SEQUENCES, touch ? IndexReader::PRELOAD_INDEX : 0); - IndexReader *t3DiDbr = NULL; - auto *qCaDbr = new IndexReader(par.db1, par.threads, IndexReader::makeUserDatabaseType(LocalParameters::INDEX_DB_CA_KEY_DB1), touch ? IndexReader::PRELOAD_INDEX : 0, DBReader::USE_INDEX | DBReader::USE_DATA, "_ca" ); - IndexReader *tCaDbr = NULL; + + std::string t3DiDbrName = StructureUtil::getIndexWithSuffix(par.db2, "_ss"); + bool is3DiIdx = Parameters::isEqualDbtype(FileUtil::parseDbType(t3DiDbrName.c_str()), Parameters::DBTYPE_INDEX_DB); + IndexReader t3DiDbr( + is3DiIdx ? t3DiDbrName : par.db2, + par.threads, + needSrc ? IndexReader::SRC_SEQUENCES : IndexReader::SEQUENCES, + touch ? IndexReader::PRELOAD_INDEX : 0, + DBReader::USE_INDEX | DBReader::USE_DATA, + needSrc ? "_seq_ss" : "_ss" + ); + IndexReader tCaDbr( + par.db2, + par.threads, + needSrc + ? IndexReader::makeUserDatabaseType(LocalParameters::INDEX_DB_CA_KEY_DB2) + : IndexReader::makeUserDatabaseType(LocalParameters::INDEX_DB_CA_KEY_DB1), + touch ? IndexReader::PRELOAD_INDEX : 0, + DBReader::USE_INDEX | DBReader::USE_DATA, + needSrc ? "_seq_ca" : "_ca" + ); + IndexReader* q3DiDbr = NULL; + IndexReader* qCaDbr = NULL; bool sameDB = false; if (par.db1 == par.db2) { sameDB = true; - t3DiDbr = &q3DiDbr; - tCaDbr = qCaDbr; + q3DiDbr = &t3DiDbr; + qCaDbr = &tCaDbr; } else { - t3DiDbr = new IndexReader(StructureUtil::getIndexWithSuffix(par.db2, "_ss"), par.threads, IndexReader::SEQUENCES, touch ? IndexReader::PRELOAD_INDEX : 0); - tCaDbr = new IndexReader(par.db2, par.threads, IndexReader::makeUserDatabaseType(LocalParameters::INDEX_DB_CA_KEY_DB1), touch ? IndexReader::PRELOAD_INDEX : 0, DBReader::USE_INDEX | DBReader::USE_DATA, "_ca"); + q3DiDbr = new IndexReader( + StructureUtil::getIndexWithSuffix(par.db1, "_ss"), + par.threads, IndexReader::SEQUENCES, + touch ? IndexReader::PRELOAD_INDEX : 0, + DBReader::USE_INDEX | DBReader::USE_DATA + ); + qCaDbr = new IndexReader( + par.db1, + par.threads, + IndexReader::makeUserDatabaseType(LocalParameters::INDEX_DB_CA_KEY_DB1), + touch ? IndexReader::PRELOAD_INDEX : 0, + DBReader::USE_INDEX | DBReader::USE_DATA, + "_ca" + ); } - std::string qLookupFile = par.db1 + ".lookup"; - std::string dbLookupFile = par.db2 + ".lookup"; - - DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); - alnDbr.open(DBReader::LINEAR_ACCCESS); - DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), static_cast(par.threads), par.compressed, Parameters::DBTYPE_ALIGNMENT_RES); - resultWriter.open(); double minAssignedChainsRatio = par.minAssignedChainsThreshold > MAX_ASSIGNED_CHAIN_RATIO ? MAX_ASSIGNED_CHAIN_RATIO: par.minAssignedChainsThreshold; std::vector qComplexIndices; @@ -636,6 +671,8 @@ int scorecomplex(int argc, const char **argv, const Command &command) { chainKeyToComplexId_t dbChainKeyToComplexIdMap; complexIdToChainKeys_t dbComplexIdToChainKeysMap; complexIdToChainKeys_t qComplexIdToChainKeysMap; + std::string qLookupFile = par.db1 + ".lookup"; + std::string dbLookupFile = par.db2 + ".lookup"; getKeyToIdMapIdToKeysMapIdVec(qLookupFile, qChainKeyToComplexIdMap, qComplexIdToChainKeysMap, qComplexIndices); getKeyToIdMapIdToKeysMapIdVec(dbLookupFile, dbChainKeyToComplexIdMap, dbComplexIdToChainKeysMap, dbComplexIndices); qChainKeyToComplexIdMap.clear(); @@ -652,7 +689,7 @@ int scorecomplex(int argc, const char **argv, const Command &command) { std::vector searchResults; std::vector assignments; std::vector resultToWriteLines; - ComplexScorer complexScorer(&q3DiDbr, t3DiDbr, alnDbr, qCaDbr, tCaDbr, thread_idx, minAssignedChainsRatio); + ComplexScorer complexScorer(q3DiDbr, &t3DiDbr, alnDbr, qCaDbr, &tCaDbr, thread_idx, minAssignedChainsRatio); #pragma omp for schedule(dynamic, 1) // for each q complex for (size_t qCompIdx = 0; qCompIdx < qComplexIndices.size(); qCompIdx++) { @@ -698,10 +735,9 @@ int scorecomplex(int argc, const char **argv, const Command &command) { dbComplexIdToChainKeysMap.clear(); qComplexIdToChainKeysMap.clear(); alnDbr.close(); - delete qCaDbr; if (!sameDB) { - delete t3DiDbr; - delete tCaDbr; + delete q3DiDbr; + delete qCaDbr; } resultWriter.close(true); return EXIT_SUCCESS;