Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
03ab50c
don't show progress bar while extracting contigs from the query graph
karasikov May 16, 2025
2ad67fd
compile on MacOS
karasikov May 16, 2025
977199e
keep column positions in small vectors
karasikov May 16, 2025
3ec88ce
always compile htslib
karasikov May 16, 2025
5688029
revert CMakeLists.txt
karasikov May 16, 2025
3b8e67d
revert
karasikov May 16, 2025
7cac361
back to Vector
karasikov May 16, 2025
4b68370
pass verbose without changing the global veriable
karasikov May 16, 2025
11d6002
keep rows in small vectors
karasikov May 16, 2025
f5d7fee
use uint32_t for column indexes
karasikov May 16, 2025
c6bd124
minor
karasikov May 17, 2025
30102c0
minor
karasikov May 18, 2025
1b72092
minor
karasikov May 18, 2025
6c4a94d
up
karasikov May 18, 2025
04bbca2
Merge branch 'master' into mk/query
adamant-pwn May 18, 2025
993ed04
minor
karasikov May 18, 2025
ae7f467
minor
karasikov May 18, 2025
acf74f4
report the number of query k-mer matches for each batch
karasikov May 18, 2025
f40c097
batch accumulation in query
karasikov May 19, 2025
f46682a
Merge remote-tracking branch 'origin/master' into mk/query
karasikov May 19, 2025
b35a2de
deduplicate rows in aggregated batches
karasikov May 19, 2025
27b0171
Merge remote-tracking branch 'origin/master' into mk/query
karasikov May 19, 2025
1c16215
fix
karasikov May 19, 2025
2f8048a
fix: aggregate the number of bp in aggregated batches
karasikov May 19, 2025
014b3fe
print size of aggregated batches in logs
karasikov May 19, 2025
44e4719
Merge remote-tracking branch 'origin/master' into mk/query
karasikov May 19, 2025
5166c2d
added flag --batch-min-matches to control the batch aggregation
karasikov May 20, 2025
9a1b2b6
refactoring: decoupled graph and annotation in query.cpp
karasikov May 20, 2025
22a47a3
don't pass the entire anno_graph where unnecessary
karasikov May 20, 2025
2a8baca
fix typo
karasikov May 20, 2025
d717f0c
cleanup/refactoring
karasikov May 20, 2025
6960c5f
refactoring: removed QueryExecutor for simplification
karasikov May 21, 2025
a5cef5a
load annotation in a separate thread
karasikov May 22, 2025
1775c3f
fix
karasikov May 22, 2025
926d2df
fix wrapping
karasikov May 22, 2025
8e8e0ef
minor
karasikov May 22, 2025
df5e05d
last part of batch query (fetch annotations) in a single worker
karasikov May 22, 2025
5e7c969
minor
karasikov May 24, 2025
fdebe57
move lock
karasikov May 24, 2025
17ec5c9
change threading
karasikov May 25, 2025
6944b07
change
karasikov May 25, 2025
696fe57
minor
karasikov May 25, 2025
dff0f7a
cleanup logs
karasikov May 25, 2025
5ab5410
use query sequences without extracting contigs
karasikov May 26, 2025
4a33b93
cleanup
karasikov May 26, 2025
4eb9be1
fix
karasikov May 26, 2025
ebd3c12
optimization
karasikov May 26, 2025
5ca0e71
canonical mode
karasikov May 26, 2025
e5078b9
fix
karasikov May 26, 2025
2af7fa7
minor
karasikov May 26, 2025
4849833
mapping single thread
karasikov May 26, 2025
947f88f
parallel mapping
karasikov May 26, 2025
fc60ad0
log query graph construction and mapping separately
karasikov May 26, 2025
0f4cdb6
build primary batch graph instead of canonical
karasikov May 26, 2025
7b6047d
map with all threads
karasikov May 26, 2025
5d9bb14
revert
karasikov May 26, 2025
39e8a8b
revert
karasikov May 27, 2025
4cee813
fix for short sequences: check length
karasikov May 28, 2025
5e03993
cleanup
karasikov May 28, 2025
273f83a
load graph and annotation in parallel
karasikov May 28, 2025
a92dab7
fix
karasikov May 28, 2025
1225412
Merge branch 'master' into mk/query
karasikov Sep 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,9 @@ double VectorRowBinMat<RowType>::density() const {
return static_cast<double>(num_relations()) / num_columns() / num_rows();
}

template class VectorRowBinMat<SmallVector<uint32_t>>;
template class VectorRowBinMat<Vector<uint64_t>>;
template class VectorRowBinMat<Vector<uint32_t>>;
template class VectorRowBinMat<SmallVector<uint32_t>>;

} // namespace matrix
} // namespace annot
Expand Down
3 changes: 3 additions & 0 deletions metagraph/src/cli/config/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ Config::Config(int argc, char *argv[]) {
num_columns_cached = atoi(get_value(i++));
} else if (!strcmp(argv[i], "--batch-size")) {
query_batch_size = atoll(get_value(i++));
} else if (!strcmp(argv[i], "--batch-min-matches")) {
batch_min_matches = std::stod(get_value(i++));
} else if (!strcmp(argv[i], "-p") || !strcmp(argv[i], "--parallel")) {
set_num_threads(atoi(get_value(i++)));
} else if (!strcmp(argv[i], "--parallel-nodes")) {
Expand Down Expand Up @@ -1325,6 +1327,7 @@ if (advanced) {
fprintf(stderr, "\t-p --parallel [INT] \tuse multiple threads for computation [1]\n");
// fprintf(stderr, "\t --cache-size [INT] \tnumber of uncompressed rows to store in the cache [0]\n");
fprintf(stderr, "\t --batch-size [INT] \tquery batch size in bp (0 to disable batch query) [100'000'000]\n");
fprintf(stderr, "\t --batch-min-matches [FLOAT] \taggregate batches unless this they have this ratio of k-mer matches [0.0]\n");
if (advanced) {
fprintf(stderr, "\t --threads-each [INT]\tnumber of parallel batches [1]\n");
fprintf(stderr, "\t --RA-ivbuff-size [INT] \tsize (in bytes) of int_vector_buffer used in random access mode (e.g. by row disk annotator) [16384]\n");
Expand Down
1 change: 1 addition & 0 deletions metagraph/src/cli/config/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ class Config {
double min_fraction = 0.0;
double max_fraction = 1.0;
double cleaning_threshold_percentile = 0.001;
double batch_min_matches = 0.0;
std::vector<double> count_slice_quantiles;
std::vector<double> count_quantiles;

Expand Down
51 changes: 41 additions & 10 deletions metagraph/src/cli/load/load_annotated_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,39 @@ using namespace mtg::graph;
using mtg::common::logger;


std::unique_ptr<AnnotatedDBG> initialize_annotated_dbg(std::shared_ptr<DeBruijnGraph> graph,
const Config &config,
size_t max_chunks_open) {
std::unique_ptr<annot::MultiLabelAnnotation<std::string>>
load_annotation(std::shared_ptr<DeBruijnGraph> graph,
const Config &config,
size_t max_chunks_open) {
std::shared_future<std::shared_ptr<DeBruijnGraph>> graph_future = std::async(std::launch::async, [graph]{ return graph; });
return load_annotation(graph_future, config, max_chunks_open);
}

std::unique_ptr<annot::MultiLabelAnnotation<std::string>>
load_annotation(std::shared_future<std::shared_ptr<DeBruijnGraph>> graph_future,
const Config &config,
size_t max_chunks_open) {

std::unique_ptr<annot::MultiLabelAnnotation<std::string>> annotation_temp;
if (config.infbase_annotators.size())
annotation_temp = initialize_annotation(config.infbase_annotators.at(0), config, 0, max_chunks_open);

std::shared_ptr<DeBruijnGraph> graph = graph_future.get();
uint64_t max_index = graph->max_index();

auto base_graph = graph;
const auto *base_graph_ptr = graph.get();
if (graph->get_mode() == DeBruijnGraph::PRIMARY) {
graph = std::make_shared<CanonicalDBG>(graph);
logger->trace("Primary graph wrapped into canonical");
} else if (const auto *canonical = dynamic_cast<const CanonicalDBG *>(graph.get())) {
base_graph_ptr = &canonical->get_graph();
max_index = base_graph_ptr->max_index();
}

auto annotation_temp = config.infbase_annotators.size()
? initialize_annotation(config.infbase_annotators.at(0), config, 0, max_chunks_open)
: initialize_annotation(config.anno_type, config, max_index, max_chunks_open);
if (!config.infbase_annotators.size())
annotation_temp = initialize_annotation(config.anno_type, config, max_index, max_chunks_open);

assert(annotation_temp);

if (config.infbase_annotators.size()) {
bool loaded = false;
Expand All @@ -56,7 +75,7 @@ std::unique_ptr<AnnotatedDBG> initialize_annotated_dbg(std::shared_ptr<DeBruijnG
using namespace annot::matrix;
BinaryMatrix &matrix = const_cast<BinaryMatrix &>(annotation_temp->get_matrix());
if (IRowDiff *row_diff = dynamic_cast<IRowDiff*>(&matrix)) {
row_diff->set_graph(base_graph.get());
row_diff->set_graph(base_graph_ptr);

if (auto *row_diff_column = dynamic_cast<RowDiff<ColumnMajor> *>(&matrix)) {
row_diff_column->load_anchor(config.infbase + kRowDiffAnchorExt);
Expand All @@ -65,9 +84,21 @@ std::unique_ptr<AnnotatedDBG> initialize_annotated_dbg(std::shared_ptr<DeBruijnG
}
}

// load graph
return annotation_temp;
}

std::unique_ptr<AnnotatedDBG> initialize_annotated_dbg(std::shared_ptr<DeBruijnGraph> graph,
const Config &config,
size_t max_chunks_open) {
if (graph->get_mode() == DeBruijnGraph::PRIMARY) {
graph = std::make_shared<CanonicalDBG>(graph);
logger->trace("Primary graph wrapped into canonical");
}

auto annotation = load_annotation(graph, config, max_chunks_open);

auto anno_graph
= std::make_unique<AnnotatedDBG>(std::move(graph), std::move(annotation_temp));
= std::make_unique<AnnotatedDBG>(std::move(graph), std::move(annotation));

if (!anno_graph->check_compatibility()) {
logger->error("Graph and annotation are not compatible");
Expand Down
19 changes: 16 additions & 3 deletions metagraph/src/cli/load/load_annotated_graph.hpp
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
#ifndef __LOAD_ANNOTATED_GRAPH_HPP__
#define __LOAD_ANNOTATED_GRAPH_HPP__


#include <future>
#include <memory>

namespace mtg {

namespace graph {

class DeBruijnGraph;
class AnnotatedDBG;

} // namespace graph

namespace annot {
template <typename LabelType>
class MultiLabelAnnotation;
} // namespace annot

namespace cli {

class Config;
Expand All @@ -22,6 +25,16 @@ initialize_annotated_dbg(std::shared_ptr<graph::DeBruijnGraph> graph,
const Config &config,
size_t max_chunks_open = 2000);

std::unique_ptr<annot::MultiLabelAnnotation<std::string>>
load_annotation(std::shared_ptr<graph::DeBruijnGraph> graph,
const Config &config,
size_t max_chunks_open = 2000);

std::unique_ptr<annot::MultiLabelAnnotation<std::string>>
load_annotation(std::shared_future<std::shared_ptr<graph::DeBruijnGraph>> graph,
const Config &config,
size_t max_chunks_open = 2000);

std::unique_ptr<graph::AnnotatedDBG> initialize_annotated_dbg(const Config &config);

} // namespace cli
Expand Down
Loading
Loading