Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 28 additions & 6 deletions metagraph/src/cli/annotate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ void annotate_data(std::shared_ptr<graph::DeBruijnGraph> graph,

thread_pool.join();

if (config.count_kmers) {
if (config.count_kmers || config.enumerate_headers) {
// add k-mer counts to existing binary annotations
for (const auto &file : files) {
logger->trace("Annotating k-mer counts for file {}", file);
Expand All @@ -358,7 +358,7 @@ void annotate_data(std::shared_ptr<graph::DeBruijnGraph> graph,
const std::string &counts_fname
= utils::remove_suffix(file, ".gz", ".fasta") + ".kmer_counts.gz";

if (fs::exists(counts_fname)) {
if (!config.enumerate_headers && fs::exists(counts_fname)) {
add_kmer_counts(
file,
anno_graph->get_graph(),
Expand All @@ -379,9 +379,17 @@ void annotate_data(std::shared_ptr<graph::DeBruijnGraph> graph,
}
);
} else {
logger->warn("No k-mer counts found at '{}', "
"will try reading counts from headers",
counts_fname);
if (config.enumerate_headers) {
logger->warn("Indexing headers in file '{}', "
"will take indexes of sequence headers as counts. "
"Make sure all k-mers in each file are unique! (This will not be checked)",
counts_fname);
} else {
logger->warn("No k-mer counts found at '{}', "
"will try reading counts from headers",
counts_fname);
}
size_t num_sequences = 0;
call_annotations(
file,
config.refpath,
Expand All @@ -391,18 +399,32 @@ void annotate_data(std::shared_ptr<graph::DeBruijnGraph> graph,
config.max_count,
config.filename_anno,
config.annotate_sequence_headers,
/*parse_counts_from_headers*/true,
/*parse_counts_from_headers*/ !config.enumerate_headers,
config.fasta_anno_comment_delim,
config.fasta_header_delimiter,
config.anno_labels,
[&](std::string sequence, auto labels, uint64_t kmer_count) {
if (config.enumerate_headers) {
// index sequence IDs as counts
kmer_count = num_sequences++ / (1 + config.forward_and_reverse);
if (kmer_count > sdsl::bits::lo_set[config.count_width]) {
logger->error("Number of sequences exceeds the maximum "
"representable with {} bits. Increase the value --count-width",
config.count_width);
exit(1);
}
}
if (sequence.size() >= k) {
//logger->info("{}: {}, {}", fmt::join(labels, ", "), kmer_count, sequence.size() - k + 1);
batcher.push_and_pay(sequence.size(),
std::move(sequence), std::move(labels),
std::vector<uint64_t>(sequence.size() - k + 1, kmer_count));
}
}
);
if (config.enumerate_headers)
logger->info("Indexed {} header IDs in file '{}' as counts",
num_sequences++ / (1 + config.forward_and_reverse), counts_fname);
}
}
}
Expand Down
5 changes: 4 additions & 1 deletion metagraph/src/cli/config/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ Config::Config(int argc, char *argv[]) {
print_counts_hist = true;
} else if (!strcmp(argv[i], "--coordinates")) {
coordinates = true;
} else if (!strcmp(argv[i], "--enum-headers")) {
enumerate_headers = true;
} else if (!strcmp(argv[i], "--num-kmers-in-seq")) {
// FYI: experimental
std::cerr << "WARNING: Flag --num-kmers-in-seq is experimental and"
Expand Down Expand Up @@ -455,7 +457,7 @@ Config::Config(int argc, char *argv[]) {
" to represent k-mer abundance" << std::endl;
print_usage_and_exit = true;
}
if (!count_kmers)
if (!count_kmers && !enumerate_headers)
count_width = 0;

if (count_width > 32) {
Expand Down Expand Up @@ -1225,6 +1227,7 @@ if (advanced) {
fprintf(stderr, "\n");
fprintf(stderr, "\t --count-kmers \tadd k-mer counts to the annotation [off]\n");
fprintf(stderr, "\t --count-width \tnumber of bits used to represent k-mer abundance [8]\n");
fprintf(stderr, "\t --enum-headers \tenumerate headers and index sequence IDs as counts [off]\n");
fprintf(stderr, "\t --coordinates \tannotate coordinates as multi-integer attributes [off]\n");
fprintf(stderr, "\n");
fprintf(stderr, "\t-p --parallel [INT] \tuse multiple threads for computation [1]\n");
Expand Down
1 change: 1 addition & 0 deletions metagraph/src/cli/config/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ class Config {
unsigned long long int num_chars = 0;

uint8_t count_width = 8;
bool enumerate_headers = false;

// Alignment options
bool alignment_edit_distance = false;
Expand Down
2 changes: 1 addition & 1 deletion metagraph/src/cli/stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ void print_annotation_stats(const std::string &fname, const Config &config) {
std::cout << "\t";
if (count_hist_v.size())
std::cout << fmt::format("{}:{}", count_hist_v[0].first, count_hist_v[0].second);
for (size_t i = 2; i < count_hist_v.size(); i++) {
for (size_t i = 1; i < count_hist_v.size(); i++) {
std::cout << fmt::format(",{}:{}", count_hist_v[i].first, count_hist_v[i].second);
}
}
Expand Down
1 change: 0 additions & 1 deletion metagraph/src/seq_io/sequence_io.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include "sequence_io.hpp"

#include <algorithm>
#include <thread>
#include <sstream>

#include <unistd.h>
Expand Down
Loading