diff --git a/CMakeLists.txt b/CMakeLists.txt index c6e1641..71f7e17 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,8 +17,8 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) MESSAGE(STATUS "Compiling for processor: " ${CMAKE_HOST_SYSTEM_PROCESSOR}) if (UNIX AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64")) - MESSAGE(STATUS "Compiling with flags: -march=native -mbmi2 -msse4.2") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") + MESSAGE(STATUS "Compiling with flags: -march=core-avx2 -mbmi2 -msse4.2") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=core-avx2") # Flags for PTHash: set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mbmi2 -msse4.2") # for hardware popcount and pdep endif() diff --git a/include/buckets.hpp b/include/buckets.hpp index 743c0d0..6954124 100644 --- a/include/buckets.hpp +++ b/include/buckets.hpp @@ -5,6 +5,11 @@ #include "ef_sequence.hpp" namespace sshash { +struct superkmer_result { + uint64_t kmer_idx; + uint64_t superkmer_idx; + uint64_t superkmer_id; +}; template struct buckets { @@ -108,6 +113,26 @@ struct buckets { } return lookup_result(); } + // superkmer annotation + lookup_result superkmer_id_to_kmer_id(uint64_t super_kmer_id, uint64_t k) const { + uint64_t offset = offsets.access(super_kmer_id); + auto [res, contig_end] = offset_to_id(offset, k); + return res; + } + + + superkmer_result lookup_superkmer_start(uint64_t begin, uint64_t end, kmer_t target_kmer, + uint64_t k, uint64_t m) const { + for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) { + auto res = lookup_in_super_kmer(super_kmer_id, target_kmer, k, m); + if(res.kmer_id != constants::invalid_uint64){ + assert(is_valid(res)); + return {res.kmer_id, superkmer_id_to_kmer_id(super_kmer_id, k).kmer_id, super_kmer_id}; + } + } + return {constants::invalid_uint64, constants::invalid_uint64, constants::invalid_uint64}; + } + lookup_result lookup_canonical(uint64_t bucket_id, kmer_t target_kmer, kmer_t target_kmer_rc, uint64_t k, uint64_t m) const { diff --git a/include/builder/util.hpp b/include/builder/util.hpp index 59cedd3..80201e5 100644 --- a/include/builder/util.hpp +++ b/include/builder/util.hpp @@ -219,7 +219,10 @@ struct minimizers_tuples { } std::string get_minimizers_filename() const { - assert(m_num_files_to_merge > 0); + //assert(m_num_files_to_merge > 0); + if(!(m_num_files_to_merge > 0)){ + throw std::invalid_argument("m_num_files_to_merge > 0"); + } if (m_num_files_to_merge == 1) return get_tmp_output_filename(0); std::stringstream filename; filename << m_tmp_dirname << "/sshash.tmp.run_" << m_run_identifier << ".minimizers.bin"; diff --git a/include/dictionary.hpp b/include/dictionary.hpp index 82a68cd..e1f5863 100644 --- a/include/dictionary.hpp +++ b/include/dictionary.hpp @@ -1,10 +1,13 @@ #pragma once +#include + #include "util.hpp" #include "minimizers.hpp" #include "buckets.hpp" #include "skew_index.hpp" #include "weights.hpp" +#include namespace sshash { @@ -30,6 +33,7 @@ struct dictionary { uint64_t k() const { return m_k; } uint64_t m() const { return m_m; } uint64_t num_contigs() const { return m_buckets.pieces.size() - 1; } + uint64_t num_superkmers() const { return m_buckets.offsets.size(); } bool canonicalized() const { return m_canonical_parsing; } bool weighted() const { return !m_weights.empty(); } @@ -127,6 +131,17 @@ struct dictionary { void print_space_breakdown() const; void compute_statistics() const; + // pre: monochromatic_labels returns true iff same labels are found for all k-mers in given string + // post: returns indeces of superkmers that are not monochromatic + std::vector build_superkmer_bv(const std::function &monochromatic_labels) const; + // pre: get_annotation_labels returns vector of labels for given k-mer + // post: number of k-mers and color changes for every super-k-mer are written to files + void make_superkmer_stats(const std::function (std::string_view)> &get_annotation_labels) const; + + superkmer_result kmer_to_superkmer_idx(char const* kmer_str, bool check_reverse_complement) const ; + superkmer_result kmer_to_superkmer_idx_helper(kmer_t uint_kmer) const ; + uint64_t look_up_from_superkmer_id(uint64_t superkmer_id, char const* kmer_str, bool check_reverse_complement) const ; + template void visit(Visitor& visitor) const { visit_impl(visitor, *this); diff --git a/include/dictionary.impl b/include/dictionary.impl index 72d41a9..e3ddabf 100644 --- a/include/dictionary.impl +++ b/include/dictionary.impl @@ -1,5 +1,47 @@ namespace sshash { +template +superkmer_result dictionary::kmer_to_superkmer_idx_helper(kmer_t uint_kmer) const { + uint64_t minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed); + uint64_t bucket_id = m_minimizers.lookup(minimizer); + auto [begin, end] = m_buckets.locate_bucket(bucket_id); + + if (m_skew_index.empty()) return m_buckets.lookup_superkmer_start(begin, end, uint_kmer, m_k, m_m); + + uint64_t num_super_kmers_in_bucket = end - begin; + uint64_t log2_bucket_size = util::ceil_log2_uint32(num_super_kmers_in_bucket); + // superkmer in skew index + if (log2_bucket_size > m_skew_index.min_log2) { + uint64_t pos = m_skew_index.lookup(uint_kmer, log2_bucket_size); + /* It must hold pos < num_super_kmers_in_bucket for the kmer to exist. */ + if (pos < num_super_kmers_in_bucket) { + auto res_kmer = m_buckets.lookup_in_super_kmer(begin + pos, uint_kmer, m_k, m_m).kmer_id; + if(res_kmer != sshash::constants::invalid_uint64){ + uint64_t superkmer_id = begin + pos; + return {res_kmer, m_buckets.superkmer_id_to_kmer_id(superkmer_id, m_k).kmer_id, superkmer_id}; + } + } + return {constants::invalid_uint64, constants::invalid_uint64, constants::invalid_uint64}; + } + // superkmer not in skew index + return m_buckets.lookup_superkmer_start(begin, end, uint_kmer, m_k, m_m); + +} + +template +uint64_t dictionary::look_up_from_superkmer_id(uint64_t superkmer_id, char const* kmer_str, bool check_reverse_complement) const { + kmer_t uint_kmer = util::string_to_uint_kmer(kmer_str, m_k); + + auto res = m_buckets.lookup_in_super_kmer(superkmer_id, uint_kmer, m_k, m_m).kmer_id; + if (check_reverse_complement and res == constants::invalid_uint64) { + kmer_t uint_kmer_rc = uint_kmer; + uint_kmer_rc.reverse_complement_inplace(m_k); + res = m_buckets.lookup_in_super_kmer(superkmer_id, uint_kmer_rc, m_k, m_m).kmer_id; + } + return res; +} + + template lookup_result dictionary::lookup_uint_regular_parsing(kmer_t uint_kmer) const { auto minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed); @@ -67,6 +109,17 @@ uint64_t dictionary::lookup_uint(kmer_t uint_kmer, bool check_reverse_co return res.kmer_id; } +template +superkmer_result dictionary::kmer_to_superkmer_idx(char const* kmer_str, bool check_reverse_complement) const { + kmer_t uint_kmer = util::string_to_uint_kmer(kmer_str, m_k); + superkmer_result sk_res = kmer_to_superkmer_idx_helper(uint_kmer); + if(sk_res.kmer_idx == constants::invalid_uint64 && check_reverse_complement){ + kmer_t uint_kmer_rc = uint_kmer; + uint_kmer_rc.reverse_complement_inplace(m_k); + sk_res = kmer_to_superkmer_idx_helper(uint_kmer_rc); + } + return sk_res; +} template lookup_result dictionary::lookup_advanced(char const* string_kmer, bool check_reverse_complement) const { diff --git a/include/statistics.impl b/include/statistics.impl index c9fef8e..e5b7518 100644 --- a/include/statistics.impl +++ b/include/statistics.impl @@ -1,5 +1,9 @@ +#include +#include +#include +#include "dictionary.hpp" #include "buckets_statistics.hpp" - +#include namespace sshash { template @@ -11,8 +15,7 @@ void dictionary::compute_statistics() const { buckets_statistics buckets_stats(num_minimizers, num_kmers, num_super_kmers); std::cout << "computing buckets statistics..." << std::endl; - - for (uint64_t bucket_id = 0; bucket_id != num_minimizers; ++bucket_id) { + for (uint64_t bucket_id = 0; bucket_id < num_minimizers; ++bucket_id) { auto [begin, end] = m_buckets.locate_bucket(bucket_id); uint64_t num_super_kmers_in_bucket = end - begin; buckets_stats.add_num_super_kmers_in_bucket(num_super_kmers_in_bucket); @@ -48,5 +51,185 @@ void dictionary::compute_statistics() const { buckets_stats.print_full(); std::cout << "DONE" << std::endl; } +inline bool equal(const std::vector& input1, const std::vector& input2) { + if(input1.size() != input2.size()){ + return false; + } + for(size_t i = 0; i < input1.size(); i++){ + if(input1[i] != input2[i]){ + return false; + } + } + return true; +} + +template +void uint_kmer_to_last_char(kmer_t x, char* str, uint64_t k) { + x.drop_chars(k - 1); + str[0] = kmer_t::uint64_to_char(x.pop_char()); +} + +template +std::vector dictionary::build_superkmer_bv(const std::function &monochromatic_labels) const { + //uint64_t num_kmers = size(); + uint64_t num_minimizers = m_minimizers.size(); + //uint64_t num_super_kmers = m_buckets.offsets.size(); + + std::cout << "building super kmer mask..." << std::endl; + //std::vector non_mono_superkmer (num_super_kmers, false); + //size_t superkmer_idx = 0; + uint64_t one_pc_buckets = std::ceil(num_minimizers/100.0); + std::cout<<"iterating through buckets :\n"; + uint64_t num_threads = std::thread::hardware_concurrency(); + if (num_minimizers < num_threads) num_threads = num_minimizers; + std::vector> indeces (num_threads); + time_t my_time = time(NULL); + printf("%s", ctime(&my_time)); + #pragma omp parallel for num_threads(num_threads) schedule(dynamic) + for (uint64_t bucket_id = 0; bucket_id < num_minimizers; ++bucket_id) { + if(bucket_id%one_pc_buckets==0){ + my_time = time(NULL); + printf("%s", ctime(&my_time)); + std::cout <<" "<< bucket_id/one_pc_buckets <<"%" << std::endl; + } + auto [begin, end] = m_buckets.locate_bucket(bucket_id); + //uint64_t num_super_kmers_in_bucket = end - begin; + for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) { + uint64_t offset = m_buckets.offsets.access(super_kmer_id); + auto [_, contig_end] = m_buckets.offset_to_id(offset, m_k); + (void)_; + bit_vector_iterator bv_it(m_buckets.strings, kmer_t::bits_per_char * offset); + uint64_t window_size = std::min(m_k - m_m + 1, contig_end - offset - m_k + 1); + kmer_t prev_minimizer = constants::invalid_uint64; + std::vector prev_labels = {}; + std::string superkmer = ""; + size_t num_chars_to_add = m_k; + for (uint64_t w = 0; w != window_size; ++w) { + kmer_t kmer = bv_it.read_and_advance_by_char(m_k); + auto [minimizer, pos] = util::compute_minimizer_pos(kmer, m_k, m_m, m_seed); + + //check if superkmer has ended + if (prev_minimizer != constants::invalid_uint64 and minimizer != prev_minimizer) { + break; + } + prev_minimizer = minimizer; + + //get kmers into superkmer + std::string chars_to_add(num_chars_to_add, '_'); + if(num_chars_to_add > 1){ + util::uint_kmer_to_string(kmer, &chars_to_add[0], m_k); + num_chars_to_add = 1; + }else{ + uint_kmer_to_last_char(kmer, &chars_to_add[0], m_k); + } + superkmer += chars_to_add; + + } + // get labels and compare them to previous ones + if(!monochromatic_labels(superkmer)){ + indeces[omp_get_thread_num()].push_back(super_kmer_id); + } + + } + } + std::cout << "DONE" << std::endl; + my_time = time(NULL); + printf("%s", ctime(&my_time)); + std::cout << " Merging index vectors...\n"; + std::sort(indeces.begin(),indeces.end(),[](const std::vector& first, const std::vector& second) {return first.size() < second.size(); }); + std::vector merged = std::move(indeces[0]); + for(uint64_t i = 1; i < num_threads; i++){ + std::vector dest_aux; + auto& src = indeces[i]; + std::merge(merged.begin(),merged.end(),src.begin(), src.end(),std::back_inserter(dest_aux)); + merged = std::move(dest_aux); + } + my_time = time(NULL); + printf("%s", ctime(&my_time)); + std::cout<< " merged! \n"; + return merged; +} +static void write_vec(const std::vector& vec, std::string output_path){ + std::ofstream outputFile(output_path); + if (!outputFile.is_open()) { + std::cerr << "Error opening the file." << std::endl; + } + for (const auto& element : vec) { + outputFile << element << " "; + } + outputFile.close(); +} +template +void dictionary::make_superkmer_stats(const std::function (std::string_view)> &get_annotation_labels) const { + uint64_t num_minimizers = m_minimizers.size(); + uint64_t num_super_kmers = m_buckets.offsets.size(); + + std::cout << "gathering super-k-mer stats..." << std::endl; + std::vector kmer_per_superkmer (num_super_kmers, 0); + std::vector color_changes_superkmer (num_super_kmers, 0); + + uint64_t num_threads = std::thread::hardware_concurrency(); + if (num_minimizers < num_threads) num_threads = num_minimizers; + std::mutex color_lock; + std::mutex kmer_lock; + + uint64_t one_pc_buckets = std::ceil(num_minimizers/100.0); + time_t my_time = time(NULL); + std::cout<<"iterating through buckets :\n"; + #pragma omp parallel for num_threads(num_threads) schedule(dynamic) + for (uint64_t bucket_id = 0; bucket_id < num_minimizers; ++bucket_id) { + if(bucket_id%one_pc_buckets==0){ + my_time = time(NULL); + printf("%s", ctime(&my_time)); + std::cout <<" "<< bucket_id/one_pc_buckets <<"%" << std::endl; + } + auto [begin, end] = m_buckets.locate_bucket(bucket_id); + for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) { + uint64_t offset = m_buckets.offsets.access(super_kmer_id); + auto [_, contig_end] = m_buckets.offset_to_id(offset, m_k); + (void)_; + bit_vector_iterator bv_it(m_buckets.strings, kmer_t::bits_per_char * offset); + uint64_t window_size = std::min(m_k - m_m + 1, contig_end - offset - m_k + 1); + kmer_t prev_minimizer = constants::invalid_uint64; + std::vector prev_labels = {}; + + unsigned num_kmers = 0; + unsigned num_col = 0; + for (uint64_t w = 0; w != window_size; ++w) { + kmer_t kmer = bv_it.read_and_advance_by_char(m_k); + auto [minimizer, pos] = util::compute_minimizer_pos(kmer, m_k, m_m, m_seed); + + //check if superkmer has ended + if (prev_minimizer != constants::invalid_uint64 and minimizer != prev_minimizer) { + break; + } + prev_minimizer = minimizer; + num_kmers++; + + // get labels and compare them to previous ones + std::string kmer_str(m_k,'_'); + util::uint_kmer_to_string(kmer, &kmer_str[0], m_k); + std::vector labels = get_annotation_labels(kmer_str); + + if(prev_labels.size() == 0){ + prev_labels = labels; + } else if(!equal(labels, prev_labels)){ + num_col++; + } + } + color_lock.lock(); + color_changes_superkmer[super_kmer_id] = num_col; + color_lock.unlock(); + kmer_lock.lock(); + kmer_per_superkmer[super_kmer_id] = num_kmers; + kmer_lock.unlock(); + } + } + std::cout << "DONE" << std::endl; + + std::cout<<"writing kmer and color stats to files...\n"; + write_vec(color_changes_superkmer,"final_color_stats.txt"); + write_vec(kmer_per_superkmer,"final_kmer_stats.txt"); +} } // namespace sshash