Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
MESSAGE(STATUS "Compiling for processor: " ${CMAKE_HOST_SYSTEM_PROCESSOR})

if (UNIX AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64"))
MESSAGE(STATUS "Compiling with flags: -march=native -mbmi2 -msse4.2")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
MESSAGE(STATUS "Compiling with flags: -march=core-avx2 -mbmi2 -msse4.2")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=core-avx2")
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I understand why we need this...

# Flags for PTHash:
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mbmi2 -msse4.2") # for hardware popcount and pdep
endif()
Expand Down
25 changes: 25 additions & 0 deletions include/buckets.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
#include "ef_sequence.hpp"

namespace sshash {
struct superkmer_result {
uint64_t kmer_idx;
uint64_t superkmer_idx;
uint64_t superkmer_id;
};

template <class kmer_t>
struct buckets {
Expand Down Expand Up @@ -108,6 +113,26 @@ struct buckets {
}
return lookup_result();
}
// superkmer annotation
lookup_result superkmer_id_to_kmer_id(uint64_t super_kmer_id, uint64_t k) const {
uint64_t offset = offsets.access(super_kmer_id);
auto [res, contig_end] = offset_to_id(offset, k);
return res;
}


superkmer_result lookup_superkmer_start(uint64_t begin, uint64_t end, kmer_t target_kmer,
uint64_t k, uint64_t m) const {
for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) {
auto res = lookup_in_super_kmer(super_kmer_id, target_kmer, k, m);
if(res.kmer_id != constants::invalid_uint64){
assert(is_valid(res));
return {res.kmer_id, superkmer_id_to_kmer_id(super_kmer_id, k).kmer_id, super_kmer_id};
}
}
return {constants::invalid_uint64, constants::invalid_uint64, constants::invalid_uint64};
}


lookup_result lookup_canonical(uint64_t bucket_id, kmer_t target_kmer, kmer_t target_kmer_rc,
uint64_t k, uint64_t m) const {
Expand Down
5 changes: 4 additions & 1 deletion include/builder/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,10 @@ struct minimizers_tuples {
}

std::string get_minimizers_filename() const {
assert(m_num_files_to_merge > 0);
//assert(m_num_files_to_merge > 0);
if(!(m_num_files_to_merge > 0)){
throw std::invalid_argument("m_num_files_to_merge > 0");
}
if (m_num_files_to_merge == 1) return get_tmp_output_filename(0);
std::stringstream filename;
filename << m_tmp_dirname << "/sshash.tmp.run_" << m_run_identifier << ".minimizers.bin";
Expand Down
15 changes: 15 additions & 0 deletions include/dictionary.hpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
#pragma once

#include <functional>

#include "util.hpp"
#include "minimizers.hpp"
#include "buckets.hpp"
#include "skew_index.hpp"
#include "weights.hpp"
#include <mutex>

namespace sshash {

Expand All @@ -30,6 +33,7 @@ struct dictionary {
uint64_t k() const { return m_k; }
uint64_t m() const { return m_m; }
uint64_t num_contigs() const { return m_buckets.pieces.size() - 1; }
uint64_t num_superkmers() const { return m_buckets.offsets.size(); }
bool canonicalized() const { return m_canonical_parsing; }
bool weighted() const { return !m_weights.empty(); }

Expand Down Expand Up @@ -127,6 +131,17 @@ struct dictionary {
void print_space_breakdown() const;
void compute_statistics() const;

// pre: monochromatic_labels returns true iff same labels are found for all k-mers in given string
// post: returns indeces of superkmers that are not monochromatic
std::vector<uint64_t> build_superkmer_bv(const std::function<bool (std::string_view)> &monochromatic_labels) const;
// pre: get_annotation_labels returns vector of labels for given k-mer
// post: number of k-mers and color changes for every super-k-mer are written to files
void make_superkmer_stats(const std::function<std::vector<std::string> (std::string_view)> &get_annotation_labels) const;

superkmer_result kmer_to_superkmer_idx(char const* kmer_str, bool check_reverse_complement) const ;
superkmer_result kmer_to_superkmer_idx_helper(kmer_t uint_kmer) const ;
uint64_t look_up_from_superkmer_id(uint64_t superkmer_id, char const* kmer_str, bool check_reverse_complement) const ;

template <typename Visitor>
void visit(Visitor& visitor) const {
visit_impl(visitor, *this);
Expand Down
53 changes: 53 additions & 0 deletions include/dictionary.impl
Original file line number Diff line number Diff line change
@@ -1,5 +1,47 @@
namespace sshash {

template <class kmer_t>
superkmer_result dictionary<kmer_t>::kmer_to_superkmer_idx_helper(kmer_t uint_kmer) const {
uint64_t minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed);
uint64_t bucket_id = m_minimizers.lookup(minimizer);
auto [begin, end] = m_buckets.locate_bucket(bucket_id);

if (m_skew_index.empty()) return m_buckets.lookup_superkmer_start(begin, end, uint_kmer, m_k, m_m);

uint64_t num_super_kmers_in_bucket = end - begin;
uint64_t log2_bucket_size = util::ceil_log2_uint32(num_super_kmers_in_bucket);
// superkmer in skew index
if (log2_bucket_size > m_skew_index.min_log2) {
uint64_t pos = m_skew_index.lookup(uint_kmer, log2_bucket_size);
/* It must hold pos < num_super_kmers_in_bucket for the kmer to exist. */
if (pos < num_super_kmers_in_bucket) {
auto res_kmer = m_buckets.lookup_in_super_kmer(begin + pos, uint_kmer, m_k, m_m).kmer_id;
if(res_kmer != sshash::constants::invalid_uint64){
uint64_t superkmer_id = begin + pos;
return {res_kmer, m_buckets.superkmer_id_to_kmer_id(superkmer_id, m_k).kmer_id, superkmer_id};
}
}
return {constants::invalid_uint64, constants::invalid_uint64, constants::invalid_uint64};
}
// superkmer not in skew index
return m_buckets.lookup_superkmer_start(begin, end, uint_kmer, m_k, m_m);

}

template <class kmer_t>
uint64_t dictionary<kmer_t>::look_up_from_superkmer_id(uint64_t superkmer_id, char const* kmer_str, bool check_reverse_complement) const {
kmer_t uint_kmer = util::string_to_uint_kmer<kmer_t>(kmer_str, m_k);

auto res = m_buckets.lookup_in_super_kmer(superkmer_id, uint_kmer, m_k, m_m).kmer_id;
if (check_reverse_complement and res == constants::invalid_uint64) {
kmer_t uint_kmer_rc = uint_kmer;
uint_kmer_rc.reverse_complement_inplace(m_k);
res = m_buckets.lookup_in_super_kmer(superkmer_id, uint_kmer_rc, m_k, m_m).kmer_id;
}
return res;
}


template <class kmer_t>
lookup_result dictionary<kmer_t>::lookup_uint_regular_parsing(kmer_t uint_kmer) const {
auto minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed);
Expand Down Expand Up @@ -67,6 +109,17 @@ uint64_t dictionary<kmer_t>::lookup_uint(kmer_t uint_kmer, bool check_reverse_co
return res.kmer_id;
}

template <class kmer_t>
superkmer_result dictionary<kmer_t>::kmer_to_superkmer_idx(char const* kmer_str, bool check_reverse_complement) const {
kmer_t uint_kmer = util::string_to_uint_kmer<kmer_t>(kmer_str, m_k);
superkmer_result sk_res = kmer_to_superkmer_idx_helper(uint_kmer);
if(sk_res.kmer_idx == constants::invalid_uint64 && check_reverse_complement){
kmer_t uint_kmer_rc = uint_kmer;
uint_kmer_rc.reverse_complement_inplace(m_k);
sk_res = kmer_to_superkmer_idx_helper(uint_kmer_rc);
}
return sk_res;
}
template <class kmer_t>
lookup_result dictionary<kmer_t>::lookup_advanced(char const* string_kmer,
bool check_reverse_complement) const {
Expand Down
189 changes: 186 additions & 3 deletions include/statistics.impl
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
#include <algorithm>
#include <functional>
#include <omp.h>
#include "dictionary.hpp"
#include "buckets_statistics.hpp"

#include <time.h>
namespace sshash {

template <class kmer_t>
Expand All @@ -11,8 +15,7 @@ void dictionary<kmer_t>::compute_statistics() const {
buckets_statistics buckets_stats(num_minimizers, num_kmers, num_super_kmers);

std::cout << "computing buckets statistics..." << std::endl;

for (uint64_t bucket_id = 0; bucket_id != num_minimizers; ++bucket_id) {
for (uint64_t bucket_id = 0; bucket_id < num_minimizers; ++bucket_id) {
auto [begin, end] = m_buckets.locate_bucket(bucket_id);
uint64_t num_super_kmers_in_bucket = end - begin;
buckets_stats.add_num_super_kmers_in_bucket(num_super_kmers_in_bucket);
Expand Down Expand Up @@ -48,5 +51,185 @@ void dictionary<kmer_t>::compute_statistics() const {
buckets_stats.print_full();
std::cout << "DONE" << std::endl;
}
inline bool equal(const std::vector<std::string>& input1, const std::vector<std::string>& input2) {
if(input1.size() != input2.size()){
return false;
}
for(size_t i = 0; i < input1.size(); i++){
if(input1[i] != input2[i]){
return false;
}
}
return true;
}

template <class kmer_t>
void uint_kmer_to_last_char(kmer_t x, char* str, uint64_t k) {
x.drop_chars(k - 1);
str[0] = kmer_t::uint64_to_char(x.pop_char());
}

template <class kmer_t>
std::vector<uint64_t> dictionary<kmer_t>::build_superkmer_bv(const std::function <bool (std::string_view)> &monochromatic_labels) const {
//uint64_t num_kmers = size();
uint64_t num_minimizers = m_minimizers.size();
//uint64_t num_super_kmers = m_buckets.offsets.size();

std::cout << "building super kmer mask..." << std::endl;
//std::vector<bool> non_mono_superkmer (num_super_kmers, false);
//size_t superkmer_idx = 0;
uint64_t one_pc_buckets = std::ceil(num_minimizers/100.0);
std::cout<<"iterating through buckets :\n";
uint64_t num_threads = std::thread::hardware_concurrency();
if (num_minimizers < num_threads) num_threads = num_minimizers;
std::vector<std::vector<uint64_t>> indeces (num_threads);
time_t my_time = time(NULL);
printf("%s", ctime(&my_time));
#pragma omp parallel for num_threads(num_threads) schedule(dynamic)
for (uint64_t bucket_id = 0; bucket_id < num_minimizers; ++bucket_id) {
if(bucket_id%one_pc_buckets==0){
my_time = time(NULL);
printf("%s", ctime(&my_time));
std::cout <<" "<< bucket_id/one_pc_buckets <<"%" << std::endl;
}
auto [begin, end] = m_buckets.locate_bucket(bucket_id);
//uint64_t num_super_kmers_in_bucket = end - begin;
for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) {
uint64_t offset = m_buckets.offsets.access(super_kmer_id);
auto [_, contig_end] = m_buckets.offset_to_id(offset, m_k);
(void)_;
bit_vector_iterator<kmer_t> bv_it(m_buckets.strings, kmer_t::bits_per_char * offset);
uint64_t window_size = std::min<uint64_t>(m_k - m_m + 1, contig_end - offset - m_k + 1);
kmer_t prev_minimizer = constants::invalid_uint64;
std::vector<std::string> prev_labels = {};
std::string superkmer = "";
size_t num_chars_to_add = m_k;
for (uint64_t w = 0; w != window_size; ++w) {
kmer_t kmer = bv_it.read_and_advance_by_char(m_k);
auto [minimizer, pos] = util::compute_minimizer_pos(kmer, m_k, m_m, m_seed);

//check if superkmer has ended
if (prev_minimizer != constants::invalid_uint64 and minimizer != prev_minimizer) {
break;
}
prev_minimizer = minimizer;

//get kmers into superkmer
std::string chars_to_add(num_chars_to_add, '_');
if(num_chars_to_add > 1){
util::uint_kmer_to_string(kmer, &chars_to_add[0], m_k);
num_chars_to_add = 1;
}else{
uint_kmer_to_last_char(kmer, &chars_to_add[0], m_k);
}
superkmer += chars_to_add;

}
// get labels and compare them to previous ones
if(!monochromatic_labels(superkmer)){
indeces[omp_get_thread_num()].push_back(super_kmer_id);
}

}
}
std::cout << "DONE" << std::endl;
my_time = time(NULL);
printf("%s", ctime(&my_time));
std::cout << " Merging index vectors...\n";
std::sort(indeces.begin(),indeces.end(),[](const std::vector<uint64_t>& first, const std::vector<uint64_t>& second) {return first.size() < second.size(); });
std::vector<uint64_t> merged = std::move(indeces[0]);
for(uint64_t i = 1; i < num_threads; i++){
std::vector<uint64_t> dest_aux;
auto& src = indeces[i];
std::merge(merged.begin(),merged.end(),src.begin(), src.end(),std::back_inserter(dest_aux));
merged = std::move(dest_aux);
}
my_time = time(NULL);
printf("%s", ctime(&my_time));
std::cout<< " merged! \n";
return merged;
}
static void write_vec(const std::vector<unsigned int>& vec, std::string output_path){
std::ofstream outputFile(output_path);
if (!outputFile.is_open()) {
std::cerr << "Error opening the file." << std::endl;
}
for (const auto& element : vec) {
outputFile << element << " ";
}
outputFile.close();
}
template <class kmer_t>
void dictionary<kmer_t>::make_superkmer_stats(const std::function<std::vector<std::string> (std::string_view)> &get_annotation_labels) const {
uint64_t num_minimizers = m_minimizers.size();
uint64_t num_super_kmers = m_buckets.offsets.size();

std::cout << "gathering super-k-mer stats..." << std::endl;
std::vector<unsigned> kmer_per_superkmer (num_super_kmers, 0);
std::vector<unsigned> color_changes_superkmer (num_super_kmers, 0);

uint64_t num_threads = std::thread::hardware_concurrency();
if (num_minimizers < num_threads) num_threads = num_minimizers;
std::mutex color_lock;
std::mutex kmer_lock;

uint64_t one_pc_buckets = std::ceil(num_minimizers/100.0);
time_t my_time = time(NULL);
std::cout<<"iterating through buckets :\n";
#pragma omp parallel for num_threads(num_threads) schedule(dynamic)
for (uint64_t bucket_id = 0; bucket_id < num_minimizers; ++bucket_id) {
if(bucket_id%one_pc_buckets==0){
my_time = time(NULL);
printf("%s", ctime(&my_time));
std::cout <<" "<< bucket_id/one_pc_buckets <<"%" << std::endl;
}
auto [begin, end] = m_buckets.locate_bucket(bucket_id);
for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) {
uint64_t offset = m_buckets.offsets.access(super_kmer_id);
auto [_, contig_end] = m_buckets.offset_to_id(offset, m_k);
(void)_;
bit_vector_iterator<kmer_t> bv_it(m_buckets.strings, kmer_t::bits_per_char * offset);
uint64_t window_size = std::min<uint64_t>(m_k - m_m + 1, contig_end - offset - m_k + 1);
kmer_t prev_minimizer = constants::invalid_uint64;
std::vector<std::string> prev_labels = {};

unsigned num_kmers = 0;
unsigned num_col = 0;
for (uint64_t w = 0; w != window_size; ++w) {
kmer_t kmer = bv_it.read_and_advance_by_char(m_k);
auto [minimizer, pos] = util::compute_minimizer_pos(kmer, m_k, m_m, m_seed);

//check if superkmer has ended
if (prev_minimizer != constants::invalid_uint64 and minimizer != prev_minimizer) {
break;
}
prev_minimizer = minimizer;
num_kmers++;

// get labels and compare them to previous ones
std::string kmer_str(m_k,'_');
util::uint_kmer_to_string(kmer, &kmer_str[0], m_k);
std::vector<std::string> labels = get_annotation_labels(kmer_str);

if(prev_labels.size() == 0){
prev_labels = labels;
} else if(!equal(labels, prev_labels)){
num_col++;
}
}
color_lock.lock();
color_changes_superkmer[super_kmer_id] = num_col;
color_lock.unlock();
kmer_lock.lock();
kmer_per_superkmer[super_kmer_id] = num_kmers;
kmer_lock.unlock();
}
}
std::cout << "DONE" << std::endl;

std::cout<<"writing kmer and color stats to files...\n";
write_vec(color_changes_superkmer,"final_color_stats.txt");
write_vec(kmer_per_superkmer,"final_kmer_stats.txt");
}

} // namespace sshash