@@ -42,6 +42,7 @@ void get_include_set_table(min_arguments const & args, std::filesystem::path con
4242 }
4343}
4444
45+ // Chech if file has fasta format to estimate cutoffs.
4546inline bool check_for_fasta_format (std::vector<std::string> const & valid_extensions, std::string const & file_path)
4647{
4748
@@ -68,7 +69,7 @@ inline bool check_for_fasta_format(std::vector<std::string> const & valid_extens
6869// Determine cutoff for one experiment
6970uint8_t calculate_cutoff (std::filesystem::path sequence_file, int samples)
7071{
71- // Cutoff according to Mantis paper, divided by two because we store expression levels and
72+ // Cutoff according to Mantis paper, divided by two because we store expression thresholds and
7273 // -1 because we use "<" and not "<="
7374 uint16_t const default_cutoff{24 };
7475 uint8_t cutoff{default_cutoff};
@@ -123,7 +124,8 @@ void fill_hash_table(min_arguments const & args,
123124 hash_table[minHash] = cutoff_table[minHash] + 1 ;
124125 cutoff_table.erase (minHash);
125126 }
126- // If none of the above, increase count in cutoff table.
127+ // If none of the above, increase count in cutoff table. Cutoff Table increases RAM usage by storing
128+ // minimisers with a low occurence in a smaller hash table.
127129 else
128130 {
129131 cutoff_table[minHash]++;
@@ -320,13 +322,13 @@ void check_fpr(uint8_t const number_expression_thresholds, std::vector<double> &
320322 }
321323}
322324
323- // Calculate expression levels and sizes
325+ // Calculate expression thresholds and sizes
324326void get_expression_thresholds (uint8_t const number_expression_thresholds,
325327 robin_hood::unordered_node_map<uint64_t , uint16_t > const & hash_table,
326328 std::vector<uint16_t > & expression_thresholds, std::vector<uint64_t > & sizes,
327329 robin_hood::unordered_set<uint64_t > const & genome, bool all = true )
328330{
329- // Calculate expression levels by taking median recursively
331+ // Calculate expression thresholds by taking median recursively
330332 std::vector<uint16_t > counts;
331333 for (auto && elem : hash_table)
332334 {
@@ -354,21 +356,23 @@ void get_expression_thresholds(uint8_t const number_expression_thresholds,
354356 prev_pos = prev_pos + counts.size ()/dev;
355357 dev = dev*2 ;
356358
359+ // If expression does not change compared to previous one, do not store it again as an expression threshold.
357360 if ((exp - prev_exp) > 1 )
358361 {
359362 expression_thresholds.push_back (exp);
360363 sizes.push_back (prev_pos);
361-
362364 }
363365
364366 prev_exp = exp;
365367 }
368+ // In case not all levels have a threshold, give the last levels a maximal threshold, which can not be met by any minimiser.
366369 while (expression_thresholds.size () < number_expression_thresholds)
367370 expression_thresholds.push_back (max_elem + 1 );
368371 counts.clear ();
369372}
370373
371- // Estimate the file size for every expression level, necessary when samplewise=false
374+ // Estimate the file size for every expression level, necessary when samplewise=false, because then it is completly
375+ // unclear how many minimisers are to store per file.
372376void get_filsize_per_expression_level (std::filesystem::path filename, uint8_t const number_expression_thresholds,
373377 std::vector<uint16_t > const & expression_thresholds, std::vector<uint64_t > & sizes,
374378 robin_hood::unordered_set<uint64_t > const & genome, bool all = true )
@@ -398,6 +402,8 @@ void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t co
398402 fin.read ((char *)&minimiser_count, sizeof (minimiser_count));
399403 if (all | genome.contains (minimiser))
400404 {
405+ // Find the level with the smallest greater value than the minimiser occurrence, in the level before that the
406+ // minimiser is going to be stored.
401407 auto p = std::upper_bound (expression_thresholds.begin (), expression_thresholds.end (), minimiser_count);
402408 if (p != expression_thresholds.begin ())
403409 sizes[(p-expression_thresholds.begin ())-1 ]++;
@@ -469,7 +475,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
469475 else
470476 {
471477 // Estimate sizes on filesize, assuming every byte translates to one letter (which is obiously not true,
472- // because ids contain letters as well), so size might be overestimated
478+ // because ids contain letters as well), so size might be overestimated. TODO: Find a better estimation!
473479 unsigned file_iterator = std::accumulate (minimiser_args.samples .begin (), minimiser_args.samples .begin () + i, 0 );
474480
475481 // Determine cutoffs
@@ -515,7 +521,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
515521 }
516522
517523 std::ofstream outfile_fpr;
518- outfile_fpr.open (std::string{ibf_args.path_out } + " IBF_FPRs.fprs" );
524+ outfile_fpr.open (std::string{ibf_args.path_out } + " IBF_FPRs.fprs" ); // File to store actual false positive rates per experiment.
519525 // Create IBFs
520526 std::vector<seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed>> ibfs;
521527 for (unsigned j = 0 ; j < ibf_args.number_expression_thresholds ; j++)
@@ -539,7 +545,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
539545
540546 for (unsigned i = 0 ; i < num_files; i++)
541547 {
542- double fpr = std::pow (1.0 - std::pow (1.0 -(1.0 /size), num_hash*sizes[i][j]), num_hash);// std::pow((1.0-(std::exp((-1.0*num_hash*sizes[i][j])/size))), num_hash);
548+ double fpr = std::pow (1.0 - std::pow (1.0 -(1.0 /size), num_hash*sizes[i][j]), num_hash);
543549 outfile_fpr << fpr << " " ;
544550 }
545551 outfile_fpr << " \n " ;
@@ -637,6 +643,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
637643
638644 }
639645
646+ // Store all expression thresholds per level.
640647 if constexpr (samplewise)
641648 {
642649 std::ofstream outfile;
0 commit comments