Merge pull request #117 from MitraDarja/improve_doc

MitraDarja · web-flow · commit 4d4ce9b4647d · 2021-09-20T12:18:04.000-05:00
Improve doc
diff --git a/src/estimate.cpp b/src/estimate.cpp
@@ -28,8 +28,10 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint
                seqan3::dna4_vector const seq, std::vector<uint32_t> & prev_counts,
                exp_t const & expressions, uint16_t const k, std::vector<double> const fprs)
 {
+    // Check, if one expression threshold for all or individual thresholds
     static constexpr bool multiple_expressions = std::same_as<exp_t, std::vector<std::vector<uint16_t>>>;
 
+    // Count minimisers in ibf of current level
     std::vector<uint32_t> counter;
     counter.assign(ibf.bin_count(), 0);
     uint64_t minimiser_length = 0;
@@ -41,15 +43,20 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint
         ++minimiser_length;
     }
 
+    // Defines, where the median should be
     float minimiser_pos = minimiser_length/2.0;
 
+    // Check every experiment by going over the number of bins in the ibf.
     for(int j = 0; j < counter.size(); j++)
     {
         // Correction by substracting the expected number of false positives
         counter[j] = std::max((double) 0.0, (double) ((counter[j]-(minimiser_length*fprs[j]))/(1.0-fprs[j])));
+        // Check, if considering previously seen minimisers and minimisers found ar current level equal to or are greater
+        // than the minimiser_pow, which gives the median position.
+        // If ań estimation took already place (estimations_i[j]!=0), a second estimation is not performed.
         if (((prev_counts[j] + counter[j]) >= minimiser_pos) & (estimations_i[j] == 0))
         {
-            // If there was nothing previous
+            // If there was no previous level, because we are looking at the last level.
             if constexpr(last_exp)
             {
                 if constexpr (multiple_expressions)
@@ -68,15 +75,15 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint
                    estimations_i[j] = std::max(expressions[k][j] * 1.0, expressions[k+1][j] - ((abs(minimiser_pos - prev_counts[j])/(counter[j] * 1.0)) * (expressions[k+1][j]-expressions[k][j])));
                else
                    estimations_i[j] = std::max(expressions * 1.0, k - ((abs(minimiser_pos - prev_counts[j])/(counter[j] * 1.0)) * (k-expressions)));
-               // Make sure, every transcript is only estimated once
-               prev_counts[j] = 0;
             }
 
+            // Perform normalization by dividing through the threshold of the first level. Only works, if multiple expressions were used.
             if constexpr (normalization & multiple_expressions)
                 estimations_i[j] = estimations_i[j]/expressions[0][j];
         }
         else
         {
+            // If not found at this level, add to previous count.
             prev_counts[j] = prev_counts[j] + counter[j];
         }
     }
@@ -160,7 +167,7 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat
     // Make sure expression levels are sorted.
     sort(args.expression_thresholds.begin(), args.expression_thresholds.end());
 
-    // Initialse last expression
+    // Initialse last expression.
     if constexpr (samplewise)
         load_ibf(ibf, estimate_args.path_in.string() + "IBF_Level_" + std::to_string(args.number_expression_thresholds-1));
     else
@@ -199,6 +206,7 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat
 
     for (int j = args.number_expression_thresholds - 2; j >= 0; j--)
     {
+        // Loadthe next ibf that should be considered.
         if constexpr (samplewise)
             load_ibf(ibf, estimate_args.path_in.string() + "IBF_Level_" + std::to_string(j));
         else
@@ -223,6 +231,7 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat
             prev_expression = args.expression_thresholds[j];
     }
 
+    // Write output file.
     std::ofstream outfile;
     outfile.open(std::string{file_out});
     for (int i = 0; i <  seqs.size(); ++i)
diff --git a/src/ibf.cpp b/src/ibf.cpp
@@ -42,6 +42,7 @@ void get_include_set_table(min_arguments const & args, std::filesystem::path con
     }
 }
 
+// Chech if file has fasta format to estimate cutoffs.
 inline bool check_for_fasta_format(std::vector<std::string> const & valid_extensions, std::string const & file_path)
 {
 
@@ -68,7 +69,7 @@ inline bool check_for_fasta_format(std::vector<std::string> const & valid_extens
 // Determine cutoff for one experiment
 uint8_t calculate_cutoff(std::filesystem::path sequence_file, int samples)
 {
-    // Cutoff according to Mantis paper, divided by two because we store expression levels and
+    // Cutoff according to Mantis paper, divided by two because we store expression thresholds and
     // -1 because we use "<" and not "<="
     uint16_t const default_cutoff{24};
     uint8_t cutoff{default_cutoff};
@@ -123,7 +124,8 @@ void fill_hash_table(min_arguments const & args,
                     hash_table[minHash] = cutoff_table[minHash] + 1;
                     cutoff_table.erase(minHash);
                 }
-                // If none of the above, increase count in cutoff table.
+                // If none of the above, increase count in cutoff table. Cutoff Table increases RAM usage by storing
+                // minimisers with a low occurence in a smaller hash table.
                 else
                 {
                     cutoff_table[minHash]++;
@@ -320,13 +322,13 @@ void check_fpr(uint8_t const number_expression_thresholds, std::vector<double> &
     }
 }
 
-// Calculate expression levels and sizes
+// Calculate expression thresholds and sizes
 void get_expression_thresholds(uint8_t const number_expression_thresholds,
                            robin_hood::unordered_node_map<uint64_t, uint16_t> const & hash_table,
                            std::vector<uint16_t> & expression_thresholds, std::vector<uint64_t> & sizes,
                            robin_hood::unordered_set<uint64_t> const & genome, bool all = true)
 {
-    // Calculate expression levels by taking median recursively
+    // Calculate expression thresholds by taking median recursively
     std::vector<uint16_t> counts;
     for (auto && elem : hash_table)
     {
@@ -354,21 +356,23 @@ void get_expression_thresholds(uint8_t const number_expression_thresholds,
         prev_pos = prev_pos + counts.size()/dev;
         dev = dev*2;
 
+        // If expression does not change compared to previous one, do not store it again as an expression threshold.
         if ((exp - prev_exp) > 1)
         {
             expression_thresholds.push_back(exp);
             sizes.push_back(prev_pos);
-
         }
 
         prev_exp = exp;
     }
+    // In case not all levels have a threshold, give the last levels a maximal threshold, which can not be met by any minimiser.
     while(expression_thresholds.size() < number_expression_thresholds)
         expression_thresholds.push_back(max_elem + 1);
     counts.clear();
 }
 
-// Estimate the file size for every expression level, necessary when samplewise=false
+// Estimate the file size for every expression level, necessary when samplewise=false, because then it is completly
+// unclear how many minimisers are to store per file.
 void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t const number_expression_thresholds,
                                       std::vector<uint16_t> const & expression_thresholds, std::vector<uint64_t> & sizes,
                                       robin_hood::unordered_set<uint64_t> const & genome, bool all = true)
@@ -398,6 +402,8 @@ void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t co
         fin.read((char*)&minimiser_count, sizeof(minimiser_count));
         if (all | genome.contains(minimiser))
         {
+            // Find the level with the smallest greater value than the minimiser occurrence, in the level before that the
+            // minimiser is going to be stored.
             auto p = std::upper_bound(expression_thresholds.begin(), expression_thresholds.end(), minimiser_count);
             if(p != expression_thresholds.begin())
                 sizes[(p-expression_thresholds.begin())-1]++;
@@ -469,7 +475,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
         else
         {
             // Estimate sizes on filesize, assuming every byte translates to one letter (which is obiously not true,
-            // because ids contain letters as well), so size might be overestimated
+            // because ids contain letters as well), so size might be overestimated. TODO: Find a better estimation!
             unsigned file_iterator = std::accumulate(minimiser_args.samples.begin(), minimiser_args.samples.begin() + i, 0);
 
             // Determine cutoffs
@@ -515,7 +521,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
     }
 
     std::ofstream outfile_fpr;
-    outfile_fpr.open(std::string{ibf_args.path_out} +  "IBF_FPRs.fprs");
+    outfile_fpr.open(std::string{ibf_args.path_out} +  "IBF_FPRs.fprs"); // File to store actual false positive rates per experiment.
     // Create IBFs
     std::vector<seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed>> ibfs;
     for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++)
@@ -539,7 +545,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
 
         for (unsigned i = 0; i < num_files; i++)
         {
-            double fpr = std::pow(1.0- std::pow(1.0-(1.0/size), num_hash*sizes[i][j]), num_hash);//std::pow((1.0-(std::exp((-1.0*num_hash*sizes[i][j])/size))), num_hash);
+            double fpr = std::pow(1.0- std::pow(1.0-(1.0/size), num_hash*sizes[i][j]), num_hash);
             outfile_fpr << fpr << " ";
         }
         outfile_fpr << "\n";
@@ -637,6 +643,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
 
     }
 
+    // Store all expression thresholds per level.
     if constexpr(samplewise)
     {
         std::ofstream outfile;

Original file line number	Diff line number	Diff line change
`@@ -28,8 +28,10 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint`
`28`	`28`	`seqan3::dna4_vector const seq, std::vector<uint32_t> & prev_counts,`
`29`	`29`	`exp_t const & expressions, uint16_t const k, std::vector<double> const fprs)`
`30`	`30`	`{`
	`31`	`+ // Check, if one expression threshold for all or individual thresholds`
`31`	`32`	`static constexpr bool multiple_expressions = std::same_as<exp_t, std::vector<std::vector<uint16_t>>>;`
`32`	`33`
	`34`	`+ // Count minimisers in ibf of current level`
`33`	`35`	`std::vector<uint32_t> counter;`
`34`	`36`	`counter.assign(ibf.bin_count(), 0);`
`35`	`37`	`uint64_t minimiser_length = 0;`
`@@ -41,15 +43,20 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint`
`41`	`43`	`++minimiser_length;`
`42`	`44`	`}`
`43`	`45`
	`46`	`+ // Defines, where the median should be`
`44`	`47`	`float minimiser_pos = minimiser_length/2.0;`
`45`	`48`
	`49`	`+ // Check every experiment by going over the number of bins in the ibf.`
`46`	`50`	`for(int j = 0; j < counter.size(); j++)`
`47`	`51`	`{`
`48`	`52`	`// Correction by substracting the expected number of false positives`
`49`	`53`	`counter[j] = std::max((double) 0.0, (double) ((counter[j]-(minimiser_length*fprs[j]))/(1.0-fprs[j])));`
	`54`	`+ // Check, if considering previously seen minimisers and minimisers found ar current level equal to or are greater`
	`55`	`+ // than the minimiser_pow, which gives the median position.`
	`56`	`+ // If ań estimation took already place (estimations_i[j]!=0), a second estimation is not performed.`
`50`	`57`	`if (((prev_counts[j] + counter[j]) >= minimiser_pos) & (estimations_i[j] == 0))`
`51`	`58`	`{`
`52`		`- // If there was nothing previous`
	`59`	`+ // If there was no previous level, because we are looking at the last level.`
`53`	`60`	`if constexpr(last_exp)`
`54`	`61`	`{`
`55`	`62`	`if constexpr (multiple_expressions)`
`@@ -68,15 +75,15 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint`
`68`	`75`	`estimations_i[j] = std::max(expressions[k][j] * 1.0, expressions[k+1][j] - ((abs(minimiser_pos - prev_counts[j])/(counter[j] * 1.0)) * (expressions[k+1][j]-expressions[k][j])));`
`69`	`76`	`else`
`70`	`77`	`estimations_i[j] = std::max(expressions * 1.0, k - ((abs(minimiser_pos - prev_counts[j])/(counter[j] * 1.0)) * (k-expressions)));`
`71`		`- // Make sure, every transcript is only estimated once`
`72`		`- prev_counts[j] = 0;`
`73`	`78`	`}`
`74`	`79`
	`80`	`+ // Perform normalization by dividing through the threshold of the first level. Only works, if multiple expressions were used.`
`75`	`81`	`if constexpr (normalization & multiple_expressions)`
`76`	`82`	`estimations_i[j] = estimations_i[j]/expressions[0][j];`
`77`	`83`	`}`
`78`	`84`	`else`
`79`	`85`	`{`
	`86`	`+ // If not found at this level, add to previous count.`
`80`	`87`	`prev_counts[j] = prev_counts[j] + counter[j];`
`81`	`88`	`}`
`82`	`89`	`}`
`@@ -160,7 +167,7 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat`
`160`	`167`	`// Make sure expression levels are sorted.`
`161`	`168`	`sort(args.expression_thresholds.begin(), args.expression_thresholds.end());`
`162`	`169`
`163`		`- // Initialse last expression`
	`170`	`+ // Initialse last expression.`
`164`	`171`	`if constexpr (samplewise)`
`165`	`172`	`load_ibf(ibf, estimate_args.path_in.string() + "IBF_Level_" + std::to_string(args.number_expression_thresholds-1));`
`166`	`173`	`else`
`@@ -199,6 +206,7 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat`
`199`	`206`
`200`	`207`	`for (int j = args.number_expression_thresholds - 2; j >= 0; j--)`
`201`	`208`	`{`
	`209`	`+ // Loadthe next ibf that should be considered.`
`202`	`210`	`if constexpr (samplewise)`
`203`	`211`	`load_ibf(ibf, estimate_args.path_in.string() + "IBF_Level_" + std::to_string(j));`
`204`	`212`	`else`
`@@ -223,6 +231,7 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat`
`223`	`231`	`prev_expression = args.expression_thresholds[j];`
`224`	`232`	`}`
`225`	`233`
	`234`	`+ // Write output file.`
`226`	`235`	`std::ofstream outfile;`
`227`	`236`	`outfile.open(std::string{file_out});`
`228`	`237`	`for (int i = 0; i < seqs.size(); ++i)`
Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,7 @@ void get_include_set_table(min_arguments const & args, std::filesystem::path con`
`42`	`42`	`}`
`43`	`43`	`}`
`44`	`44`
	`45`	`+// Chech if file has fasta format to estimate cutoffs.`
`45`	`46`	`inline bool check_for_fasta_format(std::vector<std::string> const & valid_extensions, std::string const & file_path)`
`46`	`47`	`{`
`47`	`48`
`@@ -68,7 +69,7 @@ inline bool check_for_fasta_format(std::vector<std::string> const & valid_extens`
`68`	`69`	`// Determine cutoff for one experiment`
`69`	`70`	`uint8_t calculate_cutoff(std::filesystem::path sequence_file, int samples)`
`70`	`71`	`{`
`71`		`- // Cutoff according to Mantis paper, divided by two because we store expression levels and`
	`72`	`+ // Cutoff according to Mantis paper, divided by two because we store expression thresholds and`
`72`	`73`	`// -1 because we use "<" and not "<="`
`73`	`74`	`uint16_t const default_cutoff{24};`
`74`	`75`	`uint8_t cutoff{default_cutoff};`
`@@ -123,7 +124,8 @@ void fill_hash_table(min_arguments const & args,`
`123`	`124`	`hash_table[minHash] = cutoff_table[minHash] + 1;`
`124`	`125`	`cutoff_table.erase(minHash);`
`125`	`126`	`}`
`126`		`- // If none of the above, increase count in cutoff table.`
	`127`	`+ // If none of the above, increase count in cutoff table. Cutoff Table increases RAM usage by storing`
	`128`	`+ // minimisers with a low occurence in a smaller hash table.`
`127`	`129`	`else`
`128`	`130`	`{`
`129`	`131`	`cutoff_table[minHash]++;`
`@@ -320,13 +322,13 @@ void check_fpr(uint8_t const number_expression_thresholds, std::vector<double> &`
`320`	`322`	`}`
`321`	`323`	`}`
`322`	`324`
`323`		`-// Calculate expression levels and sizes`
	`325`	`+// Calculate expression thresholds and sizes`
`324`	`326`	`void get_expression_thresholds(uint8_t const number_expression_thresholds,`
`325`	`327`	`robin_hood::unordered_node_map<uint64_t, uint16_t> const & hash_table,`
`326`	`328`	`std::vector<uint16_t> & expression_thresholds, std::vector<uint64_t> & sizes,`
`327`	`329`	`robin_hood::unordered_set<uint64_t> const & genome, bool all = true)`
`328`	`330`	`{`
`329`		`- // Calculate expression levels by taking median recursively`
	`331`	`+ // Calculate expression thresholds by taking median recursively`
`330`	`332`	`std::vector<uint16_t> counts;`
`331`	`333`	`for (auto && elem : hash_table)`
`332`	`334`	`{`
`@@ -354,21 +356,23 @@ void get_expression_thresholds(uint8_t const number_expression_thresholds,`
`354`	`356`	`prev_pos = prev_pos + counts.size()/dev;`
`355`	`357`	`dev = dev*2;`
`356`	`358`
	`359`	`+ // If expression does not change compared to previous one, do not store it again as an expression threshold.`
`357`	`360`	`if ((exp - prev_exp) > 1)`
`358`	`361`	`{`
`359`	`362`	`expression_thresholds.push_back(exp);`
`360`	`363`	`sizes.push_back(prev_pos);`
`361`		`-`
`362`	`364`	`}`
`363`	`365`
`364`	`366`	`prev_exp = exp;`
`365`	`367`	`}`
	`368`	`+ // In case not all levels have a threshold, give the last levels a maximal threshold, which can not be met by any minimiser.`
`366`	`369`	`while(expression_thresholds.size() < number_expression_thresholds)`
`367`	`370`	`expression_thresholds.push_back(max_elem + 1);`
`368`	`371`	`counts.clear();`
`369`	`372`	`}`
`370`	`373`
`371`		`-// Estimate the file size for every expression level, necessary when samplewise=false`
	`374`	`+// Estimate the file size for every expression level, necessary when samplewise=false, because then it is completly`
	`375`	`+// unclear how many minimisers are to store per file.`
`372`	`376`	`void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t const number_expression_thresholds,`
`373`	`377`	`std::vector<uint16_t> const & expression_thresholds, std::vector<uint64_t> & sizes,`
`374`	`378`	`robin_hood::unordered_set<uint64_t> const & genome, bool all = true)`
`@@ -398,6 +402,8 @@ void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t co`
`398`	`402`	`fin.read((char*)&minimiser_count, sizeof(minimiser_count));`
`399`	`403`	`if (all \| genome.contains(minimiser))`
`400`	`404`	`{`
	`405`	`+ // Find the level with the smallest greater value than the minimiser occurrence, in the level before that the`
	`406`	`+ // minimiser is going to be stored.`
`401`	`407`	`auto p = std::upper_bound(expression_thresholds.begin(), expression_thresholds.end(), minimiser_count);`
`402`	`408`	`if(p != expression_thresholds.begin())`
`403`	`409`	`sizes[(p-expression_thresholds.begin())-1]++;`
`@@ -469,7 +475,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,`
`469`	`475`	`else`
`470`	`476`	`{`
`471`	`477`	`// Estimate sizes on filesize, assuming every byte translates to one letter (which is obiously not true,`
`472`		`- // because ids contain letters as well), so size might be overestimated`
	`478`	`+ // because ids contain letters as well), so size might be overestimated. TODO: Find a better estimation!`
`473`	`479`	`unsigned file_iterator = std::accumulate(minimiser_args.samples.begin(), minimiser_args.samples.begin() + i, 0);`
`474`	`480`
`475`	`481`	`// Determine cutoffs`
`@@ -515,7 +521,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,`
`515`	`521`	`}`
`516`	`522`
`517`	`523`	`std::ofstream outfile_fpr;`
`518`		`- outfile_fpr.open(std::string{ibf_args.path_out} + "IBF_FPRs.fprs");`
	`524`	`+ outfile_fpr.open(std::string{ibf_args.path_out} + "IBF_FPRs.fprs"); // File to store actual false positive rates per experiment.`
`519`	`525`	`// Create IBFs`
`520`	`526`	`std::vector<seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed>> ibfs;`
`521`	`527`	`for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++)`
`@@ -539,7 +545,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,`
`539`	`545`
`540`	`546`	`for (unsigned i = 0; i < num_files; i++)`
`541`	`547`	`{`
`542`		`- double fpr = std::pow(1.0- std::pow(1.0-(1.0/size), num_hashsizes[i][j]), num_hash);//std::pow((1.0-(std::exp((-1.0num_hash*sizes[i][j])/size))), num_hash);`
	`548`	`+ double fpr = std::pow(1.0- std::pow(1.0-(1.0/size), num_hash*sizes[i][j]), num_hash);`
`543`	`549`	`outfile_fpr << fpr << " ";`
`544`	`550`	`}`
`545`	`551`	`outfile_fpr << "\n";`
`@@ -637,6 +643,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,`
`637`	`643`
`638`	`644`	`}`
`639`	`645`
	`646`	`+ // Store all expression thresholds per level.`
`640`	`647`	`if constexpr(samplewise)`
`641`	`648`	`{`
`642`	`649`	`std::ofstream outfile;`