Skip to content

Commit 4d4ce9b

Browse files
authored
Merge pull request #117 from MitraDarja/improve_doc
Improve doc
2 parents f5b53e5 + 41fbed0 commit 4d4ce9b

File tree

2 files changed

+29
-13
lines changed

2 files changed

+29
-13
lines changed

src/estimate.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,10 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint
2828
seqan3::dna4_vector const seq, std::vector<uint32_t> & prev_counts,
2929
exp_t const & expressions, uint16_t const k, std::vector<double> const fprs)
3030
{
31+
// Check, if one expression threshold for all or individual thresholds
3132
static constexpr bool multiple_expressions = std::same_as<exp_t, std::vector<std::vector<uint16_t>>>;
3233

34+
// Count minimisers in ibf of current level
3335
std::vector<uint32_t> counter;
3436
counter.assign(ibf.bin_count(), 0);
3537
uint64_t minimiser_length = 0;
@@ -41,15 +43,20 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint
4143
++minimiser_length;
4244
}
4345

46+
// Defines, where the median should be
4447
float minimiser_pos = minimiser_length/2.0;
4548

49+
// Check every experiment by going over the number of bins in the ibf.
4650
for(int j = 0; j < counter.size(); j++)
4751
{
4852
// Correction by substracting the expected number of false positives
4953
counter[j] = std::max((double) 0.0, (double) ((counter[j]-(minimiser_length*fprs[j]))/(1.0-fprs[j])));
54+
// Check, if considering previously seen minimisers and minimisers found ar current level equal to or are greater
55+
// than the minimiser_pow, which gives the median position.
56+
// If ań estimation took already place (estimations_i[j]!=0), a second estimation is not performed.
5057
if (((prev_counts[j] + counter[j]) >= minimiser_pos) & (estimations_i[j] == 0))
5158
{
52-
// If there was nothing previous
59+
// If there was no previous level, because we are looking at the last level.
5360
if constexpr(last_exp)
5461
{
5562
if constexpr (multiple_expressions)
@@ -68,15 +75,15 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint
6875
estimations_i[j] = std::max(expressions[k][j] * 1.0, expressions[k+1][j] - ((abs(minimiser_pos - prev_counts[j])/(counter[j] * 1.0)) * (expressions[k+1][j]-expressions[k][j])));
6976
else
7077
estimations_i[j] = std::max(expressions * 1.0, k - ((abs(minimiser_pos - prev_counts[j])/(counter[j] * 1.0)) * (k-expressions)));
71-
// Make sure, every transcript is only estimated once
72-
prev_counts[j] = 0;
7378
}
7479

80+
// Perform normalization by dividing through the threshold of the first level. Only works, if multiple expressions were used.
7581
if constexpr (normalization & multiple_expressions)
7682
estimations_i[j] = estimations_i[j]/expressions[0][j];
7783
}
7884
else
7985
{
86+
// If not found at this level, add to previous count.
8087
prev_counts[j] = prev_counts[j] + counter[j];
8188
}
8289
}
@@ -160,7 +167,7 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat
160167
// Make sure expression levels are sorted.
161168
sort(args.expression_thresholds.begin(), args.expression_thresholds.end());
162169

163-
// Initialse last expression
170+
// Initialse last expression.
164171
if constexpr (samplewise)
165172
load_ibf(ibf, estimate_args.path_in.string() + "IBF_Level_" + std::to_string(args.number_expression_thresholds-1));
166173
else
@@ -199,6 +206,7 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat
199206

200207
for (int j = args.number_expression_thresholds - 2; j >= 0; j--)
201208
{
209+
// Loadthe next ibf that should be considered.
202210
if constexpr (samplewise)
203211
load_ibf(ibf, estimate_args.path_in.string() + "IBF_Level_" + std::to_string(j));
204212
else
@@ -223,6 +231,7 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat
223231
prev_expression = args.expression_thresholds[j];
224232
}
225233

234+
// Write output file.
226235
std::ofstream outfile;
227236
outfile.open(std::string{file_out});
228237
for (int i = 0; i < seqs.size(); ++i)

src/ibf.cpp

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ void get_include_set_table(min_arguments const & args, std::filesystem::path con
4242
}
4343
}
4444

45+
// Chech if file has fasta format to estimate cutoffs.
4546
inline bool check_for_fasta_format(std::vector<std::string> const & valid_extensions, std::string const & file_path)
4647
{
4748

@@ -68,7 +69,7 @@ inline bool check_for_fasta_format(std::vector<std::string> const & valid_extens
6869
// Determine cutoff for one experiment
6970
uint8_t calculate_cutoff(std::filesystem::path sequence_file, int samples)
7071
{
71-
// Cutoff according to Mantis paper, divided by two because we store expression levels and
72+
// Cutoff according to Mantis paper, divided by two because we store expression thresholds and
7273
// -1 because we use "<" and not "<="
7374
uint16_t const default_cutoff{24};
7475
uint8_t cutoff{default_cutoff};
@@ -123,7 +124,8 @@ void fill_hash_table(min_arguments const & args,
123124
hash_table[minHash] = cutoff_table[minHash] + 1;
124125
cutoff_table.erase(minHash);
125126
}
126-
// If none of the above, increase count in cutoff table.
127+
// If none of the above, increase count in cutoff table. Cutoff Table increases RAM usage by storing
128+
// minimisers with a low occurence in a smaller hash table.
127129
else
128130
{
129131
cutoff_table[minHash]++;
@@ -320,13 +322,13 @@ void check_fpr(uint8_t const number_expression_thresholds, std::vector<double> &
320322
}
321323
}
322324

323-
// Calculate expression levels and sizes
325+
// Calculate expression thresholds and sizes
324326
void get_expression_thresholds(uint8_t const number_expression_thresholds,
325327
robin_hood::unordered_node_map<uint64_t, uint16_t> const & hash_table,
326328
std::vector<uint16_t> & expression_thresholds, std::vector<uint64_t> & sizes,
327329
robin_hood::unordered_set<uint64_t> const & genome, bool all = true)
328330
{
329-
// Calculate expression levels by taking median recursively
331+
// Calculate expression thresholds by taking median recursively
330332
std::vector<uint16_t> counts;
331333
for (auto && elem : hash_table)
332334
{
@@ -354,21 +356,23 @@ void get_expression_thresholds(uint8_t const number_expression_thresholds,
354356
prev_pos = prev_pos + counts.size()/dev;
355357
dev = dev*2;
356358

359+
// If expression does not change compared to previous one, do not store it again as an expression threshold.
357360
if ((exp - prev_exp) > 1)
358361
{
359362
expression_thresholds.push_back(exp);
360363
sizes.push_back(prev_pos);
361-
362364
}
363365

364366
prev_exp = exp;
365367
}
368+
// In case not all levels have a threshold, give the last levels a maximal threshold, which can not be met by any minimiser.
366369
while(expression_thresholds.size() < number_expression_thresholds)
367370
expression_thresholds.push_back(max_elem + 1);
368371
counts.clear();
369372
}
370373

371-
// Estimate the file size for every expression level, necessary when samplewise=false
374+
// Estimate the file size for every expression level, necessary when samplewise=false, because then it is completly
375+
// unclear how many minimisers are to store per file.
372376
void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t const number_expression_thresholds,
373377
std::vector<uint16_t> const & expression_thresholds, std::vector<uint64_t> & sizes,
374378
robin_hood::unordered_set<uint64_t> const & genome, bool all = true)
@@ -398,6 +402,8 @@ void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t co
398402
fin.read((char*)&minimiser_count, sizeof(minimiser_count));
399403
if (all | genome.contains(minimiser))
400404
{
405+
// Find the level with the smallest greater value than the minimiser occurrence, in the level before that the
406+
// minimiser is going to be stored.
401407
auto p = std::upper_bound(expression_thresholds.begin(), expression_thresholds.end(), minimiser_count);
402408
if(p != expression_thresholds.begin())
403409
sizes[(p-expression_thresholds.begin())-1]++;
@@ -469,7 +475,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
469475
else
470476
{
471477
// Estimate sizes on filesize, assuming every byte translates to one letter (which is obiously not true,
472-
// because ids contain letters as well), so size might be overestimated
478+
// because ids contain letters as well), so size might be overestimated. TODO: Find a better estimation!
473479
unsigned file_iterator = std::accumulate(minimiser_args.samples.begin(), minimiser_args.samples.begin() + i, 0);
474480

475481
// Determine cutoffs
@@ -515,7 +521,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
515521
}
516522

517523
std::ofstream outfile_fpr;
518-
outfile_fpr.open(std::string{ibf_args.path_out} + "IBF_FPRs.fprs");
524+
outfile_fpr.open(std::string{ibf_args.path_out} + "IBF_FPRs.fprs"); // File to store actual false positive rates per experiment.
519525
// Create IBFs
520526
std::vector<seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed>> ibfs;
521527
for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++)
@@ -539,7 +545,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
539545

540546
for (unsigned i = 0; i < num_files; i++)
541547
{
542-
double fpr = std::pow(1.0- std::pow(1.0-(1.0/size), num_hash*sizes[i][j]), num_hash);//std::pow((1.0-(std::exp((-1.0*num_hash*sizes[i][j])/size))), num_hash);
548+
double fpr = std::pow(1.0- std::pow(1.0-(1.0/size), num_hash*sizes[i][j]), num_hash);
543549
outfile_fpr << fpr << " ";
544550
}
545551
outfile_fpr << "\n";
@@ -637,6 +643,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
637643

638644
}
639645

646+
// Store all expression thresholds per level.
640647
if constexpr(samplewise)
641648
{
642649
std::ofstream outfile;

0 commit comments

Comments
 (0)