Skip to content

Commit 0297d7c

Browse files
authored
new error model (#2)
* update documentation * check if initialized to NaN * fix output bug * fix initialization bug * change false positive model * track variances * fix adaptive sampling bug * make eps scale with number of alleles * implement constrained sampling and remove exact sampling * remove complexity limit * update simulator to take allele_freq specification * fix bug in simulator * parameterize epsilon as draw from beta * missed semicolon * fix sampling bounds * update simulator and documentation
1 parent c5f9936 commit 0297d7c

27 files changed

+489
-306
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,7 @@ src/Makevars
2727
*.tmp
2828

2929
compile_flags.txt
30+
31+
manuscript/
32+
33+
*.rds

NAMESPACE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,12 @@ export(simulate_allele_frequencies)
1313
export(simulate_data)
1414
export(simulate_observed_genotype)
1515
export(simulate_sample_coi)
16+
export(simulate_sample_genotype)
1617
export(summarize_allele_freq_fn)
1718
export(summarize_allele_freqs)
1819
export(summarize_coi)
20+
export(summarize_epsilon_neg)
21+
export(summarize_epsilon_pos)
1922
export(summarize_he)
2023
importFrom(Rcpp,sourceCpp)
2124
importFrom(magrittr,"%>%")

R/mcmc.R

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,23 +17,23 @@
1717
#' discard as burnin
1818
#' @param samples Positive Integer. Number of samples to take
1919
#' after burnin
20-
#' @param complexity_limit Limit on the number of alleles possible
21-
#' before augmenting with a latent genetic state representation.
2220
#' @param verbose Logical indicating if progress is printed
23-
#' @param eps_pos_0 0-1 Numeric. Initial eps_pos value
21+
#' @param allele_freq_threshold 0-1 Numeric. Lowest allowed value for an
22+
#' allele frequency.
23+
#' @param eps_pos_0 Numeric. Initial eps_pos value
2424
#' @param eps_pos_var Numeric. Variance used in sampling eps_pos
25-
#' @param eps_pos_alpha Positive Numeric. Alpha parameter in
26-
#' Beta distribution for eps_pos prior
27-
#' @param eps_pos_beta Positive Numeric. Beta parameter in
28-
#' Beta distribution for eps_pos prior
29-
#' @param eps_neg_0 0-1 Numeric. Initial eps_neg value
25+
#' @param eps_pos_shape Positive Numeric. Shape parameter in
26+
#' Gamma distribution for eps_pos prior
27+
#' @param eps_pos_scale Positive Numeric. Scale parameter in
28+
#' Gamma distribution for eps_pos prior
29+
#' @param eps_neg_0 Numeric. Initial eps_neg value
3030
#' @param eps_neg_var Numeric. Variance used in sampling eps_neg
31-
#' @param eps_neg_alpha Positive Numeric. Alpha parameter in
32-
#' Beta distribution for eps_neg prior
33-
#' @param eps_neg_beta Positive Numeric. Beta parameter in
34-
#' Beta distribution for eps_neg prior
35-
#' @param max_eps_pos 0-1 Numeric. Maximum allowed value for eps_pos
36-
#' @param max_eps_neg 0-1 Numeric. Maximum allowed value for eps_neg
31+
#' @param eps_neg_shape Positive Numeric. Shape parameter in
32+
#' Gamma distribution for eps_neg prior
33+
#' @param eps_neg_scale Positive Numeric. Scale parameter in
34+
#' Gamma distribution for eps_neg prior
35+
#' @param max_eps_pos Numeric. Maximum allowed value for eps_pos
36+
#' @param max_eps_neg Numeric. Maximum allowed value for eps_neg
3737
#' @param max_coi Positive Numeric. Maximum allowed complexity of infection
3838
#' @param allele_freq_vars Positive Numeric. Variance used in sampling allele
3939
#' frequencies
@@ -47,21 +47,21 @@ run_mcmc <-
4747
thin = 1,
4848
burnin = 1e4,
4949
samples = 1e4,
50-
complexity_limit = 5,
5150
verbose = TRUE,
52-
eps_pos_0 = .01,
53-
eps_pos_var = .001,
54-
eps_pos_alpha = 1,
55-
eps_pos_beta = 99,
56-
eps_neg_0 = .05,
57-
eps_neg_var = .005,
58-
eps_neg_alpha = 5,
59-
eps_neg_beta = 95,
60-
max_eps_pos = .5,
61-
max_eps_neg = .5,
51+
allele_freq_threshold = 1e-5,
52+
eps_pos_0 = .005,
53+
eps_pos_var = 1,
54+
eps_pos_alpha = .5,
55+
eps_pos_beta = 99.5,
56+
eps_neg_0 = .005,
57+
eps_neg_var = 1,
58+
eps_neg_alpha = .5,
59+
eps_neg_beta = 99.5,
60+
max_eps_pos = 2,
61+
max_eps_neg = 2,
6262
max_coi = 20,
63-
allele_freq_vars = .1,
64-
adapt_allele_freq_vars = FALSE) {
63+
allele_freq_vars = 1,
64+
adapt_allele_freq_vars = TRUE) {
6565
args <- as.list(environment())
6666

6767
## if is_missing == FALSE, then generate a default FALSE matrix

R/simulator.R

Lines changed: 60 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ rdirichlet <- function(n, alpha) {
2525
#' @param num_loci total number of loci to draw
2626
simulate_allele_frequencies <- function(alpha, num_loci) {
2727
dists <- rdirichlet(num_loci, alpha)
28-
lapply(seq_len(num_loci), function(x) {
28+
sapply(seq_len(num_loci), function(x) {
2929
dists[x, ]
3030
})
3131
}
@@ -44,15 +44,32 @@ simulate_sample_coi <- function(num_samples, mean_coi) {
4444

4545
#' Simulate sample genotype
4646
#' @details Simulates sampling the genetics at a single locus given an allele
47-
#' frequency distribution and a vector of sample COIs
47+
#' frequency distribution and a vector of sample COIs
4848
#'
4949
#' @param sample_cois Numeric vector indicating the multiplicity of infection
50-
#' for each biological sample
50+
#' for each biological sample
5151
#' @param locus_allele_dist Allele frequencies -- simplex parameter of a
52-
#' multinomial distribution
53-
simulate_sample_genotype <- function(sample_cois, locus_allele_dist) {
52+
#' multinomial distribution
53+
#' @param internal_relatedness numeric 0-1 indicating the probability for a
54+
#' strain's allele to come from an existing lineage within host
55+
#' @export
56+
simulate_sample_genotype <- function(sample_cois, locus_allele_dist, internal_relatedness) {
5457
lapply(sample_cois, function(coi) {
55-
rmultinom(1, coi, locus_allele_dist)
58+
genotypes <- matrix(nrow = coi, ncol = length(locus_allele_dist))
59+
for (i in 1:coi) {
60+
if (i == 1) {
61+
genotypes[i,] = rmultinom(1, 1, locus_allele_dist)
62+
} else {
63+
sample_internal = as.logical(rbinom(1, 1, internal_relatedness))
64+
if (sample_internal) {
65+
genotypes[i,] <- genotypes[sample(1:(i - 1), 1),]
66+
} else {
67+
genotypes[i,] <- rmultinom(1, 1, locus_allele_dist)
68+
}
69+
}
70+
}
71+
g <- colSums(genotypes)
72+
g
5673
})
5774
}
5875

@@ -64,16 +81,27 @@ simulate_sample_genotype <- function(sample_cois, locus_allele_dist) {
6481
#'
6582
#' @param alleles A numeric vector representing the number of strains
6683
#' contributing each allele
67-
#' @param epsilon_pos false positive rate
68-
#' @param epsilon_neg false negative rate
84+
#' @param epsilon_pos expected number of false negatives
85+
#' @param epsilon_neg expected number of false positives
6986
simulate_observed_allele <- function(alleles, epsilon_pos, epsilon_neg) {
70-
sapply(alleles, function(allele) {
87+
positive_indices <- which(as.logical(alleles)) # True Positives
88+
negative_indices <- which(!as.logical(alleles)) # True Negatives
89+
90+
# scale eps to the number of alleles so that given a fixed COI, there is a fixed
91+
# number of expected false positives or negatives across loci of varying
92+
# diversity.
93+
eps_pos_prob = epsilon_pos / length(alleles)
94+
eps_neg_prob = epsilon_neg / length(alleles)
95+
96+
alleles <- sapply(alleles, function(allele) {
7197
if (allele > 0) {
72-
rbinom(1, 1, prob = 1 - epsilon_neg)
98+
rbinom(1, 1, prob = 1 - eps_neg_prob)
7399
} else {
74-
rbinom(1, 1, epsilon_pos)
100+
rbinom(1, 1, prob = eps_pos_prob)
75101
}
76102
})
103+
104+
return(alleles)
77105
}
78106

79107
#' Simulate observed genotypes
@@ -85,8 +113,8 @@ simulate_observed_allele <- function(alleles, epsilon_pos, epsilon_neg) {
85113
#'
86114
#' @param true_genotypes a list of numeric vectors that are input
87115
#' to sim_observed_allele
88-
#' @param epsilon_pos false positive rate
89-
#' @param epsilon_neg false negative rate
116+
#' @param epsilon_pos expected number of false positives
117+
#' @param epsilon_neg expected number of false negatives
90118
simulate_observed_genotype <- function(true_genotypes,
91119
epsilon_pos,
92120
epsilon_neg) {
@@ -103,28 +131,31 @@ simulate_observed_genotype <- function(true_genotypes,
103131
#' @param locus_freq_alphas List of alpha vectors to be used to simulate
104132
#' from a Dirichlet distribution to generate allele frequencies.
105133
#' @param num_samples Total number of biological samples to simulate
106-
#' @param epsilon_pos False positive rate, between 0 and 1
107-
#' @param epsilon_neg False negative rate, between 0 and 1
134+
#' @param epsilon_pos False positive rate, expected number of false positives
135+
#' @param epsilon_neg False negative rate, expected number of false negatives
136+
#' @param allele_freqs List of allele frequencies to be used instead of
137+
#' simulating allele frequencies
108138
#' @return Simulated data that is structured to go into the MCMC sampler
109139
#'
110140
simulate_data <- function(mean_coi,
111-
locus_freq_alphas,
112141
num_samples,
113142
epsilon_pos,
114-
epsilon_neg) {
115-
allele_freq_dists <- c()
116-
117-
for (alpha in locus_freq_alphas) {
118-
allele_freq_dists <- c(
119-
allele_freq_dists,
120-
simulate_allele_frequencies(alpha, 1)
121-
)
143+
epsilon_neg,
144+
locus_freq_alphas = NULL,
145+
allele_freqs = NULL,
146+
internal_relatedness = 0) {
147+
if(is.null(allele_freqs)) {
148+
allele_freqs <- list()
149+
for (i in 1:length(locus_freq_alphas)) {
150+
allele_freqs[[i]] <- simulate_allele_frequencies(locus_freq_alphas[[i]], 1)
151+
}
122152
}
123153

154+
124155
sample_cois <- simulate_sample_coi(num_samples, mean_coi)
125156

126-
true_sample_genotypes <- lapply(allele_freq_dists, function(dist) {
127-
simulate_sample_genotype(sample_cois, dist)
157+
true_sample_genotypes <- lapply(allele_freqs, function(dist) {
158+
simulate_sample_genotype(sample_cois, dist, internal_relatedness)
128159
})
129160

130161
observed_sample_genotypes <- lapply(
@@ -136,13 +167,14 @@ simulate_data <- function(mean_coi,
136167
list(
137168
data = observed_sample_genotypes,
138169
sample_ids = paste0("S", seq.int(1, num_samples)),
139-
loci = paste0("L", seq.int(1, length(locus_freq_alphas))),
140-
allele_freqs = allele_freq_dists,
170+
loci = paste0("L", seq.int(1, length(allele_freqs))),
171+
allele_freqs = allele_freqs,
141172
sample_cois = sample_cois,
142173
true_genotypes = true_sample_genotypes,
143174
input = list(
144175
mean_coi = mean_coi,
145176
locus_freq_alphas = locus_freq_alphas,
177+
allele_freqs = allele_freqs,
146178
num_samples = num_samples,
147179
epsilon_pos = epsilon_pos,
148180
epsilon_neg = epsilon_neg

R/summary.R

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,76 @@ summarize_coi <- function(mcmc_results, lower_quantile = .025,
122122
return(coi_data)
123123
}
124124

125+
#' Summarize epsilon_neg
126+
#'
127+
#' @details Summarize epsilon negative results from MCMC. Returns
128+
#' a dataframe that contains summaries of the posterior
129+
#' distribution of epsilon negative for each biological sample.
130+
#'
131+
#' @export
132+
#'
133+
#' @param mcmc_results Result of calling run_mcmc()
134+
#' @param lower_quantile The lower quantile of the posterior
135+
#' distribution to return
136+
#' @param upper_quantile The upper quantile of the posterior
137+
#' distribution to return
138+
summarize_epsilon_neg <- function(mcmc_results, lower_quantile = .025, upper_quantile = .975) {
139+
epsilon_neg <- mcmc_results$eps_neg
140+
post_eps_neg_lower <- sapply(epsilon_neg, function(x) {
141+
quantile(x, lower_quantile)
142+
})
143+
post_eps_neg_med <- sapply(epsilon_neg, function(x) {
144+
quantile(x, .5)
145+
})
146+
post_eps_neg_upper <- sapply(epsilon_neg, function(x) {
147+
quantile(x, upper_quantile)
148+
})
149+
post_eps_neg_mean <- sapply(epsilon_neg, mean)
150+
151+
eps_neg_data <- data.frame(
152+
sample_id = mcmc_results$args$sample_ids,
153+
post_eps_neg_lower, post_eps_neg_med, post_eps_neg_upper, post_eps_neg_mean
154+
)
155+
156+
return(eps_neg_data)
157+
}
158+
159+
160+
#' Summarize epsilon_pos
161+
#'
162+
#' @details Summarize epsilon positive results from MCMC. Returns
163+
#' a dataframe that contains summaries of the posterior
164+
#' distribution of epsilon positive for each biological sample.
165+
#'
166+
#' @export
167+
#'
168+
#' @param mcmc_results Result of calling run_mcmc()
169+
#' @param lower_quantile The lower quantile of the posterior
170+
#' distribution to return
171+
#' @param upper_quantile The upper quantile of the posterior
172+
#' distribution to return
173+
summarize_epsilon_pos <- function(mcmc_results, lower_quantile = .025, upper_quantile = .975) {
174+
epsilon_pos <- mcmc_results$eps_pos
175+
post_eps_pos_lower <- sapply(epsilon_pos, function(x) {
176+
quantile(x, lower_quantile)
177+
})
178+
post_eps_pos_med <- sapply(epsilon_pos, function(x) {
179+
quantile(x, .5)
180+
})
181+
post_eps_pos_upper <- sapply(epsilon_pos, function(x) {
182+
quantile(x, upper_quantile)
183+
})
184+
post_eps_pos_mean <- sapply(epsilon_pos, mean)
185+
186+
eps_pos_data <- data.frame(
187+
sample_id = mcmc_results$args$sample_ids,
188+
post_eps_pos_lower, post_eps_pos_med, post_eps_pos_upper, post_eps_pos_mean
189+
)
190+
191+
return(eps_pos_data)
192+
}
193+
194+
125195
#' Summarize Function of Allele Frequencies
126196
#'
127197
#' @details General function to summarize the posterior distribution of
@@ -226,7 +296,8 @@ summarize_allele_freqs <- function(mcmc_results,
226296
post_allele_freqs_lower = post_allele_freqs_lower,
227297
post_allele_freqs_med = post_allele_freqs_med,
228298
post_allele_freqs_upper = post_allele_freqs_upper,
229-
post_allele_freqs_mean = post_allele_freqs_mean
299+
post_allele_freqs_mean = post_allele_freqs_mean,
300+
num_alleles = num_alleles
230301
)
231302
}
232303
)

data-raw/mcmc_results.R

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,34 @@
1-
## ----simulate_data
1+
## ----load_settings
22
set.seed(17325)
33

4-
mean_moi <- 5
4+
mean_moi <- 4
55
num_biological_samples <- 100
6-
epsilon_pos <- .01
7-
epsilon_neg <- .03
6+
epsilon_pos <- .05
7+
epsilon_neg <- .05
88

99
# Generate the number of alleles at each locus
10-
allele_counts <- c(rep(5, 15), rep(10, 15), rep(25, 15), rep(50, 15))
10+
allele_counts <- c(rep(3, 15), rep(5, 15), rep(10, 15))
11+
1112

1213
# We'll use flat alpha vectors for our draws from the Dirichlet
1314
locus_freq_alphas <- lapply(allele_counts, function(allele) rep(1, allele))
1415

16+
## ----simulate_data
1517
simulated_data <- moire::simulate_data(
16-
mean_moi, locus_freq_alphas,
18+
mean_moi,
1719
num_biological_samples,
18-
epsilon_pos, epsilon_neg
20+
epsilon_pos, epsilon_neg,
21+
locus_freq_alphas = locus_freq_alphas
1922
)
2023

2124

2225
## ----run_mcmc
23-
burnin <- 1e3
26+
burnin <- 1e4
2427
num_samples <- 1e3
2528

2629
mcmc_results <- moire::run_mcmc(
2730
simulated_data$data, simulated_data$sample_ids, simulated_data$loci,
28-
verbose = T, burnin = burnin, samples = num_samples, thin = 1,
29-
eps_pos_alpha = 1, eps_pos_beta = 99, complexity_limit = 5,
30-
eps_neg_alpha = 10, eps_neg_beta = 990, allele_freq_vars = 1,
31+
verbose = T, burnin = burnin, samples = num_samples,
3132
adapt_allele_freq_vars = TRUE
3233
)
3334

data/mcmc_results.rda

-6.7 MB
Binary file not shown.

data/simulated_data.rda

-30.7 KB
Binary file not shown.

0 commit comments

Comments
 (0)