Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
217a06c
remove scran dependency
soroorh Nov 30, 2023
f6dbb50
add relax_min_obs to remove min 4 obs requirement (issue #4), change …
soroorh Feb 28, 2025
d8c5a0e
fix #8
soroorh Feb 28, 2025
bc544bf
fixes #5
soroorh Feb 28, 2025
a73af45
fix y_critical_obs definition
soroorh Mar 3, 2025
7e1944a
fix y_critical_obs definition #8
soroorh Mar 3, 2025
31f7b60
Merge branch '1.17.0_devel' of https://github.com/DavisLaboratory/msI…
soroorh Mar 3, 2025
ec0e8f2
revert back to the original implementation in the paper, using the eb…
soroorh Mar 9, 2025
beaa445
revert back to the original implementation in the paper, using the eb…
soroorh Mar 9, 2025
664bd48
Merge branch '1.17.0_devel' of https://github.com/DavisLaboratory/msI…
soroorh Mar 9, 2025
897a7b1
fix error critical obs issue #8
soroorh Mar 15, 2025
35c4a29
update docs and fix conditional in msImpute
soroorh Mar 15, 2025
43313bb
change to mixture of normals
soroorh Mar 18, 2025
3040942
update docs, make setting seed optional for a deterministic lower nor…
soroorh Mar 18, 2025
a586a93
fix row orders #8
soroorh Mar 18, 2025
abbeec5
dirichlet prior over missing value class mixture weights #8
soroorh Mar 19, 2025
a6975c9
replace group with design to accomodate complex MV mechanism due to e…
soroorh Mar 19, 2025
4d5bfc1
update references to the journal paper
soroorh Mar 19, 2025
e864a04
update vignette, depricate computeStructuralMetrics
soroorh Mar 20, 2025
20933da
depricate group and use design #8
soroorh Mar 20, 2025
de5a822
update docs
soroorh Mar 20, 2025
03c2c1a
upadte readme
soroorh Mar 22, 2025
1c812c7
upadte readme
soroorh Mar 22, 2025
2f73b29
upadte version to bioc dev version
soroorh Mar 22, 2025
cda7f4d
Merge branch 'master' into 1.17.0_devel
soroorh Mar 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: msImpute
Type: Package
Title: Imputation of label-free mass spectrometry peptides
Version: 1.7.0
Version: 1.17.0
Authors@R:
person(given = "Soroor",
family = "Hediyeh-zadeh",
Expand All @@ -15,14 +15,14 @@ Description: MsImpute is a package for imputation of peptide intensity in proteo
MNAR ("v2-mnar"), or by Peptide Identity Propagation (PIP).
Depends: R (> 4.1.0)
SystemRequirements: python
Imports: softImpute, methods, stats, graphics, pdist, reticulate,
scran, data.table, FNN, matrixStats, limma, mvtnorm,
Imports: softImpute, methods, stats, graphics, pdist, LaplacesDemon,
data.table, FNN, matrixStats, limma, mvtnorm,
tidyr, dplyr
License: GPL (>=2)
Encoding: UTF-8
LazyData: true
BugReports: https://github.com/DavisLaboratory/msImpute/issues
RoxygenNote: 7.1.1
RoxygenNote: 7.3.2
Suggests: BiocStyle, knitr, rmarkdown, ComplexHeatmap, imputeLCMD
VignetteBuilder: knitr
biocViews: MassSpectrometry, Proteomics, Software
3 changes: 0 additions & 3 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
export(CPD)
export(KNC)
export(KNN)
export(computeStructuralMetrics)
export(evidenceToMatrix)
export(findVariableFeatures)
export(msImpute)
export(mspip)
export(plotCV2)
Expand All @@ -22,7 +20,6 @@ importFrom(limma,loessFit)
importFrom(matrixStats,rowSds)
importFrom(methods,is)
importFrom(methods,new)
importFrom(scran,modelGeneVar)
importFrom(stats,aggregate)
importFrom(stats,complete.cases)
importFrom(stats,cor)
Expand Down
10 changes: 5 additions & 5 deletions R/computeStructuralMetrics.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#' Metrics for the assessment of post-imputation structural preservation
#' Metrics for the assessment of post-imputation structural preservation
#'
#' For an imputed dataset, it computes within phenotype/experimental condition similarity
#' DEPRECATED. For an imputed dataset, it computes within phenotype/experimental condition similarity
#' (i.e. preservation of local structures), between phenotype distances
#' (preservation of global structures), and the Gromov-Wasserstein (GW)
#' distance between original (source) and imputed data.
Expand Down Expand Up @@ -52,7 +52,7 @@
#' group <- as.factor(gsub("_[1234]", "", colnames(y)))
#' computeStructuralMetrics(y, group, y=NULL)
#'
#' @export
#'
computeStructuralMetrics <- function(x, group=NULL, y = NULL, k=2){
if(!is.null(group)){
out <- list(withinness = log(withinness(x, group)),
Expand Down Expand Up @@ -114,8 +114,8 @@ gromov_wasserstein <- function(x, y, k, min.mean = 0.1){


cat("Computing GW distance using k=", k, "Principal Components\n")
reticulate::source_python(system.file("python", "gw.py", package = "msImpute"))
return(gw(C1,C2, ncol(x)))
# reticulate::source_python(system.file("python", "gw.py", package = "msImpute"))
# return(gw(C1,C2, ncol(x)))
}


Expand Down
4 changes: 2 additions & 2 deletions R/evidenceToMatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ evidenceToMatrix <- function(evidence, run_id = "Raw.file", peptide_id = "Peptid
evidence_colnames <- tolower(colnames(evidence))

# genes <- evidence[,match(tolower(meta_attrs), evidence_colnames)]
genes <- evidence[, evidence_colnames %in% tolower(meta_attrs)]
genes <- genes[!duplicated(genes),]
genes <- evidence[, evidence_colnames %in% tolower(meta_attrs),drop=FALSE]
genes <- genes[!duplicated(genes),,drop=FALSE]
genes <- genes[match(rownames(E), genes[,peptide_id]),]


Expand Down
30 changes: 0 additions & 30 deletions R/findVariableFeatures.R

This file was deleted.

122 changes: 89 additions & 33 deletions R/msImpute.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@
#'
#'
#' @param y Numeric matrix giving log-intensity where missing values are denoted by NA. Rows are peptides, columns are samples.
#' @param method character. Allowed values are \code{"v2"} for \code{msImputev2} imputation (enhanced version) for MAR.
#' \code{method="v2-mnar"} (modified low-rank approx for MNAR), and \code{"v1"} initial release of \code{msImpute}
#' @param group character or factor vector of length \code{ncol(y)}
#' @param a numeric. the weight parameter. default to 0.2. Weights the MAR-imputed distribution in the imputation scheme.
#' @param method Character. Allowed values are \code{"v2"} for \code{msImputev2} imputation (enhanced version) for MAR.
#' \code{method="v2-mnar"} (modified low-rank approx for MNAR), and \code{"v1"} initial release of \code{msImpute}.
#' @param group Character or factor vector of length \code{ncol(y)}. DEPRECATED. Please specify the \code{design} argument.
#' @param design Object from model.matrix(); A zero-intercept design matrix (see example).
#' @param alpha Numeric. The weight parameter. Default to 0.2. Weights the MAR-imputed distribution in the imputation scheme. DEPRECATED
#' @param rank.max Numeric. This restricts the rank of the solution. is set to min(dim(\code{y})-1) by default in "v1".
#' @param lambda Numeric. Nuclear-norm regularization parameter. Controls the low-rank property of the solution
#' to the matrix completion problem. By default, it is determined at the scaling step. If set to zero
Expand All @@ -44,48 +45,77 @@
#' @param final.svd Logical. Shall final SVD object be saved?
#' The solutions to the matrix completion problems are computed from U, D and V components of final SVD.
#' Applicable to "v1" only.
#' @param biScale_maxit number of iteration for the scaling algorithm to converge . See \code{scaleData}. You may need to change this
#' @param biScale_maxit Number of iteration for the scaling algorithm to converge . See \code{scaleData}. You may need to change this
#' parameter only if you're running \code{method=v1}. Applicable to "v1" only.
#' @param gauss_width numeric. The width parameter of the Gaussian distribution to impute the MNAR peptides (features). This the width parameter in the down-shift imputation method.
#' @param gauss_shift numeric. The shift parameter of the Gaussian distribution to impute the MNAR peptides (features). This the width parameter in the down-shift imputation method.
#' @param gauss_width Numeric. The width parameter of the Gaussian distribution to impute the MNAR peptides (features). This the width parameter in the down-shift imputation method.
#' @param gauss_shift Numeric. The shift parameter of the Gaussian distribution to impute the MNAR peptides (features). This the width parameter in the down-shift imputation method.
#' @param use_seed Logical. Makes random draw from the lower Normal component of the mixture (corresponding to imputation by down-shift) deterministic, so that results are reproducible.
#' @return Missing values are imputed by low-rank approximation of the input matrix. If input is a numeric matrix,
#' a numeric matrix of identical dimensions is returned.
#'
#'
#' @examples
#' data(pxd010943)
#' y <- log2(data.matrix(pxd010943))
#' group <- gsub("_[1234]","", colnames(y))
#' yimp <- msImpute(y, method="v2-mnar", group=group)
#' group <- as.factor(gsub("_[1234]","", colnames(y)))
#' design <- model.matrix(~0+group)
#' yimp <- msImpute(y, method="v2-mnar", design=design, max.rank=2)
#' @seealso selectFeatures
#' @author Soroor Hediyeh-zadeh
#' @references
#' Hastie, T., Mazumder, R., Lee, J. D., & Zadeh, R. (2015). Matrix completion and low-rank SVD via fast alternating least squares. The Journal of Machine Learning Research, 16(1), 3367-3402.
#' @references
#' Hediyeh-zadeh, S., Webb, A. I., & Davis, M. J. (2020). MSImpute: Imputation of label-free mass spectrometry peptides by low-rank approximation. bioRxiv.
#' Hediyeh-Zadeh, S., Webb, A. I., & Davis, M. J. (2023). MsImpute: Estimation of missing peptide intensity data in label-free quantitative mass spectrometry. Molecular & Cellular Proteomics, 22(8).
#' @importFrom methods is
#' @export
msImpute <- function(y, method=c("v2-mnar", "v2", "v1"),
group = NULL,
a = 0.2,
design = NULL,
alpha = NULL,
relax_min_obs=TRUE,
rank.max = NULL, lambda = NULL, thresh = 1e-05,
maxit = 100, trace.it = FALSE, warm.start = NULL,
final.svd = TRUE, biScale_maxit=20, gauss_width = 0.3, gauss_shift = 1.8) {
final.svd = TRUE, biScale_maxit=20, gauss_width = 0.3,
gauss_shift = 1.8, use_seed = TRUE) {

method <- match.arg(method, c("v2-mnar","v2", "v1"))
if (use_seed){
set.seed(123)
}

if (is.null(rownames(y))){
stop("Input row names are null. Please assign row names")
}else{
roworder <- rownames(y)
}


if(any(is.nan(y) | is.infinite(y))) stop("Inf or NaN values encountered.")
if(any(rowSums(!is.na(y)) <= 3)) stop("Peptides with excessive NAs are detected. Please revisit your fitering step. At least 4 non-missing measurements are required for any peptide.")

if(!relax_min_obs & any(rowSums(!is.na(y)) <= 3)) {

stop("Peptides with excessive NAs are detected. Please revisit your fitering step (at least 4 non-missing measurements are required for any peptide) or set relax_min_obs=TRUE.")
}
else if(relax_min_obs & any(rowSums(!is.na(y)) <= 3)){
critical_obs <- which(rowSums(!is.na(y)) <= 3)
message("Features with less than 4 non-missing measurements detected. These will be treated as MNAR.")
}else{
critical_obs <- NULL
}

if(any(y < 0, na.rm = TRUE)){
warning("Negative values encountered in imputed data. Please consider revising filtering and/or normalisation steps.")
}



if(!is.null(critical_obs)){
y_critical_obs <- y[critical_obs,, drop=FALSE]
y <- y[-critical_obs,, drop=FALSE]
}

if(method=="v1"){
message(paste("Running msImpute version", method))

yimp <- scaleData(y, maxit = biScale_maxit)
yimp <- msImputev1(yimp,
rank.max = rank.max, lambda = lambda, thresh = thresh,
Expand All @@ -100,17 +130,28 @@ msImpute <- function(y, method=c("v2-mnar", "v2", "v1"),
yimp <- msImputev1(y, rank.max = rank.max , lambda = estimateLambda(y, rank = rank.max)) #
if (method == "v2-mnar"){
message(paste("Compute barycenter of MAR and NMAR distributions", method))
if (is.null(group)) stop("Please specify the 'group' argument. This is required for the 'v2-mnar' method.")
if (!is.null(group) & is.null(design)) stop("'group' argument is deprecated. Please specify the 'design' argument.")
if (is.null(group) & is.null(design)) stop("Please specify the 'design' argument. This is required for the 'v2-mnar' method.")
ygauss <- gaussimpute(y, width = gauss_width, shift = gauss_shift)
yimp <- l2bary(y=y, ygauss = ygauss, yerank = yimp, group = group, a=a)
# yimp <- l2bary(y=y, ygauss = ygauss, yerank = yimp, group = group, a=alpha)
yimp <- l2bary(y=y, ygauss = ygauss, yerank = yimp, design = design, a=alpha)

}


}

}

yimp[!is.na(y)] <- y[!is.na(y)]
if (!is.null(critical_obs)){
yimp_critical_obs <- gaussimpute(y_critical_obs, width = gauss_width, shift = gauss_shift)
yimp_critical_obs[!is.na(y_critical_obs)] <- y_critical_obs[!is.na(y_critical_obs)]
yimp <- rbind(yimp,yimp_critical_obs)
yimp <- yimp[match(roworder, rownames(yimp)),]
}



return(yimp)


Expand Down Expand Up @@ -177,7 +218,7 @@ eigenpdf <- function(y, rank=NULL){
#' @importFrom stats var sd
#' @keywords internal
estimateS0 <- function(y, rank=NULL){
set.seed(123)
# set.seed(123)
s0 <- vector(length = 100L)
for(i in seq_len(100)){
s0[i] <- var(eigenpdf(y, rank=rank))
Expand All @@ -198,23 +239,38 @@ estimateLambda <- function(y, rank=NULL) mean(matrixStats::colSds(y, na.rm = TRU

#' @importFrom stats quantile
#' @keywords internal
l2bary <- function(y, ygauss, yerank, group, a=0.2){
l2bary <- function(y, ygauss, yerank, group, design = NULL, a=0.2){

pepVars <- matrixStats::rowSds(y, na.rm = TRUE)
varq75 <- quantile(pepVars, p = 0.75, na.rm=TRUE)
pepSds <- matrixStats::rowSds(y, na.rm = TRUE)
pepMeans <- rowMeans(y, na.rm = TRUE)
pepCVs <- pepSds/pepMeans
CV_cutoff <- min(0.2, median(pepCVs))
varq75 <- quantile(pepSds, p = 0.75, na.rm=TRUE)
#varq75 <- mean(pepVars)
EBM <- ebm(y, group)

# if entropy is nan and variance is high, it is most likely detection limit missing
w1 <- ifelse(is.nan(EBM) & (pepVars > varq75), 1-a, a)
w2 <- 1-w1

yl2 <- list()
for(j in colnames(y)){
yl2[[j]] <- rowSums(cbind(w1*ygauss[,j], w2*yerank[,j]))
}

yl2 <- do.call(cbind, yl2)
# EBM <- ebm(y, group)
mv_design <- apply(design, 2, FUN=function(x) ebm(y, as.factor(x)))
dirich_alpha_1 <- rowSums(!is.nan(mv_design))
dirich_alpha_2 <- ncol(mv_design) - dirich_alpha_1
dirich_alpha <- cbind(dirich_alpha_1, dirich_alpha_2)


# if entropy is nan and variance is low, it is most likely detection limit missing
# w1 <- ifelse(is.nan(EBM) & (pepCVs < CV_cutoff), 1-a, a)
# w1 <- ifelse(is.nan(EBM), 1-a, a)
# w2 <- 1-w1

w <- apply(dirich_alpha, 1, FUN= function(alpha) LaplacesDemon::rdirichlet(1, alpha))
w <- t(w)
w1 <- w[,2]
w2 <- w[,1]

# yl2 <- list()
# for(j in colnames(y)){
# yl2[[j]] <- rowSums(cbind(w1*ygauss[,j], w2*yerank[,j]))
# }

# yl2 <- do.call(cbind, yl2)
yl2 <- w1*ygauss + w2*yerank
yl2[!is.na(y)] <- y[!is.na(y)]
return(yl2)

Expand Down
4 changes: 2 additions & 2 deletions R/mspip.R
Original file line number Diff line number Diff line change
Expand Up @@ -547,8 +547,8 @@ mspip <- function(path_txt, k = 10, thresh = 0, skip_weights = TRUE, tims_ms = F
evidence_colnames <- tolower(colnames(evidence))

# genes <- evidence[,match(tolower(meta_anchors), evidence_colnames)]
genes <- evidence[, evidence_colnames %in% tolower(meta_anchors)]
genes <- genes[!duplicated(genes),]
genes <- evidence[, evidence_colnames %in% tolower(meta_anchors), drop=FALSE]
genes <- genes[!duplicated(genes),, drop=FALSE]
evidence_pip <- cbind(evidence_pip, genes[match(evidence_pip$PeptideID, genes$PeptideID),
grep("PeptideID", colnames(genes), invert=TRUE)])
}
Expand Down
2 changes: 1 addition & 1 deletion R/scaleData.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
#' @references
#' Hastie, T., Mazumder, R., Lee, J. D., & Zadeh, R. (2015). Matrix completion and low-rank SVD via fast alternating least squares. The Journal of Machine Learning Research, 16(1), 3367-3402.
#' @references
#' Hediyeh-zadeh, S., Webb, A. I., & Davis, M. J. (2020). MSImpute: Imputation of label-free mass spectrometry peptides by low-rank approximation. bioRxiv.
#' Hediyeh-Zadeh, S., Webb, A. I., & Davis, M. J. (2023). MsImpute: Estimation of missing peptide intensity data in label-free quantitative mass spectrometry. Molecular & Cellular Proteomics, 22(8).
#' @importFrom methods is
#' @export
scaleData <- function(object, maxit = 20, thresh = 1e-09, row.center = TRUE, row.scale =TRUE,
Expand Down
2 changes: 1 addition & 1 deletion R/selectFeatures.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
#' @author Soroor Hediyeh-zadeh
#' @seealso msImpute
#' @references
#' Hediyeh-zadeh, S., Webb, A. I., & Davis, M. J. (2020). MSImpute: Imputation of label-free mass spectrometry peptides by low-rank approximation. bioRxiv.
#' Hediyeh-Zadeh, S., Webb, A. I., & Davis, M. J. (2023). MsImpute: Estimation of missing peptide intensity data in label-free quantitative mass spectrometry. Molecular & Cellular Proteomics, 22(8).
#' @importFrom stats lm residuals
#' @importFrom methods is
#' @importFrom graphics abline plot
Expand Down
Loading
Loading