Skip to content

Commit 065646e

Browse files
authored
Merge pull request #216 from stemangiola/dev
Dev
2 parents 820b0b8 + 14974e4 commit 065646e

10 files changed

+475
-28
lines changed

DESCRIPTION

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@ Suggests:
7070
functional,
7171
survminer,
7272
tidySummarizedExperiment,
73-
markdown
73+
markdown,
74+
uwot
7475
VignetteBuilder:
7576
knitr
7677
RdMacros:

R/functions.R

Lines changed: 125 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1825,7 +1825,7 @@ we suggest to partition the dataset for sample clusters.
18251825

18261826
}
18271827

1828-
#' Get principal component information to a tibble using tSNE
1828+
#' Get tSNE
18291829
#'
18301830
#' @keywords internal
18311831
#' @noRd
@@ -1939,6 +1939,130 @@ get_reduced_dimensions_TSNE_bulk <-
19391939

19401940
}
19411941

1942+
#' Get UMAP
1943+
#'
1944+
#' @keywords internal
1945+
#'
1946+
#' @import dplyr
1947+
#' @import tidyr
1948+
#' @import tibble
1949+
#' @importFrom rlang :=
1950+
#' @importFrom stats setNames
1951+
#' @importFrom utils install.packages
1952+
#'
1953+
#' @param .data A tibble
1954+
#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
1955+
#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6)
1956+
#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes)
1957+
#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples)
1958+
#' @param top An integer. How many top genes to select
1959+
#' @param of_samples A boolean
1960+
#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
1961+
#' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered
1962+
#' @param ... Further parameters passed to the function uwot
1963+
#'
1964+
#' @return A tibble with additional columns
1965+
#'
1966+
get_reduced_dimensions_UMAP_bulk <-
1967+
function(.data,
1968+
.element = NULL,
1969+
.feature = NULL,
1970+
1971+
.abundance = NULL,
1972+
.dims = 2,
1973+
top = 500,
1974+
of_samples = TRUE,
1975+
log_transform = TRUE,
1976+
scale = TRUE,
1977+
calculate_for_pca_dimensions = 20,
1978+
...) {
1979+
1980+
if(!is.null(calculate_for_pca_dimensions) & (
1981+
!is(calculate_for_pca_dimensions, "numeric") |
1982+
length(calculate_for_pca_dimensions) > 1
1983+
))
1984+
stop("tidybulk says: the argument calculate_for_pca_dimensions should be NULL or an integer of size 1")
1985+
1986+
# Comply with CRAN NOTES
1987+
. = NULL
1988+
1989+
# Get column names
1990+
.element = enquo(.element)
1991+
.feature = enquo(.feature)
1992+
.abundance = enquo(.abundance)
1993+
1994+
# Evaluate ...
1995+
arguments <- list(...)
1996+
# if (!"check_duplicates" %in% names(arguments))
1997+
# arguments = arguments %>% c(check_duplicates = FALSE)
1998+
if (!"dims" %in% names(arguments))
1999+
arguments = arguments %>% c(n_components = .dims)
2000+
if (!"init" %in% names(arguments))
2001+
arguments = arguments %>% c(init = "spca")
2002+
2003+
# Check if package is installed, otherwise install
2004+
if (find.package("uwot", quiet = TRUE) %>% length %>% equals(0)) {
2005+
message("tidybulk says: Installing uwot")
2006+
install.packages("uwot", repos = "https://cloud.r-project.org")
2007+
}
2008+
2009+
df_source =
2010+
.data %>%
2011+
2012+
# Filter NA symbol
2013+
filter(!!.feature %>% is.na %>% not()) %>%
2014+
2015+
# Prepare data frame
2016+
distinct(!!.feature,!!.element,!!.abundance) %>%
2017+
2018+
# Check if data rectangular
2019+
when(
2020+
check_if_data_rectangular(., !!.element,!!.feature,!!.abundance) ~
2021+
eliminate_sparse_transcripts(., !!.feature),
2022+
~ (.)
2023+
) %>%
2024+
2025+
# Check if log transform is needed
2026+
when(log_transform ~ dplyr::mutate(., !!.abundance := !!.abundance %>% log1p), ~ (.)) %>%
2027+
2028+
# Filter most variable genes
2029+
keep_variable_transcripts(!!.element,!!.feature,!!.abundance, top)
2030+
2031+
# Calculate based on PCA
2032+
if(!is.null(calculate_for_pca_dimensions))
2033+
df_UMAP =
2034+
df_source %>%
2035+
reduce_dimensions(
2036+
!!.element,!!.feature,!!.abundance,
2037+
method="PCA",
2038+
.dims = calculate_for_pca_dimensions,
2039+
action="get",
2040+
scale = scale
2041+
) %>%
2042+
suppressMessages() %>%
2043+
as_matrix(rownames = quo_name(.element))
2044+
2045+
# Calculate based on all features
2046+
else
2047+
df_UMAP =
2048+
df_source %>%
2049+
spread(!!.feature,!!.abundance) %>%
2050+
as_matrix(rownames = quo_name(.element))
2051+
2052+
do.call(uwot::tumap, c(list(df_UMAP), arguments)) %>%
2053+
as_tibble(.name_repair = "minimal") %>%
2054+
setNames(c("UMAP1", "UMAP2")) %>%
2055+
2056+
# add element name
2057+
dplyr::mutate(!!.element := df_UMAP %>% rownames) %>%
2058+
select(!!.element, everything()) %>%
2059+
2060+
# Attach attributes
2061+
reattach_internals(.data) %>%
2062+
memorise_methods_used("uwot")
2063+
2064+
}
2065+
19422066
#' Get rotated dimensions of two principal components or MDS dimension of choice, of an angle
19432067
#'
19442068
#' @keywords internal

R/functions_SE.R

Lines changed: 91 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -361,17 +361,16 @@ get_reduced_dimensions_TSNE_bulk_SE <-
361361

362362
# Set perprexity to not be too high
363363
if (!"perplexity" %in% names(arguments))
364-
arguments = arguments %>% c(perplexity = ((
365-
.data %>% distinct(!!.element) %>% nrow %>% sum(-1)
366-
) / 3 / 2) %>% floor() %>% min(30))
364+
arguments = arguments %>% c(perplexity = ((
365+
.data %>% ncol() %>% sum(-1)
366+
) / 3 / 2) %>% floor() %>% min(30))
367367

368368
# If not enough samples stop
369369
if (arguments$perplexity <= 2)
370370
stop("tidybulk says: You don't have enough samples to run tSNE")
371371

372372
# Calculate the most variable genes, from plotMDS Limma
373-
tsne_obj =
374-
do.call(Rtsne::Rtsne, c(list(t(.data)), arguments))
373+
tsne_obj = do.call(Rtsne::Rtsne, c(list(t(.data)), arguments))
375374

376375

377376

@@ -389,6 +388,93 @@ get_reduced_dimensions_TSNE_bulk_SE <-
389388

390389
}
391390

391+
#' Get UMAP
392+
#'
393+
#' @keywords internal
394+
#'
395+
#' @import dplyr
396+
#' @import tidyr
397+
#' @import tibble
398+
#' @importFrom rlang :=
399+
#' @importFrom stats setNames
400+
#' @importFrom utils install.packages
401+
#'
402+
#' @param .data A tibble
403+
#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
404+
#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6)
405+
#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes)
406+
#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples)
407+
#' @param top An integer. How many top genes to select
408+
#' @param of_samples A boolean
409+
#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
410+
#' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered
411+
#' @param ... Further parameters passed to the function uwot
412+
#'
413+
#' @return A tibble with additional columns
414+
#'
415+
get_reduced_dimensions_UMAP_bulk_SE <-
416+
function(.data,
417+
.dims = 2,
418+
top = 500,
419+
of_samples = TRUE,
420+
log_transform = TRUE,
421+
scale = NULL, # This is only a dummy argument for making it compatibble with PCA
422+
calculate_for_pca_dimensions = 20,
423+
...) {
424+
# Comply with CRAN NOTES
425+
. = NULL
426+
427+
# To avoid dplyr complications
428+
429+
# Evaluate ...
430+
arguments <- list(...)
431+
# if (!"check_duplicates" %in% names(arguments))
432+
# arguments = arguments %>% c(check_duplicates = FALSE)
433+
if (!"dims" %in% names(arguments))
434+
arguments = arguments %>% c(n_components = .dims)
435+
if (!"init" %in% names(arguments))
436+
arguments = arguments %>% c(init = "spca")
437+
438+
439+
# Check if package is installed, otherwise install
440+
if (find.package("uwot", quiet = TRUE) %>% length %>% equals(0)) {
441+
message("tidybulk says: Installing uwot")
442+
install.packages("uwot", repos = "https://cloud.r-project.org")
443+
}
444+
445+
446+
# Calculate based on PCA
447+
if(!is.null(calculate_for_pca_dimensions))
448+
df_UMAP =
449+
.data %>%
450+
451+
t() %>%
452+
453+
# Calculate principal components
454+
prcomp(scale = scale) %$%
455+
456+
# Parse the PCA results to a tibble
457+
x %>%
458+
.[,1:calculate_for_pca_dimensions]
459+
460+
# Calculate based on all features
461+
else
462+
df_UMAP = .data
463+
464+
umap_obj = do.call(uwot::tumap, c(list(df_UMAP), arguments))
465+
466+
list(
467+
raw_result = umap_obj,
468+
result = umap_obj %>%
469+
as_tibble(.name_repair = "minimal") %>%
470+
setNames(c("UMAP1", "UMAP2")) %>%
471+
472+
# add element name
473+
dplyr::mutate(sample = !!.data %>% colnames) %>%
474+
select(-sample)
475+
)
476+
477+
}
392478

393479
counts_scaled_exist_SE = function(.data){
394480

R/methods.R

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,23 @@ setMethod("cluster_elements", "tidybulk", .cluster_elements)
725725
#' Underlying method for tSNE:
726726
#' Rtsne::Rtsne(data, ...)
727727
#'
728+
#' Underlying method for UMAP:
729+
#'
730+
#' df_source =
731+
#' .data %>%
732+
#'
733+
#' # Filter NA symbol
734+
#' filter(!!.feature %>% is.na %>% not()) %>%
735+
#'
736+
#' # Prepare data frame
737+
#' distinct(!!.feature,!!.element,!!.abundance) %>%
738+
#'
739+
#' # Filter most variable genes
740+
#' keep_variable_transcripts(top) %>%
741+
#' reduce_dimensions(method="PCA", .dims = calculate_for_pca_dimensions, action="get" ) %>%
742+
#' as_matrix(rownames = quo_name(.element)) %>%
743+
#' uwot::tumap(...)
744+
#'
728745
#'
729746
#' @return A tbl object with additional columns for the reduced dimensions
730747
#'
@@ -814,7 +831,7 @@ setGeneric("reduce_dimensions", function(.data,
814831
) %>%
815832

816833
when(
817-
method == "MDS" ~ get_reduced_dimensions_MDS_bulk(.,
834+
tolower(method) == tolower("MDS") ~ get_reduced_dimensions_MDS_bulk(.,
818835
.abundance = !!.abundance,
819836
.dims = .dims,
820837
.element = !!.element,
@@ -824,7 +841,7 @@ setGeneric("reduce_dimensions", function(.data,
824841
log_transform = log_transform,
825842
...
826843
),
827-
method == "PCA" ~ get_reduced_dimensions_PCA_bulk(.,
844+
tolower(method) == tolower("PCA") ~ get_reduced_dimensions_PCA_bulk(.,
828845
.abundance = !!.abundance,
829846
.dims = .dims,
830847
.element = !!.element,
@@ -835,7 +852,7 @@ setGeneric("reduce_dimensions", function(.data,
835852
scale = scale,
836853
...
837854
),
838-
method == "tSNE" ~ get_reduced_dimensions_TSNE_bulk(.,
855+
tolower(method) == tolower("tSNE") ~ get_reduced_dimensions_TSNE_bulk(.,
839856
.abundance = !!.abundance,
840857
.dims = .dims,
841858
.element = !!.element,
@@ -845,6 +862,17 @@ setGeneric("reduce_dimensions", function(.data,
845862
log_transform = log_transform,
846863
...
847864
),
865+
tolower(method) == tolower("UMAP") ~ get_reduced_dimensions_UMAP_bulk(.,
866+
.abundance = !!.abundance,
867+
.dims = .dims,
868+
.element = !!.element,
869+
.feature = !!.feature,
870+
top = top,
871+
of_samples = of_samples,
872+
log_transform = log_transform,
873+
scale = scale,
874+
...
875+
),
848876
TRUE ~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\"")
849877
)
850878

R/methods_SE.R

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -350,10 +350,11 @@ setMethod("cluster_elements",
350350
my_reduction_function =
351351
method %>%
352352
when(
353-
(.) == "MDS" ~ get_reduced_dimensions_MDS_bulk_SE,
354-
(.) == "PCA" ~ get_reduced_dimensions_PCA_bulk_SE,
355-
(.) == "tSNE" ~ get_reduced_dimensions_TSNE_bulk_SE,
356-
~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\"")
353+
tolower(.) == tolower("MDS") ~ get_reduced_dimensions_MDS_bulk_SE,
354+
tolower(.) == tolower("PCA") ~ get_reduced_dimensions_PCA_bulk_SE,
355+
tolower(.) == tolower("tSNE") ~ get_reduced_dimensions_TSNE_bulk_SE,
356+
tolower(.) == tolower("UMAP") ~ get_reduced_dimensions_UMAP_bulk_SE,
357+
~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\", or \"UMAP\" ")
357358
)
358359

359360
# Both dataframe and raw result object are returned
@@ -378,10 +379,11 @@ setMethod("cluster_elements",
378379

379380
# Add bibliography
380381
when(
381-
method == "MDS" ~ memorise_methods_used(., "limma"),
382-
method == "PCA" ~ memorise_methods_used(., "stats"),
383-
method == "tSNE" ~ memorise_methods_used(., "rtsne"),
384-
~ stop("tidybulk says: the only supported methods are \"kmeans\" or \"SNN\" ")
382+
tolower(method) == tolower("MDS") ~ memorise_methods_used(., "limma"),
383+
tolower(method) == tolower("PCA") ~ memorise_methods_used(., "stats"),
384+
tolower(method) == tolower("tSNE") ~ memorise_methods_used(., "rtsne"),
385+
tolower(method) == tolower("UMAP") ~ memorise_methods_used(., "uwot"),
386+
~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\", or \"UMAP\" ")
385387
) %>%
386388

387389
# Attach edgeR for keep variable filtering

0 commit comments

Comments
 (0)