5 diffusionMaps

5.1 Introduction

In this tutorial, we explore Diffusion Maps and PHATE, two powerful nonlinear dimensionality reduction techniques that excel in capturing the geometric structure of complex datasets, particularly in life sciences. They are especially useful for continuous processes, such as cell development or biological trajectories.

5.2 Diffusion Maps

Diffusion maps leverage the relationship between heat diffusion and a random walk Markov chain on the dataset. The basic idea is that in a random walk, you’re more likely to step to a nearby point than one farther away.
The connectivity between two points is defined as the probability of transitioning from one to the other in one step, typically via a kernel function. This defines the local geometry and leads to construction of a transition matrix (M) for the Markov chain.
Raising M to higher powers simulates a diffusion process over time, revealing the geometric structure at increasing scales. The parameter t acts as both a time and scale parameter.
The diffusion distance between two points at time t reflects their similarity based on how many short paths connect them. It is robust to noise and integrates all indirect connections—making it suitable for inference tasks.
This distance can be computed from the eigenvectors and eigenvalues of the diffusion matrix.
By keeping only the first k eigenvectors and their eigenvalues (due to spectral decay), the data is embedded in a k-dimensional space. The resulting diffusion map is a nonlinear embedding where Euclidean distances approximate diffusion distances, capturing the intrinsic geometry of the original data.

5.3 Data

library(tidyverse)
library(mixOmics)

# load data
data(breast.TCGA)
x <- rbind(breast.TCGA$data.train$mirna,breast.TCGA$data.test$mirna)
labels <-c(breast.TCGA$data.train$subtype,breast.TCGA$data.test$subtype)

# scale data
x_scaled <- scale(x)

# preview data
# data dimensions
x |> dim() |> print () # dimensions of the data matrix (samples x features)
## [1] 220 184
labels |> as.factor() |> summary() # samples per group
## Basal  Her2  LumA 
##    66    44   110

# box plots 
par(mfrow=c(2,1))
boxplot(t(x), main="distribution per sample", las=2, cex.axis=0.7, col=rainbow(10), outline=FALSE, cex.main=0.8)
boxplot(x, main="distribution per miRNA", las=2, cex.axis=0.7, col=rainbow(10), outline=FALSE, cex.main=0.8)

# perform PCA
# perform PCA
pca <- prcomp(x, center=TRUE, scale.=FALSE)
eigs <- pca$sdev^2
var_exp <- eigs / sum(eigs)

res_pca <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], PC3=pca$x[,3], PC4=pca$x[,4], PC5=pca$x[,5]) |>
    rownames_to_column("sample") |> 
    as_tibble() 

res_pca_loadings <- pca$rotation

# show PCA scores plots
res_pca |>
    ggplot(aes(x=PC1, y=PC2, color=labels)) +
    geom_point() +
    labs(title="PCA of miRNA data", x="PC1", y="PC2") +
    xlab(paste("PC1 (Var: ", round(var_exp[1] * 100, 2), "%)")) +
    ylab(paste("PC2 (Var: ", round(var_exp[2] * 100, 2), "%)")) +
    theme_minimal() +
    scale_color_manual(values=c("Basal"="#FF0000", "Her2"="#00FF00", "LumA"="#0000FF")) +
    theme(legend.title=element_blank())

# show top 10 loadings along PC1
res_pca_loadings |> 
    as.data.frame() |> 
    rownames_to_column("miRNA") |> 
    arrange(desc(abs(PC1))) |> 
    head(10) |> 
    ggplot(aes(x=reorder(miRNA, PC1), y=PC1)) +
    geom_bar(stat="identity", fill="steelblue") +
    coord_flip() +
    labs(title="Top 10 miRNAs contributing to PC1", x="miRNA", y="Loading") +
    theme_minimal()

5.4 Run diffusionMap

library(destiny) # main package for diffusion maps

dm <- DiffusionMap(data = x_scaled, sigma = "local")  # adaptive kernel width

# Key parameters
# - sigma:  Diffusion scale parameter of the Gaussian kernel
# - k:  Number of neighbors
# - n_eigs: Number of diffusion components to calculate
# - density.norm: Density normalization (helps manifold discovery)

df_dm <- data.frame(
  DC1 = eigenvectors(dm)[, 1],
  DC2 = eigenvectors(dm)[, 2],
  Subtype = labels
)

# Visualize First Two Diffusion Components
ggplot(df_dm, aes(x = DC1, y = DC2, color = Subtype)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(title = "Diffusion Map of miRNA Data")

#  Explore Diffusion Pseudotime (Optional)
df_dm$Pseudotime <- eigenvectors(dm)[, 1]

ggplot(df_dm, aes(x = DC1, y = DC2, color = Pseudotime)) +
  geom_point(size = 2) +
  scale_color_viridis_c() +
  theme_minimal() +
  labs(title = "Diffusion Pseudotime", color = "Pseudotime")

5.5 Exercise

We have simulated a dataset with 100 cells and 20 genes, where 3 genes have a signal related to pseudotime. Load the data diffmap-sim-gene-expr.csv and run a diffusion map analysis. Could you identify the genes that correlate with the first diffusion component?

Hint:

there should be one gene highly positively correlated with the first diffusion component and one negatively correlated.

Example code

library(tidyverse)
library(destiny)
library(viridis)

# Load the simulated data
expr_data <- read_csv("data/diffmap-sim-gene-expr.csv")

# scale data
x <- scale(expr_data)

# Run Diffusion Map
dm <- DiffusionMap(data = x, sigma = "local")  # adaptive kernel width

df_dm <- data.frame(
  DC1 = eigenvectors(dm)[, 1],
  DC2 = eigenvectors(dm)[, 2]
)

# Visualize First Two Diffusion Components
ggplot(df_dm, aes(x = DC1, y = DC2)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(title = "Diffusion Map of miRNA Data")

#  Explore Diffusion Pseudotime (Optional)
df_dm$Pseudotime <- eigenvectors(dm)[, 1]

ggplot(df_dm, aes(x = DC1, y = DC2, color = Pseudotime)) +
  geom_point(size = 2) +
  scale_color_viridis_c() +
  theme_minimal() +
  labs(title = "Diffusion Pseudotime", color = "Pseudotime")

# find genes positively and negatively correlated with DC1
dc1 <- eigenvectors(dm)[, 1]

# Correlate each gene with DC1
cor_with_dc1 <- apply(x, 2, function(g) cor(g, dc1))

# top genes positively correlated with DC1
cor_with_dc1_sorted <- sort(cor_with_dc1, decreasing = TRUE)
head(cor_with_dc1_sorted, 5)
##     Gene18     Gene15      Gene9      Gene1     Gene20 
## 0.91409842 0.31069956 0.18258365 0.09695313 0.08421495

# color code UMAP by the expression of the correlated genes
library(viridis)

ggplot(df_dm, aes(x = DC1, y = DC2, color = expr_data$Gene5, size = expr_data$Gene5)) +
  geom_point(alpha = 0.9) +
  scale_color_viridis_c(option = "plasma", name = "Expression") +
  scale_size(range = c(1, 4), guide = "none") +
  theme_minimal() +
  labs(
    title = "Diffusion Map Colored by Gene Expression",
    x = "DC1", y = "DC2"
  )

# top genes negatively correlated with DC1
cor_with_dc1_sorted <- sort(cor_with_dc1, decreasing = FALSE)
head(cor_with_dc1_sorted, 5)
##      Gene7     Gene12     Gene19      Gene2      Gene8 
## -0.9155005 -0.2076297 -0.1958369 -0.1670577 -0.1650107

# color code UMAP by the expression of the correlated genes
ggplot(df_dm, aes(x = DC1, y = DC2, color = expr_data$Gene18, size = expr_data$Gene18)) +
  geom_point(alpha = 0.9) +
  scale_color_viridis_c(option = "plasma", name = "Expression") +
  scale_size(range = c(1, 4), guide = "none") +
  theme_minimal() +
  labs(
    title = "Diffusion Map Colored by Gene Expression",
    x = "DC1", y = "DC2"
  )

5.6 Additional resources

PHATE (Potential of Heat-diffusion for Affinity-based Transition Embedding) aims to capture both local and global nonlinear structure by using an information-geometric distance derived from a heat-diffusion process. PHATE builds on diffusion maps by modeling data as a diffusion process but introduces key innovations like automatic selection of diffusion time and a log-transformed “potential distance” to better preserve both local and global structure. Unlike diffusion maps, which use eigenvectors for embedding, PHATE applies non-metric multidimensional scaling for improved visualization of trajectories and branching structures