.rds
format.In order to speed up the computations during the exercises, we will be selecting every fith cell of the dataset.
umi_counts <- readRDS("data/GSE72857_umitab.rds")
umi_counts <- umi_counts[, c(T, F, F, F, F)]
dim(umi_counts)
## [1] 27297 2074
If you have been using the scran/scater
pipeline so far. Below you can find a summary code for data processing until getting:
suppressPackageStartupMessages({
library(scran)
library(scater)
library(igraph)
})
# Data analysis with Scran pipeline
data <- SingleCellExperiment(assays = list(counts = umi_counts))
data <- computeSumFactors(data, sizes = c(20, 40, 60, 80))
data <- logNormCounts(data)
var.fit <- modelGeneVar(data)
top.hvgs <- getTopHVGs(var.fit, n = 2000)
data <- runPCA(data, subset_row = top.hvgs)
g <- buildSNNGraph(data, k = 5, use.dimred = "PCA", assay.type = "RNA")
data$louvain_SNNk5 <- factor(igraph::cluster_louvain(g)$membership)
data <- runUMAP(data, dimred = "PCA", n_dimred = 50, ncomponents = 2, spread = 2, min_dist = 0.1, n_neighbors = 10, metric = "cosine")
# Plot the clusters
plotReducedDim(data, dimred = "UMAP", colour_by = "louvain_SNNk5")
If you have been using the Seurat
pipeline so far. Below you can find a summary code for data processing until getting:
suppressPackageStartupMessages({
library(Seurat)
library(cowplot)
})
# Data analysis with Seurat pipeline
data <- CreateSeuratObject(counts = umi_counts)
data <- NormalizeData(data)
data <- FindVariableFeatures(data, nfeatures = 2000)
data <- ScaleData(data)
data <- RunPCA(data)
## PC_ 1
## Positive: Fam132a, Ermap, Cpox, Rhd, Mt2, Sphk1, Add2, Blvrb, Klf1, Epor
## Pklr, Abcb4, Mns1, Slc38a5, Myh10;mKIAA3005, Car2, Ctse, Hmbs, Mthfd1, Car1
## Spire1;AK053505, Atp1b2, Tnfaip2, Alad, 2210023G05Rik;Ces2g, Lmna, Gclm, Mt1, Cldn13, Aqp1
## Negative: Coro1a, Cd52, Vim, Irf8, Pkm2, Lsp1, Gm2a, Napsa, Tyrobp, AK156292;V00821;AI324046;Ighg1;AK135975;Igh;Ighg;J00476;Igh-A (1g2);Gm16844;AK036494;M34473;abParts;M13680;M12570;V00757;M36767;X73024
## Cnn2, Lgals1, Limd2, Tmsb10, H2-Aa, Cd74, H2-Eb1, Igfbp4, Id2, Cst3
## Anxa2, Plbd1, Flt3, H2-DMa, Lcp1, Tmem176b, Prtn3, Ctsg, Lgals3, Fcer1g
## PC_ 2
## Positive: Tmsb4x, Ctsg, Coro1a, Cst3, H2afy, Cd52, Napsa, Emb, Mpo, Tyrobp
## Lcp1, Hsp90b1, Prtn3, Arhgdib, Gm2a, Dbi, Calr, Irf8, Psap, Anxa2
## Lyz1;Lyz2, Lgals1, MRPL33;Mrpl33, Pkm2, Ly6c2, Vim, Csde1;mKIAA0885, BC035044, Alas1, 2810417H13Rik
## Negative: Car2, Car1, Aqp1, Vamp5, Blvrb, Klf1, Zfpm1, Mfsd2b, Apoe, Atp1b2
## Mt2, Abcb4, Tspo2, Samd14, Rhd, Sphk1, Gata1, Isg20, Gfi1b, Hemgn
## Sdsl, Ermap, Fam132a, 1190007F08Rik, Pdzk1ip1, Epor, Lmna, Ctse, Trfr2, Hba-a2
## PC_ 3
## Positive: Cd74, H2-Eb1, H2-Aa, Plbd1, H2-Ab1, Id2, Lgals3, NAAA;Naaa, Ahnak, Lsp1
## Slamf7, 4930506M07Rik, Myadm, Batf3, Aif1, Crip1, Psap, Ifi30, Ckb, H2-DMb1;H2-DMb2
## Gpr35, Rnase6, Ctss, Olfm1, Ece1, Irf5, Lilrb3;Pira1;Gm14548;Pira11;Pira2;Pira4;Pira7;Lilra6;Pirb, AK217941, Xcr1, Klf2
## Negative: Elane, Hp, Gstm1, Prtn3, Ly6c2, Ctsg, C3, Alas1, Fcgr3, 1100001G20Rik
## Lbp, Mpo, Gpc1, Nkg7, Trem3, 1190002H23Rik, B4galt6, Lcn2, Igsf6, Pgd
## Hdc, Cldn15, Emilin1, Prdx5, Dmkn, Igfbp4, Calr, Gsr, Ckap4, Cst7
## PC_ 4
## Positive: Gpr56, Tmem176b, Nrgn, Ifitm1, Cd27, Sox4, Cd34, Spns2, Eef1a1, Aff3
## AK152437, H2afy, Ifitm3, Mef2c;AK043267, Eltd1, Gata2, Rbpms;Rbpms2, Slc22a3, Serpina3f;Serpina3g, Paics
## BC035044, Lgals9, Elf1, Apoe, Gas5, Pabpc1;Pabpc2, Mir703, Rpl23, Rps24, Dntt
## Negative: 1100001G20Rik, C3, Elane, 1190002H23Rik, Fcnb, Pglyrp1, Lcn2, Gstm1, Hp, Gpc1
## Fcgr3, Mgl2, Ly6c2, B4galt6, Plbd1, Chi3l3, Fam101b, Mt1, Sema4a, Dmkn
## Id2, Chd7, Lrg1, Dgat2, Nucb2, Cldn15, Lgals3, S100a9, NAAA;Naaa, Gsr
## PC_ 5
## Positive: Rpl22, Cox7b, Rpl23, Eef1a1, Cox7a2, Cox6c, Sub1, Ndufa4, Chi3l3, Dbi
## 1100001G20Rik, S100a9, Nkg7, Lcn2, Gas5, Evi2b;Evi2a, Psmb7, Malat1, Csde1;mKIAA0885, 1810006K21Rik
## Snrpe, Tmsb4x, Mrpl42, S100a8, Rps24, Mgst2, Agps, Apoe, Bcap31, Slc22a3
## Negative: Cd34, Irf8, AK156292;V00821;AI324046;Ighg1;AK135975;Igh;Ighg;J00476;Igh-A (1g2);Gm16844;AK036494;M34473;abParts;M13680;M12570;V00757;M36767;X73024, Igfbp4, Lgals1, Cd27, H2afy, Tmsb10, Csf1r, Serpinf1
## Emb, Tmem176b, Mpo, Limd2, Prtn3, Pkm2, Plxnb2, Ctsg, Flt3, Bex6
## Coro1a, Phb2, Calr, Plac8, FKBP12-T1;Fkbp1a, H2afx, Fth1, Il6st, Tifab, Slc29a1;Ent1
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 2074
## Number of edges: 64612
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8111
## Number of communities: 12
## Elapsed time: 0 seconds
data <- RunUMAP(data, n.neighbors = 10, dims = 1:50, spread = 2, min.dist = 0.3)
# Plot the clusters
DimPlot(data, group.by = "RNA_snn_res.1")
# Save the objects as separate matrices for input in slingshot
dimred <- data@reductions$umap@cell.embeddings
clustering <- data$RNA_snn_res.1
counts <- as.matrix(data@assays$RNA@counts[data@assays$RNA@var.features, ])
Until up to this point, the steps above have been covered in the previous lectures. From now on, we will start using that clustering and data reduction techniques for trajectory inference. The whole process can be done using a single function named slingshot
, which is simply a wrapper for the 2 main steps for trajectory inference. The first step of the process is to define the lineages and then fit a curve through the data that defines a trajectory. These steps are break donw below for clarity.
suppressPackageStartupMessages({
library(slingshot)
})
# Run default Slingshot lineage identification
set.seed(1)
lineages <- getLineages(data = dimred, clusterLabels = clustering)
lineages
## class: SlingshotDataSet
##
## Samples Dimensions
## 2074 2
##
## lineages: 5
## Lineage1: 5 3 1 0 11 6 2 10
## Lineage2: 5 3 1 0 11 6 4
## Lineage3: 5 3 1 0 11 6 9
## Lineage4: 5 3 1 0 11 7
## Lineage5: 5 3 1 8
##
## curves: 0
# Plot the lineages
par(mfrow = c(1, 2))
plot(dimred[, 1:2], col = pal[clustering], cex = 0.5, pch = 16)
for (i in levels(clustering)) {
text(mean(dimred[clustering == i, 1]), mean(dimred[clustering == i, 2]), labels = i, font = 2)
}
plot(dimred[, 1:2], col = pal[clustering], cex = 0.5, pch = 16)
lines(lineages, lwd = 3, col = "black")
Here we see one central issue with trajectory analysis: where does the trajectory begin? Without any extra information, this is nearly an impossible task for a TI method. We need prior biological information to be able to define where the trajectory starts and where it should end.
#Run default Slingshot
set.seed(1)
lineages <- getLineages(data = dimred,
clusterLabels = clustering,
#end.clus = c("11","7","10","9","5"), #define how many branches/lineages to consider
start.clus = "0") #define where to start the trajectories
lineages
## class: SlingshotDataSet
##
## Samples Dimensions
## 2074 2
##
## lineages: 6
## Lineage1: 0 11 6 2 10
## Lineage2: 0 11 6 4
## Lineage3: 0 1 3 5
## Lineage4: 0 11 6 9
## Lineage5: 0 1 8
## Lineage6: 0 11 7
##
## curves: 0
#Plot the lineages
par(mfrow=c(1,2))
plot(dimred[,1:2], col = pal[clustering], cex=.5,pch = 16)
for(i in levels(clustering)){
text( mean(dimred[clustering==i,1]),
mean(dimred[clustering==i,2]), labels = i,font = 2) }
plot(dimred, col = pal[clustering], pch = 16)
lines(lineages, lwd = 3, col = 'black')
Once the clusters are connected, Slingshot allows you to transform them to a smooth trajectory using principal curves. This is an algorithm that iteratively changes an initial curve to better match the data points. It was developed for linear data. To apply it to single-cell data, slingshot adds two enhancements:
Since the function getCurves()
takes some time to run, we can speed up the convergence of the curve fitting process by reducing the amount of cells to use in each lineage. Ideally you could all cells, but here we had set approx_points
to 300 to speed up. Feel free to adjust that for your dataset.
curves <- getCurves(lineages, approx_points = 300, thresh = 0.01, stretch = 0.8, allow.breaks = FALSE, shrink = 0.99)
curves
## class: SlingshotDataSet
##
## Samples Dimensions
## 2074 2
##
## lineages: 6
## Lineage1: 0 11 6 2 10
## Lineage2: 0 11 6 4
## Lineage3: 0 1 3 5
## Lineage4: 0 11 6 9
## Lineage5: 0 1 8
## Lineage6: 0 11 7
##
## curves: 6
## Curve1: Length: 23.409 Samples: 784.19
## Curve2: Length: 30.512 Samples: 693.17
## Curve3: Length: 21.643 Samples: 948.24
## Curve4: Length: 19.246 Samples: 553.65
## Curve5: Length: 25.157 Samples: 719.84
## Curve6: Length: 19.054 Samples: 477.24
The main way to interpret a trajectory is to find genes that change along the trajectory. There are many ways to define differential expression along a trajectory:
tradeSeq
is a recently proposed algorithm to find trajectory differentially expressed genes. It works by smoothing the gene expression along the trajectory by fitting a smoother using generalized additive models (GAMs), and testing whether certain coefficients are statstically different between points in the trajectory.
The fitting of GAMs can take quite a while, so for demonstration purposes we first do a very stringent filtering of the genes. In an ideal experiment, you would use all the genes, or at least those defined as being variable.
library(tradeSeq)
# Removing some genes to speed up the computations for this tutorial
filt_counts <- counts[rowSums(counts > 5) > ncol(counts)/100, ]
dim(filt_counts)
## [1] 215 2074
sce <- fitGAM(counts = as.matrix(filt_counts), sds = curves)
plotGeneCount(curves, filt_counts, clusters = clustering, models = sce)
# Define function to plot
library(dplyr)
plot_differential_expression <- function(feature_id) {
feature_id <- pseudotime_association %>% filter(pvalue < 0.05) %>% top_n(1, -waldStat) %>% pull(feature_id)
cowplot::plot_grid(plotGeneCount(curves, filt_counts, gene = feature_id[1], clusters = clustering, models = sce) + ggplot2::theme(legend.position = "none"),
plotSmoothers(sce, as.matrix(counts), gene = feature_id[1]))
}
We can define custom pseudotime values of interest if we’re interested in genes that change between particular point in pseudotime. By default, we can look at differences between start and end:
pseudotime_start_end_association <- startVsEndTest(sce, pseudotimeValues = c(0, 1))
pseudotime_start_end_association$feature_id <- rownames(pseudotime_start_end_association)
feature_id <- pseudotime_start_end_association %>% filter(pvalue < 0.05) %>% top_n(1, waldStat) %>% pull(feature_id)
plot_differential_expression(feature_id)
More interesting are genes that are different between two branches. We may have seen some of these genes already pop up in previous analyses of pseudotime. There are several ways to define “different between branches”, and each have their own functions:
diffEndTest
earlyDETest
patternTest
different_end_association <- diffEndTest(sce)
different_end_association$feature_id <- rownames(different_end_association)
feature_id <- different_end_association %>% filter(pvalue < 0.05) %>% arrange(desc(waldStat)) %>% dplyr::slice(1) %>% pull(feature_id)
plot_differential_expression(feature_id)
branch_point_association <- earlyDETest(sce)
branch_point_association$feature_id <- rownames(branch_point_association)
feature_id <- branch_point_association %>% filter(pvalue < 0.05) %>% arrange(desc(waldStat)) %>% dplyr::slice(1) %>% pull(feature_id)
plot_differential_expression(feature_id)
Check out this vignette for a more in-depth overview of tradeSeq
Cannoodt, Robrecht, Wouter Saelens, and Yvan Saeys. 2016. “Computational Methods for Trajectory Inference from Single-Cell Transcriptomics.” European Journal of Immunology 46 (11): 2496–2506. doi.
Saelens, Wouter, Robrecht Cannoodt, Helena Todorov, and Yvan Saeys. 2019. “A Comparison of Single-Cell Trajectory Inference Methods.” Nature Biotechnology 37 (5): 547–54. doi.