suppressPackageStartupMessages({
library(Seurat)
library(dplyr)
library(patchwork)
library(ggplot2)
library(pheatmap)
library(enrichR)
library(Matrix)
library(edgeR)
library(MAST)
})
Code chunks run R commands unless otherwise specified.
In this tutorial we will cover differential gene expression, which comprises an extensive range of topics and methods. In single cell, differential expresison can have multiple functionalities such as identifying marker genes for cell populations, as well as identifying differentially regulated genes across conditions (healthy vs control). We will also cover controlling batch effect in your test.
We can first load the data from the clustering session. Moreover, we can already decide which clustering resolution to use. First let’s define using the louvain
clustering to identifying differentially expressed genes.
# download pre-computed data if missing or long compute
<- TRUE
fetch_data
# url for source and intermediate data
<- "https://export.uppmax.uu.se/naiss2023-23-3/workshops/workshop-scrnaseq"
path_data <- "data/covid/results/seurat_covid_qc_dr_int_cl.rds"
path_file if (!dir.exists(dirname(path_file))) dir.create(dirname(path_file), recursive = TRUE)
if (fetch_data && !file.exists(path_file)) download.file(url = file.path(path_data, "covid/results/seurat_covid_qc_dr_int_cl.rds"), destfile = path_file)
<- readRDS(path_file) alldata
# Set the identity as louvain with resolution 0.5
<- "RNA_snn_res.0.5"
sel.clust
<- SetIdent(alldata, value = sel.clust)
alldata table(alldata@active.ident)
0 1 2 3 4 5 6 7 8 9
2138 1245 1086 646 532 357 357 334 263 176
# plot this clustering
wrap_plots(
DimPlot(alldata, reduction = "umap_cca", label = T) + NoAxes(),
DimPlot(alldata, reduction = "umap_cca", group.by = "orig.ident") + NoAxes(),
DimPlot(alldata, reduction = "umap_cca", group.by = "type") + NoAxes(),
ncol = 3
)
1 Cell marker genes
Let us first compute a ranking for the highly differential genes in each cluster. There are many different tests and parameters to be chosen that can be used to refine your results. When looking for marker genes, we want genes that are positively expressed in a cell type and possibly not expressed in others.
For differential expression it is important to use the RNA assay, for most tests we will use the logtransformed counts in the data slot. With Seurat v5 the data may be split into layers depending on what you did with the data beforehand. So first, lets check what we have in our object:
Layers(alldata@assays$RNA)
[1] "counts.covid_1" "counts.covid_15" "counts.covid_16" "counts.covid_17"
[5] "counts.ctrl_5" "counts.ctrl_13" "counts.ctrl_14" "counts.ctrl_19"
[9] "scale.data" "data.covid_1" "data.covid_15" "data.covid_16"
[13] "data.covid_17" "data.ctrl_5" "data.ctrl_13" "data.ctrl_14"
[17] "data.ctrl_19"
As you can see, we have the data split by each sample, so we need to merge them into one single matrix to run the differential expression.
@active.assay = "RNA"
alldata<- JoinLayers(object = alldata, layers = c("data","counts"))
alldata
Layers(alldata@assays$RNA)
[1] "counts" "data" "scale.data"
Now we can run the function FindAllMarkers
that will run each of the clusters vs the rest. As you can see, there are some filtering criteria to remov genes that do not have certain log2FC or percent expressed. Here we only test for upregulated genes, so the only.pos
parameter is set to TRUE
.
# Compute differentiall expression
<- FindAllMarkers(
markers_genes
alldata,log2FC.threshold = 0.2,
test.use = "wilcox",
min.pct = 0.1,
min.diff.pct = 0.2,
only.pos = TRUE,
max.cells.per.ident = 50,
assay = "RNA"
)
We can now select the top 25 overexpressed genes for plotting.
%>%
markers_genes group_by(cluster) %>%
top_n(-25, p_val_adj) -> top25
head(top25)
par(mfrow = c(2, 5), mar = c(4, 6, 3, 1))
for (i in unique(top25$cluster)) {
barplot(sort(setNames(top25$avg_log2FC, top25$gene)[top25$cluster == i], F),
horiz = T, las = 1, main = paste0(i, " vs. rest"), border = "white", yaxs = "i"
)abline(v = c(0, 0.25), lty = c(1, 2))
}
We can visualize them as a heatmap. Here we are selecting the top 5.
%>%
markers_genes group_by(cluster) %>%
slice_min(p_val_adj, n = 5, with_ties = FALSE) -> top5
# create a scale.data slot for the selected genes
<- ScaleData(alldata, features = as.character(unique(top5$gene)), assay = "RNA")
alldata DoHeatmap(alldata, features = as.character(unique(top5$gene)), group.by = sel.clust, assay = "RNA")
Another way is by representing the overall group expression and detection rates in a dot-plot.
DotPlot(alldata, features = rev(as.character(unique(top5$gene))), group.by = sel.clust, assay = "RNA") + coord_flip()
We can also plot a violin plot for each gene.
# take top 3 genes per cluster/
%>%
top5 group_by(cluster) %>%
top_n(-3, p_val) -> top3
# set pt.size to zero if you do not want all the points to hide the violin shapes, or to a small value like 0.1
VlnPlot(alldata, features = as.character(unique(top3$gene)), ncol = 5, group.by = sel.clust, assay = "RNA", pt.size = 0)
Take a screenshot of those results and re-run the same code above with another test: “wilcox” (Wilcoxon Rank Sum test), “bimod” (Likelihood-ratio test), “roc” (Identifies ‘markers’ of gene expression using ROC analysis),“t” (Student’s t-test),“negbinom” (negative binomial generalized linear model),“poisson” (poisson generalized linear model), “LR” (logistic regression), “MAST” (hurdle model), “DESeq2” (negative binomial distribution).
1.1 DGE with equal amount of cells
The number of cells per cluster differ quite a bit in this data
table(alldata@active.ident)
0 1 2 3 4 5 6 7 8 9
2138 1245 1086 646 532 357 357 334 263 176
Hence when we run FindAllMarkers
one cluster vs rest, the largest cluster (cluster 0) will dominate the “rest” and influence the results the most. So it is often a good idea to subsample the clusters to an equal number of cells before running differential expression for one vs rest. We can select a fixed number of cells per cluster with the function WhichCells
and the argument downsample
.
<- subset(alldata, cells = WhichCells(alldata, downsample = 300))
sub
table(sub@active.ident)
0 1 2 3 4 5 6 7 8 9
300 300 300 300 300 300 300 300 263 176
Now rerun FindAllMarkers
with this set and compare the results.
<- FindAllMarkers(
markers_genes_sub
sub,log2FC.threshold = 0.2,
test.use = "wilcox",
min.pct = 0.1,
min.diff.pct = 0.2,
only.pos = TRUE,
max.cells.per.ident = 50,
assay = "RNA"
)
The number of significant genes per cluster has changed, with more for some clusters and less for others.
table(markers_genes$cluster)
0 1 2 3 4 5 6 7 8 9
727 118 114 154 97 776 198 69 69 17
table(markers_genes_sub$cluster)
0 1 2 3 4 5 6 7 8 9
700 157 114 140 122 991 195 57 112 13
%>%
markers_genes_sub group_by(cluster) %>%
slice_min(p_val_adj, n = 5, with_ties = FALSE) -> top5_sub
DotPlot(alldata, features = rev(as.character(unique(top5_sub$gene))), group.by = sel.clust, assay = "RNA") + coord_flip()
2 DGE across conditions
The second way of computing differential expression is to answer which genes are differentially expressed within a cluster. For example, in our case we have libraries comming from patients and controls and we would like to know which genes are influenced the most in a particular cell type. For this end, we will first subset our data for the desired cell cluster, then change the cell identities to the variable of comparison (which now in our case is the type, e.g. Covid/Ctrl).
# select all cells in cluster 3
<- subset(alldata, cells = colnames(alldata)[alldata@meta.data[, sel.clust] == 3])
cell_selection <- SetIdent(cell_selection, value = "type")
cell_selection # Compute differentiall expression
<- FindAllMarkers(cell_selection,
DGE_cell_selection log2FC.threshold = 0.2,
test.use = "wilcox",
min.pct = 0.1,
min.diff.pct = 0.2,
only.pos = TRUE,
max.cells.per.ident = 50,
assay = "RNA"
)
We can now plot the expression across the type.
%>%
DGE_cell_selection group_by(cluster) %>%
top_n(-5, p_val) -> top5_cell_selection
VlnPlot(cell_selection, features = as.character(unique(top5_cell_selection$gene)), ncol = 5, group.by = "type", assay = "RNA", pt.size = .1)
We can also plot these genes across all clusters, but split by type, to check if the genes are also over/under expressed in other celltypes.
VlnPlot(alldata,
features = as.character(unique(top5_cell_selection$gene)),
ncol = 4, split.by = "type", assay = "RNA", pt.size = 0
)
As you can see, we have many sex chromosome related genes among the top DE genes. And if you remember from the QC lab, we have unbalanced sex distribution among our subjects, so this may not be related to covid at all.
2.1 Remove sex chromosome genes
To remove some of the bias due to unbalanced sex in the subjects, we can remove the sex chromosome related genes.
<- file.path("data/covid/results/genes_table.csv")
genes_file if (!file.exists(genes_file)) download.file(file.path(path_data, "covid/results/genes_table.csv"), destfile = genes_file)
<- read.csv(genes_file) # was created in the QC exercise
gene.info
<- gene.info$external_gene_name[!(gene.info$chromosome_name %in% c("X", "Y"))]
auto.genes
@active.assay <- "RNA"
cell_selection<- intersect(rownames(cell_selection), auto.genes)
keep.genes <- cell_selection[keep.genes, ]
cell_selection
# then renormalize the data
<- NormalizeData(cell_selection) cell_selection
Rerun differential expression:
# Compute differential expression
<- FindMarkers(cell_selection,
DGE_cell_selection ident.1 = "Covid", ident.2 = "Ctrl",
logfc.threshold = 0.2, test.use = "wilcox", min.pct = 0.1,
min.diff.pct = 0.2, assay = "RNA"
)
# Define as Covid or Ctrl in the df and add a gene column
$direction <- ifelse(DGE_cell_selection$avg_log2FC > 0, "Covid", "Ctrl")
DGE_cell_selection$gene <- rownames(DGE_cell_selection)
DGE_cell_selection
%>%
DGE_cell_selection group_by(direction) %>%
top_n(-5, p_val) %>%
arrange(direction) -> top5_cell_selection
VlnPlot(cell_selection,
features = as.character(unique(top5_cell_selection$gene)),
ncol = 5, group.by = "type", assay = "RNA", pt.size = .1
)
We can also plot these genes across all clusters, but split by type, to check if the genes are also over/under expressed in other celltypes/clusters.
VlnPlot(alldata,
features = as.character(unique(top5_cell_selection$gene)),
ncol = 4, split.by = "type", assay = "RNA", pt.size = 0
)
3 Patient Batch effects
When we are testing for Covid vs Control, we are running a DGE test for 4 vs 4 individuals. That will be very sensitive to sample differences unless we find a way to control for it. So first, let’s check how the top DEGs are expressed across the individuals within cluster 3:
VlnPlot(cell_selection, group.by = "orig.ident", features = as.character(unique(top5_cell_selection$gene)), ncol = 4, assay = "RNA", pt.size = 0)
As you can see, many of the genes detected as DGE in Covid are unique to one or 2 patients.
We can examine more genes with a DotPlot:
%>%
DGE_cell_selection group_by(direction) %>%
top_n(-20, p_val) -> top20_cell_selection
DotPlot(cell_selection, features = rev(as.character(unique(top20_cell_selection$gene))), group.by = "orig.ident", assay = "RNA") + coord_flip() + RotatedAxis()
As you can see, most of the DGEs are driven by the covid_17
patient. It is also a sample with very high number of cells:
table(cell_selection$orig.ident)
covid_1 covid_15 covid_16 covid_17 ctrl_13 ctrl_14 ctrl_19 ctrl_5
94 32 37 170 65 60 35 153
4 Subsample
So one obvious thing to consider is an equal amount of cells per individual so that the DGE results are not dominated by a single sample.
We will use the downsample
option in the Seurat function WhichCells()
to select 30 cells per cluster:
<- SetIdent(cell_selection, value = "orig.ident")
cell_selection <- subset(cell_selection, cells = WhichCells(cell_selection, downsample = 30))
sub_data
table(sub_data$orig.ident)
covid_1 covid_15 covid_16 covid_17 ctrl_13 ctrl_14 ctrl_19 ctrl_5
30 30 30 30 30 30 30 30
And now we run DGE analysis again:
<- SetIdent(sub_data, value = "type")
sub_data
# Compute differentiall expression
<- FindMarkers(sub_data,
DGE_sub ident.1 = "Covid", ident.2 = "Ctrl",
logfc.threshold = 0.2, test.use = "wilcox", min.pct = 0.1,
min.diff.pct = 0.2, assay = "RNA"
)
# Define as Covid or Ctrl in the df and add a gene column
$direction <- ifelse(DGE_sub$avg_log2FC > 0, "Covid", "Ctrl")
DGE_sub$gene <- rownames(DGE_sub)
DGE_sub
%>%
DGE_sub group_by(direction) %>%
top_n(-5, p_val) %>%
arrange(direction) -> top5_sub
VlnPlot(sub_data,
features = as.character(unique(top5_sub$gene)),
ncol = 5, group.by = "type", assay = "RNA", pt.size = .1
)
Plot as dotplot, but in the full (not subsampled) data, still only showing cluster 3:
%>%
DGE_sub group_by(direction) %>%
top_n(-20, p_val) -> top20_sub
DotPlot(cell_selection, features = rev(as.character(unique(top20_sub$gene))), group.by = "orig.ident", assay = "RNA") +
coord_flip() + RotatedAxis()
It looks much better now. But if we look per patient you can see that we still have some genes that are dominated by a single patient.
Why do you think this is?
5 Pseudobulk
One option is to treat the samples as pseudobulks and do differential expression for the 4 patients vs 4 controls. You do lose some information about cell variability within each patient, but instead you gain the advantage of mainly looking for effects that are seen in multiple patients.
However, having only 4 patients is perhaps too low, with many more patients it will work better to run pseudobulk analysis.
For a fair comparison we should have equal number of cells per sample when we create the pseudobulk with AggregateExpression
. For pseudobulk it is recommended to use the subsampled object so that you have similar amount of cells into each pseudobulk calculation. A biased number of cells will strongly influence the differential expression you run with the pseudobulk.
= AggregateExpression(sub_data, group.by = "orig.ident", assays = "RNA")$RNA pseudobulk
Then run edgeR:
# define the groups
<- c("Covid", "Covid", "Covid", "Covid", "Ctrl", "Ctrl", "Ctrl", "Ctrl")
bulk.labels
<- DGEList(counts = pseudobulk, group = factor(bulk.labels))
dge.list <- filterByExpr(dge.list)
keep <- dge.list[keep, , keep.lib.sizes = FALSE]
dge.list
<- calcNormFactors(dge.list)
dge.list <- model.matrix(~bulk.labels)
design
<- estimateDisp(dge.list, design)
dge.list
<- glmQLFit(dge.list, design)
fit <- glmQLFTest(fit, coef = 2)
qlf topTags(qlf)
Coefficient: bulk.labelsCtrl
logFC logCPM F PValue FDR
WDFY2 1.471776 7.175742 15.099461 0.0009581845 0.4683637
AHNAK 1.402522 7.845133 14.393278 0.0011846521 0.4683637
SKIL -1.570543 6.781652 14.206374 0.0012541140 0.4683637
CD69 -1.594180 10.341748 13.395898 0.0016123421 0.4683637
DDIT3 -1.429081 7.306436 13.285757 0.0016692453 0.4683637
STAG3 -1.886657 7.459503 13.061054 0.0017923761 0.4683637
NUDT3 1.311554 6.898319 12.851803 0.0019161578 0.4683637
SLIRP -1.193814 6.686989 10.669825 0.0039659763 0.7858307
IFITM2 -1.252358 8.090788 10.018526 0.0049872285 0.7858307
SNHG15 -1.273079 6.840242 9.962369 0.0050881698 0.7858307
As you can see, we have very few significant genes. Since we only have 4 vs 4 samples, we should not expect to find many genes with this method.
Again as dotplot including top 10 genes:
<- topTags(qlf, 100)$table
res.edgeR $dir <- ifelse(res.edgeR$logFC > 0, "Covid", "Ctrl")
res.edgeR$gene <- rownames(res.edgeR)
res.edgeR
%>%
res.edgeR group_by(dir) %>%
top_n(-10, PValue) %>%
arrange(dir) -> top.edgeR
DotPlot(cell_selection,
features = as.character(unique(top.edgeR$gene)), group.by = "orig.ident",
assay = "RNA"
+ coord_flip() + ggtitle("EdgeR pseudobulk") + RotatedAxis() )
As you can see, even if we find few genes, they seem to make sense across all the patients.
6 MAST random effect
MAST has the option to add a random effect for the patient when running DGE analysis. It is quite slow, even with this small dataset, so it may not be practical for a larger dataset unless you have access to a compute cluster.
We will run MAST with and without patient info as random effect and compare the results
First, filter genes in part to speed up the process but also to avoid too many warnings in the model fitting step of MAST. We will keep genes that are expressed with at least 2 reads in 2 covid patients or 2 controls.
# select genes that are expressed in at least 2 patients or 2 ctrls with > 2 reads.
<- sapply(unique(cell_selection$orig.ident), function(x) {
nPatient rowSums(cell_selection@assays$RNA@layers$counts[, cell_selection$orig.ident == x] > 2)
})<- rowSums(nPatient[, 1:4] > 2)
nCovid <- rowSums(nPatient[, 5:8] > 2)
nCtrl
<- nCovid >= 2 | nCtrl >= 2
sel <- cell_selection[sel, ] cell_selection_sub
Set up the MAST object.
# create the feature data
<- data.frame(primerid = rownames(cell_selection_sub))
fData <- cell_selection_sub@meta.data
m $wellKey <- rownames(m)
m
# make sure type and orig.ident are factors
$orig.ident <- factor(m$orig.ident)
m$type <- factor(m$type)
m
<- MAST::FromMatrix(
sca exprsArray = as.matrix(x = cell_selection_sub@assays$RNA@layers$data),
check_sanity = FALSE, cData = m, fData = fData
)
First, run the regular MAST analysis without random effects
# takes a while to run, so save a file to tmpdir in case you have to rerun the code
<- "data/covid/results/tmp_dge"
tmpdir dir.create(tmpdir, showWarnings = F)
<- file.path(tmpdir, "mast_bayesglm_cl3.Rds")
tmpfile1 if (file.exists(tmpfile1)) {
<- readRDS(tmpfile1)
fcHurdle1 else {
} <- suppressMessages(MAST::zlm(~ type + nFeature_RNA, sca, method = "bayesglm", ebayes = T))
zlmCond <- suppressMessages(MAST::summary(zlmCond, doLRT = "typeCtrl"))
summaryCond <- summaryCond$datatable
summaryDt <- merge(summaryDt[summaryDt$contrast == "typeCtrl" & summaryDt$component ==
fcHurdle "logFC", c(1, 7, 5, 6, 8)], summaryDt[summaryDt$contrast == "typeCtrl" &
$component == "H", c(1, 4)], by = "primerid")
summaryDt<- stats::na.omit(as.data.frame(fcHurdle))
fcHurdle1 saveRDS(fcHurdle1, tmpfile1)
}
Then run MAST with glmer and random effect.
library(lme4)
<- file.path(tmpdir, "mast_glme_cl3.Rds")
tmpfile2 if (file.exists(tmpfile2)) {
<- readRDS(tmpfile2)
fcHurdle2 else {
} <- suppressMessages(MAST::zlm(~ type + nFeature_RNA + (1 | orig.ident), sca,
zlmCond method = "glmer",
ebayes = F, strictConvergence = FALSE
))
<- suppressMessages(MAST::summary(zlmCond, doLRT = "typeCtrl"))
summaryCond <- summaryCond$datatable
summaryDt <- merge(summaryDt[summaryDt$contrast == "typeCtrl" & summaryDt$component ==
fcHurdle "logFC", c(1, 7, 5, 6, 8)], summaryDt[summaryDt$contrast == "typeCtrl" &
$component == "H", c(1, 4)], by = "primerid")
summaryDt<- stats::na.omit(as.data.frame(fcHurdle))
fcHurdle2 saveRDS(fcHurdle2, tmpfile2)
}
Top genes with normal MAST:
<- head(fcHurdle1[order(fcHurdle1$`Pr(>Chisq)`), ], 10)
top1 top1
$pval <- fcHurdle1$`Pr(>Chisq)`
fcHurdle1$dir <- ifelse(fcHurdle1$z > 0, "Ctrl", "Covid")
fcHurdle1%>%
fcHurdle1 group_by(dir) %>%
top_n(-10, pval) %>%
arrange(z) -> mastN
<- mastN$primerid mastN
Top genes with random effect:
<- head(fcHurdle2[order(fcHurdle2$`Pr(>Chisq)`), ], 10)
top2 top2
$pval <- fcHurdle2$`Pr(>Chisq)`
fcHurdle2$dir <- ifelse(fcHurdle2$z > 0, "Ctrl", "Covid")
fcHurdle2%>%
fcHurdle2 group_by(dir) %>%
top_n(-10, pval) %>%
arrange(z) -> mastR
<- mastR$primerid mastR
As you can see, we have lower significance for the genes with the random effect added.
Dotplot for top 10 genes in each direction:
<- DotPlot(cell_selection, features = mastN, group.by = "orig.ident", assay = "RNA") +
p1 coord_flip() + RotatedAxis() + ggtitle("Regular MAST")
<- DotPlot(cell_selection, features = mastR, group.by = "orig.ident", assay = "RNA") +
p2 coord_flip() + RotatedAxis() + ggtitle("With random effect")
+ p2 p1
You have now run DGE analysis for Covid vs Ctrl in cluster 3 with several diffent methods. Have a look at the different results. Where did you get more/less significant genes? Which results would you like to present in a paper? Discuss with a neighbor which one you think looks best and why.
7 Gene Set Analysis (GSA)
7.1 Hypergeometric enrichment test
Having a defined list of differentially expressed genes, you can now look for their combined function using hypergeometric test.
In this case we will use the DGE from MAST with random effect to run enrichment analysis.
# Load additional packages
library(enrichR)
# Check available databases to perform enrichment (then choose one)
::listEnrichrDbs() enrichR
# Perform enrichment
<- enrichr(
enrich_results genes = fcHurdle2$primerid[fcHurdle2$z < 0 & fcHurdle2$pval < 0.05],
databases = "GO_Biological_Process_2017b"
1]] )[[
Uploading data to Enrichr... Done.
Querying GO_Biological_Process_2017b... Done.
Parsing results... Done.
Some databases of interest:
GO_Biological_Process_2017b
KEGG_2019_Human
KEGG_2019_Mouse
WikiPathways_2019_Human
WikiPathways_2019_Mouse
You visualize your results using a simple barplot, for example:
par(mfrow = c(1, 1), mar = c(3, 25, 2, 1))
barplot(
height = -log10(enrich_results$P.value)[10:1],
names.arg = enrich_results$Term[10:1],
horiz = TRUE,
las = 1,
border = FALSE,
cex.names = .6
)abline(v = c(-log10(0.05)), lty = 2)
abline(v = 0, lty = 1)
8 Gene Set Enrichment Analysis (GSEA)
Besides the enrichment using hypergeometric test, we can also perform gene set enrichment analysis (GSEA), which scores ranked genes list (usually based on fold changes) and computes permutation test to check if a particular gene set is more present in the Up-regulated genes, among the DOWN_regulated genes or not differentially regulated.
Before, we ran FindMarkers()
with the default settings for reporting only significantly up/down regulated genes, but now we need statistics on a larger set of genes, so we will have to rerun the test with more lenient cutoffs. We will not use it now but this is how it should be run:
## OBS! No need to run
<- SetIdent(sub_data, value = "type")
sub_data
<- FindMarkers(
DGE_cell_selection2
sub_data,ident.1 = "Covid",
log2FC.threshold = -Inf,
test.use = "wilcox",
min.pct = 0.05,
min.diff.pct = 0,
only.pos = FALSE,
max.cells.per.ident = 50,
assay = "RNA"
)
# Create a gene rank based on the gene expression fold change
<- setNames(DGE_cell_selection2$avg_log2FC, casefold(rownames(DGE_cell_selection2), upper = T)) gene_rank
In this case we will use the results from MAST with random effect to run GSEA, and we will use the Z-score from MAST to sort the genes.
<- setNames(fcHurdle2$z, casefold(fcHurdle2$primerid, upper = T)) gene_rank
Once our list of genes are sorted, we can proceed with the enrichment itself. We can use the package to get gene set from the Molecular Signature Database (MSigDB) and select KEGG pathways as an example.
library(msigdbr)
# Download gene sets
<- msigdbr::msigdbr("Homo sapiens")
msigdbgmt <- as.data.frame(msigdbgmt)
msigdbgmt
# List available gene sets
unique(msigdbgmt$gs_subcat)
[1] "MIR:MIR_Legacy" "TFT:TFT_Legacy" "CGP" "TFT:GTRD"
[5] "" "VAX" "CP:BIOCARTA" "CGN"
[9] "GO:BP" "GO:CC" "IMMUNESIGDB" "GO:MF"
[13] "HPO" "CP:KEGG" "MIR:MIRDB" "CM"
[17] "CP" "CP:PID" "CP:REACTOME" "CP:WIKIPATHWAYS"
# Subset which gene set you want to use.
<- msigdbgmt[msigdbgmt$gs_subcat == "CP:WIKIPATHWAYS", ]
msigdbgmt_subset <- lapply(unique(msigdbgmt_subset$gs_name), function(x) {
gmt $gs_name == x, "gene_symbol"]
msigdbgmt_subset[msigdbgmt_subset
})names(gmt) <- unique(paste0(msigdbgmt_subset$gs_name, "_", msigdbgmt_subset$gs_exact_source))
Next, we will run GSEA. This will result in a table containing information for several pathways. We can then sort and filter those pathways to visualize only the top ones. You can select/filter them by either p-value
or normalized enrichment score (NES
).
library(fgsea)
# Perform enrichemnt analysis
<- fgsea(pathways = gmt, stats = gene_rank, minSize = 15, maxSize = 500)
fgseaRes <- fgseaRes[order(fgseaRes$pval, decreasing = F), ]
fgseaRes
# Filter the results table to show only the top 10 UP or DOWN regulated processes (optional)
<- fgseaRes$pathway[1:10]
top10_UP
# Nice summary table (shown as a plot)
plotGseaTable(gmt[top10_UP], gene_rank, fgseaRes, gseaParam = 0.5)
Which KEGG pathways are upregulated in this cluster? Which KEGG pathways are dowregulated in this cluster? Change the pathway source to another gene set (e.g. CP:WIKIPATHWAYS or CP:REACTOME or CP:BIOCARTA or GO:BP) and check the if you get similar results?
9 Session info
Click here
sessionInfo()
R version 4.3.3 (2024-02-29)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: macOS Big Sur ... 10.16
Matrix products: default
BLAS/LAPACK: /Users/asabjor/miniconda3/envs/seurat5/lib/libopenblasp-r0.3.27.dylib; LAPACK version 3.12.0
locale:
[1] sv_SE.UTF-8/sv_SE.UTF-8/sv_SE.UTF-8/C/sv_SE.UTF-8/sv_SE.UTF-8
time zone: Europe/Stockholm
tzcode source: system (macOS)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] fgsea_1.28.0 msigdbr_7.5.1
[3] lme4_1.1-35.5 MAST_1.28.0
[5] SingleCellExperiment_1.24.0 SummarizedExperiment_1.32.0
[7] Biobase_2.62.0 GenomicRanges_1.54.1
[9] GenomeInfoDb_1.38.1 IRanges_2.36.0
[11] S4Vectors_0.40.2 BiocGenerics_0.48.1
[13] MatrixGenerics_1.14.0 matrixStats_1.4.1
[15] edgeR_4.0.16 limma_3.58.1
[17] Matrix_1.6-5 enrichR_3.2
[19] pheatmap_1.0.12 ggplot2_3.5.1
[21] patchwork_1.2.0 dplyr_1.1.4
[23] Seurat_5.1.0 SeuratObject_5.0.2
[25] sp_2.1-4
loaded via a namespace (and not attached):
[1] RcppAnnoy_0.0.22 splines_4.3.3 later_1.3.2
[4] bitops_1.0-8 tibble_3.2.1 polyclip_1.10-7
[7] fastDummies_1.7.4 lifecycle_1.0.4 globals_0.16.3
[10] lattice_0.22-6 MASS_7.3-60.0.1 magrittr_2.0.3
[13] plotly_4.10.4 rmarkdown_2.28 yaml_2.3.10
[16] httpuv_1.6.15 sctransform_0.4.1 spam_2.10-0
[19] spatstat.sparse_3.1-0 reticulate_1.39.0 minqa_1.2.8
[22] cowplot_1.1.3 pbapply_1.7-2 RColorBrewer_1.1-3
[25] abind_1.4-5 zlibbioc_1.48.0 Rtsne_0.17
[28] purrr_1.0.2 presto_1.0.0 RCurl_1.98-1.16
[31] WriteXLS_6.7.0 GenomeInfoDbData_1.2.11 ggrepel_0.9.6
[34] irlba_2.3.5.1 listenv_0.9.1 spatstat.utils_3.1-0
[37] goftest_1.2-3 RSpectra_0.16-2 spatstat.random_3.2-3
[40] fitdistrplus_1.2-1 parallelly_1.38.0 leiden_0.4.3.1
[43] codetools_0.2-20 DelayedArray_0.28.0 tidyselect_1.2.1
[46] farver_2.1.2 spatstat.explore_3.2-6 jsonlite_1.8.8
[49] progressr_0.14.0 ggridges_0.5.6 survival_3.7-0
[52] tools_4.3.3 ica_1.0-3 Rcpp_1.0.13
[55] glue_1.7.0 gridExtra_2.3 SparseArray_1.2.2
[58] xfun_0.47 withr_3.0.1 fastmap_1.2.0
[61] boot_1.3-31 fansi_1.0.6 digest_0.6.37
[64] R6_2.5.1 mime_0.12 colorspace_2.1-1
[67] scattermore_1.2 tensor_1.5 spatstat.data_3.1-2
[70] utf8_1.2.4 tidyr_1.3.1 generics_0.1.3
[73] data.table_1.15.4 httr_1.4.7 htmlwidgets_1.6.4
[76] S4Arrays_1.2.0 uwot_0.1.16 pkgconfig_2.0.3
[79] gtable_0.3.5 lmtest_0.9-40 XVector_0.42.0
[82] htmltools_0.5.8.1 dotCall64_1.1-1 scales_1.3.0
[85] png_0.1-8 knitr_1.48 reshape2_1.4.4
[88] rjson_0.2.21 nloptr_2.1.1 nlme_3.1-165
[91] curl_5.2.1 zoo_1.8-12 stringr_1.5.1
[94] KernSmooth_2.23-24 parallel_4.3.3 miniUI_0.1.1.1
[97] vipor_0.4.7 ggrastr_1.0.2 pillar_1.9.0
[100] grid_4.3.3 vctrs_0.6.5 RANN_2.6.2
[103] promises_1.3.0 xtable_1.8-4 cluster_2.1.6
[106] beeswarm_0.4.0 evaluate_0.24.0 cli_3.6.3
[109] locfit_1.5-9.9 compiler_4.3.3 rlang_1.1.4
[112] crayon_1.5.3 future.apply_1.11.2 labeling_0.4.3
[115] plyr_1.8.9 ggbeeswarm_0.7.2 stringi_1.8.4
[118] BiocParallel_1.36.0 viridisLite_0.4.2 deldir_2.0-4
[121] babelgene_22.9 munsell_0.5.1 lazyeval_0.2.2
[124] spatstat.geom_3.2-9 RcppHNSW_0.6.0 future_1.34.0
[127] statmod_1.5.0 shiny_1.9.1 ROCR_1.0-11
[130] igraph_2.0.3 fastmatch_1.1-4