suppressPackageStartupMessages({
library(Seurat)
library(zellkonverter)
library(Matrix)
library(ggplot2)
library(patchwork)
library(scran)
library(ComplexHeatmap)
library(basilisk)
})
::source_url("https://raw.githubusercontent.com/asabjorklund/single_cell_R_scripts/main/overlap_phyper_v2.R") devtools
With data in place, now we can start loading libraries we will use in this tutorial.
1 Load data
OBS! Zellkonverter installs conda env with basilisk! Takes a while to run first time!!
<- "data/covid/results"
path_results if (!dir.exists(path_results)) dir.create(path_results, recursive = T)
= "../seurat/data/covid/results/"
path_seurat = "../bioc/data/covid/results/"
path_bioc = "../scanpy/data/covid/results/"
path_scanpy
<- "data/covid/results"
path_results if (!dir.exists(path_results)) dir.create(path_results, recursive = T)
= "../seurat/data/covid/results/"
path_seurat = "../bioc/data/covid/results/"
path_bioc = "../scanpy/data/covid/results/"
path_scanpy
# fetch the files with qc and dimred for each
# seurat
= readRDS(file.path(path_seurat,"seurat_covid_qc_dr_int_cl.rds"))
sobj
# bioc
= readRDS(file.path(path_bioc,"bioc_covid_qc_dr_int_cl.rds"))
sce = as.Seurat(sce)
bioc
# scanpy
= readH5AD(file.path(path_scanpy, "scanpy_covid_qc_dr_scanorama_cl.h5ad"))
scanpy.sce = as.Seurat(scanpy.sce, counts = NULL, data = "X") # only have the var.genes data that is scaled. scanpy
2 Umaps
wrap_plots(
DimPlot(sobj, group.by = "orig.ident") + NoAxes() + ggtitle("Seurat"),
DimPlot(bioc, group.by = "sample") + NoAxes() + ggtitle("Bioc"),
DimPlot(scanpy, group.by = "sample", reduction = "X_umap_uncorr") + NoAxes() + ggtitle("Scanpy"),
ncol = 3
)
Create one dataset with the cells that are present in all samples. Also add in umap from all 3 pipelines.
= sobj@meta.data
meta.seurat = scanpy@meta.data
meta.scanpy = bioc@meta.data
meta.bioc
$cell = rownames(meta.bioc)
meta.bioc$cell = sapply(rownames(meta.scanpy), function(x) substr(x,1,nchar(x)-2))
meta.scanpy$cell = unlist(lapply(strsplit(rownames(meta.seurat),"_"), function(x) x[3])) meta.seurat
= intersect(intersect(meta.scanpy$cell, meta.seurat$cell), meta.bioc$cell)
in.all
= meta.bioc[match(in.all, meta.bioc$cell),]
tmp1 colnames(tmp1) = paste0(colnames(tmp1),"_bioc")
= meta.scanpy[match(in.all, meta.scanpy$cell),]
tmp2 colnames(tmp2) = paste0(colnames(tmp2),"_scpy")
= sobj[,match(in.all, meta.seurat$cell)]
all
= cbind(all@meta.data, tmp1,tmp2)
meta.all @meta.data = meta.all
all
Reductions(all)
[1] "pca" "umap" "tsne"
[4] "UMAP10_on_PCA" "UMAP_on_ScaleData" "integrated_cca"
[7] "umap_cca" "tsne_cca" "harmony"
[10] "umap_harmony" "scanorama" "scanoramaC"
[13] "umap_scanorama" "umap_scanoramaC"
= bioc@reductions$UMAP_on_PCA@cell.embeddings[match(in.all, meta.bioc$cell),]
tmp rownames(tmp) = colnames(all)
"umap_bioc"]] = CreateDimReducObject(tmp, key = "umapbioc_", assay = "RNA")
all[[= scanpy@reductions$X_umap_uncorr@cell.embeddings[match(in.all, meta.scanpy$cell),]
tmp rownames(tmp) = colnames(all)
"umap_scpy"]] = CreateDimReducObject(tmp, key = "umapscpy_", assay = "RNA")
all[[
Reductions(all)
[1] "pca" "umap" "tsne"
[4] "UMAP10_on_PCA" "UMAP_on_ScaleData" "integrated_cca"
[7] "umap_cca" "tsne_cca" "harmony"
[10] "umap_harmony" "scanorama" "scanoramaC"
[13] "umap_scanorama" "umap_scanoramaC" "umap_bioc"
[16] "umap_scpy"
3 Variable features
In Seurat, FindVariableFeatures
is not batch aware unless the data is split into layers by samples, here we have the variable genes created with layers. In Bioc modelGeneVar
we used sample as a blocking parameter, e.g calculates the variable genes per sample and combines the variances. Similar in scanpy we used the samples as batch_key
.
= list()
hvgs $seurat = VariableFeatures(sobj)
hvgs$bioc = sce@metadata$hvgs
hvgs# scanpy has no strict number on selection, instead uses dispersion cutoff. So select instead top 2000 dispersion genes.
= rowData(scanpy.sce)
scpy.hvg = rownames(scpy.hvg)[scpy.hvg$highly_variable]
hvgs_scanpy $scanpy = rownames(scpy.hvg)[order(scpy.hvg$dispersions_norm, decreasing = T)][1:2000]
hvgs
= make_comb_mat(hvgs)
cmat print(cmat)
A combination matrix with 3 sets and 7 combinations.
ranges of combination set size: c(245, 869).
mode for the combination size: distinct.
sets are on rows.
Combination sets are:
seurat bioc scanpy code size
x x x 111 607
x x 110 279
x x 101 245
x x 011 344
x 100 869
x 010 770
x 001 804
Sets are:
set size
seurat 2000
bioc 2000
scanpy 2000
UpSet(cmat)
Surprisingly low overlap between the methods and many genes that are unique to one pipeline. With discrepancies in the doublet filtering the cells used differ to some extent, but otherwise the variation should be similar. Even if it is estimated in different ways.
Is the differences more due to the combination of ranks/dispersions or also found within a single dataset?
Only Seurat have the dispersions for each individual dataset stored in the object.
3.1 Compare dispersions
Recalculate for bioc.
<- modelGeneVar(sce, block = sce$sample)
var.out <- getTopHVGs(var.out, n = 2000)
hvgs_bioc <- rownames(var.out) %in% hvgs_bioc cutoff
# Merge all hvg info
= intersect(rownames(var.out), rownames(scpy.hvg))
all.genes
= rowData(scanpy.sce)
scpy.hvg colnames(scpy.hvg) = paste0(colnames(scpy.hvg), "_scpy")
colnames(var.out) = paste0(colnames(var.out), "_bioc")
= HVFInfo(all)
seu.hvg
= cbind(seu.hvg[all.genes,], scpy.hvg[all.genes,], var.out[all.genes,])
all.hvg
par(mfrow=c(2,2))
plot(all.hvg$means_scpy, all.hvg$dispersions_norm_scpy, main = "Scanpy",pch = 16, cex = 0.4)
points(all.hvg$means_scpy[all.hvg$highly_variable_scpy], all.hvg$dispersions_norm_scpy[all.hvg$highly_variable_scpy], col="red",pch = 16, cex = 0.4)
plot(all.hvg$mean_bioc, all.hvg$bio_bioc, pch = 16, cex = 0.4, main = "Bioc")
points(all.hvg$mean_bioc[rownames(all.hvg) %in% hvgs$bioc], all.hvg$bio_bioc[rownames(all.hvg) %in% hvgs$bioc], col = "red", pch = 16, cex = .6)
plot(log1p(all.hvg$mean), all.hvg$variance.standardized, main = "Seurat",pch = 16, cex = 0.4)
points(log1p(all.hvg$mean)[match(hvgs$seurat, rownames(all.hvg))], all.hvg$variance.standardized[match(hvgs$seurat, rownames(all.hvg))],col="red",pch = 16, cex = 0.4)
Scanpy uses min_disp=0.5, min_mean=0.0125, max_mean=3, so the highly expressed ones are not included.
Seurat uses variance.standardized
to rank the genes. Bioc uses the bio
slot with estimated biological variation.
$mean_log = log1p(all.hvg$mean)
all.hvg= c("mean","mean_log","means_scpy", "mean_bioc")
sel.means pairs(all.hvg[,sel.means])
= c("variance.standardized","dispersions_norm_scpy", "bio_bioc","total_bioc")
sel.disp pairs(all.hvg[,sel.disp])
Difference in what cells were used, and also in how the dispersions across the samples is combined into one value. Do for just one sample instead.
4 For one sample
Run hvg selection for ctrl.13 sample, so that the handling of batches is not influencing results. Use default settings in all the different methods.
4.1 Seurat
Try the different methods implemented in Seurat. From the help section:
* “vst”: First, fits a line to the relationship of log(variance) and log(mean) using local polynomial regression (loess). Then standardizes the feature values using the observed mean and expected variance (given by the fitted line). Feature variance is then calculated on the standardized values after clipping to a maximum (see clip.max parameter).
* “mean.var.plot” (mvp): First, uses a function to calculate average expression (mean.function) and dispersion (dispersion.function) for each feature. Next, divides features into num.bin (deafult 20) bins based on their average expression, and calculates z-scores for dispersion within each bin. The purpose of this is to identify variable features while controlling for the strong relationship between variability and average expression
* “dispersion” (disp): selects the genes with the highest dispersion values
Feature selection for individual datasets
In each dataset, we next aimed to identify a subset of features (e.g., genes) exhibiting high variability across cells, and therefore represent heterogeneous features to prioritize for downstream analysis. Choosing genes solely based on their log-normalized single-cell variance fails to account for the mean-variance relationship that is inherent to single-cell RNA-seq. Therefore, we first applied a variance-stabilizing transformation to correct for this [Mayer et al., 2018, Hafemeister and Satija, 2019]. To learn the mean-variance relationship from the data, we computed the mean and variance of each gene using the unnormalized data (i.e., UMI or counts matrix), and applied -transformation to both. We then fit a curve to predict the variance of each gene as a function of its mean, by calculating a local fitting of polynomials of degree 2 (R function loess, span = 0.3). This global fit provided us with a regularized estimator of variance given the mean of a feature. As such, we could use it to standardize feature counts without removing higher-than-expected variation.
= all[,all$orig.ident == "ctrl_13"]
ctrl @assays$RNA@meta.data[,1:ncol(ctrl@assays$RNA@meta.data)] = NULL
ctrl
= FindVariableFeatures(ctrl)
ctrl = list()
hvg_seu $vst = VariableFeatures(ctrl)
hvg_seu#top20 <- head(hvg_seu$vst, 20)
#LabelPoints(plot = VariableFeaturePlot(ctrl), points = top20, repel = TRUE)
= FindVariableFeatures(ctrl, selection.method = "mean.var.plot")
ctrl $mvp = VariableFeatures(ctrl)
hvg_seu#top20 <- head(hvg_seu$mvp, 20)
#LabelPoints(plot = VariableFeaturePlot(ctrl), points = top20, repel = TRUE)
= FindVariableFeatures(ctrl, selection.method = "dispersion")
ctrl $disp = VariableFeatures(ctrl)
hvg_seu#top20 <- head(hvg_seu$disp, 20)
#LabelPoints(plot = VariableFeaturePlot(ctrl), points = top20, repel = TRUE)
= make_comb_mat(hvg_seu)
cmat cmat
A combination matrix with 3 sets and 7 combinations.
ranges of combination set size: c(31, 1295).
mode for the combination size: distinct.
sets are on rows.
Combination sets are:
vst mvp disp code size
x x x 111 401
x x 110 31
x x 101 273
x x 011 549
x 100 1295
x 010 149
x 001 777
Sets are:
set size
vst 2000
mvp 1130
disp 2000
Very little overlap between the methods in Seurat. Mvp and disp are calculated on data
, while vst is done on counts
.
= ctrl@assays$RNA@meta.data
vinfo.seu rownames(vinfo.seu) = rownames(ctrl)
$vg_vst = rownames(vinfo.seu) %in% hvg_seu$vst
vinfo.seu$vg_disp = rownames(vinfo.seu) %in% hvg_seu$disp
vinfo.seu$vg_mvp = rownames(vinfo.seu) %in% hvg_seu$mvp
vinfo.seu$vg_vst_disp = vinfo.seu$vg_vst + vinfo.seu$vg_disp == 2
vinfo.seu$hvinfo = ifelse(vinfo.seu$vg_vst, "VST",NA)
vinfo.seu$hvinfo[vinfo.seu$vg_mvp] = "MVP"
vinfo.seu$hvinfo[vinfo.seu$vg_disp] = "DISP"
vinfo.seu$hvinfo[vinfo.seu$vg_vst_disp] = "VST/DISP"
vinfo.seu
= colnames(vinfo.seu)[grepl("mean", colnames(vinfo.seu))]
means = c("vf_vst_counts.ctrl_13_variance.standardized","vf_disp_data.ctrl_13_mvp.dispersion.scaled","vf_mvp_data.ctrl_13_mvp.dispersion.scaled")
disp pairs(vinfo.seu[,means])
= function(df, m,d,vg, log=FALSE){
plot_hvg = head(vg,20)
top20 = ggplot(df, aes(x=.data[[m]], y=.data[[d]], color=hvinfo)) + geom_point() + theme_classic()
p if (log) { p = p + scale_x_log10()}
LabelPoints(plot = p, points = top20, repel = TRUE)
}
= plot_hvg(vinfo.seu, "vf_vst_counts.ctrl_13_mean", "vf_vst_counts.ctrl_13_variance.standardized",hvg_seu$vst, log=TRUE)
p1S = plot_hvg(vinfo.seu, "vf_disp_data.ctrl_13_mvp.mean", "vf_disp_data.ctrl_13_mvp.dispersion.scaled",hvg_seu$disp)
p2S = plot_hvg(vinfo.seu, "vf_mvp_data.ctrl_13_mvp.mean", "vf_mvp_data.ctrl_13_mvp.dispersion.scaled",hvg_seu$mvp)
p3S wrap_plots(p1S,p2S,p3S, ncol=2)
4.2 Bioc
Run the bioconductor detection method with modelGeneVar
and getTopHVGs
. Plot both total variance and “bio” variance.
= as.SingleCellExperiment(ctrl)
ctrl.sce <- modelGeneVar(ctrl.sce)
var.out <- getTopHVGs(var.out, n = 2000)
hvgs_bioc
= data.frame(var.out)
var.out.df $hvg = rownames(var.out) %in% hvgs_bioc
var.out.df= ggplot(var.out.df, aes(x=mean, y=total, colour = hvg)) + geom_point() + theme_classic()
p
<- head(hvgs_bioc, 20)
top20 = LabelPoints(plot = p, points = top20, repel = TRUE)
pB pB
= ggplot(var.out.df, aes(x=mean, y=bio, colour = hvg)) + geom_point() + theme_classic()
p2
<- head(hvgs_bioc, 20)
top20 = LabelPoints(plot = p2, points = top20, repel = TRUE)
pB2 pB2
4.3 Scanpy
Run with both seurat
(on lognorm data) and seurat_v3
(on counts, is same as vst)
For the dispersion-based methods (flavor=‘seurat’ Satija et al. [2015] and flavor=‘cell_ranger’ Zheng et al. [2017]), the normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for genes falling into a given bin for mean expression of genes. This means that for each bin of mean expression, highly variable genes are selected.
For flavor=‘seurat_v3’/‘seurat_v3_paper’ [Stuart et al., 2019], a normalized variance for each gene is computed. First, the data are standardized (i.e., z-score normalization per feature) with a regularized standard deviation. Next, the normalized variance is computed as the variance of each gene after the transformation. Genes are ranked by the normalized variance. Only if batch_key is not None, the two flavors differ: For flavor=‘seurat_v3’, genes are first sorted by the median (across batches) rank, with ties broken by the number of batches a gene is a HVG. For flavor=‘seurat_v3_paper’, genes are first sorted by the number of batches a gene is a HVG, with ties broken by the median (across batches) rank.
= "/Users/asabjor/miniconda3/envs/scanpy_2024_nopip"
penv = basiliskRun(env=penv, fun=function(counts) {
hvg.scanpy <- reticulate::import("scanpy")
scanpy = reticulate::import("anndata")
ad = ad$AnnData(counts)
adata print(adata$X[1:10,1:10])
= scanpy$pp$highly_variable_genes(adata, flavor = "seurat_v3", inplace=FALSE)
var1 $pp$normalize_per_cell(adata, counts_per_cell_after=1e4)
scanpy$pp$log1p(adata)
scanpyprint(adata$X[1:10,1:10])
$pp$highly_variable_genes(adata)
scanpyreturn(list(disp=adata$var, vst=var1))
counts = t(ctrl@assays$RNA@layers$counts.ctrl_13), testload="scanpy")
},
#flavor
#Literal['seurat', 'cell_ranger', 'seurat_v3', 'seurat_v3_paper'] (default: 'seurat')
rownames(hvg.scanpy$disp) = rownames(ctrl)
rownames(hvg.scanpy$vst) = rownames(ctrl)
<- head(rownames(hvg.scanpy$disp)[order(hvg.scanpy$disp$dispersions_norm, decreasing = T)], 20)
top20 = ggplot(hvg.scanpy$disp, aes(x=means, y=dispersions_norm, colour = highly_variable)) + geom_point() + theme_classic() + ggtitle("Scanpy disp")
p
= LabelPoints(plot = p, points = top20, repel = TRUE)
pS pS
<- head(rownames(hvg.scanpy$vst)[order(hvg.scanpy$vst$variances_norm, decreasing = T)], 20)
top20 = ggplot(hvg.scanpy$vst, aes(x=means, y=variances_norm, colour = highly_variable)) + geom_point() + theme_classic() + ggtitle("Scanpy vst") + scale_x_log10()
p2
= LabelPoints(plot = p2, points = top20, repel = TRUE)
pS2 pS2
4.4 All together
colnames(hvg.scanpy$disp) = paste0(colnames(hvg.scanpy$disp), "_scpyD")
colnames(hvg.scanpy$vst) = paste0(colnames(hvg.scanpy$vst), "_scpyV")
colnames(var.out) = paste0(colnames(var.out), "_bioc")
= cbind(vinfo.seu, var.out, hvg.scanpy$disp, hvg.scanpy$vst) ctrl.hvg
wrap_plots(p1S + ggtitle("Seurat vst"),p2S + ggtitle("Seurat disp"),p3S + ggtitle("Seurat mvp"),pB + ggtitle("Bioc total"),pS,pS2, ncol=3)
= c("vf_vst_counts.ctrl_13_variance.standardized","vf_disp_data.ctrl_13_mvp.dispersion.scaled","bio_bioc", "dispersions_norm_scpyD", "variances_norm_scpyV")
sel
pairs(ctrl.hvg[,sel])
= c("vf_vst_counts.ctrl_13_mean_log","vf_disp_data.ctrl_13_mvp.mean", "mean_bioc", "means_scpyD", "means_scpyV_log")
sel
$vf_vst_counts.ctrl_13_mean_log = log1p(ctrl.hvg$vf_vst_counts.ctrl_13_mean)
ctrl.hvg$means_scpyV_log = log1p(ctrl.hvg$means_scpyV)
ctrl.hvg
pairs(ctrl.hvg[,sel])
= hvg_seu
hvgs $scpyD = rownames(hvg.scanpy$disp)[hvg.scanpy$disp$highly_variable_scpyD]
hvgs$scpyV = rownames(hvg.scanpy$disp)[hvg.scanpy$vst$highly_variable_scpyV]
hvgs$bioc = hvgs_bioc
hvgs
= overlap_phyper2(hvgs,hvgs, remove.diag = T) o
Mean Bioc stands out the most.
Vst in seurat and scanpy is identical and very different from the other methods. But it is also based on counts instead of lognorm data.
The variance estimate for the same sample is quite different. Even the top variable genes are not the same and are very different gene groups.
4.5 Top genes
Explore top genes with the different methods.
disp and mvp are identical, remove mvp.
ScanpyV and vst are identical, remove scpyV.
= lapply(hvgs, head, 10)
topG
# same for mvp and disp, remove one of them.
$mvp = NULL
topG$scpyV = NULL
topG
# sort the genes for scanpy.
$scpyD = rownames(ctrl.hvg)[order(ctrl.hvg$dispersions_norm_scpyD, decreasing = T)][1:10]
topG
# rank the genes
= unique(unlist(topG))
allG
= Reduce(rbind, lapply(topG, function(x) {
ranks = 1:10
r names(r) = x
= r[allG]
r2 return(r2)
}))
colnames(ranks) = allG
rownames(ranks) = names(topG)
::pheatmap(ranks, display_numbers = ranks, main="Rank of top 10 gens", cluster_rows = F, cluster_cols = F) pheatmap
Plot onto umap for the unique ones.
$disp = NULL
topG
= table(unlist(topG))
t = names(t)[t==1]
selG = sapply(selG, function(y) names(which(unlist(lapply(topG, function(x) any(x==y))))))
selG = sort(selG)
selG
#small.leg <- theme(legend.text = element_text(size=3), legend.key.size = unit(0.5,"point"))
= list()
plots for (g in names(selG)){
= FeaturePlot(ctrl, reduction = "umap_harmony", features = g, order = T) + ggtitle(paste(g,selG[g], collapse = " - ")) + NoAxes() #+ small.leg
plots[[g]]
}
wrap_plots(plots, ncol = 4)
More clear celltype genes in the Bioc selection, but also higher expressed genes.. More B-cell genes for vst.
4.6 Expression levels
Expression levels of the variable genes. As total counts or mean expression across all cells.
$nC = rowSums(ctrl@assays$RNA@layers$counts.ctrl_13>0)
ctrl.hvg$meanE = rowMeans(ctrl@assays$RNA@layers$data.ctrl_13)
ctrl.hvg$vg_bioc = rownames(ctrl.hvg) %in% hvgs_bioc
ctrl.hvg
wrap_plots(
ggplot(ctrl.hvg, aes(x=nC, fill=vg_vst)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("VST"),
ggplot(ctrl.hvg, aes(x=nC, fill=vg_disp)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Disp"),
ggplot(ctrl.hvg, aes(x=nC, fill=highly_variable_scpyD)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Scanpy Disp"),
ggplot(ctrl.hvg, aes(x=nC, fill=highly_variable_scpyV)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Scanpy VST"),
ggplot(ctrl.hvg, aes(x=nC, fill=vg_bioc)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Bioc"),
ggplot(ctrl.hvg, aes(x=meanE, fill=vg_vst)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("VST"),
ggplot(ctrl.hvg, aes(x=meanE, fill=vg_disp)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Disp"),
ggplot(ctrl.hvg, aes(x=meanE, fill=highly_variable_scpyD)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Scanpy Disp"),
ggplot(ctrl.hvg, aes(x=meanE, fill=highly_variable_scpyV)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Scanpy VST"),
ggplot(ctrl.hvg, aes(x=meanE, fill=vg_bioc)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend()+ ggtitle("Bioc"),
ncol =5
)
BioC distibution shifted to more highly expressed genes.
Dispersion gives the very highly expressed genes, but also has a spread among all levels of expression.
5 Discussion
Seurat v3 paper suggests vst on counts, but then uses the variable genes in lognorm space, how is this the best option_
From SC best practices, suggests scry for HVG selection. https://www.sc-best-practices.org/preprocessing_visualization/feature_selection.html
5.1 scry
Run scry deviance function and select top 2k genes.
<-scry::devianceFeatureSelection(ctrl.sce, assay="counts", sorted=TRUE)
ctrl.sceplot(rowData(ctrl.sce)$binomial_deviance, type="l", xlab="ranked genes",
ylab="binomial deviance", main="Feature Selection with Deviance")
abline(v=2000, lty=2, col="red")
head(rowData(ctrl.sce),10)
DataFrame with 10 rows and 1 column
binomial_deviance
<numeric>
S100A9 74343.0
FTL 53573.3
GNLY 52928.0
LYZ 48572.1
S100A8 42210.5
NKG7 36843.2
FTH1 35595.1
B2M 23773.7
CCL5 22340.5
HLA-DRA 21938.7
Top genes are similar to the other methods.
$deviance = rowData(ctrl.sce)$binomial_deviance[match(rownames(ctrl.hvg), rownames(rowData(ctrl.sce)))]
ctrl.hvg$devG = rownames(ctrl.hvg) %in% head(rownames(rowData(ctrl.sce)),2000)
ctrl.hvg
wrap_plots(
ggplot(ctrl.hvg, aes(x=nC, y=deviance, color=devG)) + geom_point() + theme_classic() + ggtitle("nCell vs deviance") + NoLegend(),
ggplot(ctrl.hvg, aes(x=meanE, y=deviance, color=devG)) + geom_point() + theme_classic() + ggtitle("mean expression vs deviance")+ NoLegend(),
ggplot(ctrl.hvg, aes(x=dispersions_norm_scpyD, y=deviance, color=devG)) + geom_point() + theme_classic() + ggtitle("Dispersion vs deviance")+ NoLegend(),
ggplot(ctrl.hvg, aes(x=variances_norm_scpyV, y=deviance, color=devG)) + geom_point() + theme_classic() + ggtitle("VST vs deviance")+ NoLegend(),
ggplot(ctrl.hvg, aes(x=bio_bioc, y=deviance, color=devG)) + geom_point() + theme_classic() + ggtitle("bioc bio vs deviance")+ NoLegend(),
ncol = 3
)
Selects all genes with high expression
6 Session info
Click here
sessionInfo()
R version 4.3.3 (2024-02-29)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: macOS Big Sur ... 10.16
Matrix products: default
BLAS/LAPACK: /Users/asabjor/miniconda3/envs/seurat5_u/lib/libopenblasp-r0.3.28.dylib; LAPACK version 3.12.0
locale:
[1] sv_SE.UTF-8/sv_SE.UTF-8/sv_SE.UTF-8/C/sv_SE.UTF-8/sv_SE.UTF-8
time zone: Europe/Stockholm
tzcode source: system (macOS)
attached base packages:
[1] grid stats4 stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] pheatmap_1.0.12 basilisk_1.14.1
[3] ComplexHeatmap_2.18.0 scran_1.30.0
[5] scuttle_1.12.0 SingleCellExperiment_1.24.0
[7] SummarizedExperiment_1.32.0 Biobase_2.62.0
[9] GenomicRanges_1.54.1 GenomeInfoDb_1.38.1
[11] IRanges_2.36.0 S4Vectors_0.40.2
[13] BiocGenerics_0.48.1 MatrixGenerics_1.14.0
[15] matrixStats_1.5.0 patchwork_1.3.0
[17] ggplot2_3.5.1 Matrix_1.6-5
[19] zellkonverter_1.12.1 Seurat_5.1.0
[21] SeuratObject_5.0.2 sp_2.1-4
loaded via a namespace (and not attached):
[1] RcppAnnoy_0.0.22 splines_4.3.3
[3] later_1.4.1 bitops_1.0-9
[5] filelock_1.0.3 tibble_3.2.1
[7] polyclip_1.10-7 basilisk.utils_1.14.1
[9] fastDummies_1.7.4 lifecycle_1.0.4
[11] doParallel_1.0.17 edgeR_4.0.16
[13] globals_0.16.3 lattice_0.22-6
[15] MASS_7.3-60.0.1 magrittr_2.0.3
[17] limma_3.58.1 plotly_4.10.4
[19] rmarkdown_2.29 remotes_2.5.0
[21] yaml_2.3.10 metapod_1.10.0
[23] httpuv_1.6.15 sctransform_0.4.1
[25] spam_2.11-0 sessioninfo_1.2.2
[27] pkgbuild_1.4.5 spatstat.sparse_3.1-0
[29] reticulate_1.40.0 cowplot_1.1.3
[31] pbapply_1.7-2 RColorBrewer_1.1-3
[33] pkgload_1.4.0 abind_1.4-5
[35] zlibbioc_1.48.0 Rtsne_0.17
[37] purrr_1.0.2 RCurl_1.98-1.16
[39] circlize_0.4.16 GenomeInfoDbData_1.2.11
[41] ggrepel_0.9.6 scry_1.14.0
[43] irlba_2.3.5.1 listenv_0.9.1
[45] spatstat.utils_3.1-2 goftest_1.2-3
[47] RSpectra_0.16-2 spatstat.random_3.3-2
[49] dqrng_0.3.2 fitdistrplus_1.2-2
[51] parallelly_1.41.0 DelayedMatrixStats_1.24.0
[53] leiden_0.4.3.1 codetools_0.2-20
[55] DelayedArray_0.28.0 shape_1.4.6.1
[57] tidyselect_1.2.1 farver_2.1.2
[59] ScaledMatrix_1.10.0 spatstat.explore_3.3-4
[61] jsonlite_1.8.9 GetoptLong_1.0.5
[63] BiocNeighbors_1.20.0 ellipsis_0.3.2
[65] progressr_0.15.1 iterators_1.0.14
[67] ggridges_0.5.6 survival_3.8-3
[69] foreach_1.5.2 tools_4.3.3
[71] ica_1.0-3 Rcpp_1.0.13-1
[73] glue_1.8.0 gridExtra_2.3
[75] SparseArray_1.2.2 xfun_0.50
[77] usethis_3.1.0 dplyr_1.1.4
[79] withr_3.0.2 fastmap_1.2.0
[81] bluster_1.12.0 digest_0.6.37
[83] rsvd_1.0.5 R6_2.5.1
[85] mime_0.12 colorspace_2.1-1
[87] Cairo_1.6-2 scattermore_1.2
[89] tensor_1.5 spatstat.data_3.1-4
[91] tidyr_1.3.1 generics_0.1.3
[93] data.table_1.15.4 httr_1.4.7
[95] htmlwidgets_1.6.4 S4Arrays_1.2.0
[97] uwot_0.1.16 pkgconfig_2.0.3
[99] gtable_0.3.6 lmtest_0.9-40
[101] XVector_0.42.0 htmltools_0.5.8.1
[103] profvis_0.4.0 dotCall64_1.2
[105] clue_0.3-66 scales_1.3.0
[107] png_0.1-8 spatstat.univar_3.1-1
[109] knitr_1.49 rjson_0.2.23
[111] reshape2_1.4.4 curl_6.0.1
[113] nlme_3.1-165 cachem_1.1.0
[115] GlobalOptions_0.1.2 zoo_1.8-12
[117] stringr_1.5.1 KernSmooth_2.23-26
[119] parallel_4.3.3 miniUI_0.1.1.1
[121] pillar_1.10.1 vctrs_0.6.5
[123] RANN_2.6.2 urlchecker_1.0.1
[125] promises_1.3.2 BiocSingular_1.18.0
[127] beachmat_2.18.0 xtable_1.8-4
[129] cluster_2.1.8 evaluate_1.0.1
[131] cli_3.6.3 locfit_1.5-9.10
[133] compiler_4.3.3 rlang_1.1.4
[135] crayon_1.5.3 future.apply_1.11.2
[137] labeling_0.4.3 fs_1.6.5
[139] plyr_1.8.9 stringi_1.8.4
[141] viridisLite_0.4.2 deldir_2.0-4
[143] BiocParallel_1.36.0 munsell_0.5.1
[145] lazyeval_0.2.2 devtools_2.4.5
[147] spatstat.geom_3.3-4 dir.expiry_1.10.0
[149] RcppHNSW_0.6.0 sparseMatrixStats_1.14.0
[151] future_1.34.0 statmod_1.5.0
[153] shiny_1.10.0 ROCR_1.0-11
[155] memoise_2.0.1 igraph_2.0.3