In this tutorial, we will run all tutorials with a set of 6 PBMC 10x
datasets from 3 covid-19 patients and 3 healthy controls, the samples
have been subsampled to 1500 cells per sample. They are part of the
github repo and if you have cloned the repo they should be available in
folder: labs/data/covid_data_GSE149689
. Instructions on how
to download them can also be found in the Precourse material.
<- "https://raw.githubusercontent.com/NBISweden/workshop-scRNAseq/new_dataset/labs/data/covid_data_GSE149689/sub/"
webpath dir.create("./data/raw", recursive = T)
## Warning in dir.create("./data/raw", recursive = T): './data/raw' already exists
<- c("Normal_PBMC_13.h5", "Normal_PBMC_14.h5", "Normal_PBMC_5.h5", "nCoV_PBMC_15.h5",
file_list "nCoV_PBMC_17.h5", "nCoV_PBMC_1.h5")
for (i in file_list) {
download.file(url = paste0(webpath, i), destfile = paste0("./data/raw/", i))
}
With data in place, now we can start loading libraries we will use in this tutorial.
suppressMessages(require(Seurat))
suppressMessages(require(Matrix))
if (!require(DoubletFinder)) {
::install_github("chris-mcginnis-ucsf/DoubletFinder", upgrade = FALSE,
remotesdependencies = FALSE)
}
## Loading required package: DoubletFinder
suppressMessages(require(DoubletFinder))
We can first load the data individually by reading directly from HDF5 file format (.h5).
.15 <- Seurat::Read10X_h5(filename = "data/raw/nCoV_PBMC_15.h5", use.names = T)
cov.1 <- Seurat::Read10X_h5(filename = "data/raw/nCoV_PBMC_1.h5", use.names = T)
cov.17 <- Seurat::Read10X_h5(filename = "data/raw/nCoV_PBMC_17.h5", use.names = T)
cov
.5 <- Seurat::Read10X_h5(filename = "data/raw/Normal_PBMC_5.h5", use.names = T)
ctrl.13 <- Seurat::Read10X_h5(filename = "data/raw/Normal_PBMC_13.h5", use.names = T)
ctrl.14 <- Seurat::Read10X_h5(filename = "data/raw/Normal_PBMC_14.h5", use.names = T) ctrl
We can now load the expression matricies into objects and then merge
them into a single merged object. Each analysis workflow (Seurat,
Scater, Scranpy, etc) has its own way of storing data. We will add
dataset labels as cell.ids just in case you have overlapping barcodes
between the datasets. After that we add a column Chemistry
in the metadata for plotting later on.
<- CreateSeuratObject(cov.15, project = "covid_15")
sdata.cov15 <- CreateSeuratObject(cov.1, project = "covid_1")
sdata.cov1 <- CreateSeuratObject(cov.17, project = "covid_17")
sdata.cov17 <- CreateSeuratObject(ctrl.5, project = "ctrl_5")
sdata.ctrl5 <- CreateSeuratObject(ctrl.13, project = "ctrl_13")
sdata.ctrl13 <- CreateSeuratObject(ctrl.14, project = "ctrl_14")
sdata.ctrl14
# add metadata
$type = "Covid"
sdata.cov1$type = "Covid"
sdata.cov15$type = "Covid"
sdata.cov17$type = "Ctrl"
sdata.ctrl5$type = "Ctrl"
sdata.ctrl13$type = "Ctrl"
sdata.ctrl14
# Merge datasets into one single seurat object
<- merge(sdata.cov15, c(sdata.cov1, sdata.cov17, sdata.ctrl5, sdata.ctrl13,
alldata add.cell.ids = c("covid_15", "covid_1", "covid_17", "ctrl_5",
sdata.ctrl14), "ctrl_13", "ctrl_14"))
Once you have created the merged Seurat object, the count matrices and individual count matrices and objects are not needed anymore. It is a good idea to remove them and run garbage collect to free up some memory.
# remove all objects that will not be used.
rm(cov.15, cov.1, cov.17, ctrl.5, ctrl.13, ctrl.14, sdata.cov15, sdata.cov1, sdata.cov17,
sdata.ctrl5, sdata.ctrl13, sdata.ctrl14)
# run garbage collect to free up memory
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3386632 180.9 6555255 350.1 6416109 342.7
## Vcells 44790550 341.8 129023968 984.4 102624970 783.0
Here it is how the count matrix and the metatada look like for every cell.
as.data.frame(alldata@assays$RNA@counts[1:10, 1:2])
head(alldata@meta.data, 10)
Having the data in a suitable format, we can start calculating some quality metrics. We can for example calculate the percentage of mitocondrial and ribosomal genes per cell and add to the metadata. This will be helpfull to visualize them across different metadata parameteres (i.e. datasetID and chemistry version). There are several ways of doing this, and here manually calculate the proportion of mitochondrial reads and add to the metadata table.
Citing from “Simple Single Cell” workflows (Lun, McCarthy & Marioni, 2017): “High proportions are indicative of poor-quality cells (Islam et al. 2014; Ilicic et al. 2016), possibly because of loss of cytoplasmic RNA from perforated cells. The reasoning is that mitochondria are larger than individual transcript molecules and less likely to escape through tears in the cell membrane.”
# Way1: Doing it using Seurat function
<- PercentageFeatureSet(alldata, "^MT-", col.name = "percent_mito")
alldata
# Way2: Doing it manually
<- colSums(alldata@assays$RNA@counts)
total_counts_per_cell <- rownames(alldata)[grep("^MT-", rownames(alldata))]
mito_genes $percent_mito <- colSums(alldata@assays$RNA@counts[mito_genes, ])/total_counts_per_cell
alldata
head(mito_genes, 10)
## [1] "MT-ND1" "MT-ND2" "MT-CO1" "MT-CO2" "MT-ATP8" "MT-ATP6" "MT-CO3"
## [8] "MT-ND3" "MT-ND4L" "MT-ND4"
In the same manner we will calculate the proportion gene expression that comes from ribosomal proteins.
# Way1: Doing it using Seurat function
<- PercentageFeatureSet(alldata, "^RP[SL]", col.name = "percent_ribo")
alldata
# Way2: Doing it manually
<- rownames(alldata)[grep("^RP[SL]", rownames(alldata))]
ribo_genes head(ribo_genes, 10)
$percent_ribo <- colSums(alldata@assays$RNA@counts[ribo_genes, ])/total_counts_per_cell alldata
## [1] "RPL22" "RPL11" "RPS6KA1" "RPS8" "RPL5" "RPS27" "RPS6KC1"
## [8] "RPS7" "RPS27A" "RPL31"
And finally, with the same method we will calculate proportion hemoglobin genes, which can give an indication of red blood cell contamination.
# Percentage hemoglobin genes - includes all genes starting with HB except HBP.
<- PercentageFeatureSet(alldata, "^HB[^(P)]", col.name = "percent_hb")
alldata
<- PercentageFeatureSet(alldata, "PECAM1|PF4", col.name = "percent_plat") alldata
Now we can plot some of the QC-features as violin plots.
<- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", "percent_hb")
feats VlnPlot(alldata, group.by = "orig.ident", features = feats, pt.size = 0.1, ncol = 3) +
NoLegend()