Datasets
# Transcriptional trajectories will be inferred from data by Nestorowa, Hamey et al. (Blood, 2016):
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5305050/
# The dataset consists in 1600 hematopoietic stem and progenitor cells from mouse bone marrow
# (sequenced using the SMARTseq2 technology). Using flow cytometry and index sorting, 12 HSPC of different phenotypes
# (about 10 cells each) have been included in the dataset, and will be used in this lab as a biological prior for the
# identification of the root and the branches in the transcriptional trajectory models.
# bash:
#
# mkdir data && cd data
# wget -nH -nd http://blood.stemcells.cam.ac.uk/data/nestorowa_corrected_log2_transformed_counts.txt.gz \
# http://blood.stemcells.cam.ac.uk/data/nestorowa_corrected_population_annotation.txt.gz
# wget -nH -nd ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE81nnn/GSE81682/suppl/GSE81682%5FHTSeq%5Fcounts%2Etxt%2Egz
#
# gunzip *
# cd ..
Part I - Monocle2/DDRtree
# Inference done with Monocle2/DDRtree available via Bioconductor
library(monocle)
library(biomaRt)
# data loading
# The authors provide an expression matrix that has been filtered (highly expressed genes, high quality cells)
# scaled and log-normalized. An annotation table is also provided, with each cell type labelled according to
# the immunophenotyping done by flow cytometry.
lognorm <- t(read.table('data/nestorowa_corrected_log2_transformed_counts.txt', sep=" ", header=TRUE))
annotable <- read.table('data/nestorowa_corrected_population_annotation.txt')
# To infer a trajectory with Monocle2/DDRtree, using non-normalized UMI-based counts is highly recommended,
# as Monocle2 will scale and normalize the data internaly and is especting data ditributed according to a negative binomial.
# The count matrix has been downloaded and will be used for Monocle2.
counts <- read.table('data/GSE81682_HTSeq_counts.txt', sep="\t", header=TRUE, row.names='ID')
counts[1:5,1:5]
## HSPC_007 HSPC_013 HSPC_019 HSPC_025 HSPC_031
## ENSMUSG00000000001 0 7 1 185 2
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 4 1 2 4 3
## ENSMUSG00000000031 0 0 0 0 0
## ENSMUSG00000000037 0 0 0 0 0
dim(counts)
## [1] 46175 1920
lognorm[1:5,1:5]
## HSPC_001 HSPC_002 HSPC_003 HSPC_004 HSPC_006
## X1110032F04Rik 0.000000 0.000000 0.000000 0.000000 0.000000
## X1110059E24Rik 0.000000 0.000000 2.795189 1.326478 7.348663
## X1300017J02Rik 0.000000 0.000000 0.000000 0.000000 0.000000
## X1600014C10Rik 0.000000 2.238601 0.000000 1.326478 4.946766
## X1700017B05Rik 1.225439 2.238601 1.989360 2.005685 0.000000
dim(lognorm)
## [1] 3991 1645
# Note that the count matrix is not filtered, and genes are labelled
# according to ensembl gene IDs. We will first filter the matrix according to the authors choices
# and we will map the gene official symbols.
# we filter the counts to keep only high quality cells based on the "lognorm" matrix.
counts <- counts[ , colnames(lognorm) ]
dim(counts)
## [1] 46175 1645
# we create a annotation data frame to label the cell types
# as defined byu the authors
pDat <- data.frame(cell=colnames(counts), celltype='undefined', stringsAsFactors=FALSE)
rownames(pDat) <- pDat$cell
pDat[ rownames(anno), 2] <- as.character(anno$celltype)
head(pDat)
## cell celltype
## HSPC_001 HSPC_001 undefined
## HSPC_002 HSPC_002 undefined
## HSPC_003 HSPC_003 undefined
## HSPC_004 HSPC_004 undefined
## HSPC_006 HSPC_006 undefined
## HSPC_008 HSPC_008 undefined
# we create a feature annotation data frame
# that will contain gene informations and matching symbols and ID
# we now annotate the genes IDs in the counts matrix with biomaRt
mart <- biomaRt::useDataset("mmusculus_gene_ensembl", biomaRt::useMart("ensembl"))
genes_table <- biomaRt::getBM(attributes=c("ensembl_gene_id", "external_gene_name"), values=rownames(counts), mart=mart)
rownames(genes_table) <- genes_table$ensembl_gene_id
head(genes_table)
## ensembl_gene_id external_gene_name
## ENSMUSG00000064372 ENSMUSG00000064372 mt-Tp
## ENSMUSG00000064371 ENSMUSG00000064371 mt-Tt
## ENSMUSG00000064370 ENSMUSG00000064370 mt-Cytb
## ENSMUSG00000064369 ENSMUSG00000064369 mt-Te
## ENSMUSG00000064368 ENSMUSG00000064368 mt-Nd6
## ENSMUSG00000064367 ENSMUSG00000064367 mt-Nd5
fDat <- genes_table[ rownames(counts), ]
# to be consistent with Monocle naming conventions
colnames(fDat) <- c('ensembl_gene_id', 'gene_short_name')
head(fDat)
## ensembl_gene_id gene_short_name
## ENSMUSG00000000001 ENSMUSG00000000001 Gnai3
## ENSMUSG00000000003 ENSMUSG00000000003 Pbsn
## ENSMUSG00000000028 ENSMUSG00000000028 Cdc45
## ENSMUSG00000000031 ENSMUSG00000000031 H19
## ENSMUSG00000000037 ENSMUSG00000000037 Scml2
## ENSMUSG00000000049 ENSMUSG00000000049 Apoh
# we can now use this table to filter the genes
# in the counts matrix that are highly expressed
# according to the quality filters used by the authors
fDat <- fDat[ fDat$gene_short_name %in% rownames(lognorm), ]
# and we finally keep in the counts matrix only these genes
counts <- counts[ rownames(fDat), ]
dim(counts)
## [1] 3823 1645
dim(fDat)
## [1] 3823 2
dim(pDat)
## [1] 1645 2
# we build a cell dataset object
# in an appropriate format for monocle
# default method for modeling the expression values is VGAM::negbinomial.size()
# and adapted to counts
cds <- newCellDataSet(as.matrix(counts), phenoData=Biobase::AnnotatedDataFrame(pDat), featureData=Biobase::AnnotatedDataFrame(fDat))
cds
## CellDataSet (storageMode: environment)
## assayData: 3823 features, 1645 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: HSPC_001 HSPC_002 ... Prog_852 (1645 total)
## varLabels: cell celltype Size_Factor
## varMetadata: labelDescription
## featureData
## featureNames: ENSMUSG00000000001 ENSMUSG00000000028 ...
## ENSMUSG00000107235 (3823 total)
## fvarLabels: ensembl_gene_id gene_short_name
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
# the monocle cds object is done
# and ready for trajectory inference
dir.create('monocle', showWarnings=FALSE)
saveRDS(cds, 'monocle/cds_hematopoiesis.rds')
# Monocle2 preprocess
# normalization and scaling
cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds)
## Removing 18 outliers
# no need for further filtering, the expression matrix has already be filtered
# we find the genes that are expressed
cds <- detectGenes(cds, min_expr=0.1)
print(head(fData(cds)))
## ensembl_gene_id gene_short_name num_cells_expressed
## ENSMUSG00000000001 ENSMUSG00000000001 Gnai3 1613
## ENSMUSG00000000028 ENSMUSG00000000028 Cdc45 1438
## ENSMUSG00000000056 ENSMUSG00000000056 Narf 1333
## ENSMUSG00000000058 ENSMUSG00000000058 Cav2 577
## ENSMUSG00000000078 ENSMUSG00000000078 Klf6 1560
## ENSMUSG00000000127 ENSMUSG00000000127 Fer 578
# we identify genes that are expressed in at least 10 cells
expressed_genes <- row.names(subset(fData(cds), num_cells_expressed >= 10))
length(expressed_genes)
## [1] 3804
print(head(pData(cds)))
## cell celltype Size_Factor num_genes_expressed
## HSPC_001 HSPC_001 undefined 3.0636805 2046
## HSPC_002 HSPC_002 undefined 0.3118432 1787
## HSPC_003 HSPC_003 undefined 1.3305898 2210
## HSPC_004 HSPC_004 undefined 0.6452131 2097
## HSPC_006 HSPC_006 undefined 0.6569689 2183
## HSPC_008 HSPC_008 undefined 0.7325496 2030
# Identification of the ordering genes by differential testing
# ie. genes that are presumed to be important in the differentiation
# process captured in the sample. We used the cell types identified by the authors
# to define the ordering genes by DE testing. Alternatively, a classical approach
# consist in clustering the cells, then identify markers genes per clusters.
diff_test_res <- differentialGeneTest(cds[ expressed_genes, ], fullModelFormulaStr="~ celltype")
ordering_genes <- row.names(subset(diff_test_res, qval < 0.01))
length(ordering_genes)
## [1] 682
# we mark the genes that will be used for the ordering
cds <- setOrderingFilter(cds, ordering_genes)
# We use the DDRTree algorithm to infer a trajectory with potential branching
# points.
cds <- reduceDimension(cds, max_components = 2, method='DDRTree')
cds <- orderCells(cds)
plot_cell_trajectory(cds, color_by="celltype")

# changing the cell color
cell_colors <- c('lightblue','blue','red','black','orange','yellow','turquoise','lightgrey')
plot_cell_trajectory(cds, color_by="celltype") + scale_color_manual(values=cell_colors)

# The most imamture HSC in the sample express E-Slam
# we will define the root of this model according to this subset of cells
table(pData(cds)$State, pData(cds)$celltype)[,"ESLAM"]
## 1 2 3 4 5
## 0 0 0 10 0
# The state 4 defines the root
# We define the root in the model
cds <- orderCells(cds, root_state = 4)
# The pseudotime is now defined by the distance to the root
plot_cell_trajectory(cds, color_by = "Pseudotime")
# Differential expression testing per branch
# We look at the genes that are differentially expressed
# according to the pseudotime model
diff_test_res <- differentialGeneTest(cds[ ordering_genes, ], fullModelFormulaStr = "~sm.ns(Pseudotime)")
sig_gene_names <- row.names(subset(diff_test_res, qval < 0.1))
plot_pseudotime_heatmap(cds[ sig_gene_names[1:50], ], num_clusters = 3, cores=4, show_rownames=TRUE)
# Differential expression per branch
# let's look for genes involved in the erythroid pathway
BEAM_res <- BEAM(cds, branch_point = 1, cores = 4)
## Warning in if (progenitor_method == "duplicate") {: la condition a une
## longueur > 1 et seul le premier élément est utilisé
## Warning in if (progenitor_method == "sequential_split") {: la condition a
## une longueur > 1 et seul le premier élément est utilisé
BEAM_res <- BEAM_res[order(BEAM_res$qval),]
BEAM_res <- BEAM_res[,c("gene_short_name", "pval", "qval")]
head(BEAM_res)
## gene_short_name pval qval
## ENSMUSG00000004612 Nkg7 0 0
## ENSMUSG00000004655 Aqp1 0 0
## ENSMUSG00000009350 Mpo 0 0
## ENSMUSG00000015937 H2afy 0 0
## ENSMUSG00000016494 Cd34 0 0
## ENSMUSG00000017493 Igfbp4 0 0
plot_genes_branched_heatmap(cds[row.names(BEAM_res)[1:50]], branch_point = 1, num_clusters = 3, cores=4, use_gene_short_name=TRUE, show_rownames=TRUE)
