Datasets

# Transcriptional trajectories will be inferred from data by Nestorowa, Hamey et al. (Blood, 2016):
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5305050/
# The dataset consists in 1600 hematopoietic stem and progenitor cells from mouse bone marrow
# (sequenced using the SMARTseq2 technology). Using flow cytometry and index sorting, 12 HSPC of different phenotypes
# (about 10 cells each) have been included in the dataset, and will be used in this lab as a biological prior for the
# identification of the root and the branches in the transcriptional trajectory models.

# bash:
# 
# mkdir data && cd data
# wget -nH -nd    http://blood.stemcells.cam.ac.uk/data/nestorowa_corrected_log2_transformed_counts.txt.gz \
# http://blood.stemcells.cam.ac.uk/data/nestorowa_corrected_population_annotation.txt.gz
# wget -nH -nd ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE81nnn/GSE81682/suppl/GSE81682%5FHTSeq%5Fcounts%2Etxt%2Egz
# 
# gunzip *
# cd ..

Part I - Monocle2/DDRtree

# Inference done with Monocle2/DDRtree available via Bioconductor

library(monocle)
library(biomaRt)

# data loading

# The authors provide an expression matrix that has been filtered (highly expressed genes, high quality cells)
# scaled and log-normalized. An annotation table is also provided, with each cell type labelled according to
# the immunophenotyping done by flow cytometry.

lognorm <- t(read.table('data/nestorowa_corrected_log2_transformed_counts.txt', sep=" ", header=TRUE))
annotable <- read.table('data/nestorowa_corrected_population_annotation.txt')

# To infer a trajectory with Monocle2/DDRtree, using non-normalized UMI-based counts is highly recommended,
# as Monocle2 will scale and normalize the data internaly and is especting data ditributed according to a negative binomial.

# The count matrix has been downloaded and will be used for Monocle2.

counts <- read.table('data/GSE81682_HTSeq_counts.txt', sep="\t", header=TRUE, row.names='ID')

counts[1:5,1:5]
##                    HSPC_007 HSPC_013 HSPC_019 HSPC_025 HSPC_031
## ENSMUSG00000000001        0        7        1      185        2
## ENSMUSG00000000003        0        0        0        0        0
## ENSMUSG00000000028        4        1        2        4        3
## ENSMUSG00000000031        0        0        0        0        0
## ENSMUSG00000000037        0        0        0        0        0
dim(counts)
## [1] 46175  1920
lognorm[1:5,1:5]
##                HSPC_001 HSPC_002 HSPC_003 HSPC_004 HSPC_006
## X1110032F04Rik 0.000000 0.000000 0.000000 0.000000 0.000000
## X1110059E24Rik 0.000000 0.000000 2.795189 1.326478 7.348663
## X1300017J02Rik 0.000000 0.000000 0.000000 0.000000 0.000000
## X1600014C10Rik 0.000000 2.238601 0.000000 1.326478 4.946766
## X1700017B05Rik 1.225439 2.238601 1.989360 2.005685 0.000000
dim(lognorm)
## [1] 3991 1645
# Note that the count matrix is not filtered, and genes are labelled
# according to ensembl gene IDs. We will first filter the matrix according to the authors choices
# and we will map the gene official symbols.

# we filter the counts to keep only high quality cells based on the "lognorm" matrix.

counts <- counts[ , colnames(lognorm) ]
dim(counts)
## [1] 46175  1645
# we create a annotation data frame to label the cell types
# as defined byu the authors
pDat <- data.frame(cell=colnames(counts), celltype='undefined', stringsAsFactors=FALSE)
rownames(pDat) <- pDat$cell
pDat[ rownames(anno), 2] <- as.character(anno$celltype)
head(pDat)
##              cell  celltype
## HSPC_001 HSPC_001 undefined
## HSPC_002 HSPC_002 undefined
## HSPC_003 HSPC_003 undefined
## HSPC_004 HSPC_004 undefined
## HSPC_006 HSPC_006 undefined
## HSPC_008 HSPC_008 undefined
# we create a feature annotation data frame
# that will contain gene informations and matching symbols and ID
# we now annotate the genes IDs in the counts matrix with biomaRt
mart <- biomaRt::useDataset("mmusculus_gene_ensembl", biomaRt::useMart("ensembl"))
genes_table <- biomaRt::getBM(attributes=c("ensembl_gene_id", "external_gene_name"), values=rownames(counts), mart=mart)
rownames(genes_table) <- genes_table$ensembl_gene_id
head(genes_table)
##                       ensembl_gene_id external_gene_name
## ENSMUSG00000064372 ENSMUSG00000064372              mt-Tp
## ENSMUSG00000064371 ENSMUSG00000064371              mt-Tt
## ENSMUSG00000064370 ENSMUSG00000064370            mt-Cytb
## ENSMUSG00000064369 ENSMUSG00000064369              mt-Te
## ENSMUSG00000064368 ENSMUSG00000064368             mt-Nd6
## ENSMUSG00000064367 ENSMUSG00000064367             mt-Nd5
fDat <- genes_table[ rownames(counts), ]
# to be consistent with Monocle naming conventions
colnames(fDat) <- c('ensembl_gene_id', 'gene_short_name')
head(fDat)
##                       ensembl_gene_id gene_short_name
## ENSMUSG00000000001 ENSMUSG00000000001           Gnai3
## ENSMUSG00000000003 ENSMUSG00000000003            Pbsn
## ENSMUSG00000000028 ENSMUSG00000000028           Cdc45
## ENSMUSG00000000031 ENSMUSG00000000031             H19
## ENSMUSG00000000037 ENSMUSG00000000037           Scml2
## ENSMUSG00000000049 ENSMUSG00000000049            Apoh
# we can now use this table to filter the genes
# in the counts matrix that are highly expressed
# according to the quality filters used by the authors

fDat <- fDat[ fDat$gene_short_name %in% rownames(lognorm), ]

# and we finally keep in the counts matrix only these genes

counts <- counts[ rownames(fDat), ]
dim(counts)
## [1] 3823 1645
dim(fDat)
## [1] 3823    2
dim(pDat)
## [1] 1645    2
# we build a cell dataset object
# in an appropriate format for monocle
# default method for modeling the expression values is VGAM::negbinomial.size()
# and adapted to counts

cds <- newCellDataSet(as.matrix(counts), phenoData=Biobase::AnnotatedDataFrame(pDat), featureData=Biobase::AnnotatedDataFrame(fDat))
cds
## CellDataSet (storageMode: environment)
## assayData: 3823 features, 1645 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: HSPC_001 HSPC_002 ... Prog_852 (1645 total)
##   varLabels: cell celltype Size_Factor
##   varMetadata: labelDescription
## featureData
##   featureNames: ENSMUSG00000000001 ENSMUSG00000000028 ...
##     ENSMUSG00000107235 (3823 total)
##   fvarLabels: ensembl_gene_id gene_short_name
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
# the monocle cds object is done
# and ready for trajectory inference
dir.create('monocle', showWarnings=FALSE)
saveRDS(cds, 'monocle/cds_hematopoiesis.rds')

# Monocle2 preprocess
# normalization and scaling
cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds)
## Removing 18 outliers
# no need for further filtering, the expression matrix has already be filtered
# we find the genes that are expressed
cds <- detectGenes(cds, min_expr=0.1)
print(head(fData(cds)))
##                       ensembl_gene_id gene_short_name num_cells_expressed
## ENSMUSG00000000001 ENSMUSG00000000001           Gnai3                1613
## ENSMUSG00000000028 ENSMUSG00000000028           Cdc45                1438
## ENSMUSG00000000056 ENSMUSG00000000056            Narf                1333
## ENSMUSG00000000058 ENSMUSG00000000058            Cav2                 577
## ENSMUSG00000000078 ENSMUSG00000000078            Klf6                1560
## ENSMUSG00000000127 ENSMUSG00000000127             Fer                 578
# we identify genes that are expressed in at least 10 cells
expressed_genes <- row.names(subset(fData(cds), num_cells_expressed >= 10))
length(expressed_genes)
## [1] 3804
print(head(pData(cds)))
##              cell  celltype Size_Factor num_genes_expressed
## HSPC_001 HSPC_001 undefined   3.0636805                2046
## HSPC_002 HSPC_002 undefined   0.3118432                1787
## HSPC_003 HSPC_003 undefined   1.3305898                2210
## HSPC_004 HSPC_004 undefined   0.6452131                2097
## HSPC_006 HSPC_006 undefined   0.6569689                2183
## HSPC_008 HSPC_008 undefined   0.7325496                2030
# Identification of the ordering genes by differential testing
# ie. genes that are presumed to be important in the differentiation
# process captured in the sample. We used the cell types identified by the authors
# to define the ordering genes by DE testing. Alternatively, a classical approach
# consist in clustering the cells, then identify markers genes per clusters.

diff_test_res <- differentialGeneTest(cds[ expressed_genes, ], fullModelFormulaStr="~ celltype")
ordering_genes <- row.names(subset(diff_test_res, qval < 0.01))
length(ordering_genes)
## [1] 682
# we mark the genes that will be used for the ordering 
cds <- setOrderingFilter(cds, ordering_genes)

# We use the DDRTree algorithm to infer a trajectory with potential branching
# points.
cds <- reduceDimension(cds, max_components = 2, method='DDRTree')
cds <- orderCells(cds)
plot_cell_trajectory(cds, color_by="celltype")

# changing the cell color
cell_colors <- c('lightblue','blue','red','black','orange','yellow','turquoise','lightgrey')
plot_cell_trajectory(cds, color_by="celltype") + scale_color_manual(values=cell_colors)

# The most imamture HSC in the sample express E-Slam
# we will define the root of this model according to this subset of cells
table(pData(cds)$State, pData(cds)$celltype)[,"ESLAM"]
##  1  2  3  4  5 
##  0  0  0 10  0
# The state 4 defines the root
# We define the root in the model
cds <- orderCells(cds, root_state = 4)
# The pseudotime is now defined by the distance to the root
plot_cell_trajectory(cds, color_by = "Pseudotime")

# Differential expression testing per branch
# We look at the genes that are differentially expressed
# according to the pseudotime model
diff_test_res <- differentialGeneTest(cds[ ordering_genes, ], fullModelFormulaStr = "~sm.ns(Pseudotime)")
sig_gene_names <- row.names(subset(diff_test_res, qval < 0.1))
plot_pseudotime_heatmap(cds[ sig_gene_names[1:50], ], num_clusters = 3, cores=4, show_rownames=TRUE)

# Differential expression per branch 
# let's look for genes involved in the erythroid pathway
BEAM_res <- BEAM(cds, branch_point = 1, cores = 4)
## Warning in if (progenitor_method == "duplicate") {: la condition a une
## longueur > 1 et seul le premier élément est utilisé
## Warning in if (progenitor_method == "sequential_split") {: la condition a
## une longueur > 1 et seul le premier élément est utilisé
BEAM_res <- BEAM_res[order(BEAM_res$qval),]
BEAM_res <- BEAM_res[,c("gene_short_name", "pval", "qval")]
head(BEAM_res)
##                    gene_short_name pval qval
## ENSMUSG00000004612            Nkg7    0    0
## ENSMUSG00000004655            Aqp1    0    0
## ENSMUSG00000009350             Mpo    0    0
## ENSMUSG00000015937           H2afy    0    0
## ENSMUSG00000016494            Cd34    0    0
## ENSMUSG00000017493          Igfbp4    0    0
plot_genes_branched_heatmap(cds[row.names(BEAM_res)[1:50]], branch_point = 1, num_clusters = 3, cores=4, use_gene_short_name=TRUE, show_rownames=TRUE)