Trajectory inference

Datasets

# Transcriptional trajectories will be inferred from data by Nestorowa, Hamey et al. (Blood, 2016):
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5305050/
# The dataset consists in 1600 hematopoietic stem and progenitor cells from mouse bone marrow
# (sequenced using the SMARTseq2 technology). Using flow cytometry and index sorting, 12 HSPC of different phenotypes
# (about 10 cells each) have been included in the dataset, and will be used in this lab as a biological prior for the
# identification of the root and the branches in the transcriptional trajectory models.

# bash:
# 
# mkdir data && cd data
# wget -nH -nd    http://blood.stemcells.cam.ac.uk/data/nestorowa_corrected_log2_transformed_counts.txt.gz \
# http://blood.stemcells.cam.ac.uk/data/nestorowa_corrected_population_annotation.txt.gz
# wget -nH -nd ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE81nnn/GSE81682/suppl/GSE81682%5FHTSeq%5Fcounts%2Etxt%2Egz
# 
# gunzip *
# cd ..

Part I - Monocle2/DDRtree

# Inference done with Monocle2/DDRtree available via Bioconductor

library(monocle)
library(biomaRt)

# data loading

# The authors provide an expression matrix that has been filtered (highly expressed genes, high quality cells)
# scaled and log-normalized. An annotation table is also provided, with each cell type labelled according to
# the immunophenotyping done by flow cytometry.

lognorm <- t(read.table('data/nestorowa_corrected_log2_transformed_counts.txt', sep=" ", header=TRUE))
annotable <- read.table('data/nestorowa_corrected_population_annotation.txt')

# To infer a trajectory with Monocle2/DDRtree, using non-normalized UMI-based counts is highly recommended,
# as Monocle2 will scale and normalize the data internaly and is especting data ditributed according to a negative binomial.

# The count matrix has been downloaded and will be used for Monocle2.

counts <- read.table('data/GSE81682_HTSeq_counts.txt', sep="\t", header=TRUE, row.names='ID')

counts[1:5,1:5]

##                    HSPC_007 HSPC_013 HSPC_019 HSPC_025 HSPC_031
## ENSMUSG00000000001        0        7        1      185        2
## ENSMUSG00000000003        0        0        0        0        0
## ENSMUSG00000000028        4        1        2        4        3
## ENSMUSG00000000031        0        0        0        0        0
## ENSMUSG00000000037        0        0        0        0        0

dim(counts)

## [1] 46175  1920

lognorm[1:5,1:5]

##                HSPC_001 HSPC_002 HSPC_003 HSPC_004 HSPC_006
## X1110032F04Rik 0.000000 0.000000 0.000000 0.000000 0.000000
## X1110059E24Rik 0.000000 0.000000 2.795189 1.326478 7.348663
## X1300017J02Rik 0.000000 0.000000 0.000000 0.000000 0.000000
## X1600014C10Rik 0.000000 2.238601 0.000000 1.326478 4.946766
## X1700017B05Rik 1.225439 2.238601 1.989360 2.005685 0.000000

dim(lognorm)

## [1] 3991 1645

# Note that the count matrix is not filtered, and genes are labelled
# according to ensembl gene IDs. We will first filter the matrix according to the authors choices
# and we will map the gene official symbols.

# we filter the counts to keep only high quality cells based on the "lognorm" matrix.

counts <- counts[ , colnames(lognorm) ]
dim(counts)

## [1] 46175  1645

# we create a annotation data frame to label the cell types
# as defined byu the authors
pDat <- data.frame(cell=colnames(counts), celltype='undefined', stringsAsFactors=FALSE)
rownames(pDat) <- pDat$cell
pDat[ rownames(anno), 2] <- as.character(anno$celltype)
head(pDat)

##              cell  celltype
## HSPC_001 HSPC_001 undefined
## HSPC_002 HSPC_002 undefined
## HSPC_003 HSPC_003 undefined
## HSPC_004 HSPC_004 undefined
## HSPC_006 HSPC_006 undefined
## HSPC_008 HSPC_008 undefined

# we create a feature annotation data frame
# that will contain gene informations and matching symbols and ID
# we now annotate the genes IDs in the counts matrix with biomaRt
mart <- biomaRt::useDataset("mmusculus_gene_ensembl", biomaRt::useMart("ensembl"))
genes_table <- biomaRt::getBM(attributes=c("ensembl_gene_id", "external_gene_name"), values=rownames(counts), mart=mart)
rownames(genes_table) <- genes_table$ensembl_gene_id
head(genes_table)

##                       ensembl_gene_id external_gene_name
## ENSMUSG00000064372 ENSMUSG00000064372              mt-Tp
## ENSMUSG00000064371 ENSMUSG00000064371              mt-Tt
## ENSMUSG00000064370 ENSMUSG00000064370            mt-Cytb
## ENSMUSG00000064369 ENSMUSG00000064369              mt-Te
## ENSMUSG00000064368 ENSMUSG00000064368             mt-Nd6
## ENSMUSG00000064367 ENSMUSG00000064367             mt-Nd5

fDat <- genes_table[ rownames(counts), ]
# to be consistent with Monocle naming conventions
colnames(fDat) <- c('ensembl_gene_id', 'gene_short_name')
head(fDat)

##                       ensembl_gene_id gene_short_name
## ENSMUSG00000000001 ENSMUSG00000000001           Gnai3
## ENSMUSG00000000003 ENSMUSG00000000003            Pbsn
## ENSMUSG00000000028 ENSMUSG00000000028           Cdc45
## ENSMUSG00000000031 ENSMUSG00000000031             H19
## ENSMUSG00000000037 ENSMUSG00000000037           Scml2
## ENSMUSG00000000049 ENSMUSG00000000049            Apoh

# we can now use this table to filter the genes
# in the counts matrix that are highly expressed
# according to the quality filters used by the authors

fDat <- fDat[ fDat$gene_short_name %in% rownames(lognorm), ]

# and we finally keep in the counts matrix only these genes

counts <- counts[ rownames(fDat), ]
dim(counts)

## [1] 3823 1645

dim(fDat)

## [1] 3823    2

dim(pDat)

## [1] 1645    2

# we build a cell dataset object
# in an appropriate format for monocle
# default method for modeling the expression values is VGAM::negbinomial.size()
# and adapted to counts

cds <- newCellDataSet(as.matrix(counts), phenoData=Biobase::AnnotatedDataFrame(pDat), featureData=Biobase::AnnotatedDataFrame(fDat))
cds

## CellDataSet (storageMode: environment)
## assayData: 3823 features, 1645 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: HSPC_001 HSPC_002 ... Prog_852 (1645 total)
##   varLabels: cell celltype Size_Factor
##   varMetadata: labelDescription
## featureData
##   featureNames: ENSMUSG00000000001 ENSMUSG00000000028 ...
##     ENSMUSG00000107235 (3823 total)
##   fvarLabels: ensembl_gene_id gene_short_name
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:

# the monocle cds object is done
# and ready for trajectory inference
dir.create('monocle', showWarnings=FALSE)
saveRDS(cds, 'monocle/cds_hematopoiesis.rds')

# Monocle2 preprocess
# normalization and scaling
cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds)

## Removing 18 outliers

# no need for further filtering, the expression matrix has already be filtered
# we find the genes that are expressed
cds <- detectGenes(cds, min_expr=0.1)
print(head(fData(cds)))

##                       ensembl_gene_id gene_short_name num_cells_expressed
## ENSMUSG00000000001 ENSMUSG00000000001           Gnai3                1613
## ENSMUSG00000000028 ENSMUSG00000000028           Cdc45                1438
## ENSMUSG00000000056 ENSMUSG00000000056            Narf                1333
## ENSMUSG00000000058 ENSMUSG00000000058            Cav2                 577
## ENSMUSG00000000078 ENSMUSG00000000078            Klf6                1560
## ENSMUSG00000000127 ENSMUSG00000000127             Fer                 578

# we identify genes that are expressed in at least 10 cells
expressed_genes <- row.names(subset(fData(cds), num_cells_expressed >= 10))
length(expressed_genes)

## [1] 3804

print(head(pData(cds)))

##              cell  celltype Size_Factor num_genes_expressed
## HSPC_001 HSPC_001 undefined   3.0636805                2046
## HSPC_002 HSPC_002 undefined   0.3118432                1787
## HSPC_003 HSPC_003 undefined   1.3305898                2210
## HSPC_004 HSPC_004 undefined   0.6452131                2097
## HSPC_006 HSPC_006 undefined   0.6569689                2183
## HSPC_008 HSPC_008 undefined   0.7325496                2030

# Identification of the ordering genes by differential testing
# ie. genes that are presumed to be important in the differentiation
# process captured in the sample. We used the cell types identified by the authors
# to define the ordering genes by DE testing. Alternatively, a classical approach
# consist in clustering the cells, then identify markers genes per clusters.

diff_test_res <- differentialGeneTest(cds[ expressed_genes, ], fullModelFormulaStr="~ celltype")
ordering_genes <- row.names(subset(diff_test_res, qval < 0.01))
length(ordering_genes)

## [1] 682

# we mark the genes that will be used for the ordering 
cds <- setOrderingFilter(cds, ordering_genes)

# We use the DDRTree algorithm to infer a trajectory with potential branching
# points.
cds <- reduceDimension(cds, max_components = 2, method='DDRTree')
cds <- orderCells(cds)
plot_cell_trajectory(cds, color_by="celltype")

# changing the cell color
cell_colors <- c('lightblue','blue','red','black','orange','yellow','turquoise','lightgrey')
plot_cell_trajectory(cds, color_by="celltype") + scale_color_manual(values=cell_colors)

# The most imamture HSC in the sample express E-Slam
# we will define the root of this model according to this subset of cells
table(pData(cds)$State, pData(cds)$celltype)[,"ESLAM"]

##  1  2  3  4  5 
##  0  0  0 10  0

# The state 4 defines the root
# We define the root in the model
cds <- orderCells(cds, root_state = 4)
# The pseudotime is now defined by the distance to the root
plot_cell_trajectory(cds, color_by = "Pseudotime")

# Differential expression testing per branch
# We look at the genes that are differentially expressed
# according to the pseudotime model
diff_test_res <- differentialGeneTest(cds[ ordering_genes, ], fullModelFormulaStr = "~sm.ns(Pseudotime)")
sig_gene_names <- row.names(subset(diff_test_res, qval < 0.1))
plot_pseudotime_heatmap(cds[ sig_gene_names[1:50], ], num_clusters = 3, cores=4, show_rownames=TRUE)

# Differential expression per branch 
# let's look for genes involved in the erythroid pathway
BEAM_res <- BEAM(cds, branch_point = 1, cores = 4)

## Warning in if (progenitor_method == "duplicate") {: la condition a une
## longueur > 1 et seul le premier élément est utilisé

## Warning in if (progenitor_method == "sequential_split") {: la condition a
## une longueur > 1 et seul le premier élément est utilisé

BEAM_res <- BEAM_res[order(BEAM_res$qval),]
BEAM_res <- BEAM_res[,c("gene_short_name", "pval", "qval")]
head(BEAM_res)

##                    gene_short_name pval qval
## ENSMUSG00000004612            Nkg7    0    0
## ENSMUSG00000004655            Aqp1    0    0
## ENSMUSG00000009350             Mpo    0    0
## ENSMUSG00000015937           H2afy    0    0
## ENSMUSG00000016494            Cd34    0    0
## ENSMUSG00000017493          Igfbp4    0    0

plot_genes_branched_heatmap(cds[row.names(BEAM_res)[1:50]], branch_point = 1, num_clusters = 3, cores=4, use_gene_short_name=TRUE, show_rownames=TRUE)

# There is a clear separation between genes that are involved in the erythroid
# differentiation (eg. Gata1) on the left (cell fate1)
# with genes involved in the leukocyte differentiation (eg. Sell, Ccl9)

plot_genes_branched_pseudotime(cds[row.names(BEAM_res)[1:5]], branch_point = 1, color_by = "celltype", ncol = 1)  + scale_color_manual(values=cell_colors)

Part II - Diffusion map

# Analysis and inference done with the destiny package available via Bioconductor

# Trajectory inference by diffusion map an diffusion pseudotime

library(destiny)
library(ggplot2)

# data loading
# we now will directly use the filtered, scaled, log-normalised expression matrix
# provided by the author of the article
lognorm <- t(read.table('data/nestorowa_corrected_log2_transformed_counts.txt', sep=" ", header=TRUE))
lognorm[1:5,1:5]

##                HSPC_001 HSPC_002 HSPC_003 HSPC_004 HSPC_006
## X1110032F04Rik 0.000000 0.000000 0.000000 0.000000 0.000000
## X1110059E24Rik 0.000000 0.000000 2.795189 1.326478 7.348663
## X1300017J02Rik 0.000000 0.000000 0.000000 0.000000 0.000000
## X1600014C10Rik 0.000000 2.238601 0.000000 1.326478 4.946766
## X1700017B05Rik 1.225439 2.238601 1.989360 2.005685 0.000000

# We load the annotation of cell types that has been defined
# using flow cytometry and index sorting.
# The cell subsets (final differentiation stages) will be used
# to validate the trajectory model
anno <- read.table('data/nestorowa_corrected_population_annotation.txt')
pDat <- data.frame(cell=colnames(lognorm), celltype='undefined', stringsAsFactors=FALSE)
rownames(pDat) <- pDat$cell
pDat[ rownames(anno), 2] <- as.character(anno$celltype)

# we build an expression set object for an easier integration with destiny
eset <- Biobase::ExpressionSet(lognorm, phenoData=Biobase::AnnotatedDataFrame(pDat))
eset

## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 3991 features, 1645 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: HSPC_001 HSPC_002 ... Prog_852 (1645 total)
##   varLabels: cell celltype
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:

# the expression set is ready for inference with destiny
dir.create('destiny', showWarnings=FALSE)
saveRDS(cds, 'destiny/eset_hematopoiesis.rds')

# diffusion map
dmap <- DiffusionMap(eset)
# the process takes less than 60 seconds

# We look at the global model

plot.DiffusionMap(dmap)

plot.DiffusionMap(dmap, dims=c(1,2))

plot.DiffusionMap(dmap, dims=c(2,3))

plot.DiffusionMap(dmap, dims=c(1,3))

# components 1-2 describe well the branching process between erythroid (red) and myeloid/lymphoid (white) lineages

# we use ggplot2 to have a better rendering and project the cell labels
# as defined by flow cytometry experiment and index sorting

qplot(DC1, DC2, data=dmap, colour=celltype) + scale_color_manual(values=c('lightblue','brown','red','black','orange','yellow','blue','lightgrey')) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"))

# Pseudotime inference by diffusion:
# the transcriptional distance between cells is estimated by
# random walk along a neighborhood graph.
# The resulting "transcriptional" transition probability
# between cells is used to infer a pseudo-time scale of the differentiation process.

# we first define a root cell (origin) for the model
# we find the index of a ESLAM positive cells
which(anno$celltype=="ESLAM")

##  [1] 19 20 21 22 23 24 25 26 27 28

# we use this cell as a starting point

dpt <- DPT(dmap, tips=19)
plot(dpt)

# We can project the level of expression of known marker genes on
# the trajectory model

# Procr / Endothelial protein C is a marker of HSC subsets
plot(dpt, col_by='Procr', pal=viridis::magma)

# Gata1 is a key TF of the erythroid lineage
plot(dpt, col_by='Gata1', pal=viridis::magma)

# cathepsin G is a marker of neutrophils
plot(dpt, col_by='Ctsg', pal=viridis::magma)

sessionInfo()

## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 18.2
## 
## Matrix products: default
## BLAS:   /home/main/.local/R/R-3.6.0/lib/R/lib/libRblas.so
## LAPACK: /home/main/.local/R/R-3.6.0/lib/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=fr_FR.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=fr_FR.UTF-8        LC_COLLATE=fr_FR.UTF-8    
##  [5] LC_MONETARY=fr_FR.UTF-8    LC_MESSAGES=fr_FR.UTF-8   
##  [7] LC_PAPER=fr_FR.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] splines   stats4    parallel  stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] destiny_2.14.0      biomaRt_2.40.0      monocle_2.12.0     
##  [4] DDRTree_0.1.5       irlba_2.3.3         VGAM_1.1-1         
##  [7] ggplot2_3.1.1       Biobase_2.44.0      BiocGenerics_0.30.0
## [10] Matrix_1.2-17      
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.15                  colorspace_1.4-1           
##   [3] RcppEigen_0.3.3.5.0         class_7.3-15               
##   [5] rio_0.5.16                  XVector_0.24.0             
##   [7] GenomicRanges_1.36.0        proxy_0.4-23               
##   [9] ggrepel_0.8.0               bit64_0.9-7                
##  [11] AnnotationDbi_1.46.0        ranger_0.11.2              
##  [13] docopt_0.6.1                robustbase_0.93-4          
##  [15] knitr_1.22                  cluster_2.0.9              
##  [17] pheatmap_1.0.12             BiocManager_1.30.4         
##  [19] compiler_3.6.0              httr_1.4.0                 
##  [21] assertthat_0.2.1            lazyeval_0.2.2             
##  [23] limma_3.40.0                htmltools_0.3.6            
##  [25] prettyunits_1.0.2           tools_3.6.0                
##  [27] igraph_1.2.4.1              gtable_0.3.0               
##  [29] glue_1.3.1                  GenomeInfoDbData_1.2.1     
##  [31] RANN_2.6.1                  reshape2_1.4.3             
##  [33] dplyr_0.8.0.1               ggthemes_4.1.1             
##  [35] Rcpp_1.0.1                  carData_3.0-2              
##  [37] slam_0.1-45                 cellranger_1.1.0           
##  [39] lmtest_0.9-37               xfun_0.6                   
##  [41] laeken_0.5.0                stringr_1.4.0              
##  [43] openxlsx_4.1.0              XML_3.98-1.19              
##  [45] DEoptimR_1.0-8              zoo_1.8-5                  
##  [47] zlibbioc_1.30.0             MASS_7.3-51.4              
##  [49] scales_1.0.0                VIM_4.8.0                  
##  [51] hms_0.4.2                   SummarizedExperiment_1.14.0
##  [53] RColorBrewer_1.1-2          yaml_2.2.0                 
##  [55] curl_3.3                    memoise_1.1.0              
##  [57] gridExtra_2.3               fastICA_1.2-1              
##  [59] stringi_1.4.3               RSQLite_2.1.1              
##  [61] highr_0.8                   S4Vectors_0.22.0           
##  [63] e1071_1.7-1                 TTR_0.23-4                 
##  [65] densityClust_0.3            boot_1.3-22                
##  [67] zip_2.0.1                   BiocParallel_1.18.0        
##  [69] GenomeInfoDb_1.20.0         rlang_0.3.4                
##  [71] pkgconfig_2.0.2             matrixStats_0.54.0         
##  [73] bitops_1.0-6                qlcMatrix_0.9.7            
##  [75] evaluate_0.13               lattice_0.20-38            
##  [77] purrr_0.3.2                 labeling_0.3               
##  [79] bit_1.1-14                  tidyselect_0.2.5           
##  [81] plyr_1.8.4                  magrittr_1.5               
##  [83] R6_2.4.0                    IRanges_2.18.0             
##  [85] combinat_0.0-8              DelayedArray_0.10.0        
##  [87] DBI_1.0.0                   pillar_1.3.1               
##  [89] haven_2.1.0                 foreign_0.8-71             
##  [91] withr_2.1.2                 xts_0.11-2                 
##  [93] scatterplot3d_0.3-41        abind_1.4-5                
##  [95] RCurl_1.95-4.12             sp_1.3-1                   
##  [97] nnet_7.3-12                 tibble_2.1.1               
##  [99] crayon_1.3.4                car_3.0-2                  
## [101] rmarkdown_1.12              viridis_0.5.1              
## [103] progress_1.2.0              grid_3.6.0                 
## [105] readxl_1.3.1                data.table_1.12.2          
## [107] blob_1.1.1                  FNN_1.1.3                  
## [109] forcats_0.4.0               vcd_1.4-4                  
## [111] HSMMSingleCell_1.3.0        sparsesvd_0.1-4            
## [113] digest_0.6.18               munsell_0.5.0              
## [115] viridisLite_0.3.0           smoother_1.1

Trajectory inference

Jules GILET

2019-05-05

Datasets

Part I - Monocle2/DDRtree

Part II - Diffusion map