Loading the data
Data can be downloaded from here
# we will read both counts and rpkms as different method are more adopted for different type of data
R<-read.table("data/mouse_embryo/rpkms_Deng2014_preinplantation.txt")
C<-read.table("data/mouse_embryo/counts_Deng2014_preinplantation.txt")
M <- read.table("data/mouse_embryo/Metadata_mouse_embryo.csv",sep=",",header=T)
# select only 8 and 16 stage embryos.
selection <- c(grep("8cell",M$Stage),grep("16cell",M$Stage))
# check number of cells
table(M$Stage[selection])
# select those cells only from the data frames
M<-M[selection,]
R <- R[,selection]
C <- C[,selection]
Run SCDE
require(scde)
# make a count dataset and clean up
cd <- clean.counts(C, min.lib.size=1000, min.reads = 1, min.detected = 1)
# make factor for stage
stages <- factor(M$Stage,levels=c("X8cell","X16cell"))
names(stages) <- colnames(C)
# fit error models - takes a while to run (~20 mins).
# you can skip this step and load the precomputed file
# found [in here](https://github.com/NBISweden/excelerate-scRNAseq/blob/master/session-de/scde_error_models.Rdata?raw=true)
# Make sure to fix the file path in the next line.
savefile<-"data/mouse_embryo/DE/scde_error_models.Rdata"
if (file.exists(savefile)){
load(savefile)
}else{
#This works only if flexmix 2.3-13 (not 2.3-15) is installed, cf: http://hms-dbmi.github.io/scde/package.html
#check your environment file to change this.
o.ifm <- scde.error.models(counts = cd, groups = stages, n.cores = 1,
threshold.segmentation = TRUE, save.crossfit.plots = FALSE,
save.model.plots = FALSE, verbose = 0)
save(o.ifm,file=savefile)
}
# estimate gene expression prior
o.prior <- scde.expression.prior(models = o.ifm, counts = cd, length.out = 400, show.plot = FALSE)
# run differential expression test
ediff <- scde.expression.difference(o.ifm, cd, o.prior, groups = stages,
n.randomizations = 100, n.cores = 1, verbose = 1)
# convert Z-score and corrected Z-score to 2-sided p-values
ediff$pvalue <- 2*pnorm(-abs(ediff$Z))
ediff$p.adjust <- 2*pnorm(-abs(ediff$cZ))
# sort and look at top results
ediff <- ediff[order(abs(ediff$Z), decreasing = TRUE), ]
head(ediff)
# write output to a file with top DE genes on top.
write.table(ediff,file="data/mouse_embryo/DE/scde_8cell_vs_16_cell.tab",sep="\t",quote=F)
Run SCDE with batch info
# include also batch information in the test
# in this case we will consider each embryo as a batch just for testing
# OBS! in this case there are no batch that belongs to both groups so the correction may be pointless.
batch <- factor(M$Embryo, levels <- unique(M$Embryo))
ediff.batch <- scde.expression.difference(o.ifm, cd, o.prior, groups = stages,
n.randomizations = 100, n.cores = 1, batch=batch)
# now scde.expression.difference returns a list with the corrected values as well as the DE results.
de.batch <- ediff.batch$results
de.batch$pvalue <- 2*pnorm(-abs(de.batch$Z))
de.batch$p.adjust <- 2*pnorm(-abs(de.batch$cZ))
# look at top results
head(de.batch[order(abs(de.batch$Z), decreasing = TRUE), ])