Abstract
In this work we will integrate gene expression, methylation, mutation and drug response data from 200 human individuals with Chronic Lymphocytic Leukemia (CLL). The phenotype of interest for demonstration purpose will be Gender.
We will start with reading and imputing missing values using median imputation:
#Uncomment this in case you can not change your working directory
#current_dir<-getwd()
#setwd(paste0(current_dir,"/session_ml/SupervisedOMICsIntegration"))
expr <- as.data.frame(t(read.delim("CLL_mRNA.txt", header = TRUE, sep="\t")))
for(i in 1:ncol(expr)){expr[,i][is.na(expr[,i])]<-median(expr[,i],na.rm=TRUE)}
expr[1:5,1:5]
## ENSG00000244734 ENSG00000158528 ENSG00000198478 ENSG00000175445
## H045 4.558644 11.741854 8.921456 12.686458
## H109 2.721512 13.287432 2.721512 10.925985
## H024 9.938456 2.341006 12.381452 1.528848
## H056 13.278004 3.232874 8.106266 1.528848
## H079 6.086874 11.940820 4.889503 13.340588
## ENSG00000174469
## H045 2.644946
## H109 12.648355
## H024 1.528848
## H056 13.565210
## H079 5.476914
mut <- as.data.frame(t(read.delim("CLL_Mutations.txt", header = TRUE, sep="\t")))
for(i in 1:ncol(mut)){mut[,i][is.na(mut[,i])]<-median(mut[,i],na.rm=TRUE)}
mut[1:5,1:5]
## gain2p25.3 gain3q26 del6p21.2 del6q21 del8p12
## H045 0 0 0 0 0
## H109 0 0 0 0 0
## H024 0 0 0 0 0
## H056 0 0 0 0 0
## H079 1 0 0 0 0
meth <- as.data.frame(t(read.delim("CLL_Methylation.txt", header = TRUE, sep="\t")))
for(i in 1:ncol(meth)){meth[,i][is.na(meth[,i])]<-median(meth[,i],na.rm=TRUE)}
meth[1:5,1:5]
## cg10146935 cg26837773 cg17801765 cg13244315 cg06181703
## H045 1.81108585 -5.1725723 5.4115263 -0.1188251 5.1203838
## H109 -3.99750846 1.5948702 5.4126925 1.0438706 1.2794803
## H024 -2.84431298 0.1611705 0.3657059 -4.2192362 0.7211004
## H056 -3.33865611 -2.0934326 0.3736342 -1.5921965 4.0470594
## H079 -0.01936203 3.7489796 5.4120096 1.4164183 5.2374225
drug <- as.data.frame(t(read.delim("CLL_Drugs.txt", header = TRUE, sep="\t")))
for(i in 1:ncol(drug)){drug[,i][is.na(drug[,i])]<-median(drug[,i],na.rm=TRUE)}
drug[1:5,1:5]
## D_001_1 D_001_2 D_001_3 D_001_4 D_001_5
## H045 0.02363938 0.04623274 0.3187471 0.8237027 0.8962777
## H109 0.07359900 0.10623002 0.2732891 0.7171379 0.8850003
## H024 0.04426905 0.08425647 0.3187606 0.7797544 0.8974880
## H056 0.05813930 0.09022028 0.2322145 0.7225736 0.7957497
## H079 0.02042077 0.04750543 0.3638962 0.8073907 0.8794886
Now let us have a look at the phenotypic data, extract Gender and convert it to a variable Y that is going to be used later when running PLS-DA analysis:
phen <- read.delim("CLL_Covariates.txt", header = TRUE, sep="\t")
head(phen)
## Gender Diagnosis
## H045 m CLL
## H109 m CLL
## H024 m CLL
## H056 m CLL
## H079 m CLL
## H164 f CLL
Y<-factor(phen$Gender)
summary(Y)
## f m
## 79 121
Let us split the data set into train and test sub-sets. We select 60 samples (30%) for testing. Since we have more males than females (fraction of females is approximately 0.4) we select 36 males and 24 females.
set.seed(1234)
female_sample<-rownames(expr)[as.character(phen$Gender)=="f"][sample(1:length(rownames(expr)[as.character(phen$Gender)=="f"]),24)]
female_sample
## [1] "H102" "H040" "H165" "H018" "H101" "H096" "H033" "H210" "H225" "H170"
## [11] "H181" "H182" "H204" "H023" "H051" "H031" "H209" "H035" "H081" "H029"
## [21] "H014" "H186" "H166" "H167"
male_sample<-rownames(expr)[as.character(phen$Gender)=="m"][sample(1:length(rownames(expr)[as.character(phen$Gender)=="m"]),36)]
male_sample
## [1] "H016" "H044" "H032" "H055" "H254" "H024" "H183" "H030" "H252" "H272"
## [11] "H231" "H107" "H094" "H109" "H260" "H078" "H080" "H208" "H072" "H020"
## [21] "H059" "H235" "H206" "H202" "H106" "H087" "H049" "H104" "H038" "H037"
## [31] "H250" "H191" "H017" "H176" "H070" "H178"
expr_test<-expr[match(c(female_sample,male_sample),rownames(expr)),]
expr_test[1:5,1:5]
## ENSG00000244734 ENSG00000158528 ENSG00000198478 ENSG00000175445
## H102 7.320581 12.434167 6.783195 11.336763
## H040 4.769423 2.381844 2.381844 5.459589
## H165 2.529650 2.529650 4.073655 10.346617
## H018 4.184367 1.528848 8.195382 2.911501
## H101 7.049108 11.335080 5.101373 9.256292
## ENSG00000174469
## H102 8.629901
## H040 10.952705
## H165 8.363989
## H018 12.530168
## H101 7.540800
meth_test<-meth[match(c(female_sample,male_sample),rownames(meth)),]
meth_test[1:5,1:5]
## cg10146935 cg26837773 cg17801765 cg13244315 cg06181703
## H102 0.02408192 -2.66149768 4.7975506 -2.0793298 -0.09300816
## H040 -4.40691148 -5.22192499 5.3746764 3.2587619 3.01769727
## H165 0.59337171 -0.03963969 0.5020684 3.8726517 3.49487818
## H018 -3.67883824 -3.52009661 2.0600774 -1.0636796 0.37493793
## H101 -2.91823259 4.62748491 1.7367315 0.1692539 5.36798425
mut_test<-mut[match(c(female_sample,male_sample),rownames(mut)),]
mut_test[1:5,1:5]
## gain2p25.3 gain3q26 del6p21.2 del6q21 del8p12
## H102 0 0 0 0 0
## H040 0 0 0 0 0
## H165 1 0 0 0 0
## H018 0 0 0 0 0
## H101 0 0 0 0 0
drug_test<-drug[match(c(female_sample,male_sample),rownames(drug)),]
drug_test[1:5,1:5]
## D_001_1 D_001_2 D_001_3 D_001_4 D_001_5
## H102 0.08906634 0.13920632 0.2337110 0.7513949 0.7672126
## H040 0.01908389 0.02544977 0.4947984 0.9673132 1.0000992
## H165 0.03234895 0.04840093 0.2723969 0.7585817 0.9079429
## H018 0.04426905 0.08425647 0.3187606 0.7797544 0.8974880
## H101 0.07350703 0.12265018 0.2543330 0.7255191 0.7361780
Y.test<-Y[match(c(female_sample,male_sample),rownames(phen))]
Y.test
## [1] f f f f f f f f f f f f f f f f f f f f f f f f m m m m m m m m m m m m m m
## [39] m m m m m m m m m m m m m m m m m m m m m m
## Levels: f m
summary(Y.test)
## f m
## 24 36
length(Y.test)
## [1] 60
expr<-expr[!rownames(expr)%in%rownames(expr_test),]
meth<-meth[!rownames(meth)%in%rownames(meth_test),]
mut<-mut[!rownames(mut)%in%rownames(mut_test),]
drug<-drug[!rownames(drug)%in%rownames(drug_test),]
Y<-Y[!rownames(phen)%in%c(female_sample,male_sample)]
Y
## [1] m m m f m m m m m m m m m f f f f m f m m m m m m f m f m m f m m m f f f
## [38] m m m f m m f f m m m m f f m m f m f f f m f m m m m f f f m f m f m f m
## [75] f f m f f m m f m f m m m f m m m f m f f m m m m f m m m m m f m f m m f
## [112] f m f f f m m f f f m m m f m m m f m f m f m m m m f m f
## Levels: f m
length(Y)
## [1] 140
summary(Y)
## f m
## 55 85
Since mutations represent a binary data, there is always a lack of variation due to coding with 0 and 1. Therefore, we will prefilter the mutation matrix by excluding sites with variance across individuals close to zero:
library("mixOmics")
## Loading required package: MASS
## Loading required package: lattice
## Loading required package: ggplot2
##
## Loaded mixOmics 6.10.1
## Thank you for using mixOmics!
## Tutorials: http://mixomics.org
## Bookdown vignette: https://mixomicsteam.github.io/Bookdown
## Questions, issues: Follow the prompts at http://mixomics.org/contact-us
## Cite us: citation('mixOmics')
library("matrixStats")
my_nearZeroVar<-nearZeroVar(mut)
head(my_nearZeroVar$Metrics)
## freqRatio percentUnique
## gain2p25.3 27.00000 1.428571
## gain3q26 69.00000 1.428571
## del6p21.2 69.00000 1.428571
## del6q21 22.33333 1.428571
## del8p12 45.66667 1.428571
## gain8q24 27.00000 1.428571
dim(my_nearZeroVar$Metrics)
## [1] 59 2
mut <- mut[,-which(colnames(mut)%in%rownames(my_nearZeroVar$Metrics))]
mut[1:5,1:5]
## del11q22.3 trisomy12 del13q14_any del17p13 BRAF
## H045 1 0 0 0 0
## H056 0 0 1 0 0
## H079 1 0 0 0 0
## H164 1 0 1 0 0
## H113 0 1 1 0 0
dim(mut)
## [1] 140 10
mut_test<-subset(mut_test,select=colnames(mut))
Therefore we end up with just a few mutations which have enough variation for the PLS-DA model. Later perhaps it makes sense to include all of them and not select informative ones using the sparse PLS-DA algorithm.
Right now we can not integrate the OMICs data sets since at least expression and methylation data are high-dimensional so we need to perform a feature selection for those two OMICs. Here we use LASSO for doing feature selection for gene expression data:
library("glmnet")
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-18
lasso_fit <- cv.glmnet(as.matrix(expr), Y, family = "binomial", alpha = 1)
plot(lasso_fit)
coef <- predict(lasso_fit, s = "lambda.min", type = "nonzero")
colnames(expr)[unlist(coef)]
## [1] "ENSG00000198822" "ENSG00000176769" "ENSG00000150457" "ENSG00000184368"
## [5] "ENSG00000133134" "ENSG00000235268" "ENSG00000001626" "ENSG00000083290"
## [9] "ENSG00000172139" "ENSG00000140403" "ENSG00000136205" "ENSG00000186871"
## [13] "ENSG00000106633" "ENSG00000147113" "ENSG00000204291" "ENSG00000067798"
## [17] "ENSG00000162551" "ENSG00000049246" "ENSG00000196739" "ENSG00000088053"
## [21] "ENSG00000149150" "ENSG00000162627" "ENSG00000185666" "ENSG00000167701"
## [25] "ENSG00000187608" "ENSG00000169330" "ENSG00000106025" "ENSG00000240403"
## [29] "ENSG00000130508" "ENSG00000142515" "ENSG00000213160" "ENSG00000078114"
result_expr <- data.frame(GENE = names(as.matrix(coef(lasso_fit, s = "lambda.min"))
[as.matrix(coef(lasso_fit, s = "lambda.min"))[,1]!=0, 1])[-1],
SCORE = as.numeric(as.matrix(coef(lasso_fit, s = "lambda.min"))
[as.matrix(coef(lasso_fit,
s = "lambda.min"))[,1]!=0, 1])[-1])
result_expr <- result_expr[order(-abs(result_expr$SCORE)),]
print(head(result_expr,10))
## GENE SCORE
## 14 ENSG00000147113 -1.4122016
## 4 ENSG00000184368 -1.0724494
## 12 ENSG00000186871 -0.8856169
## 29 ENSG00000130508 0.3822835
## 21 ENSG00000149150 -0.3387733
## 23 ENSG00000185666 0.3002490
## 2 ENSG00000176769 0.2667967
## 30 ENSG00000142515 0.2633780
## 20 ENSG00000088053 -0.2163359
## 25 ENSG00000187608 -0.1967797
print(as.character(result_expr$GENE))
## [1] "ENSG00000147113" "ENSG00000184368" "ENSG00000186871" "ENSG00000130508"
## [5] "ENSG00000149150" "ENSG00000185666" "ENSG00000176769" "ENSG00000142515"
## [9] "ENSG00000088053" "ENSG00000187608" "ENSG00000162627" "ENSG00000240403"
## [13] "ENSG00000235268" "ENSG00000150457" "ENSG00000169330" "ENSG00000001626"
## [17] "ENSG00000213160" "ENSG00000140403" "ENSG00000198822" "ENSG00000106025"
## [21] "ENSG00000136205" "ENSG00000083290" "ENSG00000167701" "ENSG00000204291"
## [25] "ENSG00000162551" "ENSG00000196739" "ENSG00000078114" "ENSG00000172139"
## [29] "ENSG00000049246" "ENSG00000067798" "ENSG00000133134" "ENSG00000106633"
expr <- subset(expr, select = as.character(result_expr$GENE))
expr_test<-subset(expr_test,select=colnames(expr))
ens2gs<-read.delim("ENSEMBLE_TO_GENE_SYMBOL_COORD.txt",header=TRUE,sep="\t")
ens2gs<-ens2gs[match(colnames(expr),as.character(ens2gs$ensembl_gene_id)),]
colnames(expr)<-ens2gs$external_gene_name
colnames(expr_test)<-ens2gs$external_gene_name
We have also converted the Ensembl gene IDs into gene names on the fly for clarity. Next we proceed with LASSO feature selection for methylation data:
library("glmnet")
lasso_fit <- cv.glmnet(as.matrix(meth), Y, family = "binomial", alpha = 1)
plot(lasso_fit)
coef <- predict(lasso_fit, s = "lambda.min", type = "nonzero")
colnames(meth)[unlist(coef)]
## [1] "cg10219816" "cg08715595" "cg03441844" "cg09557387" "cg17277199"
## [6] "cg20408276" "cg11669516" "cg06193597" "cg24576535" "cg00335715"
## [11] "cg05495546" "cg06847624" "cg00642460" "cg18136963" "cg15013257"
## [16] "cg10409981" "cg11218091" "cg11216554" "cg08214808" "cg27373604"
## [21] "cg03126946" "cg02299497" "cg01517680" "cg02399249" "cg12744859"
## [26] "cg08065231" "cg22195627" "cg14815891"
result_meth <- data.frame(CPG = names(as.matrix(coef(lasso_fit, s = "lambda.min"))
[as.matrix(coef(lasso_fit, s = "lambda.min"))[,1]!=0, 1])[-1],
SCORE = as.numeric(as.matrix(coef(lasso_fit, s = "lambda.min"))
[as.matrix(coef(lasso_fit, s = "lambda.min"))[,1]!=0, 1])[-1])
result_meth <- result_meth[order(-abs(result_meth$SCORE)),]
print(head(result_meth,10))
## CPG SCORE
## 10 cg00335715 -0.06256381
## 13 cg00642460 -0.05333048
## 19 cg08214808 -0.04765747
## 4 cg09557387 -0.04195541
## 11 cg05495546 -0.03922580
## 12 cg06847624 -0.03801790
## 27 cg22195627 0.03788744
## 2 cg08715595 -0.03788461
## 25 cg12744859 -0.03677530
## 28 cg14815891 -0.03604474
print(as.character(result_meth$CPG))
## [1] "cg00335715" "cg00642460" "cg08214808" "cg09557387" "cg05495546"
## [6] "cg06847624" "cg22195627" "cg08715595" "cg12744859" "cg14815891"
## [11] "cg20408276" "cg10409981" "cg03126946" "cg10219816" "cg18136963"
## [16] "cg11669516" "cg11218091" "cg15013257" "cg03441844" "cg17277199"
## [21] "cg01517680" "cg11216554" "cg06193597" "cg02299497" "cg02399249"
## [26] "cg27373604" "cg24576535" "cg08065231"
meth <- subset(meth, select = as.character(result_meth$CPG))
meth_test <- subset(meth_test, select = as.character(result_meth$CPG))
We can see that we dramatically decreased the number of dimensions / features in the gene expression and methylation data sets. Now we can proceed with OMICs integration via PLS-DA algorithm.
Now we will start integrating the four OMICs: 1) gene expression, 2) methylation and 3) mutations and 4) drug response. For this purpose we will concatenate gene expression, methylation, mutation and drug response matrices into X matrix and use the Gender as Y variable, so it is a typical Machine Learning setup: y=f(x), where x is the input, y is the class labels of individuals and the f-function is learnt from the data. Note that the f-function is a-priori linear for PLS-DA and non-linear for e.g. artificial neural networks.
data<-list(expr=expr,mut=mut,meth=meth,drug=drug)
names(data)
## [1] "expr" "mut" "meth" "drug"
lapply(data, dim)
## $expr
## [1] 140 32
##
## $mut
## [1] 140 10
##
## $meth
## [1] 140 28
##
## $drug
## [1] 140 310
First, we fit a DIABLO model without variable selection to assess the global performance and choose the number of components for the final DIABLO model. The function “perf” will perform M-fold cross validation (number of folds is specified by “folds” parameter) repeated “nrepeat” times. The design matrix sets expected correlation between the OMICs. The values may range between 0 (no correlation) to 1 (strong correlation), the design can be chosen based on prior knowledge inferred by e.g. MOFA (see Unsupervised OMICs integration session). Here due to the lack of prior knowledge we assume a strong correlation 1 between the OMICs.
library("mixOmics")
design=matrix(1,ncol=length(data),nrow=length(data),dimnames=list(names(data),names(data)))
diag(design)=0
design
## expr mut meth drug
## expr 0 1 1 1
## mut 1 0 1 1
## meth 1 1 0 1
## drug 1 1 1 0
splsda.res = block.splsda(X = data, Y = Y, ncomp = 8, design = design)
## Design matrix has changed to include Y; each block will be
## linked to Y.
perf.diablo = perf(splsda.res, validation = 'Mfold', folds = 2, nrepeat = 5, progressBar=FALSE, cpus=4)
perf.diablo
##
## Call:
## perf.sgccda(object = splsda.res, validation = "Mfold", folds = 2, nrepeat = 5, cpus = 4, progressBar = FALSE)
##
## Main numerical outputs:
## --------------------
## Error rate (overall or BER) for each component and for each distance: see object$error.rate
## Error rate per class, for each component and for each distance: see object$error.rate.class
## Prediction values for each component: see object$predict
## Classification of each sample, for each component and for each distance: see object$class
## Stable features on each component: see object$features$stable
## AUC values: see object$auc if auc = TRUE
##
## Visualisation Functions:
## --------------------
## plot
plot(perf.diablo,overlay='dist',sd=TRUE)
Here all BER (balanced error rate, which we should use in our case since we have more males than females) distances seem to reach their minimum / plateau at ncomp=2, and do not seem to change further. Therefore, we will use ncomp=2 as an optimal number of PLS components to keep in the further downstream analysis. To keep more PLS components would lead to a risk of overfitting the model, therefore we should keep as few PLS components as possible. In the BER figure, different colors correspond to different distances between the reference train samples and the test samples projected on the reference during the cross-validation procedure. In every cross-validation iteration we decide whether a test sample belongs to the Male or Female clusters, and the distance measuring the proximity to a cluster can be max.dist, mahalanobis.dist and centroids.dist. After a test sample has been assigned to one of the reference clusters (Males or Females), and got a predicted label (Male or Female), it is validated against its true label, and the accuracy (or error rate in this case) demonstrate how generalizable the model is.
After we have selected the optimal number of PLS components, let us now perform tuning of the model which implies selecting most informative variables in all layers of the data by LASSO algorithm. For this purpose, we will again need to provide the design matrix which shows a-priori correlation between the phenotype of interest and the OMICs. We assume strong correlation, but one can in principle play with this parameter and find out how it influences the final result. Further, we will concentrate on the first two PLS components based on the output of “perf”-function above.
library("mixOmics")
test.keepX=list("expr"=c(1:5),"mut"=c(1:5),"meth"=c(1:5),"drug"=c(1:5))
ptm<-proc.time()
tune.omics=tune.block.splsda(X=data,Y=Y,ncomp=2,test.keepX=test.keepX,design=design,cpus=4,progressBar=FALSE,validation="Mfold",folds=2,nrepeat=5,near.zero.var=FALSE,dist = "mahalanobis.dist")
## You have provided a sequence of keepX of length: 5 for block expr and 5 for block mut and 5 for block meth and 5 for block drug.
## This results in 625 models being fitted for each component and each nrepeat, this may take some time to run, be patient!
my_time<-proc.time()-ptm
print(paste0("Thus it takes ",as.numeric(my_time["elapsed"])/3600,"h to run this piece of code"))
## [1] "Thus it takes 0.501896944444445h to run this piece of code"
LASSO selected the following numbers of most informative variables from the loading vectors for each OMICs data set for each of the two principal components:
tune.omics$choice.keepX
## $expr
## [1] 5 5
##
## $mut
## [1] 1 1
##
## $meth
## [1] 1 4
##
## $drug
## [1] 1 3
Now let us perform the final sPLS-DA modelling and display PCA plots and loadings. Here, however, for simplicity we will use all available mutations due to its special binary structure. Genetic variation data is notoriously difficult to handle by traditional statistical approaches that assume normality, therefore PLS (as well as many statistical other tools) has unfortunately problems working with this type of data.
#list.keepX=list("expr"=tune.omics$choice.keepX$expr,"mut"=tune.omics$choice.keepX$mut,"meth"=tune.omics$choice.keepX$meth,"drug"=tune.omics$choice.keepX$drug)
list.keepX=list("expr"=tune.omics$choice.keepX$expr,"mut"=c(dim(mut)[2],dim(mut)[2]),"meth"=tune.omics$choice.keepX$meth,"drug"=tune.omics$choice.keepX$drug)
res=block.splsda(X=data,Y=Y,ncomp=2,keepX=list.keepX,design=design,near.zero.var=FALSE)
## Design matrix has changed to include Y; each block will be
## linked to Y.
selectVar(res,block='expr',comp=1)$expr$name
## [1] "CXorf36" "LATS2" "MAP7D2" "ERCC6L" "ULK2"
plotIndiv(res,legend=TRUE,title="CLL Omics",ellipse=FALSE,ind.names=FALSE,cex=2)
Here we can visualize the samples for the 4 OMICs using their features selected via the integration procedure. Male and female samples seem to be clearly separated for gene expression and perhaps methylation, while no obvious separation is present for the mutation and drug OMICs. Let us look at the loadings in order to see what features drive the scattering of the samples in the plots above:
plotLoadings(res,comp=1,contrib='max',method='mean')
plotLoadings(res,comp=2,contrib='max',method='mean')
Now, we will diplay the data points using PCA principal components from individual OMICs. This can be an informative way to to visualise correlation between components from different data sets. In our case mutation and drug OMICs seem to have the strongest correlation.
plotDiablo(res,ncomp=1)
plotDiablo(res,ncomp=2)
Now let us display so-called “arrow plot” which demonstrates the samples (individuals) for all the OMICS superimposed. Here, each sample will be indicated using arrows. The start of the arrow indicates the centroid / consensus between all OMICS for a given individual, and the tips of the arrows the location of that individual in each individual OMICs. Short arrows indicate a strong agreement between the matching OMICs, long arrows a disagreement between the matching OMICs.
plotArrow(res,ind.names=FALSE,legend=TRUE,title="CLL Omics Integration")
The Arrow Plot can be viewed as a Consensus Plot between all the OMICs on the sample level, the X and Y coordiantes correspond to common latent variables between the OMICs, i.e. we projected all the OMICs into some common latent space where they loose information about their technological origin and hence can be superimposed onto each other.
Now we will display the so-called Correlation Circle Plot, where the top loadings variables from each of the OMICs are superimposed. Clustering of variables around the poles of the circle implies strong correlation between the variables from different OMICs. Variables on the opposite poles of the correlation circle plot imply strong anti-correlation. While the Arrow Plot serves as a main integrative OMICs visualization on sample level, the Correlation Circle Plots can be considered as a main visualization of OMICs integration on the feature level.
plotVar(res,var.names=TRUE,style='ggplot2',legend=TRUE,cex=c(3,3,3,3),col=c('blue','red2',"darkgreen","darkorange"))
For further visualization of the results of OMICs integration, we will calculate the so-called Circos Plot that diaplays correlations between features from different OMICs dat sets. Please note that here for the Circos Plot, as well as for the Correlation Circle Plot above, the features were selected simultaneously from all the OMICs when performing integration, i.e. they are not equavivalent to those obtained from each individual OMIC separately.
circosPlot(res,cutoff=0.7,line=FALSE,size.variables=0.5)
Correlation network is another handy way to demostrate correlations between top loadings of the OMICs data sets in a pairwise fashion. Here we can choose a pair of OMICs and make a network comprising most informative features (selected via the integration) from both OMICs. The color of the edges corresponds to the strength of the correlation between the OMICs. One can use a “cutoff” paremeter to display one the the edges above a specified threshold.
network(res,blocks=c(1,2),cex.node.name=0.6,color.node=c('blue','red2'),breaks=NULL)
network(res,blocks=c(1,3),cex.node.name=0.6,color.node=c('blue','darkgreen'),breaks=NULL)
network(res,blocks=c(1,4),cex.node.name=0.6,color.node=c('blue','darkorange'),breaks=NULL)
network(res,blocks=c(2,3),cex.node.name=0.6,color.node=c('red2','darkgreen'),breaks=NULL)
network(res,blocks=c(2,4),cex.node.name=0.6,color.node=c('red2','darkorange'),breaks=NULL)
network(res,blocks=c(3,4),cex.node.name=0.6,color.node=c('darkgreen','darkorange'),breaks=NULL)
Finally, the correlation heatmap displays strongly correlated blocks of gene expression, methylation, mutations and drug markers. This is an unsupervised visualization equivalent to hierarchical clustering. So if the DIABLO supervised integration worked well, one should ideally see females and males clusters on the y-axis and the corresponding “mixtures” of features from different OMICs that provide the clustering of the samples.
cimDiablo(res,margins=c(11,18))
Here the color of a cell on a heatmap demonstrates abundance of the feature in the sample that are “coordinates” of the cell. In our case we can see somewhat noticable separation between males and females and can clearly see that it is not a single OMIC that drives this separation but the features from different OMICs are well mixed on the x-axis.
Now it is time for prediction. Once we have trained the PLS-DA model, we can use it and utilize the 60 test samples for making prediction of their gender and accessing the accuracy of the prediction:
data.test<-list(expr=expr_test,mut=mut_test,meth=meth_test,drug=drug_test)
lapply(data.test, dim)
## $expr
## [1] 60 32
##
## $mut
## [1] 60 10
##
## $meth
## [1] 60 28
##
## $drug
## [1] 60 310
predict.diablo=predict(res,newdata=data.test,dist='mahalanobis.dist')
#auroc.diablo=auroc(res,newdata=data.test,outcome.test=Y.test,plot=TRUE,roc.comp=c(1),roc.block=c(1,2,3,4))
data.frame(predict.diablo$class,Truth=Y.test)
## mahalanobis.dist.expr.comp1 mahalanobis.dist.expr.comp2
## H102 f f
## H040 f f
## H165 f f
## H018 f f
## H101 f f
## H096 f f
## H033 f f
## H210 m m
## H225 f f
## H170 m m
## H181 m m
## H182 m m
## H204 m m
## H023 f f
## H051 f f
## H031 f f
## H209 m m
## H035 f f
## H081 f f
## H029 f f
## H014 f f
## H186 m m
## H166 m m
## H167 f f
## H016 m m
## H044 m m
## H032 m m
## H055 m m
## H254 m m
## H024 m m
## H183 m m
## H030 m m
## H252 m m
## H272 m m
## H231 m m
## H107 m m
## H094 m m
## H109 m m
## H260 f f
## H078 m m
## H080 m m
## H208 m m
## H072 m m
## H020 m m
## H059 m m
## H235 m m
## H206 m m
## H202 m m
## H106 m m
## H087 m m
## H049 m m
## H104 m m
## H038 m m
## H037 m m
## H250 m m
## H191 m m
## H017 m m
## H176 m m
## H070 m m
## H178 m m
## mahalanobis.dist.mut.comp1 mahalanobis.dist.mut.comp2
## H102 f f
## H040 m m
## H165 m m
## H018 m m
## H101 m m
## H096 m m
## H033 m m
## H210 m m
## H225 m m
## H170 m m
## H181 m m
## H182 m m
## H204 m m
## H023 f f
## H051 f m
## H031 m m
## H209 m m
## H035 m f
## H081 m m
## H029 f f
## H014 f f
## H186 m m
## H166 m m
## H167 m m
## H016 m f
## H044 f f
## H032 m m
## H055 m m
## H254 m f
## H024 m m
## H183 f f
## H030 f f
## H252 f f
## H272 f f
## H231 f f
## H107 f m
## H094 m m
## H109 f m
## H260 f f
## H078 f f
## H080 f f
## H208 m m
## H072 f f
## H020 m m
## H059 m m
## H235 m f
## H206 m m
## H202 m m
## H106 m m
## H087 m m
## H049 m m
## H104 f m
## H038 m m
## H037 m m
## H250 m m
## H191 m m
## H017 f f
## H176 m m
## H070 m m
## H178 f f
## mahalanobis.dist.meth.comp1 mahalanobis.dist.meth.comp2
## H102 f f
## H040 m m
## H165 f m
## H018 m m
## H101 f f
## H096 m m
## H033 m m
## H210 m m
## H225 m m
## H170 m m
## H181 m f
## H182 m f
## H204 m f
## H023 f f
## H051 m m
## H031 f f
## H209 f f
## H035 f f
## H081 m f
## H029 f m
## H014 m m
## H186 m f
## H166 f f
## H167 f m
## H016 m m
## H044 m m
## H032 m m
## H055 m m
## H254 m f
## H024 m m
## H183 m m
## H030 m m
## H252 f f
## H272 f f
## H231 f f
## H107 m m
## H094 m m
## H109 m m
## H260 f f
## H078 f m
## H080 m f
## H208 m m
## H072 f m
## H020 m m
## H059 f f
## H235 m m
## H206 f f
## H202 f m
## H106 f f
## H087 m m
## H049 f m
## H104 f m
## H038 m m
## H037 m m
## H250 m m
## H191 f f
## H017 m m
## H176 m f
## H070 f m
## H178 m m
## mahalanobis.dist.drug.comp1 mahalanobis.dist.drug.comp2 Truth
## H102 f f f
## H040 m m f
## H165 f f f
## H018 m m f
## H101 m m f
## H096 m m f
## H033 m m f
## H210 m m f
## H225 m m f
## H170 m m f
## H181 f f f
## H182 m m f
## H204 m m f
## H023 f f f
## H051 f f f
## H031 f m f
## H209 f m f
## H035 m f f
## H081 m m f
## H029 f f f
## H014 f f f
## H186 m m f
## H166 m m f
## H167 f f f
## H016 m m m
## H044 f f m
## H032 m m m
## H055 m m m
## H254 m f m
## H024 m m m
## H183 f f m
## H030 f f m
## H252 f f m
## H272 f f m
## H231 f f m
## H107 f f m
## H094 m m m
## H109 f f m
## H260 f f m
## H078 f f m
## H080 f f m
## H208 f m m
## H072 m m m
## H020 m m m
## H059 m m m
## H235 m m m
## H206 m m m
## H202 m m m
## H106 m m m
## H087 m m m
## H049 m m m
## H104 m m m
## H038 f f m
## H037 m m m
## H250 m m m
## H191 f f m
## H017 f f m
## H176 m m m
## H070 m m m
## H178 f f m
table(predict.diablo$MajorityVote$mahalanobis.dist[,1],Y.test)
## Y.test
## f m
## f 8 5
## m 13 20
round((sum(diag(table(predict.diablo$MajorityVote$mahalanobis.dist[,1],Y.test)))/sum(table(predict.diablo$MajorityVote$mahalanobis.dist[,1],Y.test)))*100)
## [1] 61
Therefore the the success rate of the first predictive component is 61%, it is quite high and hopefully provides new candidate bio-markers (see the variable plot) from different OMICs and connections between them for understanding of mechanisms of CLL pathogenesis.
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /home/nikolay/miniconda3/envs/envday1/lib/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=sv_SE.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=sv_SE.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=sv_SE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=sv_SE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] glmnet_2.0-18 foreach_1.4.7 Matrix_1.2-17 matrixStats_0.54.0
## [5] mixOmics_6.10.1 ggplot2_3.2.1 lattice_0.20-38 MASS_7.3-51.4
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.5 RSpectra_0.15-0 highr_0.9 plyr_1.8.4
## [5] RColorBrewer_1.1-2 pillar_1.4.6 compiler_3.6.1 iterators_1.0.12
## [9] tools_3.6.1 digest_0.6.20 evaluate_0.14 lifecycle_0.2.0
## [13] tibble_3.0.3 gtable_0.3.0 pkgconfig_2.0.2 rlang_0.4.7
## [17] igraph_1.2.4.1 yaml_2.2.1 parallel_3.6.1 xfun_0.20
## [21] gridExtra_2.3 withr_2.1.2 stringr_1.4.0 dplyr_1.0.2
## [25] knitr_1.31 generics_0.1.0 vctrs_0.3.2 grid_3.6.1
## [29] tidyselect_1.1.0 ellipse_0.4.1 glue_1.4.1 R6_2.4.0
## [33] snow_0.4-3 rARPACK_0.11-0 rmarkdown_2.7 reshape2_1.4.3
## [37] tidyr_1.1.0 purrr_0.3.4 corpcor_1.6.9 magrittr_1.5
## [41] codetools_0.2-16 scales_1.0.0 ellipsis_0.2.0.1 htmltools_0.5.1.1
## [45] colorspace_1.4-1 labeling_0.3 stringi_1.4.3 lazyeval_0.2.2
## [49] munsell_0.5.0 crayon_1.3.4