Exploratory Data Analysis (EDA)

We will start with reading and imputing missing values using median imputation:

#Uncomment this in case you can not change your working directory
#current_dir<-getwd()
#setwd(paste0(current_dir,"/session_ml/SupervisedOMICsIntegration"))

expr <- as.data.frame(t(read.delim("CLL_mRNA.txt", header = TRUE, sep="\t")))
for(i in 1:ncol(expr)){expr[,i][is.na(expr[,i])]<-median(expr[,i],na.rm=TRUE)}
expr[1:5,1:5]
##      ENSG00000244734 ENSG00000158528 ENSG00000198478 ENSG00000175445
## H045        4.558644       11.741854        8.921456       12.686458
## H109        2.721512       13.287432        2.721512       10.925985
## H024        9.938456        2.341006       12.381452        1.528848
## H056       13.278004        3.232874        8.106266        1.528848
## H079        6.086874       11.940820        4.889503       13.340588
##      ENSG00000174469
## H045        2.644946
## H109       12.648355
## H024        1.528848
## H056       13.565210
## H079        5.476914
mut <- as.data.frame(t(read.delim("CLL_Mutations.txt", header = TRUE, sep="\t")))
for(i in 1:ncol(mut)){mut[,i][is.na(mut[,i])]<-median(mut[,i],na.rm=TRUE)}
mut[1:5,1:5]
##      gain2p25.3 gain3q26 del6p21.2 del6q21 del8p12
## H045          0        0         0       0       0
## H109          0        0         0       0       0
## H024          0        0         0       0       0
## H056          0        0         0       0       0
## H079          1        0         0       0       0
meth <- as.data.frame(t(read.delim("CLL_Methylation.txt", header = TRUE, sep="\t")))
for(i in 1:ncol(meth)){meth[,i][is.na(meth[,i])]<-median(meth[,i],na.rm=TRUE)}
meth[1:5,1:5]
##       cg10146935 cg26837773 cg17801765 cg13244315 cg06181703
## H045  1.81108585 -5.1725723  5.4115263 -0.1188251  5.1203838
## H109 -3.99750846  1.5948702  5.4126925  1.0438706  1.2794803
## H024 -2.84431298  0.1611705  0.3657059 -4.2192362  0.7211004
## H056 -3.33865611 -2.0934326  0.3736342 -1.5921965  4.0470594
## H079 -0.01936203  3.7489796  5.4120096  1.4164183  5.2374225
drug <- as.data.frame(t(read.delim("CLL_Drugs.txt", header = TRUE, sep="\t")))
for(i in 1:ncol(drug)){drug[,i][is.na(drug[,i])]<-median(drug[,i],na.rm=TRUE)}
drug[1:5,1:5]
##         D_001_1    D_001_2   D_001_3   D_001_4   D_001_5
## H045 0.02363938 0.04623274 0.3187471 0.8237027 0.8962777
## H109 0.07359900 0.10623002 0.2732891 0.7171379 0.8850003
## H024 0.04426905 0.08425647 0.3187606 0.7797544 0.8974880
## H056 0.05813930 0.09022028 0.2322145 0.7225736 0.7957497
## H079 0.02042077 0.04750543 0.3638962 0.8073907 0.8794886

Now let us have a look at the phenotypic data, extract Gender and convert it to a variable Y that is going to be used later when running PLS-DA analysis:

phen <- read.delim("CLL_Covariates.txt", header = TRUE, sep="\t")
head(phen)
##      Gender Diagnosis
## H045      m       CLL
## H109      m       CLL
## H024      m       CLL
## H056      m       CLL
## H079      m       CLL
## H164      f       CLL
Y<-factor(phen$Gender)
summary(Y)
##   f   m 
##  79 121

Let us split the data set into train and test sub-sets. We select 60 samples (30%) for testing. Since we have more males than females (fraction of females is approximately 0.4) we select 36 males and 24 females.

set.seed(1234)
female_sample<-rownames(expr)[as.character(phen$Gender)=="f"][sample(1:length(rownames(expr)[as.character(phen$Gender)=="f"]),24)]
female_sample
##  [1] "H102" "H040" "H165" "H018" "H101" "H096" "H033" "H210" "H225" "H170"
## [11] "H181" "H182" "H204" "H023" "H051" "H031" "H209" "H035" "H081" "H029"
## [21] "H014" "H186" "H166" "H167"
male_sample<-rownames(expr)[as.character(phen$Gender)=="m"][sample(1:length(rownames(expr)[as.character(phen$Gender)=="m"]),36)]
male_sample
##  [1] "H016" "H044" "H032" "H055" "H254" "H024" "H183" "H030" "H252" "H272"
## [11] "H231" "H107" "H094" "H109" "H260" "H078" "H080" "H208" "H072" "H020"
## [21] "H059" "H235" "H206" "H202" "H106" "H087" "H049" "H104" "H038" "H037"
## [31] "H250" "H191" "H017" "H176" "H070" "H178"
expr_test<-expr[match(c(female_sample,male_sample),rownames(expr)),]
expr_test[1:5,1:5]
##      ENSG00000244734 ENSG00000158528 ENSG00000198478 ENSG00000175445
## H102        7.320581       12.434167        6.783195       11.336763
## H040        4.769423        2.381844        2.381844        5.459589
## H165        2.529650        2.529650        4.073655       10.346617
## H018        4.184367        1.528848        8.195382        2.911501
## H101        7.049108       11.335080        5.101373        9.256292
##      ENSG00000174469
## H102        8.629901
## H040       10.952705
## H165        8.363989
## H018       12.530168
## H101        7.540800
meth_test<-meth[match(c(female_sample,male_sample),rownames(meth)),]
meth_test[1:5,1:5]
##       cg10146935  cg26837773 cg17801765 cg13244315  cg06181703
## H102  0.02408192 -2.66149768  4.7975506 -2.0793298 -0.09300816
## H040 -4.40691148 -5.22192499  5.3746764  3.2587619  3.01769727
## H165  0.59337171 -0.03963969  0.5020684  3.8726517  3.49487818
## H018 -3.67883824 -3.52009661  2.0600774 -1.0636796  0.37493793
## H101 -2.91823259  4.62748491  1.7367315  0.1692539  5.36798425
mut_test<-mut[match(c(female_sample,male_sample),rownames(mut)),]
mut_test[1:5,1:5]
##      gain2p25.3 gain3q26 del6p21.2 del6q21 del8p12
## H102          0        0         0       0       0
## H040          0        0         0       0       0
## H165          1        0         0       0       0
## H018          0        0         0       0       0
## H101          0        0         0       0       0
drug_test<-drug[match(c(female_sample,male_sample),rownames(drug)),]
drug_test[1:5,1:5]
##         D_001_1    D_001_2   D_001_3   D_001_4   D_001_5
## H102 0.08906634 0.13920632 0.2337110 0.7513949 0.7672126
## H040 0.01908389 0.02544977 0.4947984 0.9673132 1.0000992
## H165 0.03234895 0.04840093 0.2723969 0.7585817 0.9079429
## H018 0.04426905 0.08425647 0.3187606 0.7797544 0.8974880
## H101 0.07350703 0.12265018 0.2543330 0.7255191 0.7361780
Y.test<-Y[match(c(female_sample,male_sample),rownames(phen))]
Y.test
##  [1] f f f f f f f f f f f f f f f f f f f f f f f f m m m m m m m m m m m m m m
## [39] m m m m m m m m m m m m m m m m m m m m m m
## Levels: f m
summary(Y.test)
##  f  m 
## 24 36
length(Y.test)
## [1] 60
expr<-expr[!rownames(expr)%in%rownames(expr_test),]
meth<-meth[!rownames(meth)%in%rownames(meth_test),]
mut<-mut[!rownames(mut)%in%rownames(mut_test),]
drug<-drug[!rownames(drug)%in%rownames(drug_test),]

Y<-Y[!rownames(phen)%in%c(female_sample,male_sample)]
Y
##   [1] m m m f m m m m m m m m m f f f f m f m m m m m m f m f m m f m m m f f f
##  [38] m m m f m m f f m m m m f f m m f m f f f m f m m m m f f f m f m f m f m
##  [75] f f m f f m m f m f m m m f m m m f m f f m m m m f m m m m m f m f m m f
## [112] f m f f f m m f f f m m m f m m m f m f m f m m m m f m f
## Levels: f m
length(Y)
## [1] 140
summary(Y)
##  f  m 
## 55 85

Since mutations represent a binary data, there is always a lack of variation due to coding with 0 and 1. Therefore, we will prefilter the mutation matrix by excluding sites with variance across individuals close to zero:

library("mixOmics")
## Loading required package: MASS
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Loaded mixOmics 6.10.1
## Thank you for using mixOmics!
## Tutorials: http://mixomics.org
## Bookdown vignette: https://mixomicsteam.github.io/Bookdown
## Questions, issues: Follow the prompts at http://mixomics.org/contact-us
## Cite us:  citation('mixOmics')
library("matrixStats")
my_nearZeroVar<-nearZeroVar(mut)
head(my_nearZeroVar$Metrics)
##            freqRatio percentUnique
## gain2p25.3  27.00000      1.428571
## gain3q26    69.00000      1.428571
## del6p21.2   69.00000      1.428571
## del6q21     22.33333      1.428571
## del8p12     45.66667      1.428571
## gain8q24    27.00000      1.428571
dim(my_nearZeroVar$Metrics)
## [1] 59  2
mut <- mut[,-which(colnames(mut)%in%rownames(my_nearZeroVar$Metrics))]
mut[1:5,1:5]
##      del11q22.3 trisomy12 del13q14_any del17p13 BRAF
## H045          1         0            0        0    0
## H056          0         0            1        0    0
## H079          1         0            0        0    0
## H164          1         0            1        0    0
## H113          0         1            1        0    0
dim(mut)
## [1] 140  10
mut_test<-subset(mut_test,select=colnames(mut))

Therefore we end up with just a few mutations which have enough variation for the PLS-DA model. Later perhaps it makes sense to include all of them and not select informative ones using the sparse PLS-DA algorithm.

Feature Selection for OMICs Integration

Right now we can not integrate the OMICs data sets since at least expression and methylation data are high-dimensional so we need to perform a feature selection for those two OMICs. Here we use LASSO for doing feature selection for gene expression data:

library("glmnet")
## Loading required package: Matrix
## Loaded glmnet 4.1-1
lasso_fit <- cv.glmnet(as.matrix(expr), Y, family = "binomial", alpha = 1)
plot(lasso_fit)

coef <- predict(lasso_fit, s = "lambda.min", type = "nonzero")
colnames(expr)[unlist(coef)]
##  [1] "ENSG00000198822" "ENSG00000176769" "ENSG00000150457" "ENSG00000184368"
##  [5] "ENSG00000133134" "ENSG00000235268" "ENSG00000001626" "ENSG00000083290"
##  [9] "ENSG00000172139" "ENSG00000140403" "ENSG00000136205" "ENSG00000186871"
## [13] "ENSG00000106633" "ENSG00000147113" "ENSG00000204291" "ENSG00000067798"
## [17] "ENSG00000162551" "ENSG00000049246" "ENSG00000196739" "ENSG00000088053"
## [21] "ENSG00000149150" "ENSG00000162627" "ENSG00000185666" "ENSG00000167701"
## [25] "ENSG00000187608" "ENSG00000169330" "ENSG00000106025" "ENSG00000240403"
## [29] "ENSG00000130508" "ENSG00000142515" "ENSG00000213160" "ENSG00000078114"
result_expr <- data.frame(GENE = names(as.matrix(coef(lasso_fit, s = "lambda.min"))
                                [as.matrix(coef(lasso_fit, s = "lambda.min"))[,1]!=0, 1])[-1], 
                   SCORE = as.numeric(as.matrix(coef(lasso_fit, s = "lambda.min"))
                                      [as.matrix(coef(lasso_fit, 
                                                      s = "lambda.min"))[,1]!=0, 1])[-1])
result_expr <- result_expr[order(-abs(result_expr$SCORE)),]
print(head(result_expr,10))
##               GENE      SCORE
## 14 ENSG00000147113 -1.4122016
## 4  ENSG00000184368 -1.0724494
## 12 ENSG00000186871 -0.8856169
## 29 ENSG00000130508  0.3822835
## 21 ENSG00000149150 -0.3387733
## 23 ENSG00000185666  0.3002490
## 2  ENSG00000176769  0.2667967
## 30 ENSG00000142515  0.2633780
## 20 ENSG00000088053 -0.2163359
## 25 ENSG00000187608 -0.1967797
print(as.character(result_expr$GENE))
##  [1] "ENSG00000147113" "ENSG00000184368" "ENSG00000186871" "ENSG00000130508"
##  [5] "ENSG00000149150" "ENSG00000185666" "ENSG00000176769" "ENSG00000142515"
##  [9] "ENSG00000088053" "ENSG00000187608" "ENSG00000162627" "ENSG00000240403"
## [13] "ENSG00000235268" "ENSG00000150457" "ENSG00000169330" "ENSG00000001626"
## [17] "ENSG00000213160" "ENSG00000140403" "ENSG00000198822" "ENSG00000106025"
## [21] "ENSG00000136205" "ENSG00000083290" "ENSG00000167701" "ENSG00000204291"
## [25] "ENSG00000162551" "ENSG00000196739" "ENSG00000078114" "ENSG00000172139"
## [29] "ENSG00000049246" "ENSG00000067798" "ENSG00000133134" "ENSG00000106633"
expr <- subset(expr, select = as.character(result_expr$GENE))
expr_test<-subset(expr_test,select=colnames(expr))

ens2gs<-read.delim("ENSEMBLE_TO_GENE_SYMBOL_COORD.txt",header=TRUE,sep="\t")
ens2gs<-ens2gs[match(colnames(expr),as.character(ens2gs$ensembl_gene_id)),]
colnames(expr)<-ens2gs$external_gene_name
colnames(expr_test)<-ens2gs$external_gene_name

We have also converted the Ensembl gene IDs into gene names on the fly for clarity. Next we proceed with LASSO feature selection for methylation data:

library("glmnet")
lasso_fit <- cv.glmnet(as.matrix(meth), Y, family = "binomial", alpha = 1)
plot(lasso_fit)