<- read.table("../../data/counts_vst.txt", header = T, row.names = 1, sep = "\t")
gc_vst <- prcomp(t(gc_vst)) vst_pca
1 PCA
Let us first make a PCA object. For this, we will use the VST
data, because it makes sense to use the normalized data for building the PCA. To run PCA, we use the R function prcomp()
. It takes in a matrix where samples are rows and variables are columns. Therefore, we transpose our count matrix using the function t()
. If we do not transpose, then PCA is run on the genes rather than the samples.
After you computer the PCA, if you type the object vst_pca$
and press TAB
, you will notice that this R object has multiple vecors and data.frames within it. Some of the important ones are
sdev:
the standard deviations of the principal componentsx:
the coordinates of the samples (observations) on the principal components.rotation:
the matrix of variable loadings (columns are eigenvectors).
There are quite a few functions in R from different packages that can run PCA. So, one should look into the structure of the PCA object and import it into ggplot
accordingly!
1.1 Variance of components (Scree plot)
First, let us look into plotting the variance explained by the top PCs.
<- function(x) x^2/sum(x^2)
frac_var library(scales)
$sdev %>%
vst_pcaas_tibble() %>%
frac_var() %>%
mutate(Comp = colnames(vst_pca$x)) %>%
slice(1:9) %>%
ggplot(aes(x=Comp, y = value)) +
geom_bar(stat = "identity", fill = "#4DC5F9") +
geom_hline(yintercept = 0.03, linetype=2) +
xlab("Principal Components") +
scale_y_continuous(name = "Variance Explained", breaks = seq(0,0.8,0.1), labels = percent_format(accuracy = 5L)) +
theme_classic(base_size = 14)
1.2 PCA plot
So, looks like the first two components explain almost 85% of the data. Now, let us look into building a plot out of these components. From the above object, to get the scatter plot for the samples, we need to look into vst_pca$x
. Then, we combine this data (as shown below) with the metadata to use different aesthetics and colors on the plot.
$x vst_pca
PC1 PC2 PC3 PC4 PC5 PC6
Sample_1 -34.641623 -13.435531 6.486032 0.2742477 6.406502 5.6703279
Sample_2 -35.431333 -13.277983 5.171377 0.3540945 3.953216 6.9974341
Sample_3 -35.938795 -14.544994 2.885351 11.3414829 -5.950082 -10.8497916
Sample_4 -20.672358 3.962013 -1.414301 -10.4819194 0.195882 -2.8003847
Sample_5 -21.155503 1.390981 -6.644132 -7.4002617 -6.505502 0.9771040
Sample_6 -22.662075 1.115504 -9.801356 -6.3107519 -7.431833 0.6592987
Sample_7 1.862762 24.449057 12.865650 4.4029501 -5.822040 4.1873901
Sample_8 -5.909698 13.992629 -14.775686 10.9789567 8.967097 2.2612207
Sample_9 -3.233544 17.321871 5.196746 -2.5142966 6.983077 -7.2580550
Sample_10 60.630406 -5.930071 6.993097 -5.7393184 4.254238 -2.6905179
Sample_11 56.669696 -10.638879 -5.860598 -0.4646609 5.216763 -1.3748865
Sample_12 60.482067 -4.404596 -1.102179 5.5594769 -10.267317 4.2208603
PC7 PC8 PC9 PC10 PC11 PC12
Sample_1 -7.6896364 -7.5508580 1.1936551 0.2382806 0.7901002 4.899206e-14
Sample_2 5.8962070 9.0662471 1.0449651 -0.2284545 -1.3982703 5.084128e-14
Sample_3 1.4353811 -0.7461021 -1.4769949 -0.2114041 0.5247079 4.830164e-14
Sample_4 1.2992058 -2.3863551 -6.1089574 0.1510797 -9.1851796 4.973973e-14
Sample_5 -2.7487548 2.7353990 -4.5686381 6.6024251 6.9911571 4.892961e-14
Sample_6 2.1391551 -2.3567136 7.7925047 -6.9400619 1.3499685 5.057196e-14
Sample_7 5.2166866 -4.0805978 -0.2292610 1.4574679 1.2200771 5.066433e-14
Sample_8 -0.1684735 0.1448735 -3.9822133 -2.5610601 0.5182664 4.997044e-14
Sample_9 -5.4663572 5.2056738 6.4295279 1.6062665 -0.5133496 4.986636e-14
Sample_10 0.8277618 0.5767198 -4.9156730 -6.8596253 4.8748004 4.900247e-14
Sample_11 5.8886122 -3.9566331 4.4374522 6.9835771 -0.4457707 4.726081e-14
Sample_12 -6.6297877 3.3483466 0.3836326 -0.2384911 -4.7265074 5.044576e-14
And, if you check the class()
of this object, you will realize that this is a matrix
. To be able to comfortably use tidyverse
on this object, we must first convert this to a data.frame
.
<- vst_pca$x %>%
vst_pca_all as.data.frame() %>%
rownames_to_column(var = "Sample_ID") %>%
full_join(md, by = "Sample_ID")
# Just to keep the order the right way.
$Sample_Name <- factor(vst_pca_all$Sample_Name, levels = c("t0_A","t0_B","t0_C","t2_A","t2_B","t2_C","t6_A","t6_B","t6_C","t24_A","t24_B","t24_C"))
vst_pca_all$Time <- factor(vst_pca_all$Time, levels = c("t0","t2","t6","t24"))
vst_pca_all$Replicate <- factor(vst_pca_all$Replicate, levels = c("A","B","C"))
vst_pca_all
ggplot(vst_pca_all, aes(x=PC1, y=PC2, color = Time)) +
geom_point(size = 3, aes(shape = Replicate)) +
geom_vline(xintercept = 0, linetype=2) +
geom_hline(yintercept = 0, linetype=2) +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
1.3 Loading plot
Now, let us say you want to plot the four genes that contribute the most to the four directions in the PCA plot. We could obtain them by looking at the vst_pca$rotation
matrix. We could get those genes and their respective coordinates as follows.
=vst_pca$rotation[c(which.max(vst_pca$rotation[,"PC1"]), which.min(vst_pca$rotation[,"PC1"]), which.max(vst_pca$rotation[,"PC2"]), which.min(vst_pca$rotation[,"PC2"])),c("PC1","PC2")]
genes.selected<- genes.selected %>%
genes.selected as.data.frame() %>%
rownames_to_column(var = "Gene_ID")
genes.selected
A loading plot shows how strongly each variable (gene) influences a principal component. As an example, we could plot the four genes we selected.
2 PCA bi-plot
By merging the PCA plot with the loadings plot one can create a so-called PCA bi-plot.
=500
scaleggplot(data=vst_pca_all, mapping=aes(x=PC1, y=PC2)) +
geom_point(size = 3, aes(shape = Replicate, color = Time)) +
geom_vline(xintercept = 0, linetype=2) +
geom_hline(yintercept = 0, linetype=2) +
geom_segment(data=genes.selected, mapping=aes(xend=scale*PC1,yend=scale*PC2), x=0, y=0, arrow=arrow(), color="grey") +
geom_label(data=genes.selected, mapping=aes(x=scale*PC1,y=scale*PC2, label=Gene_ID), size=2, hjust="outward", vjust="outward") +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
Similarly, let us say, you have environmental variables (continous variables like pH and so on) from the same samples and you would like to see how they would fit this bi-plot. One can use envfit()
function from the vegan
package, this function would return both the p-value
and the coordinates
of each of the variables in your environmental matrix. Then you could subset the significant variables and plot them in the same way as above.