Set ~/RNAseq/labs/ as your working directory (i.e. your working directory should be the directory where you saved this .Rmd file).

Load R packages.

# BiocManager::install("GEOquery")
library(GEOquery)
library(Biobase)
library(rafalib)
rafalib::mypar(mar=c(6,2.5,2.5,1)) #sets nice arrangement for the whole document

# source download function
source("https://raw.githubusercontent.com/NBISweden/workshop-RNAseq/master/assets/scripts.R")

0.1 Loading a dataset from GEO using GEOquery

For this exercise, let’s try getting the full count matrix from the original GSE131032 dataset.

In some cases the simple getGEO will download the data, metadata and gene annotation into a ExpressionSet format, where you can then extract the information about the gene counts, metadata and gene annotations.

#Get the GSE object from it
gset <- GEOquery::getGEO("GSE131032")[[1]]
gset

#Get the count matrix from the GSE objetc
data <- gset@assayData$exprs
dim(data)

#Get the sample metadata / phenotypic from the GSE object 
metadata <- gset@phenoData@data
dim(metadata)

#Get the gene annotation data from the GSE object 
annot <- gset@featureData@data
dim(annot)

However, in most cases (in our experience) that is not the case since every dataset is a bit unique. So we recommend checking the files deposited individually and download what is most relevant for you. We can first list the files deposited:

file_list <- GEOquery::getGEOSuppFiles("GSE131032",fetch_files = F)
print(file_list)

You can see the RAW.tar file, which contains the original count file deposited individually for every sample (under the GSM). The command below will dowlaod the file into a folder with the GSE name. This is a .tar compressed file that can be extracted.

GEOquery::getGEOSuppFiles("GSE131032",fetch_files = T,filter_regex = "_RAW.tar")
list.files("GSE131032")

#extract tar using bash (within R, but can be done outside too)
system("cd GSE131032; tar -xvf GSE131032_RAW.tar")
system("cd GSE131032; rm GSE131032_RAW.tar")

list.files("GSE131032")

system("cd GSE131032; gunzip *.gz")

Having the individual files, we can combine them using a for loop or apply function:

#list the files
list.files("GSE131032")

#Check the column names from the 1st file
head( read.delim(paste0("GSE131032/",list.files("GSE131032")[1])) )

#Do an apply function to get all the estimated counts from each file and compile into a single matrix
data <- sapply(list.files("GSE131032",pattern = ".tsv"), function(x){
  x <- read.delim(paste0("GSE131032/",x))[,"est_counts"]
  return(x)
})
rownames(data)
colnames(data)

#Add the gene names and sample names
rownames(data) <- read.delim(paste0("GSE131032/",list.files("GSE131032")[1]))[,"target_id"]
colnames(data) <- sub("_.*","",colnames(data))

#Check the column names from the 1st file
rownames(metadata) == colnames(data)

#Rename data
colnames(data) <- metadata$title
head(data)

Alternativelly, in the file_list above we could also simply load the count matrix.

#list files available
print(file_list)

#Download  and extract the full matrix
GEOquery::getGEOSuppFiles("GSE131032",fetch_files = T,filter_regex = "GSE131032_kallisto_counts.csv.gz")
system("cd GSE131032; gunzip GSE131032_kallisto_counts.csv.gz")

#Read the file (note that the sample names here are not the GSM accesccion, but the investigator's IDs)
data <- read.delim(paste0("GSE131032/GSE131032_kallisto_counts.csv"),sep = ",",row.names = 1)
colnames(data)

#Those sample IDs are stored in one of the metadata columns
metadata$description
metadata$description == colnames(data)

#Rename data
colnames(data) <- metadata$title
head(data)