import numpy as np
import pandas as pd
import scanpy as sc
import gseapy
import matplotlib.pyplot as plt
import warnings
import os
import urllib.request
="ignore", category=Warning)
warnings.simplefilter(action
# verbosity: errors (0), warnings (1), info (2), hints (3)
= 2
sc.settings.verbosity
=80) sc.settings.set_figure_params(dpi
Code chunks run Python commands unless it starts with %%bash
, in which case, those chunks run shell commands.
In this tutorial we will cover differential gene expression, which comprises an extensive range of topics and methods. In single cell, differential expresison can have multiple functionalities such as identifying marker genes for cell populations, as well as identifying differentially regulated genes across conditions (healthy vs control). We will also cover controlling batch effect in your test.
Differential expression is performed with the function rank_genes_group. The default method to compute differential expression is the t-test_overestim_var. Other implemented methods are: logreg, t-test and wilcoxon.
By default, the .raw attribute of AnnData is used in case it has been initialized, it can be changed by setting use_raw=False.
The clustering with resolution 0.6 seems to give a reasonable number of clusters, so we will use that clustering for all DE tests.
First, let’s import libraries and fetch the clustered data from the previous lab.
Read in the clustered data object.
# download pre-computed data if missing or long compute
= True
fetch_data
# url for source and intermediate data
= "https://export.uppmax.uu.se/naiss2023-23-3/workshops/workshop-scrnaseq"
path_data
= "data/covid/results"
path_results if not os.path.exists(path_results):
=True)
os.makedirs(path_results, exist_ok
# path_file = "data/covid/results/scanpy_covid_qc_dr_scanorama_cl.h5ad"
= "data/covid/results/scanpy_covid_qc_dr_scanorama_cl.h5ad"
path_file if fetch_data and not os.path.exists(path_file):
urllib.request.urlretrieve(os.path.join('covid/results/scanpy_covid_qc_dr_scanorama_cl.h5ad'), path_file)
path_data,
= sc.read_h5ad(path_file)
adata adata
AnnData object with n_obs × n_vars = 7222 × 2626
obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'n_genes', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'phase', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'leiden_1.0', 'leiden_0.6', 'leiden_0.4', 'leiden_1.4', 'louvain_1.0', 'louvain_0.6', 'louvain_0.4', 'louvain_1.4', 'kmeans5', 'kmeans10', 'kmeans15', 'hclust_5', 'hclust_10', 'hclust_15'
var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'dendrogram_leiden_0.6', 'dendrogram_louvain_0.6', 'doublet_info_colors', 'hclust_10_colors', 'hclust_15_colors', 'hclust_5_colors', 'hvg', 'kmeans10_colors', 'kmeans15_colors', 'kmeans5_colors', 'leiden', 'leiden_0.4_colors', 'leiden_0.6_colors', 'leiden_1.0_colors', 'leiden_1.4_colors', 'log1p', 'louvain', 'louvain_0.4_colors', 'louvain_0.6_colors', 'louvain_1.0_colors', 'louvain_1.4_colors', 'neighbors', 'pca', 'phase_colors', 'sample_colors', 'tsne', 'umap'
obsm: 'Scanorama', 'X_pca', 'X_tsne', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
print(adata.X.shape)
print(adata.raw.X.shape)
print(adata.raw.X[:10,:10])
(7222, 2626)
(7222, 19468)
(1, 4) 0.7825693876867097
(8, 7) 1.1311041336746985
As you can see, the X matrix only contains the variable genes, while the raw matrix contains all genes.
Printing a few of the values in adata.raw.X shows that the raw matrix is normalized.
For DGE analysis we would like to run with all genes, on normalized values, so we will have to revert back to the raw matrix. In case you have raw counts in the matrix you also have to renormalize and logtransform.
= adata.raw.to_adata() adata
Now lets look at the clustering of the object we loaded in the umap. We will use louvain_0.6 clustering in this exercise.
='louvain_0.6') sc.pl.umap(adata, color
1 T-test
'louvain_0.6', method='t-test', key_added = "t-test")
sc.tl.rank_genes_groups(adata, =25, sharey=False, key = "t-test")
sc.pl.rank_genes_groups(adata, n_genes
# results are stored in the adata.uns["t-test"] slot
adata
ranking genes
finished (0:00:02)
AnnData object with n_obs × n_vars = 7222 × 19468
obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'n_genes', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'phase', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'leiden_1.0', 'leiden_0.6', 'leiden_0.4', 'leiden_1.4', 'louvain_1.0', 'louvain_0.6', 'louvain_0.4', 'louvain_1.4', 'kmeans5', 'kmeans10', 'kmeans15', 'hclust_5', 'hclust_10', 'hclust_15'
var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'
uns: 'dendrogram_leiden_0.6', 'dendrogram_louvain_0.6', 'doublet_info_colors', 'hclust_10_colors', 'hclust_15_colors', 'hclust_5_colors', 'hvg', 'kmeans10_colors', 'kmeans15_colors', 'kmeans5_colors', 'leiden', 'leiden_0.4_colors', 'leiden_0.6_colors', 'leiden_1.0_colors', 'leiden_1.4_colors', 'log1p', 'louvain', 'louvain_0.4_colors', 'louvain_0.6_colors', 'louvain_1.0_colors', 'louvain_1.4_colors', 'neighbors', 'pca', 'phase_colors', 'sample_colors', 'tsne', 'umap', 't-test'
obsm: 'Scanorama', 'X_pca', 'X_tsne', 'X_umap'
obsp: 'connectivities', 'distances'
2 T-test overestimated_variance
'louvain_0.6', method='t-test_overestim_var', key_added = "t-test_ov")
sc.tl.rank_genes_groups(adata, =25, sharey=False, key = "t-test_ov") sc.pl.rank_genes_groups(adata, n_genes
ranking genes
finished (0:00:01)
3 Wilcoxon rank-sum
The result of a Wilcoxon rank-sum (Mann-Whitney-U) test is very similar. We recommend using the latter in publications, see e.g., Sonison & Robinson (2018). You might also consider much more powerful differential testing packages like MAST, limma, DESeq2 and, for python, the recent diffxpy.
'louvain_0.6', method='wilcoxon', key_added = "wilcoxon")
sc.tl.rank_genes_groups(adata, =25, sharey=False, key="wilcoxon") sc.pl.rank_genes_groups(adata, n_genes
ranking genes
finished (0:00:08)
4 Logistic regression test
As an alternative, let us rank genes using logistic regression. For instance, this has been suggested by Natranos et al. (2018). The essential difference is that here, we use a multi-variate appraoch whereas conventional differential tests are uni-variate. Clark et al. (2014) has more details.
'louvain_0.6', method='logreg',key_added = "logreg")
sc.tl.rank_genes_groups(adata, =25, sharey=False, key = "logreg") sc.pl.rank_genes_groups(adata, n_genes
ranking genes
finished (0:00:30)
5 Compare genes
Take all significant DE genes for cluster0 with each test and compare the overlap.
#compare cluster1 genes, only stores top 100 by default
= sc.get.rank_genes_groups_df(adata, group='0', key='wilcoxon', pval_cutoff=0.01, log2fc_min=0)['names']
wc = sc.get.rank_genes_groups_df(adata, group='0', key='t-test', pval_cutoff=0.01, log2fc_min=0)['names']
tt = sc.get.rank_genes_groups_df(adata, group='0', key='t-test_ov', pval_cutoff=0.01, log2fc_min=0)['names']
tt_ov
from matplotlib_venn import venn3
set(wc),set(tt),set(tt_ov)], ('Wilcox','T-test','T-test_ov') )
venn3([ plt.show()
As you can see, the Wilcoxon test and the T-test with overestimated variance gives very similar result. Also the regular T-test has good overlap.
6 Visualization
There are several ways to visualize the expression of top DE genes. Here we will plot top 5 genes per cluster from Wilcoxon test as heatmap, dotplot, violin plot or matrix.
=5, key="wilcoxon", groupby="louvain_0.6", show_gene_labels=True)
sc.pl.rank_genes_groups_heatmap(adata, n_genes=5, key="wilcoxon", groupby="louvain_0.6")
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, key="wilcoxon", groupby="louvain_0.6")
sc.pl.rank_genes_groups_stacked_violin(adata, n_genes=5, key="wilcoxon", groupby="louvain_0.6") sc.pl.rank_genes_groups_matrixplot(adata, n_genes
7 Compare specific clusters
We can also do pairwise comparisons of individual clusters on one vs many clusters. For instance, clusters 1 & 2 have very similar expression profiles.
'louvain_0.6', groups=['1'], reference='2', method='wilcoxon')
sc.tl.rank_genes_groups(adata, =['1'], n_genes=20) sc.pl.rank_genes_groups(adata, groups
ranking genes
finished (0:00:01)
Plot as violins for those two groups.
='1', n_genes=10)
sc.pl.rank_genes_groups_violin(adata, groups
# plot the same genes as violins across all the datasets.
# convert numpy.recarray to list
= [x[0] for x in adata.uns['rank_genes_groups']['names'][:10]]
mynames = 'louvain_0.6') sc.pl.stacked_violin(adata, mynames, groupby
8 DGE across conditions
The second way of computing differential expression is to answer which genes are differentially expressed within a cluster. For example, in our case we have libraries comming from patients and controls and we would like to know which genes are influenced the most in a particular cell type. For this end, we will first subset our data for the desired cell cluster, then change the cell identities to the variable of comparison (which now in our case is the type, e.g. Covid/Ctrl).
= adata[adata.obs['louvain_0.6'] == '4',:]
cl1 'type'].value_counts()
cl1.obs[
'type', method='wilcoxon', key_added = "wilcoxon")
sc.tl.rank_genes_groups(cl1, =25, sharey=False, key="wilcoxon") sc.pl.rank_genes_groups(cl1, n_genes
ranking genes
finished (0:00:01)
=10, key="wilcoxon") sc.pl.rank_genes_groups_violin(cl1, n_genes
We can also plot these genes across all clusters, but split by “type”, to check if the genes are also up/downregulated in other celltypes.
import seaborn as sns
= sc.get.rank_genes_groups_df(cl1, group='Covid', key='wilcoxon')['names'][:5]
genes1 = sc.get.rank_genes_groups_df(cl1, group='Ctrl', key='wilcoxon')['names'][:5]
genes2 = genes1.tolist() + genes2.tolist()
genes = sc.get.obs_df(adata, genes + ['louvain_0.6','type'], use_raw=False)
df = df.melt(id_vars=["louvain_0.6",'type'], value_vars=genes)
df2
= "louvain_0.6", y = "value", hue = "type", kind = 'violin', col = "variable", data = df2, col_wrap=4, inner=None) sns.catplot(x
As you can see, we have many sex chromosome related genes among the top DE genes. And if you remember from the QC lab, we have inbalanced sex distribution among our subjects, so this may not be related to covid at all.
8.1 Remove sex chromosome genes
To remove some of the bias due to inbalanced sex in the subjects we can remove the sex chromosome related genes.
= sc.queries.biomart_annotations(
annot "hsapiens",
"ensembl_gene_id", "external_gene_name", "start_position", "end_position", "chromosome_name"],
["external_gene_name")
).set_index(
= adata.var_names.intersection(annot.index[annot.chromosome_name == "Y"])
chrY_genes = adata.var_names.intersection(annot.index[annot.chromosome_name == "X"])
chrX_genes
= chrY_genes.union(chrX_genes)
sex_genes print(len(sex_genes))
= cl1.var.index.tolist()
all_genes print(len(all_genes))
= [x for x in all_genes if x not in sex_genes]
keep_genes print(len(keep_genes))
= cl1[:,keep_genes] cl1
551
19468
18917
Rerun differential expression.
'type', method='wilcoxon', key_added = "wilcoxon")
sc.tl.rank_genes_groups(cl1, =25, sharey=False, key="wilcoxon") sc.pl.rank_genes_groups(cl1, n_genes
ranking genes
finished (0:00:01)
8.2 Patient batch effects
When we are testing for Covid vs Control we are running a DGE test for 3 vs 3 individuals. That will be very sensitive to sample differences unless we find a way to control for it. So first, lets check how the top DGEs are expressed across the individuals:
= sc.get.rank_genes_groups_df(cl1, group='Covid', key='wilcoxon')['names'][:5]
genes1 = sc.get.rank_genes_groups_df(cl1, group='Ctrl', key='wilcoxon')['names'][:5]
genes2 = genes1.tolist() + genes2.tolist()
genes
='sample')
sc.pl.violin(cl1, genes1, groupby='sample') sc.pl.violin(cl1, genes2, groupby
As you can see, many of the genes detected as DGE in Covid are unique to one or 2 patients.
We can also plot the top Covid and top Ctrl genes as a dotplot:
= sc.get.rank_genes_groups_df(cl1, group='Covid', key='wilcoxon')['names'][:20]
genes1 = sc.get.rank_genes_groups_df(cl1, group='Ctrl', key='wilcoxon')['names'][:20]
genes2 = genes1.tolist() + genes2.tolist()
genes
='sample') sc.pl.dotplot(cl1,genes, groupby
Clearly many of the top Covid genes are only high in the covid_17 sample, and not a general feature of covid patients.
This is also the patient with the highest number of cells in this cluster:
'sample'].value_counts() cl1.obs[
sample
covid_17 130
ctrl_5 114
covid_1 109
ctrl_13 65
ctrl_14 62
ctrl_19 57
covid_16 38
covid_15 37
Name: count, dtype: int64
8.3 Subsample
So one obvious thing to consider is an equal amount of cells per individual so that the DGE results are not dominated by a single sample.
So we will downsample to an equal number of cells per sample, in this case 34 cells per sample as it is the lowest number among all samples
= 37
target_cells
= [cl1[cl1.obs['sample'] == s] for s in cl1.obs['sample'].cat.categories]
tmp
for dat in tmp:
if dat.n_obs > target_cells:
=target_cells)
sc.pp.subsample(dat, n_obs
= tmp[0].concatenate(*tmp[1:])
cl1_sub
'sample'].value_counts() cl1_sub.obs[
sample
covid_1 37
covid_15 37
covid_16 37
covid_17 37
ctrl_5 37
ctrl_13 37
ctrl_14 37
ctrl_19 37
Name: count, dtype: int64
'type', method='wilcoxon', key_added = "wilcoxon")
sc.tl.rank_genes_groups(cl1_sub, =25, sharey=False, key="wilcoxon") sc.pl.rank_genes_groups(cl1_sub, n_genes
ranking genes
finished (0:00:00)
= sc.get.rank_genes_groups_df(cl1_sub, group='Covid', key='wilcoxon')['names'][:20]
genes1 = sc.get.rank_genes_groups_df(cl1_sub, group='Ctrl', key='wilcoxon')['names'][:20]
genes2 = genes1.tolist() + genes2.tolist()
genes
='sample') sc.pl.dotplot(cl1,genes, groupby
It looks much better now. But if we look per patient you can see that we still have some genes that are dominated by a single patient. Still, it is often a good idea to control the number of cells from each sample when doing differential expression.
There are many different ways to try and resolve the issue of patient batch effects, however most of them require R packages. These can be run via rpy2 as is demonstraded in this compendium: https://www.sc-best-practices.org/conditions/differential_gene_expression.html
However, we have not included it here as of now. So please have a look at the patient batch effect section in the seurat DGE tutorial where we run EdgeR on pseudobulk and MAST with random effect.
9 Gene Set Analysis (GSA)
9.1 Hypergeometric enrichment test
Having a defined list of differentially expressed genes, you can now look for their combined function using hypergeometric test.
#Available databases : ‘Human’, ‘Mouse’, ‘Yeast’, ‘Fly’, ‘Fish’, ‘Worm’
= gseapy.get_library_name(organism='Human')
gene_set_names print(gene_set_names)
['ARCHS4_Cell-lines', 'ARCHS4_IDG_Coexp', 'ARCHS4_Kinases_Coexp', 'ARCHS4_TFs_Coexp', 'ARCHS4_Tissues', 'Achilles_fitness_decrease', 'Achilles_fitness_increase', 'Aging_Perturbations_from_GEO_down', 'Aging_Perturbations_from_GEO_up', 'Allen_Brain_Atlas_10x_scRNA_2021', 'Allen_Brain_Atlas_down', 'Allen_Brain_Atlas_up', 'Azimuth_2023', 'Azimuth_Cell_Types_2021', 'BioCarta_2013', 'BioCarta_2015', 'BioCarta_2016', 'BioPlanet_2019', 'BioPlex_2017', 'CCLE_Proteomics_2020', 'CORUM', 'COVID-19_Related_Gene_Sets', 'COVID-19_Related_Gene_Sets_2021', 'Cancer_Cell_Line_Encyclopedia', 'CellMarker_Augmented_2021', 'ChEA_2013', 'ChEA_2015', 'ChEA_2016', 'ChEA_2022', 'Chromosome_Location', 'Chromosome_Location_hg19', 'ClinVar_2019', 'DSigDB', 'Data_Acquisition_Method_Most_Popular_Genes', 'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019', 'DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019', 'Descartes_Cell_Types_and_Tissue_2021', 'Diabetes_Perturbations_GEO_2022', 'DisGeNET', 'Disease_Perturbations_from_GEO_down', 'Disease_Perturbations_from_GEO_up', 'Disease_Signatures_from_GEO_down_2014', 'Disease_Signatures_from_GEO_up_2014', 'DrugMatrix', 'Drug_Perturbations_from_GEO_2014', 'Drug_Perturbations_from_GEO_down', 'Drug_Perturbations_from_GEO_up', 'ENCODE_Histone_Modifications_2013', 'ENCODE_Histone_Modifications_2015', 'ENCODE_TF_ChIP-seq_2014', 'ENCODE_TF_ChIP-seq_2015', 'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X', 'ESCAPE', 'Elsevier_Pathway_Collection', 'Enrichr_Libraries_Most_Popular_Genes', 'Enrichr_Submissions_TF-Gene_Coocurrence', 'Enrichr_Users_Contributed_Lists_2020', 'Epigenomics_Roadmap_HM_ChIP-seq', 'FANTOM6_lncRNA_KD_DEGs', 'GO_Biological_Process_2013', 'GO_Biological_Process_2015', 'GO_Biological_Process_2017', 'GO_Biological_Process_2017b', 'GO_Biological_Process_2018', 'GO_Biological_Process_2021', 'GO_Biological_Process_2023', 'GO_Cellular_Component_2013', 'GO_Cellular_Component_2015', 'GO_Cellular_Component_2017', 'GO_Cellular_Component_2017b', 'GO_Cellular_Component_2018', 'GO_Cellular_Component_2021', 'GO_Cellular_Component_2023', 'GO_Molecular_Function_2013', 'GO_Molecular_Function_2015', 'GO_Molecular_Function_2017', 'GO_Molecular_Function_2017b', 'GO_Molecular_Function_2018', 'GO_Molecular_Function_2021', 'GO_Molecular_Function_2023', 'GTEx_Aging_Signatures_2021', 'GTEx_Tissue_Expression_Down', 'GTEx_Tissue_Expression_Up', 'GTEx_Tissues_V8_2023', 'GWAS_Catalog_2019', 'GWAS_Catalog_2023', 'GeDiPNet_2023', 'GeneSigDB', 'Gene_Perturbations_from_GEO_down', 'Gene_Perturbations_from_GEO_up', 'Genes_Associated_with_NIH_Grants', 'Genome_Browser_PWMs', 'GlyGen_Glycosylated_Proteins_2022', 'HDSigDB_Human_2021', 'HDSigDB_Mouse_2021', 'HMDB_Metabolites', 'HMS_LINCS_KinomeScan', 'HomoloGene', 'HuBMAP_ASCT_plus_B_augmented_w_RNAseq_Coexpression', 'HuBMAP_ASCTplusB_augmented_2022', 'HumanCyc_2015', 'HumanCyc_2016', 'Human_Gene_Atlas', 'Human_Phenotype_Ontology', 'IDG_Drug_Targets_2022', 'InterPro_Domains_2019', 'Jensen_COMPARTMENTS', 'Jensen_DISEASES', 'Jensen_TISSUES', 'KEA_2013', 'KEA_2015', 'KEGG_2013', 'KEGG_2015', 'KEGG_2016', 'KEGG_2019_Human', 'KEGG_2019_Mouse', 'KEGG_2021_Human', 'KOMP2_Mouse_Phenotypes_2022', 'Kinase_Perturbations_from_GEO_down', 'Kinase_Perturbations_from_GEO_up', 'L1000_Kinase_and_GPCR_Perturbations_down', 'L1000_Kinase_and_GPCR_Perturbations_up', 'LINCS_L1000_CRISPR_KO_Consensus_Sigs', 'LINCS_L1000_Chem_Pert_Consensus_Sigs', 'LINCS_L1000_Chem_Pert_down', 'LINCS_L1000_Chem_Pert_up', 'LINCS_L1000_Ligand_Perturbations_down', 'LINCS_L1000_Ligand_Perturbations_up', 'Ligand_Perturbations_from_GEO_down', 'Ligand_Perturbations_from_GEO_up', 'MAGMA_Drugs_and_Diseases', 'MAGNET_2023', 'MCF7_Perturbations_from_GEO_down', 'MCF7_Perturbations_from_GEO_up', 'MGI_Mammalian_Phenotype_2013', 'MGI_Mammalian_Phenotype_2017', 'MGI_Mammalian_Phenotype_Level_3', 'MGI_Mammalian_Phenotype_Level_4', 'MGI_Mammalian_Phenotype_Level_4_2019', 'MGI_Mammalian_Phenotype_Level_4_2021', 'MSigDB_Computational', 'MSigDB_Hallmark_2020', 'MSigDB_Oncogenic_Signatures', 'Metabolomics_Workbench_Metabolites_2022', 'Microbe_Perturbations_from_GEO_down', 'Microbe_Perturbations_from_GEO_up', 'MoTrPAC_2023', 'Mouse_Gene_Atlas', 'NCI-60_Cancer_Cell_Lines', 'NCI-Nature_2015', 'NCI-Nature_2016', 'NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions', 'NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions', 'NIH_Funded_PIs_2017_Human_AutoRIF', 'NIH_Funded_PIs_2017_Human_GeneRIF', 'NURSA_Human_Endogenous_Complexome', 'OMIM_Disease', 'OMIM_Expanded', 'Old_CMAP_down', 'Old_CMAP_up', 'Orphanet_Augmented_2021', 'PFOCR_Pathways', 'PFOCR_Pathways_2023', 'PPI_Hub_Proteins', 'PanglaoDB_Augmented_2021', 'Panther_2015', 'Panther_2016', 'Pfam_Domains_2019', 'Pfam_InterPro_Domains', 'PheWeb_2019', 'PhenGenI_Association_2021', 'Phosphatase_Substrates_from_DEPOD', 'ProteomicsDB_2020', 'Proteomics_Drug_Atlas_2023', 'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO', 'RNAseq_Automatic_GEO_Signatures_Human_Down', 'RNAseq_Automatic_GEO_Signatures_Human_Up', 'RNAseq_Automatic_GEO_Signatures_Mouse_Down', 'RNAseq_Automatic_GEO_Signatures_Mouse_Up', 'Rare_Diseases_AutoRIF_ARCHS4_Predictions', 'Rare_Diseases_AutoRIF_Gene_Lists', 'Rare_Diseases_GeneRIF_ARCHS4_Predictions', 'Rare_Diseases_GeneRIF_Gene_Lists', 'Reactome_2013', 'Reactome_2015', 'Reactome_2016', 'Reactome_2022', 'Rummagene_kinases', 'Rummagene_signatures', 'Rummagene_transcription_factors', 'SILAC_Phosphoproteomics', 'SubCell_BarCode', 'SynGO_2022', 'SysMyo_Muscle_Gene_Sets', 'TF-LOF_Expression_from_GEO', 'TF_Perturbations_Followed_by_Expression', 'TG_GATES_2020', 'TRANSFAC_and_JASPAR_PWMs', 'TRRUST_Transcription_Factors_2019', 'Table_Mining_of_CRISPR_Studies', 'Tabula_Muris', 'Tabula_Sapiens', 'TargetScan_microRNA', 'TargetScan_microRNA_2017', 'The_Kinase_Library_2023', 'Tissue_Protein_Expression_from_Human_Proteome_Map', 'Tissue_Protein_Expression_from_ProteomicsDB', 'Transcription_Factor_PPIs', 'UK_Biobank_GWAS_v1', 'Virus-Host_PPI_P-HIPSTer_2020', 'VirusMINT', 'Virus_Perturbations_from_GEO_down', 'Virus_Perturbations_from_GEO_up', 'WikiPathway_2021_Human', 'WikiPathway_2023_Human', 'WikiPathways_2013', 'WikiPathways_2015', 'WikiPathways_2016', 'WikiPathways_2019_Human', 'WikiPathways_2019_Mouse', 'dbGaP', 'huMAP', 'lncHUB_lncRNA_Co-Expression', 'miRTarBase_2017']
Get the significant DEGs for the Covid patients.
#?gseapy.enrichr
= sc.get.rank_genes_groups_df(cl1_sub, group='Covid', key='wilcoxon', log2fc_min=0.25, pval_cutoff=0.05)['names'].squeeze().str.strip().tolist()
glist print(len(glist))
7
= gseapy.enrichr(gene_list=glist, organism='Human', gene_sets='GO_Biological_Process_2018', cutoff = 0.5)
enr_res enr_res.results.head()
Gene_set | Term | Overlap | P-value | Adjusted P-value | Old P-value | Old Adjusted P-value | Odds Ratio | Combined Score | Genes | |
---|---|---|---|---|---|---|---|---|---|---|
0 | GO_Biological_Process_2018 | positive regulation of inflammatory response (... | 2/73 | 0.000273 | 0.021549 | 0 | 0 | 112.236620 | 921.142995 | NFKBIA;S100A9 |
1 | GO_Biological_Process_2018 | positive regulation of defense response (GO:00... | 2/74 | 0.000280 | 0.021549 | 0 | 0 | 110.672222 | 905.289889 | NFKBIA;S100A9 |
2 | GO_Biological_Process_2018 | positive regulation of response to external st... | 2/90 | 0.000414 | 0.021549 | 0 | 0 | 90.477273 | 704.697251 | NFKBIA;S100A9 |
3 | GO_Biological_Process_2018 | positive regulation of NF-kappaB transcription... | 2/128 | 0.000836 | 0.032592 | 0 | 0 | 63.069841 | 446.990813 | NFKBIA;S100A9 |
4 | GO_Biological_Process_2018 | cellular protein complex assembly (GO:0043623) | 2/144 | 0.001056 | 0.032941 | 0 | 0 | 55.918310 | 383.234256 | HSP90AB1;POMP |
Some databases of interest:
GO_Biological_Process_2017b
KEGG_2019_Human
KEGG_2019_Mouse
WikiPathways_2019_Human
WikiPathways_2019_Mouse
You visualize your results using a simple barplot, for example:
='GO_Biological_Process_2018') gseapy.barplot(enr_res.res2d,title
<Axes: title={'center': 'GO_Biological_Process_2018'}, xlabel='$- \\log_{10}$ (Adjusted P-value)'>
10 Gene Set Enrichment Analysis (GSEA)
Besides the enrichment using hypergeometric test, we can also perform gene set enrichment analysis (GSEA), which scores ranked genes list (usually based on fold changes) and computes permutation test to check if a particular gene set is more present in the Up-regulated genes, among the DOWN_regulated genes or not differentially regulated.
We need a table with all DEGs and their log foldchanges. However, many lowly expressed genes will have high foldchanges and just contribue noise, so also filter for expression in enough cells.
= sc.get.rank_genes_groups_df(cl1_sub, group='Covid', key='wilcoxon')[['names','logfoldchanges']]
gene_rank =['logfoldchanges'], inplace=True, ascending=False)
gene_rank.sort_values(by
# calculate_qc_metrics will calculate number of cells per gene
=None, log1p=False, inplace=True)
sc.pp.calculate_qc_metrics(cl1, percent_top
# filter for genes expressed in at least 30 cells.
= gene_rank[gene_rank['names'].isin(cl1.var_names[cl1.var.n_cells_by_counts>30])]
gene_rank
gene_rank
names | logfoldchanges | |
---|---|---|
526 | SLFN5 | 26.829697 |
368 | CXCL8 | 26.812254 |
209 | EGR1 | 5.063837 |
38 | PPBP | 4.969337 |
211 | PF4 | 4.870691 |
... | ... | ... |
18062 | NXPH4 | -2.804427 |
18449 | MME | -3.049736 |
18380 | DHDDS | -3.202402 |
18282 | KDM1B | -3.256811 |
18607 | ZNF296 | -4.392631 |
7105 rows × 2 columns
Once our list of genes are sorted, we can proceed with the enrichment itself. We can use the package to get gene set from the Molecular Signature Database (MSigDB) and select KEGG pathways as an example.
#Available databases : ‘Human’, ‘Mouse’, ‘Yeast’, ‘Fly’, ‘Fish’, ‘Worm’
= gseapy.get_library_name(organism='Human')
gene_set_names print(gene_set_names)
['ARCHS4_Cell-lines', 'ARCHS4_IDG_Coexp', 'ARCHS4_Kinases_Coexp', 'ARCHS4_TFs_Coexp', 'ARCHS4_Tissues', 'Achilles_fitness_decrease', 'Achilles_fitness_increase', 'Aging_Perturbations_from_GEO_down', 'Aging_Perturbations_from_GEO_up', 'Allen_Brain_Atlas_10x_scRNA_2021', 'Allen_Brain_Atlas_down', 'Allen_Brain_Atlas_up', 'Azimuth_2023', 'Azimuth_Cell_Types_2021', 'BioCarta_2013', 'BioCarta_2015', 'BioCarta_2016', 'BioPlanet_2019', 'BioPlex_2017', 'CCLE_Proteomics_2020', 'CORUM', 'COVID-19_Related_Gene_Sets', 'COVID-19_Related_Gene_Sets_2021', 'Cancer_Cell_Line_Encyclopedia', 'CellMarker_Augmented_2021', 'ChEA_2013', 'ChEA_2015', 'ChEA_2016', 'ChEA_2022', 'Chromosome_Location', 'Chromosome_Location_hg19', 'ClinVar_2019', 'DSigDB', 'Data_Acquisition_Method_Most_Popular_Genes', 'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019', 'DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019', 'Descartes_Cell_Types_and_Tissue_2021', 'Diabetes_Perturbations_GEO_2022', 'DisGeNET', 'Disease_Perturbations_from_GEO_down', 'Disease_Perturbations_from_GEO_up', 'Disease_Signatures_from_GEO_down_2014', 'Disease_Signatures_from_GEO_up_2014', 'DrugMatrix', 'Drug_Perturbations_from_GEO_2014', 'Drug_Perturbations_from_GEO_down', 'Drug_Perturbations_from_GEO_up', 'ENCODE_Histone_Modifications_2013', 'ENCODE_Histone_Modifications_2015', 'ENCODE_TF_ChIP-seq_2014', 'ENCODE_TF_ChIP-seq_2015', 'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X', 'ESCAPE', 'Elsevier_Pathway_Collection', 'Enrichr_Libraries_Most_Popular_Genes', 'Enrichr_Submissions_TF-Gene_Coocurrence', 'Enrichr_Users_Contributed_Lists_2020', 'Epigenomics_Roadmap_HM_ChIP-seq', 'FANTOM6_lncRNA_KD_DEGs', 'GO_Biological_Process_2013', 'GO_Biological_Process_2015', 'GO_Biological_Process_2017', 'GO_Biological_Process_2017b', 'GO_Biological_Process_2018', 'GO_Biological_Process_2021', 'GO_Biological_Process_2023', 'GO_Cellular_Component_2013', 'GO_Cellular_Component_2015', 'GO_Cellular_Component_2017', 'GO_Cellular_Component_2017b', 'GO_Cellular_Component_2018', 'GO_Cellular_Component_2021', 'GO_Cellular_Component_2023', 'GO_Molecular_Function_2013', 'GO_Molecular_Function_2015', 'GO_Molecular_Function_2017', 'GO_Molecular_Function_2017b', 'GO_Molecular_Function_2018', 'GO_Molecular_Function_2021', 'GO_Molecular_Function_2023', 'GTEx_Aging_Signatures_2021', 'GTEx_Tissue_Expression_Down', 'GTEx_Tissue_Expression_Up', 'GTEx_Tissues_V8_2023', 'GWAS_Catalog_2019', 'GWAS_Catalog_2023', 'GeDiPNet_2023', 'GeneSigDB', 'Gene_Perturbations_from_GEO_down', 'Gene_Perturbations_from_GEO_up', 'Genes_Associated_with_NIH_Grants', 'Genome_Browser_PWMs', 'GlyGen_Glycosylated_Proteins_2022', 'HDSigDB_Human_2021', 'HDSigDB_Mouse_2021', 'HMDB_Metabolites', 'HMS_LINCS_KinomeScan', 'HomoloGene', 'HuBMAP_ASCT_plus_B_augmented_w_RNAseq_Coexpression', 'HuBMAP_ASCTplusB_augmented_2022', 'HumanCyc_2015', 'HumanCyc_2016', 'Human_Gene_Atlas', 'Human_Phenotype_Ontology', 'IDG_Drug_Targets_2022', 'InterPro_Domains_2019', 'Jensen_COMPARTMENTS', 'Jensen_DISEASES', 'Jensen_TISSUES', 'KEA_2013', 'KEA_2015', 'KEGG_2013', 'KEGG_2015', 'KEGG_2016', 'KEGG_2019_Human', 'KEGG_2019_Mouse', 'KEGG_2021_Human', 'KOMP2_Mouse_Phenotypes_2022', 'Kinase_Perturbations_from_GEO_down', 'Kinase_Perturbations_from_GEO_up', 'L1000_Kinase_and_GPCR_Perturbations_down', 'L1000_Kinase_and_GPCR_Perturbations_up', 'LINCS_L1000_CRISPR_KO_Consensus_Sigs', 'LINCS_L1000_Chem_Pert_Consensus_Sigs', 'LINCS_L1000_Chem_Pert_down', 'LINCS_L1000_Chem_Pert_up', 'LINCS_L1000_Ligand_Perturbations_down', 'LINCS_L1000_Ligand_Perturbations_up', 'Ligand_Perturbations_from_GEO_down', 'Ligand_Perturbations_from_GEO_up', 'MAGMA_Drugs_and_Diseases', 'MAGNET_2023', 'MCF7_Perturbations_from_GEO_down', 'MCF7_Perturbations_from_GEO_up', 'MGI_Mammalian_Phenotype_2013', 'MGI_Mammalian_Phenotype_2017', 'MGI_Mammalian_Phenotype_Level_3', 'MGI_Mammalian_Phenotype_Level_4', 'MGI_Mammalian_Phenotype_Level_4_2019', 'MGI_Mammalian_Phenotype_Level_4_2021', 'MSigDB_Computational', 'MSigDB_Hallmark_2020', 'MSigDB_Oncogenic_Signatures', 'Metabolomics_Workbench_Metabolites_2022', 'Microbe_Perturbations_from_GEO_down', 'Microbe_Perturbations_from_GEO_up', 'MoTrPAC_2023', 'Mouse_Gene_Atlas', 'NCI-60_Cancer_Cell_Lines', 'NCI-Nature_2015', 'NCI-Nature_2016', 'NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions', 'NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions', 'NIH_Funded_PIs_2017_Human_AutoRIF', 'NIH_Funded_PIs_2017_Human_GeneRIF', 'NURSA_Human_Endogenous_Complexome', 'OMIM_Disease', 'OMIM_Expanded', 'Old_CMAP_down', 'Old_CMAP_up', 'Orphanet_Augmented_2021', 'PFOCR_Pathways', 'PFOCR_Pathways_2023', 'PPI_Hub_Proteins', 'PanglaoDB_Augmented_2021', 'Panther_2015', 'Panther_2016', 'Pfam_Domains_2019', 'Pfam_InterPro_Domains', 'PheWeb_2019', 'PhenGenI_Association_2021', 'Phosphatase_Substrates_from_DEPOD', 'ProteomicsDB_2020', 'Proteomics_Drug_Atlas_2023', 'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO', 'RNAseq_Automatic_GEO_Signatures_Human_Down', 'RNAseq_Automatic_GEO_Signatures_Human_Up', 'RNAseq_Automatic_GEO_Signatures_Mouse_Down', 'RNAseq_Automatic_GEO_Signatures_Mouse_Up', 'Rare_Diseases_AutoRIF_ARCHS4_Predictions', 'Rare_Diseases_AutoRIF_Gene_Lists', 'Rare_Diseases_GeneRIF_ARCHS4_Predictions', 'Rare_Diseases_GeneRIF_Gene_Lists', 'Reactome_2013', 'Reactome_2015', 'Reactome_2016', 'Reactome_2022', 'Rummagene_kinases', 'Rummagene_signatures', 'Rummagene_transcription_factors', 'SILAC_Phosphoproteomics', 'SubCell_BarCode', 'SynGO_2022', 'SysMyo_Muscle_Gene_Sets', 'TF-LOF_Expression_from_GEO', 'TF_Perturbations_Followed_by_Expression', 'TG_GATES_2020', 'TRANSFAC_and_JASPAR_PWMs', 'TRRUST_Transcription_Factors_2019', 'Table_Mining_of_CRISPR_Studies', 'Tabula_Muris', 'Tabula_Sapiens', 'TargetScan_microRNA', 'TargetScan_microRNA_2017', 'The_Kinase_Library_2023', 'Tissue_Protein_Expression_from_Human_Proteome_Map', 'Tissue_Protein_Expression_from_ProteomicsDB', 'Transcription_Factor_PPIs', 'UK_Biobank_GWAS_v1', 'Virus-Host_PPI_P-HIPSTer_2020', 'VirusMINT', 'Virus_Perturbations_from_GEO_down', 'Virus_Perturbations_from_GEO_up', 'WikiPathway_2021_Human', 'WikiPathway_2023_Human', 'WikiPathways_2013', 'WikiPathways_2015', 'WikiPathways_2016', 'WikiPathways_2019_Human', 'WikiPathways_2019_Mouse', 'dbGaP', 'huMAP', 'lncHUB_lncRNA_Co-Expression', 'miRTarBase_2017']
Next, we will run GSEA. This will result in a table containing information for several pathways. We can then sort and filter those pathways to visualize only the top ones. You can select/filter them by either p-value
or normalized enrichment score (NES
).
= gseapy.prerank(rnk=gene_rank, gene_sets='KEGG_2021_Human')
res
= res.res2d.Term
terms 10] terms[:
0 Cytokine-cytokine receptor interaction
1 AGE-RAGE signaling pathway in diabetic complic...
2 Viral protein interaction with cytokine and cy...
3 Rheumatoid arthritis
4 IL-17 signaling pathway
5 Bladder cancer
6 Chemokine signaling pathway
7 NF-kappa B signaling pathway
8 Legionellosis
9 Chagas disease
Name: Term, dtype: object
=res.ranking, term=terms[0], **res.results[terms[0]]) gseapy.gseaplot(rank_metric
[<Axes: xlabel='Gene Rank', ylabel='Ranked metric'>,
<Axes: >,
<Axes: >,
<Axes: ylabel='Enrichment Score'>]
Which KEGG pathways are upregulated in this cluster? Which KEGG pathways are dowregulated in this cluster? Change the pathway source to another gene set (e.g. CP:WIKIPATHWAYS or CP:REACTOME or CP:BIOCARTA or GO:BP) and check the if you get similar results?
Finally, let’s save the integrated data for further analysis.
'./data/covid/results/scanpy_covid_qc_dr_scanorama_cl_dge.h5ad') adata.write_h5ad(
11 Session info
Click here
sc.logging.print_versions()
-----
anndata 0.10.5.post1
scanpy 1.9.6
-----
PIL 10.0.0
anyio NA
asttokens NA
attr 23.1.0
babel 2.12.1
backcall 0.2.0
certifi 2023.11.17
cffi 1.15.1
charset_normalizer 3.1.0
colorama 0.4.6
comm 0.1.3
cycler 0.12.1
cython_runtime NA
dateutil 2.8.2
debugpy 1.6.7
decorator 5.1.1
defusedxml 0.7.1
exceptiongroup 1.2.0
executing 1.2.0
fastjsonschema NA
future 0.18.3
gmpy2 2.1.2
gseapy 1.0.6
h5py 3.9.0
idna 3.4
igraph 0.10.8
ipykernel 6.23.1
ipython_genutils 0.2.0
jedi 0.18.2
jinja2 3.1.2
joblib 1.3.2
json5 NA
jsonpointer 2.0
jsonschema 4.17.3
jupyter_events 0.6.3
jupyter_server 2.6.0
jupyterlab_server 2.22.1
kiwisolver 1.4.5
leidenalg 0.10.1
llvmlite 0.41.1
louvain 0.8.1
markupsafe 2.1.2
matplotlib 3.8.0
matplotlib_inline 0.1.6
matplotlib_venn 0.11.9
mpl_toolkits NA
mpmath 1.3.0
natsort 8.4.0
nbformat 5.8.0
numba 0.58.1
numpy 1.26.3
opt_einsum v3.3.0
overrides NA
packaging 23.1
pandas 2.2.0
parso 0.8.3
patsy 0.5.6
pexpect 4.8.0
pickleshare 0.7.5
pkg_resources NA
platformdirs 3.5.1
prometheus_client NA
prompt_toolkit 3.0.38
psutil 5.9.5
ptyprocess 0.7.0
pure_eval 0.2.2
pvectorc NA
pybiomart 0.2.0
pycparser 2.21
pydev_ipython NA
pydevconsole NA
pydevd 2.9.5
pydevd_file_utils NA
pydevd_plugins NA
pydevd_tracing NA
pygments 2.15.1
pyparsing 3.1.1
pyrsistent NA
pythonjsonlogger NA
pytz 2023.3
requests 2.31.0
requests_cache 0.4.13
rfc3339_validator 0.1.4
rfc3986_validator 0.1.1
scipy 1.12.0
seaborn 0.13.2
send2trash NA
session_info 1.0.0
six 1.16.0
sklearn 1.4.0
sniffio 1.3.0
socks 1.7.1
sparse 0.15.1
stack_data 0.6.2
statsmodels 0.14.1
sympy 1.12
texttable 1.7.0
threadpoolctl 3.2.0
torch 2.0.0
tornado 6.3.2
tqdm 4.65.0
traitlets 5.9.0
typing_extensions NA
urllib3 2.0.2
wcwidth 0.2.6
websocket 1.5.2
yaml 6.0
zmq 25.0.2
zoneinfo NA
zstandard 0.19.0
-----
IPython 8.13.2
jupyter_client 8.2.0
jupyter_core 5.3.0
jupyterlab 4.0.1
notebook 6.5.4
-----
Python 3.10.11 | packaged by conda-forge | (main, May 10 2023, 18:58:44) [GCC 11.3.0]
Linux-5.15.0-92-generic-x86_64-with-glibc2.35
-----
Session information updated at 2024-02-06 00:41