import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import warnings
import os
import urllib.request
="ignore", category=Warning)
warnings.simplefilter(action
# verbosity: errors (0), warnings (1), info (2), hints (3)
= 2
sc.settings.verbosity =80) sc.settings.set_figure_params(dpi
Code chunks run Python commands unless it starts with %%bash
, in which case, those chunks run shell commands.
Celltype prediction can either be performed on indiviudal cells where each cell gets a predicted celltype label, or on the level of clusters. All methods are based on similarity to other datasets, single cell or sorted bulk RNAseq, or uses known marker genes for each cell type.
We will select one sample from the Covid data, ctrl_13
and predict celltype by cell on that sample.
Some methods will predict a celltype to each cell based on what it is most similar to, even if that celltype is not included in the reference. Other methods include an uncertainty so that cells with low similarity scores will be unclassified.
There are multiple different methods to predict celltypes, here we will just cover a few of those.
Here we will use a reference PBMC dataset that we get from scanpy datasets and classify celltypes based on two methods:
- Using scanorama for integration just as in the integration lab, and then do label transfer based on closest neighbors.
- Using ingest to project the data onto the reference data and transfer labels.
First, lets load required libraries
Let’s read in the saved Covid-19 data object from the clustering step.
# download pre-computed data if missing or long compute
= True
fetch_data
# url for source and intermediate data
= "https://export.uppmax.uu.se/naiss2023-23-3/workshops/workshop-scrnaseq"
path_data
= "data/covid/results"
path_results if not os.path.exists(path_results):
=True)
os.makedirs(path_results, exist_ok
# path_file = "data/covid/results/scanpy_covid_qc_dr_scanorama_cl.h5ad"
= "data/covid/results/scanpy_covid_qc_dr_scanorama_cl.h5ad"
path_file if fetch_data and not os.path.exists(path_file):
urllib.request.urlretrieve(os.path.join('covid/results/scanpy_covid_qc_dr_scanorama_cl.h5ad'), path_file)
path_data,
= sc.read_h5ad(path_file)
adata adata
AnnData object with n_obs × n_vars = 7222 × 2626
obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'n_genes', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'phase', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'leiden_1.0', 'leiden_0.6', 'leiden_0.4', 'leiden_1.4', 'louvain_1.0', 'louvain_0.6', 'louvain_0.4', 'louvain_1.4', 'kmeans5', 'kmeans10', 'kmeans15', 'hclust_5', 'hclust_10', 'hclust_15'
var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'dendrogram_leiden_0.6', 'dendrogram_louvain_0.6', 'doublet_info_colors', 'hclust_10_colors', 'hclust_15_colors', 'hclust_5_colors', 'hvg', 'kmeans10_colors', 'kmeans15_colors', 'kmeans5_colors', 'leiden', 'leiden_0.4_colors', 'leiden_0.6_colors', 'leiden_1.0_colors', 'leiden_1.4_colors', 'log1p', 'louvain', 'louvain_0.4_colors', 'louvain_0.6_colors', 'louvain_1.0_colors', 'louvain_1.4_colors', 'neighbors', 'pca', 'phase_colors', 'sample_colors', 'tsne', 'umap'
obsm: 'Scanorama', 'X_pca', 'X_tsne', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
'log1p']['base']=None
adata.uns[print(adata.shape)
print(adata.raw.shape)
(7222, 2626)
(7222, 19468)
Subset one patient.
= adata[adata.obs["sample"] == "ctrl_13",:]
adata print(adata.shape)
(1121, 2626)
"louvain_0.6"].value_counts() adata.obs[
louvain_0.6
0 244
2 187
3 184
5 139
8 129
1 100
4 65
7 33
9 33
6 4
10 3
Name: count, dtype: int64
As you can see, we have only one cell from cluster 10 in this sample, so lets remove that cell for now.
= adata[adata.obs["louvain_0.6"] != "10",:] adata
sc.pl.umap(=["louvain_0.6"], palette=sc.pl.palettes.default_20
adata, color )
1 Reference data
Load the reference data from scanpy.datasets
. It is the annotated and processed pbmc3k dataset from 10x.
= sc.datasets.pbmc3k_processed()
adata_ref
'sample']='pbmc3k'
adata_ref.obs[
print(adata_ref.shape)
adata_ref.obs
(2638, 1838)
n_genes | percent_mito | n_counts | louvain | sample | |
---|---|---|---|---|---|
index | |||||
AAACATACAACCAC-1 | 781 | 0.030178 | 2419.0 | CD4 T cells | pbmc3k |
AAACATTGAGCTAC-1 | 1352 | 0.037936 | 4903.0 | B cells | pbmc3k |
AAACATTGATCAGC-1 | 1131 | 0.008897 | 3147.0 | CD4 T cells | pbmc3k |
AAACCGTGCTTCCG-1 | 960 | 0.017431 | 2639.0 | CD14+ Monocytes | pbmc3k |
AAACCGTGTATGCG-1 | 522 | 0.012245 | 980.0 | NK cells | pbmc3k |
... | ... | ... | ... | ... | ... |
TTTCGAACTCTCAT-1 | 1155 | 0.021104 | 3459.0 | CD14+ Monocytes | pbmc3k |
TTTCTACTGAGGCA-1 | 1227 | 0.009294 | 3443.0 | B cells | pbmc3k |
TTTCTACTTCCTCG-1 | 622 | 0.021971 | 1684.0 | B cells | pbmc3k |
TTTGCATGAGAGGC-1 | 454 | 0.020548 | 1022.0 | B cells | pbmc3k |
TTTGCATGCCTCAC-1 | 724 | 0.008065 | 1984.0 | CD4 T cells | pbmc3k |
2638 rows × 5 columns
='louvain') sc.pl.umap(adata_ref, color
Make sure we have the same genes in both datset by taking the intersection
print(adata_ref.shape[1])
print(adata.shape[1])
= adata_ref.var_names.intersection(adata.var_names)
var_names print(len(var_names))
= adata_ref[:, var_names]
adata_ref = adata[:, var_names] adata
1838
2626
419
First we need to rerun pca and umap with the same gene set for both datasets.
sc.pp.pca(adata_ref)
sc.pp.neighbors(adata_ref)
sc.tl.umap(adata_ref)='louvain') sc.pl.umap(adata_ref, color
computing PCA
with n_comps=50
finished (0:00:00)
computing neighbors
using 'X_pca' with n_pcs = 50
finished (0:00:12)
computing UMAP
finished (0:00:04)
sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)='louvain_0.6') sc.pl.umap(adata, color
computing PCA
on highly variable genes
with n_comps=50
finished (0:00:00)
computing neighbors
using 'X_pca' with n_pcs = 50
finished (0:00:00)
computing UMAP
finished (0:00:02)
2 Integrate with scanorama
import scanorama
#subset the individual dataset to the same variable genes as in MNN-correct.
= dict()
alldata 'ctrl']=adata
alldata['ref']=adata_ref
alldata[
#convert to list of AnnData objects
= list(alldata.values())
adatas
# run scanorama.integrate
= 50) scanorama.integrate_scanpy(adatas, dimred
Found 419 genes among all datasets
[[0. 0.96511628]
[0. 0. ]]
Processing datasets (0, 1)
# add in sample info
'sample']='pbmc3k'
adata_ref.obs[
# create a merged scanpy object and add in the scanorama
= alldata['ctrl'].concatenate(alldata['ref'], batch_key='sample', batch_categories=['ctrl','pbmc3k'])
adata_merged
= np.concatenate([ad.obsm['X_scanorama'] for ad in adatas], axis=0)
embedding 'Scanorama'] = embedding adata_merged.obsm[
#run umap.
=50, use_rep = "Scanorama")
sc.pp.neighbors(adata_merged, n_pcs sc.tl.umap(adata_merged)
computing neighbors
finished (0:00:00)
computing UMAP
finished (0:00:05)
=["sample","louvain"]) sc.pl.umap(adata_merged, color
2.1 Label transfer
Using the function in the Spatial tutorial at the scanpy website we will calculate normalized cosine distances between the two datasets and tranfer labels to the celltype with the highest scores.
from sklearn.metrics.pairwise import cosine_distances
= 1 - cosine_distances(
distances 'sample'] == "pbmc3k"].obsm["Scanorama"],
adata_merged[adata_merged.obs['sample'] == "ctrl"].obsm["Scanorama"],
adata_merged[adata_merged.obs[
)
def label_transfer(dist, labels, index):
= pd.get_dummies(labels)
lab = lab.to_numpy().T @ dist
class_prob = np.linalg.norm(class_prob, 2, axis=0)
norm = class_prob / norm
class_prob = (class_prob.T - class_prob.min(1)) / class_prob.ptp(1)
class_prob # convert to df
= pd.DataFrame(
cp_df =lab.columns
class_prob, columns
)= index
cp_df.index # classify as max score
= cp_df.idxmax(axis=1)
m
return m
= label_transfer(distances, adata_ref.obs.louvain, adata.obs.index)
class_def
# add to obs section of the original object
'predicted'] = class_def
adata.obs[
="predicted") sc.pl.umap(adata, color
# add to merged object.
"predicted"] = pd.concat(
adata_merged.obs["louvain"]], axis=0
[class_def, adata_ref.obs[
).tolist()
=["sample","louvain",'predicted'])
sc.pl.umap(adata_merged, color#plot only ctrl cells.
'sample']=='ctrl'], color='predicted') sc.pl.umap(adata_merged[adata_merged.obs[
Now plot how many cells of each celltypes can be found in each cluster.
= pd.crosstab(adata.obs['louvain_0.6'],adata.obs['predicted'], normalize='index')
tmp =True).legend(bbox_to_anchor=(1.8, 1),loc='upper right') tmp.plot.bar(stacked
<matplotlib.legend.Legend at 0x7f1f2733dfc0>
3 Ingest
Another method for celltype prediction is Ingest, for more information, please look at https://scanpy-tutorials.readthedocs.io/en/latest/integrating-data-using-ingest.html
='louvain')
sc.tl.ingest(adata, adata_ref, obs=['louvain','louvain_0.6'], wspace=0.5) sc.pl.umap(adata, color
running ingest
finished (0:00:35)
Now plot how many cells of each celltypes can be found in each cluster.
= pd.crosstab(adata.obs['louvain_0.6'],adata.obs['louvain'], normalize='index')
tmp =True).legend(bbox_to_anchor=(1.8, 1),loc='upper right') tmp.plot.bar(stacked
<matplotlib.legend.Legend at 0x7f1f271c5db0>
4 Compare results
The predictions from ingest is stored in the column ‘louvain’ while we named the label transfer with scanorama as ‘predicted’
=['louvain','predicted'], wspace=0.5) sc.pl.umap(adata, color
As you can see, the main celltypes are the same, but dendritic cells are mainly predicted to cluster 8 by ingest and the proportions of the different celltypes are different.
The only way to make sure which method you trust is to look at what genes the different celltypes express and use your biological knowledge to make decisions.
5 Gene set analysis
Another way of predicting celltypes is to use the differentially expressed genes per cluster and compare to lists of known cell marker genes. This requires a list of genes that you trust and that is relevant for the tissue you are working on.
You can either run it with a marker list from the ontology or a list of your choice as in the example below.
= 'data/human_cell_markers.txt'
path_file if not os.path.exists(path_file):
urllib.request.urlretrieve(os.path.join('human_cell_markers.txt'), path_file) path_data,
= pd.read_table(path_file)
df
df
print(df.shape)
(2868, 15)
# Filter for number of genes per celltype
'nG'] = df.geneSymbol.str.split(",").str.len()
df[
= df[df['nG'] > 5]
df = df[df['nG'] < 100]
df = df[df['cancerType'] == "Normal"]
d print(df.shape)
(445, 16)
= df.cellName
df.index = df.geneSymbol.str.split(",").to_dict() gene_dict
# run differential expression per cluster
'louvain_0.6', method='wilcoxon', key_added = "wilcoxon") sc.tl.rank_genes_groups(adata,
ranking genes
finished (0:00:02)
# do gene set overlap to the groups in the gene list and top 300 DEGs.
import gseapy
= dict()
gsea_res = dict()
pred
for cl in adata.obs['louvain_0.6'].cat.categories.tolist():
print(cl)
= sc.get.rank_genes_groups_df(adata, group=cl, key='wilcoxon')[
glist 'names'].squeeze().str.strip().tolist()
= gseapy.enrichr(gene_list=glist[:300],
enr_res ='Human',
organism=gene_dict,
gene_sets=adata.raw.shape[1],
background=1)
cutoffif enr_res.results.shape[0] == 0:
= "Unass"
pred[cl] else:
enr_res.results.sort_values(="P-value", axis=0, ascending=True, inplace=True)
byprint(enr_res.results.head(2))
= enr_res
gsea_res[cl] = enr_res.results["Term"][0] pred[cl]
0
Gene_set Term Overlap P-value Adjusted P-value \
0 gs_ind_0 Cancer stem-like cell 1/6 0.088981 0.226652
6 gs_ind_0 Macrophage 1/6 0.088981 0.226652
Odds Ratio Combined Score Genes
0 14.996147 36.280703 ANPEP
6 14.996147 36.280703 AIF1
1
Gene_set Term Overlap P-value Adjusted P-value \
2 gs_ind_0 Effector memory T cell 1/7 0.103024 0.180292
4 gs_ind_0 Monocyte 1/7 0.103024 0.180292
Odds Ratio Combined Score Genes
2 12.995993 29.537229 IL7R
4 12.995993 29.537229 CD52
2
Gene_set Term Overlap P-value Adjusted P-value \
6 gs_ind_0 Monocyte 1/7 0.103024 0.244332
7 gs_ind_0 Parietal progenitor cell 1/7 0.103024 0.244332
Odds Ratio Combined Score Genes
6 12.995993 29.537229 CD52
7 12.995993 29.537229 ANXA1
3
Gene_set Term Overlap P-value Adjusted P-value \
6 gs_ind_0 Effector memory T cell 1/7 0.103024 0.226084
8 gs_ind_0 Naive T cell 1/7 0.103024 0.226084
Odds Ratio Combined Score Genes
6 12.995993 29.537229 IL7R
8 12.995993 29.537229 CCR7
4
Gene_set Term Overlap P-value Adjusted P-value Odds Ratio \
0 gs_ind_0 B cell 1/6 0.088981 0.116851 14.996147
4 gs_ind_0 Monocyte 1/7 0.103024 0.116851 12.995993
Combined Score Genes
0 36.280703 CD19
4 29.537229 CD52
5
Gene_set Term Overlap P-value \
11 gs_ind_0 Myeloid-derived suppressor cell 1/6 0.088981
2 gs_ind_0 Dendritic cell 1/7 0.103024
Adjusted P-value Odds Ratio Combined Score Genes
11 0.183109 14.996147 36.280703 ITGAM
2 0.183109 12.995993 29.537229 ITGAM
6
Gene_set Term Overlap P-value \
0 gs_ind_0 Cancer stem-like cell 1/6 0.088981
4 gs_ind_0 Induced pluripotent stem cell 1/6 0.088981
Adjusted P-value Odds Ratio Combined Score Genes
0 0.164838 14.996147 36.280703 ANPEP
4 0.164838 14.996147 36.280703 ITGA6
7
Gene_set Term Overlap P-value Adjusted P-value Odds Ratio \
0 gs_ind_0 B cell 1/6 0.088981 0.140221 14.996147
5 gs_ind_0 Monocyte 1/7 0.103024 0.140221 12.995993
Combined Score Genes
0 36.280703 CD19
5 29.537229 CD52
8
Gene_set Term Overlap P-value \
2 gs_ind_0 Macrophage 1/6 0.088981
3 gs_ind_0 Monocyte derived dendritic cell 1/8 0.116851
Adjusted P-value Odds Ratio Combined Score Genes
2 0.233702 14.996147 36.280703 AIF1
3 0.233702 11.466464 24.616832 ITGAX
9
Gene_set Term Overlap P-value Adjusted P-value \
3 gs_ind_0 PROM1Low progenitor cell 1/7 0.103024 0.309489
1 gs_ind_0 M2 macrophage 1/12 0.170068 0.309489
Odds Ratio Combined Score Genes
3 12.995993 29.537229 ALCAM
1 7.795593 13.810336 CD163
# prediction per cluster
pred
{'0': 'Cancer stem-like cell',
'1': 'CD4+ T cell',
'2': 'CD8+ T cell',
'3': 'Activated T cell',
'4': 'B cell',
'5': 'CD16+ dendritic cell',
'6': 'Cancer stem-like cell',
'7': 'B cell',
'8': 'Circulating fetal cell',
'9': 'Circulating fetal cell'}
= [pred[x] for x in adata.obs['louvain_0.6']]
prediction "GS_overlap_pred"] = prediction
adata.obs[
='GS_overlap_pred') sc.pl.umap(adata, color
As you can see, it agrees to some extent with the predictions from label transfer and ingest, but there are clear differences, which do you think looks better?
6 Session info
Click here
sc.logging.print_versions()
-----
anndata 0.10.5.post1
scanpy 1.9.6
-----
PIL 10.0.0
annoy NA
anyio NA
array_api_compat 1.4.1
asttokens NA
attr 23.1.0
babel 2.12.1
backcall 0.2.0
certifi 2023.11.17
cffi 1.15.1
charset_normalizer 3.1.0
colorama 0.4.6
comm 0.1.3
cycler 0.12.1
cython_runtime NA
dateutil 2.8.2
debugpy 1.6.7
decorator 5.1.1
defusedxml 0.7.1
exceptiongroup 1.2.0
executing 1.2.0
fastjsonschema NA
fbpca NA
gmpy2 2.1.2
gseapy 1.0.6
h5py 3.9.0
idna 3.4
igraph 0.10.8
intervaltree NA
ipykernel 6.23.1
ipython_genutils 0.2.0
jedi 0.18.2
jinja2 3.1.2
joblib 1.3.2
json5 NA
jsonpointer 2.0
jsonschema 4.17.3
jupyter_events 0.6.3
jupyter_server 2.6.0
jupyterlab_server 2.22.1
kiwisolver 1.4.5
leidenalg 0.10.1
llvmlite 0.41.1
louvain 0.8.1
markupsafe 2.1.2
matplotlib 3.8.0
matplotlib_inline 0.1.6
mpl_toolkits NA
mpmath 1.3.0
natsort 8.4.0
nbformat 5.8.0
numba 0.58.1
numpy 1.26.3
opt_einsum v3.3.0
overrides NA
packaging 23.1
pandas 2.2.0
parso 0.8.3
patsy 0.5.6
pexpect 4.8.0
pickleshare 0.7.5
pkg_resources NA
platformdirs 3.5.1
prometheus_client NA
prompt_toolkit 3.0.38
psutil 5.9.5
ptyprocess 0.7.0
pure_eval 0.2.2
pvectorc NA
pycparser 2.21
pydev_ipython NA
pydevconsole NA
pydevd 2.9.5
pydevd_file_utils NA
pydevd_plugins NA
pydevd_tracing NA
pygments 2.15.1
pynndescent 0.5.11
pyparsing 3.1.1
pyrsistent NA
pythonjsonlogger NA
pytz 2023.3
requests 2.31.0
rfc3339_validator 0.1.4
rfc3986_validator 0.1.1
scanorama 1.7.4
scipy 1.12.0
send2trash NA
session_info 1.0.0
six 1.16.0
sklearn 1.4.0
sniffio 1.3.0
socks 1.7.1
sortedcontainers 2.4.0
sparse 0.15.1
stack_data 0.6.2
statsmodels 0.14.1
sympy 1.12
texttable 1.7.0
threadpoolctl 3.2.0
torch 2.0.0
tornado 6.3.2
tqdm 4.65.0
traitlets 5.9.0
typing_extensions NA
umap 0.5.5
urllib3 2.0.2
wcwidth 0.2.6
websocket 1.5.2
yaml 6.0
zmq 25.0.2
zoneinfo NA
zstandard 0.19.0
-----
IPython 8.13.2
jupyter_client 8.2.0
jupyter_core 5.3.0
jupyterlab 4.0.1
notebook 6.5.4
-----
Python 3.10.11 | packaged by conda-forge | (main, May 10 2023, 18:58:44) [GCC 11.3.0]
Linux-5.15.0-92-generic-x86_64-with-glibc2.35
-----
Session information updated at 2024-02-06 00:43