from IPython.display import Image
Image('assests/UMAP_DataIntegration.png', width=2000)

import os
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.layers import concatenate

import warnings
warnings.filterwarnings("ignore")

scRNAseq = pd.read_csv('data/scRNAseq.txt', sep='\t')
scProteomics = pd.read_csv('data/scProteomics.txt', sep='\t')

X_scRNAseq = scRNAseq.values[:, 0:(scRNAseq.shape[1]-1)]
Y_scRNAseq = scRNAseq.values[:, scRNAseq.shape[1]-1]
X_scProteomics = scProteomics.values[:, 0:(scProteomics.shape[1]-1)]
Y_scProteomics = scProteomics.values[:, scProteomics.shape[1]-1]

X_scRNAseq = np.log(X_scRNAseq + 1)
X_scProteomics = np.log(X_scProteomics + 1)

2024-10-15 09:44:41.921077: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-15 09:44:41.963042: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

X_scRNAseq.shape

(8617, 976)

X_scProteomics.shape

(8617, 10)

from umap import UMAP
from sklearn.decomposition import PCA

X_reduced = PCA(n_components = 20).fit_transform(X_scRNAseq)
model = UMAP(n_components = 2, min_dist = 1, n_neighbors = 93, init = X_reduced[:, 0:2], n_epochs = 1000, 
             verbose = 2)
umap = model.fit_transform(X_reduced)

plt.figure(figsize = (20,15))
plt.scatter(umap[:, 0], umap[:, 1], c = Y_scRNAseq, cmap = 'tab20', s = 20)
plt.title('UMAP: scRNAseq', fontsize = 25); 
plt.xlabel("UMAP1", fontsize = 22); plt.ylabel("UMAP2", fontsize = 22)
plt.show()

UMAP(init=array([[45.31235894, 0.07477159],
       [46.53271754, 1.71486063],
       [46.8638235 , 0.98846408],
       ...,
       [ 1.11426643, -1.86512337],
       [ 4.54796886, -0.69171758],
       [ 2.61794017, -0.97065852]]), min_dist=1, n_epochs=1000, n_neighbors=93, verbose=2)
Tue Oct 15 09:44:56 2024 Construct fuzzy simplicial set
Tue Oct 15 09:44:56 2024 Finding Nearest Neighbors
Tue Oct 15 09:44:56 2024 Building RP forest with 10 trees
Tue Oct 15 09:45:00 2024 NN descent for 13 iterations
	 1  /  13
	 2  /  13
	 3  /  13
	Stopping threshold met -- exiting after 3 iterations
Tue Oct 15 09:45:16 2024 Finished Nearest Neighbor Search
Tue Oct 15 09:45:18 2024 Construct embedding

Epochs completed:   0%|            4/1000 [00:01]

	completed  0  /  1000 epochs

Epochs completed:  10%| █          101/1000 [00:25]

	completed  100  /  1000 epochs

Epochs completed:  20%| ██         201/1000 [00:51]

	completed  200  /  1000 epochs

Epochs completed:  30%| ███        301/1000 [01:18]

	completed  300  /  1000 epochs

Epochs completed:  40%| ████       401/1000 [01:44]

X_reduced = PCA(n_components = 10).fit_transform(X_scProteomics)
model = UMAP(n_components = 2, min_dist = 0.8, n_neighbors = 93, init = X_reduced[:, 0:2], n_epochs = 1000, 
             verbose = 2)
umap = model.fit_transform(X_reduced)

plt.figure(figsize = (20,15))
plt.scatter(umap[:, 0], umap[:, 1], c = Y_scProteomics, cmap = 'tab20', s = 20)
plt.title('UMAP: scProteomics', fontsize = 25); 
plt.xlabel("UMAP1", fontsize = 22); plt.ylabel("UMAP2", fontsize = 22)
plt.show()

UMAP(init=array([[-1.65366402, -0.2564025 ],
       [-1.7084814 , 1.69115812],
       [-1.03060202, -0.13172222],
       ...,
       [-2.19859839, -5.16795233],
       [-2.10143214, -2.48594223],
       [-2.43134931, -3.33697131]]), min_dist=0.8, n_epochs=1000, n_neighbors=93, verbose=2)
Tue Oct 15 09:49:45 2024 Construct fuzzy simplicial set
Tue Oct 15 09:49:45 2024 Finding Nearest Neighbors
Tue Oct 15 09:49:45 2024 Building RP forest with 10 trees
Tue Oct 15 09:49:45 2024 NN descent for 13 iterations
	 1  /  13
	 2  /  13
	 3  /  13
	Stopping threshold met -- exiting after 3 iterations
Tue Oct 15 09:49:51 2024 Finished Nearest Neighbor Search
Tue Oct 15 09:49:52 2024 Construct embedding

Epochs completed:   0%|            4/1000 [00:01]

	completed  0  /  1000 epochs

Epochs completed:  10%| █          101/1000 [00:26]

	completed  100  /  1000 epochs

Epochs completed:  20%| ██         201/1000 [00:52]

	completed  200  /  1000 epochs

Epochs completed:  30%| ███        301/1000 [01:19]

	completed  300  /  1000 epochs

Epochs completed:  40%| ████       401/1000 [01:46]

import umap

X_reduced_scRNAseq = PCA(n_components = 20).fit_transform(X_scRNAseq)
X_reduced_scProteomics = PCA(n_components = 10).fit_transform(X_scProteomics)

fit1 = umap.UMAP(n_components = 2, min_dist = 1, n_neighbors = 93, n_epochs = 1000, 
                 init = X_reduced_scRNAseq[:, 0:2], verbose = 2).fit(X_reduced_scRNAseq)
fit2 = umap.UMAP(n_components = 2, min_dist = 0.8, n_neighbors = 93, n_epochs = 1000, 
                 init = X_reduced_scProteomics[:, 0:2], verbose = 2).fit(X_reduced_scProteomics)
intersection = umap.umap_. general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight = 0.45)
intersection = umap.umap_.reset_local_connectivity(intersection)
embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data, intersection, fit1.n_components, 
                                                fit1.learning_rate, fit1._a, fit1._b, 
                                                fit1.repulsion_strength, fit1.negative_sample_rate, 
                                                1000, 'random', np.random, fit1.metric, 
                                                fit1._metric_kwds, False, densmap_kwds=False, output_dens=False)

UMAP(init=array([[45.31235894, 0.07477159],
       [46.53271754, 1.71486063],
       [46.8638235 , 0.98846408],
       ...,
       [ 1.11426643, -1.86512337],
       [ 4.54796886, -0.69171758],
       [ 2.61794017, -0.97065852]]), min_dist=1, n_epochs=1000, n_neighbors=93, verbose=2)
Tue Oct 15 09:54:18 2024 Construct fuzzy simplicial set
Tue Oct 15 09:54:18 2024 Finding Nearest Neighbors
Tue Oct 15 09:54:18 2024 Building RP forest with 10 trees
Tue Oct 15 09:54:18 2024 NN descent for 13 iterations
	 1  /  13
	 2  /  13
	 3  /  13
	Stopping threshold met -- exiting after 3 iterations
Tue Oct 15 09:54:24 2024 Finished Nearest Neighbor Search
Tue Oct 15 09:54:25 2024 Construct embedding

Epochs completed:   0%|            5/1000 [00:01]

	completed  0  /  1000 epochs

Epochs completed:  10%| █          102/1000 [00:16]

	completed  100  /  1000 epochs

Epochs completed:  20%| ██         202/1000 [00:33]

	completed  200  /  1000 epochs

Epochs completed:  30%| ███        302/1000 [00:50]

	completed  300  /  1000 epochs

Epochs completed:  40%| ████       402/1000 [01:06]

plt.figure(figsize = (20,15))
plt.scatter(embedding[0][:, 0], embedding[0][:, 1], c = Y_scRNAseq, cmap = 'tab20', s = 20)
plt.title('UMAP: scRNAseq + scProteomics', fontsize = 25); 
plt.xlabel("UMAP1", fontsize = 22); plt.ylabel("UMAP2", fontsize = 22)
plt.show()

UMAP for Data Integration¶