import networkx as nx
import matplotlib.pyplot as plt
import torch
from torch_geometric.datasets import KarateClub
import numpy as np

# Load the Karate Club graph dataset
dataset = KarateClub()
data = dataset[0]

# Convert PyTorch geometric data to NetworkX format
G = nx.Graph()

# Add edges to the NetworkX graph
edges = data.edge_index.t().tolist()
G.add_edges_from(edges)

# Get node labels (used for coloring)
node_labels = data.y.numpy() if torch.is_tensor(data.y) else data.y

# Define the position layout for the graph
pos = nx.spring_layout(G, seed=42)

# Plot the graph with node colors based on their label using the same color map
plt.figure(figsize=(8, 6))
nodes = nx.draw_networkx_nodes(G, pos, node_color=node_labels, cmap='coolwarm', node_size=300, 
                               nodelist=G.nodes(), node_shape='o')
edges = nx.draw_networkx_edges(G, pos, alpha=0.5)
nx.draw_networkx_labels(G, pos, font_size=10)

# Add colorbar for the nodes
plt.colorbar(nodes)
plt.title("Karate Club Graph with Node2Vec Coloring")
plt.show()

print(data.y)

tensor([1, 1, 1, 1, 3, 3, 3, 1, 0, 1, 3, 1, 1, 1, 0, 0, 3, 1, 0, 1, 0, 1, 0, 0,
        2, 2, 0, 0, 2, 0, 0, 2, 0, 0])

import torch
from torch_geometric.datasets import KarateClub
from torch_geometric.nn import Node2Vec
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Load the Karate Club graph dataset
dataset = KarateClub()
data = dataset[0]

# Define Node2Vec model
embedding_dim = 64  # Number of dimensions for the node embeddings
walk_length = 20  # Length of each random walk
context_size = 10  # Window size in Skip-Gram model
num_walks_per_node = 10  # Number of random walks per node
p = 1  # Return hyperparameter
q = 1  # In-out hyperparameter

node2vec = Node2Vec(
    data.edge_index, 
    embedding_dim=embedding_dim, 
    walk_length=walk_length,
    context_size=context_size, 
    walks_per_node=num_walks_per_node, 
    p=p, 
    q=q,
    sparse=True  # Use sparse gradients to save memory
).to('cpu')

# Define the optimizer
optimizer = torch.optim.SparseAdam(list(node2vec.parameters()), lr=0.01)

# Training loop
def train():
    node2vec.train()
    total_loss = 0
    for epoch in range(100):  # Train for 100 epochs
        optimizer.zero_grad()
        # Select a batch of nodes (in this case, all nodes)
        batch = torch.arange(data.num_nodes, device='cpu')
        # Generate positive and negative random walks for the batch
        pos_rw = node2vec.pos_sample(batch)
        neg_rw = node2vec.neg_sample(batch)
        # Calculate loss
        loss = node2vec.loss(pos_rw, neg_rw)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / 100

# Train the Node2Vec model
loss = train()

# Get the learned node embeddings
node_embeddings = node2vec.embedding.weight.data.cpu()

# Convert the PyTorch tensor to a NumPy array
node_embeddings_np = node_embeddings.numpy()

# Apply t-SNE to reduce dimensions for visualization
tsne = TSNE(n_components=2)
node_embeddings_2d = tsne.fit_transform(node_embeddings_np).astype('float64')  # Ensure t-SNE output is float64

# Plot the embeddings
plt.figure(figsize=(8, 6))

# Ensure node_embeddings_2d is float64 and data.y is converted to NumPy if needed
sc = plt.scatter(
    node_embeddings_2d[:, 0].astype('float64'), 
    node_embeddings_2d[:, 1].astype('float64'), 
    c=data.y.numpy() if torch.is_tensor(data.y) else data.y,  # Labels for coloring the points
    cmap='coolwarm', 
    s=100
)

# Add node labels
for i, (x, y) in enumerate(node_embeddings_2d):
    plt.text(x, y, str(i), fontsize=9, ha='right', va='bottom')

# Add colorbar for the nodes
plt.colorbar(sc)
plt.title("Node2Vec Embeddings for Karate Club Graph with Node Labels")
plt.show()

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import networkx as nx
from sklearn.model_selection import train_test_split
from torch_geometric.nn import GCNConv
import matplotlib.pyplot as plt

np.random.seed(42)

# Generate the synthetic citation network dataset
def generate_synthetic_graph(num_nodes=20, num_features=5, num_classes=2):
    # Generate a random feature matrix (each node has a num_features-dimensional feature vector)
    feature_matrix = np.random.randn(num_nodes, num_features)
    # Create a random adjacency matrix
    G = nx.erdos_renyi_graph(num_nodes, 0.2)  # probability of edge creation is 0.2
    # Get the adjacency matrix as a sparse matrix
    adj_matrix_sparse = nx.adjacency_matrix(G)
    # Convert the sparse matrix to a dense numpy array (optional)
    adjacency_matrix = adj_matrix_sparse.toarray()
    # Generate random labels for each node (num_classes possible classes)
    labels = np.random.randint(0, num_classes, size=num_nodes)
    return feature_matrix, adjacency_matrix, labels, G

# Generate the dataset
num_nodes = 20
features, adjacency, labels, graph = generate_synthetic_graph(num_nodes=num_nodes)

# Visualize the graph
nx.draw(graph, with_labels=True, node_color=labels, cmap=plt.get_cmap('Set1'))
plt.show()

print("Feature Matrix Shape:", features.shape)
print("Adjacency Matrix Shape:", adjacency.shape)
print("Labels:", labels)

Feature Matrix Shape: (20, 5)
Adjacency Matrix Shape: (20, 20)
Labels: [0 1 0 0 0 1 1 1 0 0 1 1 1 1 0 1 0 1 0 1]

# Train/Test split (use integers for mask)
#train_mask_np, test_mask_np = train_test_split(np.arange(num_nodes), test_size=0.2, random_state=42)

# Assume test_nodes_list contains the indices of the selected test nodes
test_nodes_list = [18, 16]  # Example list of test node indices
# Define test_mask_np based on the selected test nodes
test_mask_np = np.array(test_nodes_list)
# Define train_mask_np as all nodes except the test nodes
train_mask_np = np.setdiff1d(np.arange(num_nodes), test_mask_np)


# Convert to PyTorch Long Tensors
train_mask = torch.tensor(train_mask_np, dtype=torch.long)  # Convert train_mask to PyTorch tensor
test_mask = torch.tensor(test_mask_np, dtype=torch.long)    # Convert test_mask to PyTorch tensor

# Convert the dataset to PyTorch tensors
edge_index = torch.tensor(np.array(graph.edges()).T, dtype=torch.long)  # Edge index for PyTorch Geometric
x = torch.tensor(features, dtype=torch.float)  # Feature matrix
y = torch.tensor(labels, dtype=torch.long)     # Labels

# Get the training nodes (nodes not in test_mask)
train_nodes = np.setdiff1d(np.arange(len(labels)), test_mask_np)

# Define the GCN model
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        # First convolution layer
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        # Second convolution layer
        x = self.conv2(x, edge_index)
        return torch.log_softmax(x, dim=1)

# Model training
model = GCN(in_channels=features.shape[1], hidden_channels=16, out_channels=2)  # 5 input features, 16 hidden units, 2 output classes
optimizer = optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.NLLLoss()

# Training function
def train():
    model.train()
    optimizer.zero_grad()
    out = model(x, edge_index)
    # Use train_mask as indices for loss calculation
    loss = loss_fn(out[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Test function
def test():
    model.eval()
    out = model(x, edge_index)
    pred = out.argmax(dim=1)  # Get predicted class (highest log probability)
    test_correct = pred[test_mask] == y[test_mask]
    test_acc = int(test_correct.sum()) / int(test_mask.size(0))
    return test_acc

# Training loop
epochs = 100
for epoch in range(epochs):
    loss = train()
    if epoch % 10 == 0:
        test_acc = test()
        print(f'Epoch: {epoch}, Loss: {loss:.4f}, Test Accuracy: {test_acc:.4f}')

Epoch: 0, Loss: 0.5578, Test Accuracy: 0.5000
Epoch: 10, Loss: 0.3876, Test Accuracy: 0.5000
Epoch: 20, Loss: 0.3196, Test Accuracy: 0.5000
Epoch: 30, Loss: 0.2702, Test Accuracy: 0.5000
Epoch: 40, Loss: 0.2289, Test Accuracy: 0.5000
Epoch: 50, Loss: 0.1932, Test Accuracy: 0.5000
Epoch: 60, Loss: 0.1618, Test Accuracy: 1.0000
Epoch: 70, Loss: 0.1344, Test Accuracy: 1.0000
Epoch: 80, Loss: 0.1130, Test Accuracy: 1.0000
Epoch: 90, Loss: 0.0956, Test Accuracy: 0.5000

# Plot the graph
# Using a spring layout with a minimum edge length for better visualization
pos = nx.spring_layout(graph, k=3.5, iterations=100)  # Adjusting 'k' for minimum length of edges

plt.figure(figsize=(8, 8))
for edge in graph.edges(data=True):
    node1, node2, weight = edge
    color = 'gray' if labels[node1] == labels[node2] else 'blue'
    # weight will be defined later
    if weight:
        nx.draw_networkx_edges(graph, pos, edgelist=[(node1, node2)], width=weight['weight'] * 5, edge_color=color, alpha=0.2)
    else:
        nx.draw_networkx_edges(graph, pos, edgelist=[(node1, node2)], width=1, edge_color=color, alpha=0.2)
# Draw nodes with larger labels positioned above and to the right
colors = ["red", "blue"]
node_colors = [colors[labels[i]] for i in graph.nodes()]
node_sizes = [400 if i in set(test_mask_np) else 200 for i in graph.nodes()]
nx.draw_networkx_nodes(graph, pos, node_size=node_sizes, node_color=node_colors)
# Compute the label positions more precisely
label_pos = {k: pos[k] + np.array([0.005, 0.05]) for k in pos}
nx.draw_networkx_labels(graph, label_pos, font_size=14, font_color="black", verticalalignment='bottom', horizontalalignment='left')

plt.title("(node classes: gray/red, blue links for bipartite edges)")
plt.show()

# Test the model based on input (test on a specific node)
def test_single_node(node_idx):
    model.eval()
    out = model(x, edge_index)
    pred = out.argmax(dim=1)
    predicted_class = pred[node_idx].item()
    actual_class = y[node_idx].item()
    print(f"Node {node_idx}: Predicted Class = {predicted_class}, Actual Class = {actual_class}")

# Test the model on a specific node from the test set
for node_idx in test_mask:
    test_single_node(node_idx.item())

Node 18: Predicted Class = 0, Actual Class = 1
Node 16: Predicted Class = 1, Actual Class = 1

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import MDS

# Set seed for reproducibility
np.random.seed(42)

# Parameters
n_samples = 20  # Total number of samples
n_features = 5  # Number of features
n_clusters = 2  # Number of clusters
cluster_spread = 0.5  # Spread of points around cluster centers

# Define a pairwise distance matrix between clusters
"""
# Example: pairwise distances for 4 clusters
distance_matrix = np.array([
    [0, 3, 2, 5],
    [3, 0, 4, 6],
    [2, 4, 0, 3],
    [5, 6, 3, 0]
])
"""

distance_matrix = np.array([
    [0, 1],
    [1, 0],
])

# Use Multidimensional Scaling (MDS) to find coordinates of the cluster centers in 2D latent space
mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
latent_factors_centers = mds.fit_transform(distance_matrix)

# Assign samples to clusters and generate latent factors for each sample
samples_per_cluster = n_samples // n_clusters
latent_factors = np.zeros((n_samples, 2))

for i in range(n_clusters):
    cluster_start = i * samples_per_cluster
    cluster_end = (i + 1) * samples_per_cluster
    latent_factors[cluster_start:cluster_end, 0] = np.random.normal(
        loc=latent_factors_centers[i, 0], scale=cluster_spread, size=samples_per_cluster
    )  # First latent factor (influences PCA1)
    latent_factors[cluster_start:cluster_end, 1] = np.random.normal(
        loc=latent_factors_centers[i, 1], scale=cluster_spread, size=samples_per_cluster
    )  # Second latent factor (influences PCA2)

# Generate observed features as linear combinations of latent factors plus some noise
W = np.random.rand(2, n_features)  # Random weights for mapping latent factors to features
noise = np.random.normal(0, 0.3, size=(n_samples, n_features))  # Adding some noise for realism
X = np.dot(latent_factors, W) + noise  # Map latent factors to observed features with noise

# Convert to a dataframe for easier visualization
df = pd.DataFrame(X, columns=[f"Feature_{i+1}" for i in range(n_features)])

# Apply PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Plot the results
plt.figure(figsize=(8, 6))
colors = ['red', 'blue', 'green', 'orange', 'magenta']


labels = np.array([0] * n_samples) # initialize labels as zeroes
for i in range(n_clusters):
    cluster_start = i * samples_per_cluster
    cluster_end = (i + 1) * samples_per_cluster
    for j in range(cluster_start,cluster_end):
        labels[j] = i
        plt.scatter(X_pca[j, 0], X_pca[j, 1], alpha=0.7, color=colors[i])
        plt.text(X_pca[j, 0] + 0.03, X_pca[j, 1], str(j), fontsize=12)
plt.title('PCA of Dataset with Parametrized Pairwise Cluster Distances')
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.grid(True)
plt.show()

from sklearn.metrics.pairwise import cosine_similarity

# Cosine similarity matrix
similarity_matrix = cosine_similarity(X)

# Graph extraction using a threshold
threshold = 0.7
num_samples = n_samples
graph = nx.Graph()

for i in range(num_samples):
    for j in range(i + 1, num_samples):
        if similarity_matrix[i, j] > threshold:
            graph.add_edge(i, j, weight=1.7**similarity_matrix[i, j])

# Find and add any missing nodes
for node in range(num_samples):
    if node not in graph.nodes:
        # Find the closest neighbor for the missing node
        #closest_neighbor = np.argmax(similarity_matrix[node])
        # Get the similarities, excluding the node itself by setting its value to -1
        similarities = similarity_matrix[node].copy()
        similarities[node] = -1  # Exclude self-similarity
        closest_neighbor = np.argmax(similarities)
        graph.add_edge(node, closest_neighbor, weight=similarity_matrix[node, closest_neighbor])

# Plot the graph
# Using a spring layout with a minimum edge length for better visualization
pos = nx.spring_layout(graph, k=3.5, iterations=100)  # Adjusting 'k' for minimum length of edges

plt.figure(figsize=(8, 8))
for edge in graph.edges(data=True):
    node1, node2, weight = edge
    color = 'gray' if labels[node1] == labels[node2] else 'blue'
    nx.draw_networkx_edges(graph, pos, edgelist=[(node1, node2)], width=weight['weight'] * 5, edge_color=color, alpha=0.2)

# Draw nodes with larger labels positioned above and to the right
nx.draw_networkx_nodes(graph, pos, node_size=300, node_color=['red' if labels[i] == 1 else 'gray' for i in graph.nodes()])
label_pos = {k: pos[k] + np.array([0.005, 0.05]) for k in pos}
nx.draw_networkx_labels(graph, label_pos, font_size=14, font_color="black", verticalalignment='bottom', horizontalalignment='left')

plt.title("(node classes: gray/red, blue links for bipartite edges)")
plt.show()

# Pick a vertex to analyze
picked_vertex = 18

# Line plotting its neighbors
neighbors = list(graph.neighbors(picked_vertex))
nodes_to_plot = [picked_vertex] + neighbors
plt.figure(figsize=(6, 4))
for node in nodes_to_plot:
    color = 'gray' if labels[node] == 0 else 'red'
    plt.plot(X[node], label=f'Node {node}', color=color)

plt.title(f"Feature Plot for Node {picked_vertex} and its Neighbors")
plt.xlabel("Feature Index")
plt.ylabel("Feature Value")
plt.legend()
plt.show()

# Subgraph of picked vertex's neighbors
subgraph = graph.subgraph([picked_vertex] + neighbors)

plt.figure(figsize=(4, 4))
nx.draw(subgraph, with_labels=True, node_color=['red' if labels[i] == 1 else 'gray' for i in subgraph.nodes()], node_size=300, pos=nx.spring_layout(subgraph), font_size=12)
plt.title(f"Subgraph of Node {picked_vertex} and its Neighbors")
plt.show()

# graph was already computed
# labels was already computed
num_nodes = n_samples
features = X
# Get the adjacency matrix as a sparse matrix
adj_matrix_sparse = nx.adjacency_matrix(graph)
# Convert the sparse matrix to a dense numpy array (optional)
adjacency_matrix = adj_matrix_sparse.toarray()

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data

expression_matrix = X

# Assume `expression_matrix` is the gene expression data (cells x genes)
# Normalize the expression matrix
normalized_expression = expression_matrix / np.linalg.norm(expression_matrix, axis=1, keepdims=True)

# Compute the similarity matrix between cells (e.g., cosine similarity)
similarity_matrix = cosine_similarity(normalized_expression)

# Generate adjacency matrix (e.g., using k-NN with thresholding)
k = 5  # Number of nearest neighbors to connect
adjacency_matrix = np.zeros(similarity_matrix.shape)
for i in range(similarity_matrix.shape[0]):
    # Get indices of k nearest neighbors
    knn_indices = np.argsort(similarity_matrix[i])[-(k+1):-1]  # Exclude the cell itself
    adjacency_matrix[i, knn_indices] = 1

# Convert adjacency matrix to edge index format for PyTorch Geometric
edge_index = torch.tensor(np.array(np.nonzero(adjacency_matrix)), dtype=torch.long)

class GCNCellClustering(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCNCellClustering, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        # First convolution layer
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        # Second convolution layer
        x = self.conv2(x, edge_index)
        return x  # For clustering, no softmax is applied

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import torch.optim as optim

# Prepare data for GCN (convert gene expression matrix to PyTorch tensor)
x = torch.tensor(expression_matrix, dtype=torch.float)

# Initialize model and optimizer
model = GCNCellClustering(in_channels=x.shape[1], hidden_channels=64, out_channels=32)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    # Forward pass through the GCN
    embeddings = model(x, edge_index)
    loss = torch.mean(embeddings.norm(dim=1))  # Unsupervised loss
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        print(f'Epoch: {epoch}, Loss: {loss.item():.4f}')

# Apply k-means clustering on the learned embeddings
model.eval()
with torch.no_grad():
    embeddings = model(x, edge_index).cpu().numpy()

# Use PCA to reduce dimensionality (optional)
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# Apply k-means clustering
kmeans = KMeans(n_clusters=2)
cluster_labels = kmeans.fit_predict(reduced_embeddings)

# Plot the clustered cells
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=cluster_labels, cmap='viridis')
plt.show()

Epoch: 0, Loss: 0.4815
Epoch: 10, Loss: 0.0685
Epoch: 20, Loss: 0.0276
Epoch: 30, Loss: 0.0119
Epoch: 40, Loss: 0.0071
Epoch: 50, Loss: 0.0061
Epoch: 60, Loss: 0.0053
Epoch: 70, Loss: 0.0039
Epoch: 80, Loss: 0.0037
Epoch: 90, Loss: 0.0040

embeddings.shape

(20, 32)

import numpy
print(numpy.__version__)

1.26.4

import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from torch_geometric.data import Data
from torch_geometric.utils import dense_to_sparse

# Create a synthetic single-cell RNA-seq dataset with 3 cell types
np.random.seed(42)
num_cells = 300
num_genes = 50

# Simulate gene expression profiles for 3 different cell types
cell_type_1 = np.random.normal(loc=2.0, scale=1.0, size=(100, num_genes))
cell_type_2 = np.random.normal(loc=0.0, scale=1.0, size=(100, num_genes))
cell_type_3 = np.random.normal(loc=-2.0, scale=1.0, size=(100, num_genes))

# Combine into a single dataset
gene_expression = np.vstack([cell_type_1, cell_type_2, cell_type_3])
labels = np.array([0] * 100 + [1] * 100 + [2] * 100)  # Cell type labels
# Normalize the data
scaler = StandardScaler()
gene_expression = scaler.fit_transform(gene_expression)

# Use PCA to reduce dimensionality (optional, but useful for visualization)
pca = PCA(n_components=10)
gene_expression_pca = pca.fit_transform(gene_expression)
# Convert to PyTorch tensors
x = torch.tensor(gene_expression_pca, dtype=torch.float)
y = torch.tensor(labels, dtype=torch.long)

from sklearn.metrics.pairwise import euclidean_distances

# Create a fully connected graph with distances as edge weights
distance_matrix = euclidean_distances(gene_expression_pca)
adjacency_matrix = np.exp(-distance_matrix / distance_matrix.std())  # Gaussian kernel
# Convert the adjacency matrix to edge_index and edge_attr format for PyG
edge_index, edge_weight = dense_to_sparse(torch.tensor(adjacency_matrix, dtype=torch.float))
# Create the PyTorch Geometric Data object
data = Data(x=x, edge_index=edge_index, edge_attr=edge_weight, y=y)
print(data)

Data(x=[300, 10], edge_index=[2, 90000], edge_attr=[90000], y=[300])

import torch.nn.functional as F
from torch_geometric.nn import GATConv

# Define the Graph Attention Network
class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=4):
        super(GAT, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=0.6)
        self.conv2 = GATConv(hidden_channels * heads, out_channels, heads=1, concat=False, dropout=0.6)

    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr
        # First layer of GAT with attention
        x = F.elu(self.conv1(x, edge_index, edge_weight))
        x = F.dropout(x, p=0.6, training=self.training)
        # Second layer of GAT for output (cell type classification)
        x = self.conv2(x, edge_index, edge_weight)
        return F.log_softmax(x, dim=1)

# Initialize the model, optimizer, and loss function
model = GAT(in_channels=10, hidden_channels=8, out_channels=3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
loss_fn = torch.nn.CrossEntropyLoss()

# Train the GAT model
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = loss_fn(out, data.y)
    loss.backward()
    optimizer.step()
    return loss.item()

# Evaluate the GAT model
def test():
    model.eval()
    out = model(data)
    pred = out.argmax(dim=1)
    correct = (pred == data.y).sum().item()
    accuracy = correct / data.num_nodes
    return accuracy

# Train the model for a few epochs
for epoch in range(100):
    loss = train()
    if epoch % 10 == 0:
        accuracy = test()
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch: 000, Loss: 3.0026, Test Accuracy: 0.0200
Epoch: 010, Loss: 1.2802, Test Accuracy: 0.3333
Epoch: 020, Loss: 1.2053, Test Accuracy: 0.3333
Epoch: 030, Loss: 1.0826, Test Accuracy: 0.3333
Epoch: 040, Loss: 1.0627, Test Accuracy: 0.3333
Epoch: 050, Loss: 1.0209, Test Accuracy: 0.3333
Epoch: 060, Loss: 0.9878, Test Accuracy: 0.4733
Epoch: 070, Loss: 0.9236, Test Accuracy: 0.3833
Epoch: 080, Loss: 0.9246, Test Accuracy: 0.3700
Epoch: 090, Loss: 0.9140, Test Accuracy: 0.3333

# Final evaluation after training
accuracy = test()
print(f'Final Test Accuracy: {accuracy:.4f}')

Final Test Accuracy: 0.3333

import torch
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import kneighbors_graph
from torch_geometric.data import Data
from torch_geometric.utils import dense_to_sparse
import torch.nn.functional as F
from torch_geometric.nn import GATConv

# Set random seed for reproducibility
torch.manual_seed(42)

# Generate the synthetic single-cell RNA-seq dataset
np.random.seed(42)
num_cells = 300
num_genes = 50

cell_type_1 = np.random.normal(loc=2.0, scale=1.0, size=(100, num_genes))
cell_type_2 = np.random.normal(loc=0.0, scale=1.0, size=(100, num_genes))
cell_type_3 = np.random.normal(loc=-2.0, scale=1.0, size=(100, num_genes))

gene_expression = np.vstack([cell_type_1, cell_type_2, cell_type_3])
labels = np.array([0] * 100 + [1] * 100 + [2] * 100)

# Normalize and apply PCA for dimensionality reduction
scaler = StandardScaler()
gene_expression = scaler.fit_transform(gene_expression)
pca = PCA(n_components=10)
gene_expression_pca = pca.fit_transform(gene_expression)

x = torch.tensor(gene_expression_pca, dtype=torch.float)
y = torch.tensor(labels, dtype=torch.long)

# Create a k-nearest neighbors graph
k = 10
adjacency_matrix_knn = kneighbors_graph(gene_expression_pca, n_neighbors=k, mode='connectivity', include_self=True)
edge_index, _ = dense_to_sparse(torch.tensor(adjacency_matrix_knn.toarray(), dtype=torch.float))

# Create the PyTorch Geometric Data object
data = Data(x=x, edge_index=edge_index, y=y)

# Define the GAT model
class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=4):
        super(GAT, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=0.6)
        self.conv2 = GATConv(hidden_channels * heads, out_channels, heads=1, concat=False, dropout=0.6)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Initialize the model and optimizer
model = GAT(in_channels=10, hidden_channels=8, out_channels=3, heads=4)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
loss_fn = torch.nn.CrossEntropyLoss()

# Training and evaluation functions
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = loss_fn(out, data.y)
    loss.backward()
    optimizer.step()
    return loss.item()

def test():
    model.eval()
    out = model(data)
    pred = out.argmax(dim=1)
    correct = (pred == data.y).sum().item()
    accuracy = correct / data.num_nodes
    return accuracy

# Train the model
for epoch in range(100):
    loss = train()
    if epoch % 10 == 0:
        accuracy = test()
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch: 000, Loss: 1.5666, Test Accuracy: 0.4700
Epoch: 010, Loss: 1.1289, Test Accuracy: 0.4900
Epoch: 020, Loss: 0.9075, Test Accuracy: 0.7967
Epoch: 030, Loss: 1.1501, Test Accuracy: 0.8300
Epoch: 040, Loss: 0.6872, Test Accuracy: 0.8467
Epoch: 050, Loss: 0.6313, Test Accuracy: 0.8833
Epoch: 060, Loss: 0.6732, Test Accuracy: 0.9133
Epoch: 070, Loss: 0.5575, Test Accuracy: 0.9467
Epoch: 080, Loss: 0.5404, Test Accuracy: 0.9600
Epoch: 090, Loss: 0.4871, Test Accuracy: 0.9733

Graph Neural Networks in integrative omics - Lab notebook¶

Word2Vec example: clustering in the karate club graph¶

GCN example: graph citation networks¶

Clustering with a GCN¶

Graph Attention Networks example: cell type classifier¶

Explanation of Results¶