## Preamble
import sklearn, leidenalg, itertools, random
import pandas as pd
import numpy as np
import igraph as ig
import sklearn.neighbors
import scipy as sp
from statsmodels.stats.multitest import multipletests
import gseapy as gp

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


data=pd.read_csv('data/met_genes.tsv', sep="\t", index_col=0)
data.head()


any(data.index.duplicated())

False


data.groupby('Type').agg('count')[['p10']] #1992 genes, 125 metabolites


data.shape #2117 features, 25 samples

(2117, 25)


values=data.loc[:,data.columns!='Type']
meta=data[['Type']]


values.isna().any().any() #we have no rows with NA

False


#Correlation and P val matrices
Rmatrix, Pmatrix= sp.stats.spearmanr(values.T)
Rmatrix=pd.DataFrame(Rmatrix, index=values.index.copy(), columns=values.index.copy())

#resulting R matrix
sns.clustermap(Rmatrix, cmap="RdBu_r", center=0);

/Users/rui.benfeitas/miniconda3/envs/merged1/lib/python3.8/site-packages/seaborn/matrix.py:659: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)


Pmatrix=pd.DataFrame(Pmatrix, index=values.index.copy(), columns=values.index.copy())
changed_Pmatrix=Pmatrix.copy()
changed_Pmatrix[changed_Pmatrix>0.01]=1
#resulting P matrix, similar to that seen in the section above
sns.clustermap(changed_Pmatrix);

/Users/rui.benfeitas/miniconda3/envs/merged1/lib/python3.8/site-packages/seaborn/matrix.py:659: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/Users/rui.benfeitas/miniconda3/envs/merged1/lib/python3.8/site-packages/seaborn/matrix.py:629: ClusterWarning: scipy.cluster: The symmetric non-negative hollow observation matrix looks suspiciously like an uncondensed distance matrix
  linkage = hierarchy.linkage(self.array, method=self.method,


#prepare P matrix
Psquared=Pmatrix.where(np.triu(np.ones(Pmatrix.shape),1).astype(np.bool))
Psquared.columns.name='Feat2'
Pmatrix=Pmatrix.stack()
Pmatrix.index.names=['v1','v2']
Pmatrix=Pmatrix.reset_index()
Pmatrix.columns=['feat1','feat2','P']

#prepare R matrix
Rmatrix=Rmatrix.where(np.triu(np.ones(Rmatrix.shape),1).astype(np.bool))
Rmatrix.columns.name='Feat2'
Rmatrix=Rmatrix.stack()
Rmatrix.index.names=['v1','v2'] #Avoid stacked names colliding
Rmatrix=Rmatrix.reset_index()
Rmatrix.columns=['feat1','feat2','R']

# join both
PRmatrix=pd.merge(Rmatrix.copy(), Pmatrix.copy(), on=['feat1','feat2']) #Correlation matrix with both R and P
PRmatrix=PRmatrix.loc[PRmatrix.feat1!=PRmatrix.feat2].dropna()

#Multiple hypothesis correction computed on the P column
adjP=pd.DataFrame(multipletests(PRmatrix['P'], method='bonferroni', alpha=0.01)[1], columns=['Padj'])
FDR=pd.DataFrame(multipletests(PRmatrix['P'], method='fdr_bh', alpha=0.01)[1], columns=['FDR'])

PRmatrix=pd.concat([ PRmatrix, adjP], axis=1)
PRmatrix=pd.concat([ PRmatrix, FDR], axis=1)

PRmatrix.head()


#total number of correlations w/o repetition: 2.2m
PRmatrix.shape[0]

2239786


sum(PRmatrix.Padj<0.01)

16305


sum(PRmatrix.FDR<0.01)

402368


PRmatrix.loc[:,'R (padj)']=PRmatrix['R'].copy()
PRmatrix.loc[:,'R (fdr)']=PRmatrix['R'].copy()
PRmatrix.loc[PRmatrix['Padj']>0.01,'R (padj)']=0
PRmatrix.loc[PRmatrix['FDR']>0.01,'R (fdr)']=0

all_mets=meta.loc[meta.Type=='met'].index
PRmatrix['feat1_type']=['met' if x in all_mets else 'gene' for x in PRmatrix.feat1 ]
PRmatrix['feat2_type']=['met' if x in all_mets else 'gene' for x in PRmatrix.feat2 ]
PRmatrix['int_type']=PRmatrix.feat1_type+'_'+PRmatrix.feat2_type
PRmatrix.to_csv('data/association_matrix.tsv', sep="\t", index=False) #export correlation matrix for faster loading


#Transforming to a squared matrix again
PRQ=pd.concat([
    PRmatrix.copy(), 
    PRmatrix.copy().rename(columns={'feat1':'feat2','feat2':'feat1'}).loc[:,PRmatrix.columns]
         ]).drop_duplicates()

Rmatrix_fdr=PRQ.copy().pivot(index='feat1',columns='feat2',values='R (fdr)')
Rmatrix_fdr=Rmatrix_fdr.loc[Rmatrix_fdr.sum()!=0]
Rmatrix_padj=PRQ.copy().pivot(index='feat1',columns='feat2',values='R (padj)')

Rmatrix_fdr=Rmatrix_fdr.loc[Rmatrix_fdr.index,Rmatrix_fdr.index].fillna(0)
Rmatrix_padj=Rmatrix_padj.loc[Rmatrix_fdr.index,Rmatrix_fdr.index].fillna(0)


#Showing only the top correlated features
top_features=Rmatrix_fdr.index[(Rmatrix_fdr!=0).sum()>0.05*Rmatrix_fdr.shape[0]] #top features based on FDR
Rmatrix_fdr_top=Rmatrix_fdr.copy().loc[top_features,top_features] #subsetting R (fdr corrected) matrix
Rmatrix_padj_top=Rmatrix_padj.copy().loc[top_features,top_features] #subsetting R (bonferroni corrected) matrix

g=sns.clustermap(Rmatrix_padj_top, cmap="RdBu_r", center=0);
g.fig.suptitle('Spearman R (Padj < 0.01, Bonferroni)');
plt.show()

/Users/rui.benfeitas/miniconda3/envs/merged1/lib/python3.8/site-packages/seaborn/matrix.py:659: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)


g=sns.clustermap(Rmatrix_fdr_top, cmap="RdBu_r", center=0);
g.fig.suptitle('Spearman R (FDR<0.01)');
plt.show()


shortPR=PRmatrix.copy().loc[:,['feat1','feat2','R (padj)','R (fdr)']]
shortPR=shortPR.loc[shortPR.feat1!=shortPR.feat2]

fig=plt.figure(figsize=(8,4))
p=sns.histplot(shortPR['R (padj)'][shortPR['R (padj)']!=0], color='black', label='Bonferroni (<0.01)', kde=True, bins=100);
p.set(ylabel='PDF (Bonferroni)')
ax2=p.twinx()
g=sns.histplot(shortPR['R (fdr)'][shortPR['R (fdr)']!=0], ax=ax2, color='red', label='FDR (<0.01)', kde=True, bins=100);
g.set(ylabel='PDF (FDR)')

fig.legend()
plt.xlabel('R')
plt.title('Distribution of correlation coefficients')
plt.show()


neighbor_number=pd.DataFrame()
for n_neighbors in np.arange(1,21):
    padj_count=((Rmatrix_padj!=0).sum()==n_neighbors).sum()
    fdr_count=((Rmatrix_fdr!=0).sum()==n_neighbors).sum()
    
    out=pd.Series([padj_count, fdr_count], index=['num_features(Padj)','num_features(FDR)'], name=n_neighbors)
    neighbor_number=pd.concat([neighbor_number, out],1)
    
neighbor_number=neighbor_number.T.rename_axis('num_neighbors').reset_index()

fig,ax=plt.subplots(figsize=(16, 8), nrows=2)
ax=ax.flatten()
sns.lineplot(data=neighbor_number, x='num_neighbors', y='num_features(Padj)', color='black', label='Padj', ax=ax[0]);
sns.lineplot(data=neighbor_number, x='num_neighbors', y='num_features(FDR)', color='red', label='FDR', ax=ax[0]);
ax[0].set(ylabel='Number of features')
fig.suptitle('Number of features with <21 neighbors')
ax[0].set_xticks(np.arange(1,21,1));

### boxplot requires a long df
feat_associations_padj=pd.concat([
    shortPR.copy().loc[shortPR['R (padj)']!=0,][['feat1','feat2']],
    shortPR.copy().loc[shortPR['R (padj)']!=0,][['feat2','feat1']].rename(columns={'feat1':'feat2','feat2':'feat1'})]).drop_duplicates().groupby('feat1').agg('count')

feat_associations_fdr=pd.concat([
    shortPR.copy().loc[shortPR['R (fdr)']!=0,][['feat1','feat2']],
    shortPR.copy().loc[shortPR['R (fdr)']!=0,][['feat2','feat1']].rename(columns={'feat1':'feat2','feat2':'feat1'})]).drop_duplicates().groupby('feat1').agg('count')

feat_associations=pd.concat([feat_associations_padj, feat_associations_fdr],1, sort=True)
feat_associations.fillna(0,inplace=True)
feat_associations.columns=['Padj','FDR']

sns.boxplot( data=feat_associations, notch=True, ax=ax[1], orient='h', palette={'Padj':'black','FDR':'red'});
plt.title('Number of associations per feature')
fig.tight_layout()
plt.show()


fdr_df=pd.DataFrame()
for fdr in np.append(np.geomspace(1e-4, 0.01, 30), [0.02, 0.03, 0.04, 0.05,0.1]):
    out=pd.Series([fdr,(PRmatrix.Padj<fdr).sum(),np.round(fdr*((PRmatrix.Padj<fdr).sum()),2)],
                  index=['FDR','num_edges','num_fp'])
    fdr_df=pd.concat([fdr_df, out],1)
fdr_df=fdr_df.T
    
fig,ax=plt.subplots(figsize=(10,4))
sns.lineplot(data=fdr_df, x='FDR', y='num_edges', color='black', ax=ax, label='Edge number')
sns.lineplot(data=fdr_df, x='FDR', y='num_fp', color='red', ax=ax, label='# potential false positives')
ax.set(xscale='log', yscale='log', ylabel='Edge #');
ax.set_xticks([1e-4,1e-3, 1e-2, 1e-1, 0.05])
ax.set_xticklabels([1e-4,1e-3, 1e-2, 1e-1, 0.05])
ax.axvline(0.01, dashes=[0,0,5,5], color='gray');


fig,ax=plt.subplots(figsize=(10,4))

for fdr in [0.05, 0.01, 0.001, 0.0001]:
    ## Plotting R distribution
    temp=PRmatrix.copy()
    temp.loc[:,'R (fdr)']=temp['R'].copy()
    temp.loc[temp['FDR']>fdr,'R (fdr)']=0
    
    temp=temp.loc[temp.feat1!=shortPR.feat2]
    
    color={0.05:'gray',0.01:'green',0.001:'red',0.0001:'blue'}[fdr]

    
    p=sns.histplot(
        temp['R (fdr)'][temp['R (fdr)']!=0], color=color, label='FDR <'+str(fdr), kde=True, ax=ax, bins=50);
    p.set(ylabel='PDF (FDR)')

fig.legend()
plt.xlabel('R (FDR)');
plt.title('Distribution of correlation coefficients');


bonferroni_significant=PRmatrix.copy().loc[PRmatrix.Padj<0.01].loc[:,['feat1','feat2','R','int_type']]
bonferroni_significant['sig_test']='bonferroni'
fdr_significant=PRmatrix.copy().loc[PRmatrix.FDR<0.01].loc[:,['feat1','feat2','R','int_type']]
fdr_significant['sig_test']='FDR'

pd.concat([bonferroni_significant,fdr_significant]).groupby(['sig_test','int_type'])['R'].agg('count').reset_index()


# Prepares table for being read by igraph
PRmatrix=pd.read_csv('data/association_matrix.tsv', sep="\t")
PRmatrix.loc[PRmatrix['FDR']>0.01,'R (fdr)']=0
PRmatrix=PRmatrix.loc[PRmatrix['R (fdr)']!=0,['feat1','feat2','R (fdr)']]
PRmatrix=PRmatrix.loc[PRmatrix.feat1!=PRmatrix.feat2] #drops self correlations

fdr_pos_mat=PRmatrix.loc[PRmatrix['R (fdr)']>0]
fdr_neg_mat=PRmatrix.loc[PRmatrix['R (fdr)']<0]

PRmatrix=PRmatrix.loc[PRmatrix.isin(pd.unique(fdr_pos_mat[['feat1','feat2']].values.flatten())).sum(1)==2,] #selects only nodes also found in the positive network so that we can compare networks of the same sizes


#Imports and normalizes met and gene data so that we can compute similarities between them
from sklearn.preprocessing import StandardScaler
data=pd.read_csv('data/met_genes.tsv', sep="\t", index_col=0)

scaled_data=pd.DataFrame(StandardScaler().fit_transform(data.loc[:,data.columns!='Type'].T).T, columns=data.columns[data.columns!='Type'], index=data.index)
scaled_data_values=scaled_data.copy()
scaled_data['Type']=data.Type

#Plots the data distribution
fig,ax=plt.subplots(figsize=(12,8))
sns.kdeplot(
    scaled_data.loc[scaled_data.Type=='met',scaled_data.columns!='Type'].values.flatten(), 
    color='r', label='Mets', ax=ax, legend=False)
ax2=ax.twinx()
sns.kdeplot(
    scaled_data.loc[scaled_data.Type=='genes',scaled_data.columns!='Type'].values.flatten(),
    color='b', label='Genes', ax=ax2, legend=False)
fig.suptitle('Distribution for met and genes');
fig.legend();


### Generating the kNN graph
#Computes a kNN adjacency matrix from the input dataset
#and prepares table for being read by igraph
input_ds=scaled_data_values.loc[scaled_data_values.index.isin(pd.unique(fdr_pos_mat[['feat1','feat2']].values.flatten()))]
knnG=sklearn.neighbors.kneighbors_graph(input_ds.values, 200, metric='euclidean')
knnG=pd.DataFrame(knnG.toarray(), columns=input_ds.index.copy(), index=input_ds.index.copy())  #adjacency matrix
knnG.index.name='gene1'
knnG.columns.name='gene2'
knnG=knnG.stack().reset_index().rename(columns={0:'Connectivity'})
knnG=knnG.loc[knnG['Connectivity']!=0]

### Generates each of the graphs
#positive associations, weighted
pos_w=ig.Graph.TupleList([tuple(x) for x in fdr_pos_mat.values], directed=False, edge_attrs=['w'])

#full network, unweighted
edge_list=PRmatrix.copy().loc[PRmatrix.isin(pd.unique(fdr_pos_mat[['feat1','feat2']].values.flatten())).sum(1)==2,['feat1','feat2']].values
all_u=ig.Graph.TupleList([tuple(x) for x in edge_list], directed=False)

#knnG, unweighted
knn=ig.Graph.TupleList([tuple(x) for x in knnG.values], directed=False)

#random network, unweighted, node and edge number based on a network of the same size
random_posw=ig.Graph.Erdos_Renyi(n=input_ds.shape[0], m=len(fdr_pos_mat.values), directed=False, loops=False)

random_allu=ig.Graph.Erdos_Renyi(n=input_ds.shape[0], m=len(edge_list), directed=False, loops=False)

random_knn=ig.Graph.Erdos_Renyi(n=input_ds.shape[0], m=len(knnG.values), directed=False, loops=False)


#random subset of knn graph for plotting
short_knn=ig.Graph.TupleList([tuple(x) for x in knnG.values[random.sample(list(np.arange(len(knnG.values))), 5000)]], directed=False)

#This plots each graph, using degree to present node size:
short_knn.vs['degree']=short_knn.degree() 
short_knn.vs['degree_size']=[(x*15)/(max(short_knn.vs['degree'])) for x in short_knn.vs['degree']] #degree is multiplied by 10 so that we can see all nodes

layout = short_knn.layout_mds()
ig.plot(short_knn, layout=layout, vertex_color='white', edge_color='silver', vertex_size=short_knn.vs['degree_size'])


#function to get graph properties, takes a few minutes to run
def graph_prop(input_graph):
    ncount=nn.vcount()
    ecount=nn.ecount()
    diameter=nn.diameter()
    av_path=nn.average_path_length()
    dens=nn.density()
    clustering=nn.transitivity_undirected() #this is the global clustering coefficient
    conn=nn.is_connected()
    min_cut=nn.mincut_value()
    out=pd.DataFrame([ncount, ecount, diameter, av_path, dens, clustering, conn, min_cut],
                 index=['node_count','edge_count','diameter','av_path_length','density','clustering_coef','connected?','minimum_cut']).T
    return(out)

#summarizing graph properties
network_stats=pd.DataFrame()
for nn in [pos_w, all_u, knn, random_posw, random_allu, random_knn]:
    network_stats=pd.concat([network_stats,graph_prop(nn)])
    
network_stats.index=['pos_w','all_u','knn','pos_w_random', 'all_u_random', 'knn_random']
network_stats


fig, axes=plt.subplots(nrows=1, ncols=3, figsize=(15,5))

for i, ax in zip(range(4), axes.flat):
    sns.regplot(x=[pos_w,all_u,knn][i].degree(), y=[random_posw, random_allu, random_knn][i].degree(), 
                ax=ax, color=['blue','lightsalmon','green'][i],
               line_kws={'color':'black'}
               );
    ax.set_title(['pos','all','knn'][i])
    ax.set(xlabel='k ('+['pos','all','knn'][i]+')', ylabel='k (random)')


def transform_degree(graph):
    alldegs=graph.degree()
    alldegs=pd.DataFrame([[key,len(list(group))] for key,group in itertools.groupby(alldegs)], columns=['k','count'])
    alldegs['P(k)']=[x/alldegs['count'].sum() for x in alldegs['count']]
    alldegs=alldegs.loc[:,['k','P(k)']]
    alldegs.drop_duplicates(inplace=True)
    alldegs.reset_index(drop=True, inplace=True)
    return(alldegs)


fig, ax = plt.subplots(figsize=(7, 7))
# ax.set(yscale='log', xscale='log')
p=sp.stats.probplot(pos_w.degree(), plot=ax)
a=sp.stats.probplot(all_u.degree(), plot=ax)
k=sp.stats.probplot(knn.degree(), plot=ax)
r=sp.stats.probplot(random_posw.degree(), plot=ax)
r2=sp.stats.probplot(random_allu.degree(), plot=ax)
r3=sp.stats.probplot(random_knn.degree(), plot=ax)

col=['blue','','peru','','green','','lightblue','','lightsalmon','','greenyellow','']
for x in np.arange(0,11,2):
    ax.get_lines()[x].set_markerfacecolor(col[x])
    ax.get_lines()[x].set_markeredgewidth(0)
    ax.get_lines()[x+1].set_color(col[x])


fig.legend(labels=['pos','pos','all','all','knn','knn','pos_random','pos_random','all_random','all_random','knn_random','knn_random']);

ax.set(xlabel='Data quantiles', ylabel='observed degree (k)')
plt.show()


centralities_list=['degree (larger ~ central)','eccentricity (smaller ~ central)','betweenness (larger ~ central)', 'closeness (larger ~ central)','eigenvector (larger ~ central)']

def combine_raw_centralities():
    """
    Combines centrality metrics
    """
    def centrality_raw(input_graph, graph_name):
        """
        Computes centrality metrics for a graph.
        """
        deg=input_graph.degree(loops=False)
        node_n=input_graph.vcount()
        #scaled to account for network size
        ecc=[(2*x / ((node_n-1)*(node_n-2))) for x in input_graph.eccentricity()]
        btw=[(2*x / ((node_n-1)*(node_n-2))) for x in input_graph.betweenness(directed=False)] 
        eig=input_graph.eigenvector_centrality(directed=False, scale=False)
        
        # For disconnected graphs, computes closeness from the largest connected component
        if(input_graph.is_connected()):
            cls=input_graph.closeness(normalized=True)
        else:
            cls=input_graph.clusters(mode='WEAK').giant().closeness(normalized=True)
        
        out=pd.DataFrame([deg, ecc, btw, cls,eig], index=centralities_list).T
        out['graph']=graph_name
        out=out.loc[:,np.append(['graph'],out.columns[out.columns!='graph'])]
        
        ##Adds centralities for each node in the network
        input_graph.vs['degree']=out['degree (larger ~ central)']
        input_graph.vs['eccentricity']=out['eccentricity (smaller ~ central)']
        input_graph.vs['betweenness']=out['betweenness (larger ~ central)']
        input_graph.vs['closeness']=out['closeness (larger ~ central)']
        input_graph.vs['eigenvector']=out['eigenvector (larger ~ central)']
        
        return(out)
    
    #Computes centralities for all networks
    network_centralities_raw=pd.DataFrame()
    network_centralities_raw=pd.concat([network_centralities_raw,centrality_raw(pos_w,'pos_w')])
    network_centralities_raw=pd.concat([network_centralities_raw,centrality_raw(all_u,'all_u')])
    network_centralities_raw=pd.concat([network_centralities_raw,centrality_raw(knn,'knn')])
    network_centralities_raw=pd.concat([network_centralities_raw,centrality_raw(random_posw,'pos_w_random')])
    network_centralities_raw=pd.concat([network_centralities_raw,centrality_raw(random_allu,'all_u_random')])
    network_centralities_raw=pd.concat([network_centralities_raw,centrality_raw(random_knn,'knn_random')])
    return(network_centralities_raw)

network_centralities_raw=combine_raw_centralities()


fig, axes=plt.subplots(nrows=5, figsize=(16,20), sharey='row')
for i, ax in zip(range(6), axes.flat):
    sns.boxplot(
        data=network_centralities_raw, x='graph', y=centralities_list[i],  notch=True, ax=ax, showfliers=False)
    ax.set_title(centralities_list[i])
    ax.set(xlabel='')


network_stats


### Compute centrality ranks so that we can compare them within and between networks
full_centralities=pd.DataFrame()
for net in [0,1,2]:
    net_in=[pos_w, all_u, knn][net]
    net_nm=['pos_w', 'all_u', 'knn'][net]
    temp=pd.DataFrame([net_in.vs[att] for att in ['name','degree','betweenness', 'closeness','eccentricity','eigenvector']], index=['name','degree','betweenness', 'closeness','eccentricity','eigenvector']).T
    temp.columns=[x+'|'+net_nm for x in temp.columns]
    temp.rename(columns={'name|'+net_nm:'name'}, inplace=True)
    
    ## For all but eccentricity centrality, we compute the rank in ascending mode
    ## so that higher ranking means more central. we need to reverse this for eccentricity
    temp.loc[:,temp.columns.str.contains('deg|bet|clos|eig')]=temp.loc[:,temp.columns.str.contains('deg|bet|clos|eig')].rank(pct=True, ascending=True)
    temp.loc[:,temp.columns.str.contains('eccentricity')]=temp.loc[:,temp.columns.str.contains('eccentricity')].rank(pct=True, ascending=False
                                                              )
    temp['median_centrality|'+net_nm]=temp.loc[:,temp.columns!='name'].median(1)
    if(net==0):
        full_centralities=temp
    else:
        full_centralities=pd.merge(full_centralities, temp, on='name')
full_centralities.set_index('name', inplace=True)
full_centralities=pd.merge(full_centralities, data[['Type']], left_index=True, right_index=True, how='left')
full_centralities['median|ALL']=full_centralities.loc[:,full_centralities.columns.str.contains('median')].median(1)


### Correlations are computed between ranks, after inverting the rank for eccentricity
def correlations_centralities(graph_name):
    """
    Returns squared correlation matrix.
    """
    temp_corr=full_centralities.copy().loc[:,full_centralities.columns!='Type'].dropna().astype('float')
    temp_corr=temp_corr.loc[:,temp_corr.columns.str.contains(graph_name)]
    temp_corr.columns=temp_corr.columns.str.replace('\|.+','')
    temp_corr=temp_corr.corr(method='spearman')
    np.fill_diagonal(temp_corr.values, np.nan)
    return(temp_corr)

all_u_centcorr=correlations_centralities('all_u')
knn_centcorr=correlations_centralities('knn')
pos_w_centcorr=correlations_centralities('pos_w')

fig,ax=plt.subplots(nrows=3, figsize=(8,16), sharex=True)
ax=ax.flatten()
for i in range(3):
    tdata=[all_u_centcorr,knn_centcorr, pos_w_centcorr][i]
    tname=['all_u','knn','pos_w'][i]
    sns.heatmap(tdata, cmap="RdBu_r", center=0, annot=True, ax=ax[i]);
    ax[i].set(title=tname)


knn_centralities=full_centralities.loc[:,full_centralities.columns.str.contains('knn')]
knn_centralities=knn_centralities.loc[:,knn_centralities.columns!='median_centrality|knn']
knn_centralities.columns=knn_centralities.columns.str.replace('\\|.+','')

##top nodes based on eccentricity
knn_top_ecc=knn_centralities['eccentricity'].sort_values(ascending=False).index.values[:1]

##top nodes based on degree
knn_top_deg=knn_centralities['degree'].sort_values(ascending=False).index.values[:1]

##random nodes to help visualize
##warning, do not increase this value much higher than 500, or you may have problems rendering this image
knn_others=knn_centralities.sample(500).index.values

## full list
node_list=np.append(np.append(knn_top_deg, knn_top_ecc), knn_others)

## subsets nodes
knn_to_draw=knn.subgraph(knn.vs.select([x.index for x in knn.vs if x['name'] in node_list]))
knn_to_draw.vs['color']=['red' if x['name']  in knn_top_deg else 'green' if x['name'] in knn_top_ecc else 'white' for x in knn_to_draw.vs]

layout = knn_to_draw.layout_auto()
ig.plot(knn_to_draw, layout=layout, vertex_color=knn_to_draw.vs['color'], edge_color='silver', vertex_size=7)


g = ig.Graph([(0,1), (0,2), (1,2), (0,3), (2,3), (1,3), (2,4), (4,5), (5,6), (4,6), (4,7), (5,7), (6,7)])

g.vs["name"] = ["A", "B", "C", "D", "E", "F", "G","H"]
g.vs["module"] = ["f", "f", "f", "f", "m", "m","m", "f"]


layout = g.layout_circle()
g.vs["label"] = g.vs["name"]
color_dict = {"m": "cyan", "f": "pink"}
g.vs["color"] = [color_dict[module] for module in g.vs["module"]]
ig.plot(g, layout = layout, bbox = (300, 300), margin = 50)


pd.DataFrame( g.get_adjacency(), index=g.vs["name"],  columns=g.vs["name"] )


def modularity(graph, membership):
    """
    Computes the modularity of `graph` for a given module `membership`.
    """
    m=len(graph.es.indices) #edge number
    A=pd.DataFrame(graph.get_adjacency()) #adjacency matrix
    Q=[] 
    for i in A.index:
        for j in A.columns:
            if(membership[i]==membership[j]):
                deltaij=1
            else:
                deltaij=0
            Q=np.append(Q, (A.loc[i,j]-(g.vs[i].degree()*g.vs[j].degree())/(2*m))*deltaij)
    Q=(1/(2*m))*np.sum(Q)
    out=Q
    return(out)

modularity(g, [1,1,1,1,2,2,2,1])

0.16568047337278102


all_memberships=[x for x in itertools.product([1,2], repeat=8)]


membership_modularity=pd.DataFrame()
for membership in all_memberships:
    group1=[g.vs['name'][x] for x in range(len(g.vs['name'])) if membership[x]==1]
    group2=[g.vs['name'][x] for x in range(len(g.vs['name'])) if membership[x]==2]
    out=pd.Series([group1, group2, membership, modularity(g, membership)], index=['comm1', 'comm2','memb','Q'])
    membership_modularity=pd.concat([membership_modularity, out], 1)
    
membership_modularity=membership_modularity.T.sort_values('Q', ascending=False)
membership_modularity.head(10)


pos_comm = leidenalg.find_partition(pos_w, leidenalg.ModularityVertexPartition)
pos_w_comm = leidenalg.find_partition(pos_w, leidenalg.ModularityVertexPartition, weights='w')
all_comm = leidenalg.find_partition(all_u, leidenalg.ModularityVertexPartition)
knn_comm = leidenalg.find_partition(knn, leidenalg.ModularityVertexPartition)
random_all = leidenalg.find_partition(random_allu, leidenalg.ModularityVertexPartition)
random_posw = leidenalg.find_partition(random_posw, leidenalg.ModularityVertexPartition)
random_knn = leidenalg.find_partition(random_knn, leidenalg.ModularityVertexPartition)


np.round(pos_comm.modularity,3)

0.591


np.round(pos_w_comm.modularity,3)

0.591


np.round(all_comm.modularity,3)

0.421


np.round(knn_comm.modularity,3)

0.588


np.round(random_all.modularity,3)

0.032


np.round(random_posw.modularity,3)

0.039


np.round(random_knn.modularity,3)

0.032


#Compiles feat lists per community
def get_community_table():
    comm_counts=pd.DataFrame()
    feat_lists=pd.DataFrame()
    for i in [0,1,2,3]:
        graph=[pos_w,pos_w,all_u,knn][i]
        comm=[pos_comm,pos_w_comm,all_comm,knn_comm][i]
        name=['pos','pos_w','all','knn'][i]
        temp=pd.DataFrame(list(zip(graph.vs['name'],[x+1 for x in comm.membership]))).rename(columns={0:'feat',1:'community'})
        counts=pd.DataFrame(temp.groupby('community')['feat'].agg(len))
        counts.columns=[name]
        comm_counts=pd.concat([comm_counts, counts],1)
        
        gl=pd.DataFrame(temp.groupby('community')['feat'].apply(list)).reset_index()
        gl['community']=['c'+str(i) for i in gl['community']]
        gl['network']=name
        gl=gl.loc[:,['network','community','feat']]
        feat_lists=pd.concat([feat_lists, gl])
        
    comm_counts.index=['c'+str(i) for i in comm_counts.index]
    return([comm_counts,feat_lists])

#Plotting community sizes
import seaborn as sns
fig, ax = plt.subplots(figsize=(7, 4))
bar_data=get_community_table()[0].fillna(0).T
bar_data.plot(kind='bar', stacked=True, ax=ax);
# ## number of communities in each
# for index, row in groupedvalues.iterrows():
#     g.text(row.name,row.tip, round(row.total_bill,2), color='black', ha="center")
ax.legend(get_community_table()[0].index, loc='right', bbox_to_anchor=(1.15, 1));
ax.set_title('Community size')
plt.xticks(rotation=0)
plt.show()


fig, ax = plt.subplots(figsize=(7, 4))
sns.barplot(
    data=bar_data.T.unstack().reset_index().rename(columns={'level_0':'network','level_1':'community',0:'size'}),
    x='network',y='size', hue='community'
           )

ax.set(yscale='log');
ax.legend(loc='right', bbox_to_anchor=(1.15, 1));
ax.set_title('Community size')
plt.xticks(rotation=0)
plt.show()


#All the available human libraries
gp.get_library_name(database='Human')[:15]

['ARCHS4_Cell-lines',
 'ARCHS4_IDG_Coexp',
 'ARCHS4_Kinases_Coexp',
 'ARCHS4_TFs_Coexp',
 'ARCHS4_Tissues',
 'Achilles_fitness_decrease',
 'Achilles_fitness_increase',
 'Aging_Perturbations_from_GEO_down',
 'Aging_Perturbations_from_GEO_up',
 'Allen_Brain_Atlas_down',
 'Allen_Brain_Atlas_up',
 'BioCarta_2013',
 'BioCarta_2015',
 'BioCarta_2016',
 'BioPlanet_2019']


#we will search 3 libraries for significantly enriched gene sets
gene_sets=['GO_Biological_Process_2018','KEGG_2019_Human','OMIM_Disease']
background=[x for x in all_u.copy().vs['name'] if x in data.loc[data.Type=='genes'].index]
all_genes=data.loc[data.Type=='genes'].index

def perform_enrich(network):
    temp=get_community_table()[1].copy()
    temp=temp.loc[temp['network']==network]
    output_enrichr=pd.DataFrame()
    for comm in temp['community'].values:
        gl=list(temp.loc[temp['community']==comm, 'feat'])[0]
        gl=list([x for x in gl if x in all_genes])
        
        if(len(gl)<30):
            continue
        print('Found '+str(len(gl))+' genes in community '+comm)
        for bp in gene_sets:
            print('Analyzing '+network+' network | Comm: '+comm+'/'+str(len(temp.index))+'  | BP: '+bp)
            enr=gp.enrichr(
                gene_list=gl,
                gene_sets=bp,
                background=background,
                outdir='Enrichr',
                format='png'
            )

            results=enr.results.sort_values('Adjusted P-value', ascending=True)
            results=results.loc[results['Adjusted P-value']<0.05,]
            results['BP']=bp
            results['Comm']=comm
            results['Graph']=network
            output_enrichr=pd.concat([output_enrichr, results])
            
    return(output_enrichr)

all_enriched=pd.DataFrame()
for net in ['pos', 'pos_w', 'all', 'knn']: 
    all_enriched=pd.concat([all_enriched,perform_enrich(net)])

Found 903 genes in community c1
Analyzing pos network | Comm: c1/7  | BP: GO_Biological_Process_2018

2020-10-01 18:27:22,499 Warning: No enrich terms using library GO_Biological_Process_2018 when cutoff = 0.05

Analyzing pos network | Comm: c1/7  | BP: KEGG_2019_Human

2020-10-01 18:27:27,051 Warning: No enrich terms using library KEGG_2019_Human when cutoff = 0.05

Analyzing pos network | Comm: c1/7  | BP: OMIM_Disease

2020-10-01 18:27:31,523 Warning: No enrich terms using library OMIM_Disease when cutoff = 0.05

Found 547 genes in community c2
Analyzing pos network | Comm: c2/7  | BP: GO_Biological_Process_2018
Analyzing pos network | Comm: c2/7  | BP: KEGG_2019_Human
Analyzing pos network | Comm: c2/7  | BP: OMIM_Disease
Found 513 genes in community c3
Analyzing pos network | Comm: c3/7  | BP: GO_Biological_Process_2018
Analyzing pos network | Comm: c3/7  | BP: KEGG_2019_Human
Analyzing pos network | Comm: c3/7  | BP: OMIM_Disease

2020-10-01 18:28:00,542 Warning: No enrich terms using library OMIM_Disease when cutoff = 0.05

Found 902 genes in community c1
Analyzing pos_w network | Comm: c1/7  | BP: GO_Biological_Process_2018

2020-10-01 18:28:05,614 Warning: No enrich terms using library GO_Biological_Process_2018 when cutoff = 0.05


enriched_terms=all_enriched.loc[:,['Graph','Comm','Term','Adjusted P-value']].copy()


enriched_terms['Adjusted P-value']=-1*np.log10(enriched_terms['Adjusted P-value'])


enriched_terms.head()


fig, ax = plt.subplots(figsize=(7, 4))
data_bars=pd.DataFrame(enriched_terms.groupby(['Graph','Comm'])['Term'].agg('count')).stack().reset_index().rename(columns={0:'Count'})
sns.barplot(x='Graph', y='Count', data=data_bars, hue='Comm')
ax.set_title('Number of significant Terms (Q < 0.05) per community')
ax.legend(loc='right', bbox_to_anchor=(1.15, 1));
plt.xticks(rotation=0)
plt.show()


###Number of genes/community
# We skipped communities with <30 genes
get_community_table()[0].fillna(0).T


pd.DataFrame(enriched_terms.groupby(['Graph','Comm'])['Term'].agg('count'))


#Finding consensus
temp=enriched_terms.copy()
temp['comm_term']=temp.Comm+'_'+temp.Term
temp=temp.loc[:,['Graph','comm_term']]

consensus=pd.DataFrame()
consensus=pd.concat([consensus, temp.loc[temp['Graph']=='pos']])
consensus=pd.merge(consensus,
                   temp.loc[temp['Graph']=='pos_w'], on="comm_term", how='outer', suffixes=['pos','pos_w'])
consensus=pd.merge(consensus, 
                   temp.loc[temp['Graph']=='all'], on="comm_term", how='outer').rename(columns={'Graph':'all'})

consensus=consensus.loc[consensus.isna().sum(1)==0].loc[:,['comm_term','Graphpos','Graphpos_w','all']]


consensus['comm']=[x[0] for x in consensus.comm_term.str.split('\_')]
consensus.groupby('comm')['comm_term'].agg('count')

comm
c3    54
Name: comm_term, dtype: int64


#Requires running the community detection from the previous section
patlas=pd.read_csv('data/proteinatlas.tsv', sep="\t").loc[:,['Ensembl','Gene']]


#Compiles gene lists per community. We need Ensembl ids for further analyses
def get_ensembl():
    comm_counts=pd.DataFrame()
    gene_lists=pd.DataFrame()
    for i in [0,1,2,3]:
        graph=[pos_w,pos_w,all_u,knn][i]
        comm=[pos_comm,pos_w_comm,all_comm,knn_comm][i]
        name=['pos','pos_w','all','knn'][i]
        temp=pd.DataFrame(list(zip(graph.vs['name'],[x+1 for x in comm.membership]))).rename(columns={0:'gene',1:'community'})

        gl=pd.DataFrame(temp.groupby('community')['gene'].apply(list)).reset_index()
        gl['community']=['c'+str(i) for i in gl['community']]
        gl['network']=name
        gl=gl.loc[:,['network','community','gene']]
        gene_lists=pd.concat([gene_lists, gl])

    gene_communities=gene_lists
    gene_mat=pd.DataFrame()
    for net in gene_communities['network'].unique():
        temp=gene_communities.copy().loc[gene_communities['network']==net,]
        for comm in temp['community'].unique():
            gl=list(temp.copy().loc[temp['community']==comm,'gene'])[0]
            el=[patlas.loc[patlas['Gene']==x,'Ensembl'].iloc[0] for x in gl if x in patlas['Gene'].values]

            df=pd.DataFrame([net,comm,gl,el]).T
            df.columns=['network','community','Gene','Ensembl']
            gene_mat=pd.concat([gene_mat, df])
                
    return(gene_mat)

get_ensembl().to_csv('data/gene_communities.tsv', sep="\t", index=False)

	Type	p10	p11	p12	p14	p15	p16	p18	p22	p23	...	p37	p38	p4	p40	p41	p46	p48	p5	p8	p9
feature
C0_accarnitines	met	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.103152	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.049757
C2_accarnitines	met	0.007356	0.038454	0.011738	0.011923	0.015104	0.012317	0.021905	0.017777	0.039760	...	0.000000	0.012542	0.018896	0.020212	0.015263	0.012510	0.009677	0.015757	0.010706	0.018615
C3_accarnitines	met	0.000000	0.000000	0.000000	0.000000	0.000888	0.001116	0.001682	0.001024	0.002179	...	0.000000	0.000827	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000995
C3DC_C4OH_accarnitines	met	0.000000	0.000000	0.000455	0.001163	0.000000	0.000000	0.000000	0.000000	0.001065	...	0.000000	0.000480	0.000000	0.000717	0.000000	0.000782	0.000398	0.000000	0.000000	0.000312
C5DC_C6OH_accarnitines	met	0.001035	0.001479	0.000430	0.001527	0.000485	0.000792	0.000954	0.000995	0.001356	...	0.005792	0.000400	0.000987	0.001255	0.000885	0.001710	0.000597	0.001218	0.000848	0.000507

	feat1	feat2	R	P	Padj	FDR
0	C0_accarnitines	C2_accarnitines	0.375326	0.070718	1.0	0.149290
1	C0_accarnitines	C3_accarnitines	0.519174	0.009329	1.0	0.033441
2	C0_accarnitines	C3DC_C4OH_accarnitines	0.395363	0.055848	1.0	0.125357
3	C0_accarnitines	C5DC_C6OH_accarnitines	-0.029917	0.889631	1.0	0.928289
4	C0_accarnitines	C5MDC_accarnitines	-0.137801	0.520795	1.0	0.643703

	sig_test	int_type	R
0	FDR	gene_gene	398274
1	FDR	met_gene	118
2	FDR	met_met	3976
3	bonferroni	gene_gene	15434
4	bonferroni	met_met	871

	node_count	edge_count	diameter	av_path_length	density	clustering_coef	connected?	minimum_cut
pos_w	2102	293544	8	2.93304	0.132937	0.71453	False	0
all_u	2102	402360	7	2.11979	0.182216	0.610098	False	0
knn	2102	420400	4	2.32002	0.190386	0.590294	True	203
pos_w_random	2102	293544	2	1.86706	0.132937	0.132886	True	228
all_u_random	2102	402360	2	1.81778	0.182216	0.182149	True	326
knn_random	2102	420400	2	1.80961	0.190386	0.190349	True	339

	node_count	edge_count	diameter	av_path_length	density	clustering_coef	connected?	minimum_cut
pos_w	2102	293544	8	2.93304	0.132937	0.71453	False	0
all_u	2102	402360	7	2.11979	0.182216	0.610098	False	0
knn	2102	420400	4	2.32002	0.190386	0.590294	True	203
pos_w_random	2102	293544	2	1.86706	0.132937	0.132886	True	228
all_u_random	2102	402360	2	1.81778	0.182216	0.182149	True	326
knn_random	2102	420400	2	1.80961	0.190386	0.190349	True	339

Omics Integration course 2020

Rui Benfeitas, Scilifelab, NBIS National Bioinformatics Infrastructure Sweden

Biological network topology analysis¶

Data preparation¶

Association analysis¶

Questions¶

Network construction and preliminary analysis¶

Questions:¶

Centrality analysis¶

Questions:¶

Questions¶

Community analysis¶

Modularity of a small graph¶

Modularity of gene-metabolite networks¶

Conclusion¶

	A	B	C	D	E	F	G	H
A	0	1	1	1	0	0	0	0
B	1	0	1	1	0	0	0	0
C	1	1	0	1	1	0	0	0
D	1	1	1	0	0	0	0	0
E	0	0	1	0	0	1	1	1
F	0	0	0	0	1	0	1	1
G	0	0	0	0	1	1	0	1
H	0	0	0	0	1	1	1	0

comm1	comm2	memb	Q
[A, B, C, D]	[E, F, G, H]	(1, 1, 1, 1, 2, 2, 2, 2)	0.423077
[E, F, G, H]	[A, B, C, D]	(2, 2, 2, 2, 1, 1, 1, 1)	0.423077
[A, B, C, D, E]	[F, G, H]	(1, 1, 1, 1, 1, 2, 2, 2)	0.221893
[A, B, D]	[C, E, F, G, H]	(1, 1, 2, 1, 2, 2, 2, 2)	0.221893
[F, G, H]	[A, B, C, D, E]	(2, 2, 2, 2, 2, 1, 1, 1)	0.221893
[C, E, F, G, H]	[A, B, D]	(2, 2, 1, 2, 1, 1, 1, 1)	0.221893
[A, B, C, D, H]	[E, F, G]	(1, 1, 1, 1, 2, 2, 2, 1)	0.16568
[B, C, D]	[A, E, F, G, H]	(2, 1, 1, 1, 2, 2, 2, 2)	0.16568
[D, E, F, G, H]	[A, B, C]	(2, 2, 2, 1, 1, 1, 1, 1)	0.16568
[A, E, F, G, H]	[B, C, D]	(1, 2, 2, 2, 1, 1, 1, 1)	0.16568

	Graph	Comm	Term	Adjusted P-value
0	pos	c2	cotranslational protein targeting to membrane ...	66.645270
1	pos	c2	protein targeting to ER (GO:0045047)	65.268506
2	pos	c2	SRP-dependent cotranslational protein targetin...	65.153180
3	pos	c2	nuclear-transcribed mRNA catabolic process, no...	56.891640
4	pos	c2	viral gene expression (GO:0019080)	54.347075

	c1	c2	c3	c4	c5	c6	c7
pos	906.0	547.0	515.0	119.0	10.0	3.0	2.0
pos_w	905.0	555.0	507.0	119.0	11.0	3.0	2.0
all	626.0	577.0	499.0	261.0	134.0	3.0	2.0
knn	887.0	485.0	335.0	272.0	123.0	0.0	0.0

		Term
Graph	Comm
all	c1	234
	c2	1
	c3	135
	c4	112
knn	c2	387
	c3	199
	c4	91
pos	c2	250
pos	c3	372
pos_w	c2	244
pos_w	c3	405

	A	B	C	D	E	F	G	H
A	0	1	1	1	0	0	0	0
B	1	0	1	1	0	0	0	0
C	1	1	0	1	1	0	0	0
D	1	1	1	0	0	0	0	0
E	0	0	1	0	0	1	1	1
F	0	0	0	0	1	0	1	1
G	0	0	0	0	1	1	0	1
H	0	0	0	0	1	1	1	0

	A	B	C	D	E	F	G	H
A	0	1	1	1	0	0	0	0
B	1	0	1	1	0	0	0	0
C	1	1	0	1	1	0	0	0
D	1	1	1	0	0	0	0	0
E	0	0	1	0	0	1	1	1
F	0	0	0	0	1	0	1	1
G	0	0	0	0	1	1	0	1
H	0	0	0	0	1	1	1	0