import json
import pandas as pd
import gseapy as gp
import warnings
warnings.filterwarnings('ignore')

with open('/home/jovyan/lab/data/serialization/degree_data.json', 'r') as file:
    degree_data = json.load(file)

feat_lists = pd.read_csv("/home/jovyan/lab/data/serialization/feat_lists.csv", sep = "\t")
comm_counts = pd.read_csv("/home/jovyan/lab/data/serialization/comm_counts.csv", sep = "\t", index_col = 0)
data=pd.read_csv('/home/jovyan/lab/data/met_genes.tsv', sep="\t", index_col=0)

# extract all the available human libraries in enrichr
libraries = gp.get_library_name()
print([lib for lib in libraries if 'Human' in lib])
print([lib for lib in libraries if 'GO_Biological_Process' in lib])
print([lib for lib in libraries if 'KEGG' in lib])
print([lib for lib in libraries if 'OMIM_Disease' in lib])

['HDSigDB_Human_2021', 'HumanCyc_2015', 'HumanCyc_2016', 'Human_Gene_Atlas', 'Human_Phenotype_Ontology', 'KEGG_2019_Human', 'KEGG_2021_Human', 'NIH_Funded_PIs_2017_Human_AutoRIF', 'NIH_Funded_PIs_2017_Human_GeneRIF', 'NURSA_Human_Endogenous_Complexome', 'RNAseq_Automatic_GEO_Signatures_Human_Down', 'RNAseq_Automatic_GEO_Signatures_Human_Up', 'Tissue_Protein_Expression_from_Human_Proteome_Map', 'WikiPathway_2021_Human', 'WikiPathway_2023_Human', 'WikiPathways_2019_Human', 'WikiPathways_2024_Human']
['GO_Biological_Process_2013', 'GO_Biological_Process_2015', 'GO_Biological_Process_2017', 'GO_Biological_Process_2017b', 'GO_Biological_Process_2018', 'GO_Biological_Process_2021', 'GO_Biological_Process_2023']
['KEGG_2013', 'KEGG_2015', 'KEGG_2016', 'KEGG_2019_Human', 'KEGG_2019_Mouse', 'KEGG_2021_Human']
['OMIM_Disease']

import ast
import gseapy as gp

# we will search 3 libraries for significantly enriched gene sets
gene_sets = ['GO_Biological_Process_2023','KEGG_2021_Human','OMIM_Disease']
background=[x for x in degree_data['all_u_names'] if x in data.loc[data.Type=='genes'].index]
all_genes=data.loc[data.Type=='genes'].index

def perform_enrich(network):
    temp=feat_lists.copy()
    temp=temp.loc[temp['network']==network]
    output_enrichr=pd.DataFrame()
    for comm in temp['community'].values:
        # the dataframe holds the gene lists as strings, we parse them into lists
        gl_string =temp.loc[temp['community']==comm, 'feat'].values[0]
        gl = ast.literal_eval(gl_string)
        gl=list([x for x in gl if x in all_genes])
        # if the list is small we don't do the test
        if(len(gl)<30):
            continue
        for bp in gene_sets:
            print('Analyzing '+network+' network | Comm: '+comm+'/'+str(len(temp.index))+'  | BP: '+bp)
            # When P-values are under thresholds the program exits
            # We avoid this by using a try catch block
            try:
                enr=gp.enrichr(
                    gene_list=gl,
                    gene_sets=bp,
                    background=background,
                    outdir='Enrichr',
                    format='png'
                )
                results=enr.results.sort_values('Adjusted P-value', ascending=True)
                results=results.loc[results['Adjusted P-value']<0.05,]
                results['BP']=bp
                results['Comm']=comm
                results['Graph']=network
                output_enrichr=pd.concat([output_enrichr, results])
            except Exception as e:
                #print(f"An unexpected error occurred: {e}")
                print("Error for:", network, comm, bp)
    return(output_enrichr)

all_enriched=pd.DataFrame()
#perform_enrich('pos')
for net in ['pos', 'pos_w', 'all', 'knn']: 
    all_enriched=pd.concat([all_enriched,perform_enrich(net)])

Analyzing pos network | Comm: c1/7  | BP: GO_Biological_Process_2023
Error for: pos c1 GO_Biological_Process_2023
Analyzing pos network | Comm: c1/7  | BP: KEGG_2021_Human
Error for: pos c1 KEGG_2021_Human
Analyzing pos network | Comm: c1/7  | BP: OMIM_Disease
Error for: pos c1 OMIM_Disease
Analyzing pos network | Comm: c2/7  | BP: GO_Biological_Process_2023
Analyzing pos network | Comm: c2/7  | BP: KEGG_2021_Human
Analyzing pos network | Comm: c2/7  | BP: OMIM_Disease
Error for: pos c2 OMIM_Disease
Analyzing pos network | Comm: c3/7  | BP: GO_Biological_Process_2023
Analyzing pos network | Comm: c3/7  | BP: KEGG_2021_Human
Analyzing pos network | Comm: c3/7  | BP: OMIM_Disease
Error for: pos c3 OMIM_Disease
Analyzing pos_w network | Comm: c1/7  | BP: GO_Biological_Process_2023
Error for: pos_w c1 GO_Biological_Process_2023
Analyzing pos_w network | Comm: c1/7  | BP: KEGG_2021_Human
Error for: pos_w c1 KEGG_2021_Human
Analyzing pos_w network | Comm: c1/7  | BP: OMIM_Disease
Error for: pos_w c1 OMIM_Disease
Analyzing pos_w network | Comm: c2/7  | BP: GO_Biological_Process_2023
Analyzing pos_w network | Comm: c2/7  | BP: KEGG_2021_Human
Analyzing pos_w network | Comm: c2/7  | BP: OMIM_Disease
Error for: pos_w c2 OMIM_Disease
Analyzing pos_w network | Comm: c3/7  | BP: GO_Biological_Process_2023
Analyzing pos_w network | Comm: c3/7  | BP: KEGG_2021_Human
Analyzing pos_w network | Comm: c3/7  | BP: OMIM_Disease
Error for: pos_w c3 OMIM_Disease
Analyzing all network | Comm: c1/6  | BP: GO_Biological_Process_2023
Analyzing all network | Comm: c1/6  | BP: KEGG_2021_Human
Analyzing all network | Comm: c1/6  | BP: OMIM_Disease
Error for: all c1 OMIM_Disease
Analyzing all network | Comm: c2/6  | BP: GO_Biological_Process_2023
Analyzing all network | Comm: c2/6  | BP: KEGG_2021_Human
Analyzing all network | Comm: c2/6  | BP: OMIM_Disease
Error for: all c2 OMIM_Disease
Analyzing all network | Comm: c3/6  | BP: GO_Biological_Process_2023
Analyzing all network | Comm: c3/6  | BP: KEGG_2021_Human
Analyzing all network | Comm: c3/6  | BP: OMIM_Disease
Error for: all c3 OMIM_Disease
Analyzing all network | Comm: c4/6  | BP: GO_Biological_Process_2023
Error for: all c4 GO_Biological_Process_2023
Analyzing all network | Comm: c4/6  | BP: KEGG_2021_Human
Error for: all c4 KEGG_2021_Human
Analyzing all network | Comm: c4/6  | BP: OMIM_Disease
Analyzing knn network | Comm: c1/5  | BP: GO_Biological_Process_2023
Error for: knn c1 GO_Biological_Process_2023
Analyzing knn network | Comm: c1/5  | BP: KEGG_2021_Human
Error for: knn c1 KEGG_2021_Human
Analyzing knn network | Comm: c1/5  | BP: OMIM_Disease
Error for: knn c1 OMIM_Disease
Analyzing knn network | Comm: c2/5  | BP: GO_Biological_Process_2023
Analyzing knn network | Comm: c2/5  | BP: KEGG_2021_Human
Analyzing knn network | Comm: c2/5  | BP: OMIM_Disease
Error for: knn c2 OMIM_Disease
Analyzing knn network | Comm: c3/5  | BP: GO_Biological_Process_2023
Analyzing knn network | Comm: c3/5  | BP: KEGG_2021_Human
Error for: knn c3 KEGG_2021_Human
Analyzing knn network | Comm: c3/5  | BP: OMIM_Disease
Error for: knn c3 OMIM_Disease
Analyzing knn network | Comm: c4/5  | BP: GO_Biological_Process_2023
Analyzing knn network | Comm: c4/5  | BP: KEGG_2021_Human
Analyzing knn network | Comm: c4/5  | BP: OMIM_Disease
Error for: knn c4 OMIM_Disease
Analyzing knn network | Comm: c5/5  | BP: GO_Biological_Process_2023
Analyzing knn network | Comm: c5/5  | BP: KEGG_2021_Human
Analyzing knn network | Comm: c5/5  | BP: OMIM_Disease
Error for: knn c5 OMIM_Disease

all_enriched.head()

import numpy as np

enriched_terms=all_enriched.loc[:,['Graph','Comm','Term','Adjusted P-value']].copy()
enriched_terms['Adjusted P-value']=-1*np.log10(enriched_terms['Adjusted P-value'])
enriched_terms.head()

###Number of genes/community
# We skipped communities with <30 genes
comm_counts.fillna(0).T

pd.DataFrame(enriched_terms.groupby(['Graph','Comm'])['Term'].agg('count'))

#Finding consensus
temp=enriched_terms.copy()
temp['comm_term']=temp.Comm+'_'+temp.Term
temp=temp.loc[:,['Graph','comm_term']]

consensus=pd.DataFrame()
consensus=pd.concat([consensus, temp.loc[temp['Graph']=='pos']])
consensus=pd.merge(consensus,
                   temp.loc[temp['Graph']=='pos_w'], on="comm_term", how='outer', suffixes=['pos','pos_w'])
consensus=pd.merge(consensus, 
                   temp.loc[temp['Graph']=='all'], on="comm_term", how='outer').rename(columns={'Graph':'all'})

consensus=consensus.loc[consensus.isna().sum(1)==0].loc[:,['comm_term','Graphpos','Graphpos_w','all']]

consensus['comm']=[x[0] for x in consensus.comm_term.str.split('_')]
consensus.groupby('comm')['comm_term'].agg('count')

comm
c2    19
Name: comm_term, dtype: int64

consensus.head()

enriched_terms.to_csv("/home/jovyan/lab/data/serialization/enriched_terms.csv", sep = "\t", index = False)

# Compiles gene lists per community. We need Ensembl ids for further analyses
# TODO: the code bellow is not working as it requires all three environments, or lots of files being saved
# Homework?
# Requires running the community detection from the previous section
patlas=pd.read_csv('/home/jovyan/lab/data/proteinatlas.tsv', sep="\t").loc[:,['Ensembl','Gene']]
def get_ensembl():
    comm_counts=pd.DataFrame()
    gene_lists=pd.DataFrame()
    for i in [0,1,2,3]:
        graph=["pos_w","pos_w","all_u","knn"][i]
        comm=[pos_comm,pos_w_comm,all_comm,knn_comm][i]
        name=['pos','pos_w','all','knn'][i]
        temp=pd.DataFrame(list(zip(graph.vs['name'],[x+1 for x in comm.membership]))).rename(columns={0:'gene',1:'community'})

        gl=pd.DataFrame(temp.groupby('community')['gene'].apply(list)).reset_index()
        gl['community']=['c'+str(i) for i in gl['community']]
        gl['network']=name
        gl=gl.loc[:,['network','community','gene']]
        gene_lists=pd.concat([gene_lists, gl])

    gene_communities=gene_lists
    gene_mat=pd.DataFrame()
    for net in gene_communities['network'].unique():
        temp=gene_communities.copy().loc[gene_communities['network']==net,]
        for comm in temp['community'].unique():
            gl=list(temp.copy().loc[temp['community']==comm,'gene'])[0]
            el=[patlas.loc[patlas['Gene']==x,'Ensembl'].iloc[0] for x in gl if x in patlas['Gene'].values]

            df=pd.DataFrame([net,comm,gl,el]).T
            df.columns=['network','community','Gene','Ensembl']
            gene_mat=pd.concat([gene_mat, df])
                
    return(gene_mat)

get_ensembl().to_csv('/home/jovyan/lab/data/gene_communities.tsv', sep="\t", index=False)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[14], line 35
     31             gene_mat=pd.concat([gene_mat, df])
     33     return(gene_mat)
---> 35 get_ensembl().to_csv('/home/jovyan/lab/data/gene_communities.tsv', sep="\t", index=False)

Cell In[14], line 11, in get_ensembl()
      9 for i in [0,1,2,3]:
     10     graph=["pos_w","pos_w","all_u","knn"][i]
---> 11     comm=[pos_comm,pos_w_comm,all_comm,knn_comm][i]
     12     name=['pos','pos_w','all','knn'][i]
     13     temp=pd.DataFrame(list(zip(graph.vs['name'],[x+1 for x in comm.membership]))).rename(columns={0:'gene',1:'community'})

NameError: name 'pos_comm' is not defined

	Gene_set	Term	P-value	Adjusted P-value	Odds Ratio	Combined Score	Genes	BP	Comm	Graph
0	GO_Biological_Process_2023	Cytoplasmic Translation (GO:0002181)	6.596737e-21	1.768585e-17	10.248988	476.247014	EIF4A2;RPL4;RPL5;RPL30;RPL3;RPL32;RPL31;RPL34;...	GO_Biological_Process_2023	c2	pos
1	GO_Biological_Process_2023	Translation (GO:0006412)	2.464224e-20	3.303293e-17	8.477737	382.768320	RPL4;RPL5;RPL30;RPL3;RPL32;RPL31;RPL34;RPLP1;R...	GO_Biological_Process_2023	c2	pos
2	GO_Biological_Process_2023	Macromolecule Biosynthetic Process (GO:0009059)	7.129468e-20	6.371368e-17	8.330309	367.262217	RPL4;RPL5;RPL30;RPL3;RPL32;RPL31;RPL34;RPLP1;R...	GO_Biological_Process_2023	c2	pos
3	GO_Biological_Process_2023	Peptide Biosynthetic Process (GO:0043043)	2.043715e-19	1.369800e-16	8.693117	374.102635	RPL4;RPL5;RPL30;RPL3;RPL32;RPL31;RPL34;RPLP1;R...	GO_Biological_Process_2023	c2	pos
4	GO_Biological_Process_2023	Gene Expression (GO:0010467)	3.477726e-19	1.864757e-16	6.725773	285.863776	RPL4;RPL5;RPL30;RPL3;RPL32;RPL31;RPL34;RPLP1;H...	GO_Biological_Process_2023	c2	pos

	c1	c2	c3	c4	c5	c6	c7
pos	907.0	555.0	507.0	119.0	9.0	3.0	2.0
pos_w	902.0	556.0	506.0	122.0	11.0	3.0	2.0
all	686.0	653.0	613.0	145.0	3.0	2.0	0.0
knn	539.0	487.0	462.0	347.0	267.0	0.0	0.0

Topology lab, part 3¶

Functional analysis¶

Conclusion¶

	Graph	Comm	Term	Adjusted P-value
0	pos	c2	Cytoplasmic Translation (GO:0002181)	16.752374
1	pos	c2	Translation (GO:0006412)	16.481053
2	pos	c2	Macromolecule Biosynthetic Process (GO:0009059)	16.195767
3	pos	c2	Peptide Biosynthetic Process (GO:0043043)	15.863343
4	pos	c2	Gene Expression (GO:0010467)	15.729378

	comm_term	Graphpos	Graphpos_w	all	comm
67	c2_Circulatory System Development (GO:0072359)	pos	pos_w	all	c2
68	c2_Coronavirus disease	pos	pos_w	all	c2
69	c2_Cytoplasmic Translation (GO:0002181)	pos	pos_w	all	c2
71	c2_ECM-receptor interaction	pos	pos_w	all	c2
72	c2_Focal adhesion	pos	pos_w	all	c2