import sys import os sys.path.append(os.path.abspath('../../stratipy_cluster')) from stratipy import load_data, formatting_data # GO Enrich import pandas as pd import goenrich import goenrich.tools as tools import numpy as np import scipy.stats as st from tqdm import trange, tqdm def ratio(x, y): try: return x/y except ZeroDivisionError: return 0 def get_enrichment_p_count(O, interesting, min_category_size, max_category_size, max_category_depth): query = set(interesting['GeneID']) options = {'min_category_size': min_category_size, 'max_category_size': max_category_size, 'max_category_depth': max_category_depth} enrichment = goenrich.enrich.analyze(O, query, 'reference', **options).dropna().sort_values(by='q') count_bio = enrichment[ enrichment['namespace'] == 'biological_process'].shape[0] count_mol = enrichment[ enrichment['namespace'] == 'molecular_function'].shape[0] count_cel = enrichment[ enrichment['namespace'] == 'cellular_component'].shape[0] enrichment = enrichment[enrichment['p'] < 0.05] p_count_bio = enrichment[ enrichment['namespace'] == 'biological_process'].shape[0] p_count_mol = enrichment[ enrichment['namespace'] == 'molecular_function'].shape[0] p_count_cel = enrichment[ enrichment['namespace'] == 'cellular_component'].shape[0] ratio_bio = ratio(p_count_bio, count_bio) ratio_mol = ratio(p_count_mol, count_mol) ratio_cel = ratio(p_count_cel, count_cel) enrichment['LOD'] = (enrichment.x / enrichment.n) / \ (enrichment.N / enrichment.M - enrichment.N) enrichment['Z'] = st.norm.ppf(enrichment.p) enrichment['Score'] = -np.log(enrichment.p) df_enrich = enrichment[['x', 'n', 'namespace', 'name', 'p', 'Score', 'Z', 'LOD']] return (df_enrich, p_count_bio, p_count_mol, p_count_cel, ratio_bio, ratio_mol, ratio_cel) def merge_p_count(data_folder, ssc_mutation_data, ssc_subgroups, gene_data, ppi_data, n_components, mut_type, alpha, ngh_max, n_permutations, lambd, O, min_category_size, max_category_size, max_category_depth): p_count_bio_list = [] p_count_mol_list = [] p_count_cel_list = [] ratio_bio_list = [] ratio_mol_list = [] ratio_cel_list = [] for k in range(n_components): cluster = k+1 file_directory = (data_folder + 'text/clusters_EntrezGene/{}_{}_{}_{}/k={}/'. format(ssc_mutation_data, ssc_subgroups, gene_data, ppi_data, n_components)) text_file = file_directory + ( '{}_{}_ngh={}_permut={}_lambd={}' .format(mut_type, alpha, ngh_max, n_permutations, lambd)) cluster_file = text_file + '_cluster={}.txt'.format(cluster) interesting = pd.read_table(cluster_file, names=['GeneID']) (df_enrich, p_count_bio, p_count_mol, p_count_cel, ratio_bio, ratio_mol, ratio_cel) = get_enrichment_p_count( O, interesting, min_category_size, max_category_size, max_category_depth) p_count_bio_list.append(p_count_bio) p_count_mol_list.append(p_count_mol) p_count_cel_list.append(p_count_cel) ratio_bio_list.append(ratio_bio) ratio_mol_list.append(ratio_mol) ratio_cel_list.append(ratio_cel) # print("---") # print("bio ", sum(p_count_bio_list), max(ratio_bio_list) ) # print("mol ", sum(p_count_mol_list), max(ratio_mol_list) ) # print("cel ", sum(p_count_cel_list), max(ratio_cel_list) ) return(df_enrich, sum(p_count_bio_list), max(ratio_bio_list), sum(p_count_mol_list), max(ratio_mol_list), sum(p_count_cel_list), max(ratio_cel_list)) def p_df_by_subgroup(data_folder, ssc_mutation_data, ssc_subgroups, gene_data, ppi_data, n_components, mut_type, alpha, ngh_max, n_permutations, lambd, O, min_category_size, max_category_size, max_category_depth): k = [] p_bio_list = [] p_mol_list = [] p_cel_list = [] r_bio_list = [] r_mol_list = [] r_cel_list = [] df_enrich_list =[] for n_components in tqdm(range(2, 51), desc='k'): df_enrich, p_bio, r_bio, p_mol, r_mol, p_cel, r_cel = merge_p_count( data_folder, ssc_mutation_data, ssc_subgroups, gene_data, ppi_data, n_components, mut_type, alpha, ngh_max, n_permutations, lambd, O, min_category_size, max_category_size, max_category_depth) df_enrich['ind_group'] = ssc_subgroups df_enrich['k'] = n_components df_enrich_list.append(df_enrich) k.append(n_components) p_bio_list.append(p_bio) p_mol_list.append(p_mol) p_cel_list.append(p_cel) r_bio_list.append(r_bio) r_mol_list.append(r_mol) r_cel_list.append(r_cel) df_enrich_merged = pd.concat(df_enrich_list) df_pcount = pd.DataFrame({'k': k, 'p_bio': p_bio_list, 'r_bio': r_bio_list, 'p_mol': p_mol_list, 'r_mol': r_mol_list, 'p_cel': p_cel_list, 'r_cel': r_cel_list}) return df_enrich_merged, df_pcount def merge_2_subgroups(data_folder, ssc_mutation_data, ssc_subgroups, gene_data, ppi_data, n_components, mut_type, alpha, ngh_max, n_permutations, lambd, O, gene2go, min_category_size, max_category_size, max_category_depth): ################ SSC 1 ################ ssc_subgroups = 'SSC1' print(ssc_subgroups) mutation_profile, gene_id_patient, patient_id = ( load_data.load_specific_SSC_mutation_profile( data_folder, ssc_mutation_data, ssc_subgroups, gene_data)) gene_id_ppi, network = load_data.load_PPI_network(data_folder, ppi_data) network, mutation_profile, idx_ppi, idx_mut, idx_ppi_only, idx_mut_only = ( formatting_data.classify_gene_index( network, mutation_profile, gene_id_ppi, gene_id_patient)) # idx_mut : List of common genes' indexes in individuals' mutation profiles. reference = pd.DataFrame({'GeneID': [gene_id_patient[i] for i in idx_mut]}) background = tools.generate_background( gene2go, reference, 'GO_ID', 'GeneID') goenrich.enrich.propagate(O, background, 'reference') df_enrich1, df_pcount1 = p_df_by_subgroup( data_folder, ssc_mutation_data, ssc_subgroups, gene_data, ppi_data, n_components, mut_type, alpha, ngh_max, n_permutations, lambd, O, min_category_size, max_category_size, max_category_depth) ################ SSC 2 ################ ssc_subgroups = 'SSC2' print(ssc_subgroups) mutation_profile, gene_id_patient, patient_id = ( load_data.load_specific_SSC_mutation_profile( data_folder, ssc_mutation_data, ssc_subgroups, gene_data)) gene_id_ppi, network = load_data.load_PPI_network(data_folder, ppi_data) network, mutation_profile, idx_ppi, idx_mut, idx_ppi_only, idx_mut_only = ( formatting_data.classify_gene_index( network, mutation_profile, gene_id_ppi, gene_id_patient)) # idx_mut : List of common genes' indexes in individuals' mutation profiles. reference = pd.DataFrame({'GeneID': [gene_id_patient[i] for i in idx_mut]}) background = tools.generate_background( gene2go, reference, 'GO_ID', 'GeneID') goenrich.enrich.propagate(O, background, 'reference') df_enrich2, df_pcount2 = p_df_by_subgroup( data_folder, ssc_mutation_data, ssc_subgroups, gene_data, ppi_data, n_components, mut_type, alpha, ngh_max, n_permutations, lambd, O, min_category_size, max_category_size, max_category_depth) ################ merge ################ df_enrich = pd.concat([df_enrich1, df_enrich2], ignore_index=True) # change col order cols = list(df_enrich) cols.insert(0, cols.pop(cols.index('k'))) cols.insert(0, cols.pop(cols.index('ind_group'))) df_enrich = df_enrich.loc[:, cols] df_pcount = df_pcount1.merge(df_pcount2, how='inner', left_on='k', right_on='k', suffixes=[1, 2]) directory = ( data_folder + 'result_biostat_genes_GOslim/' + ssc_mutation_data + '_' + gene_data + '/') os.makedirs(directory, exist_ok=True) file_name_enrich = 'GOslim_{}_lambd={}_{}.pkl'.format( mut_type, lambd, ppi_data) file_name_pcount = 'p_count_{}_lambd={}_{}.pkl'.format( mut_type, lambd, ppi_data) # save df_enrich.to_pickle('{}{}'.format(directory, file_name_enrich)) df_pcount.to_pickle('{}{}'.format(directory, file_name_pcount)) def biostat_go_enrichment( alpha, result_folder, mut_type, patient_data, data_folder, ssc_mutation_data, ssc_subgroups, gene_data, ppi_data, lambd, n_components, ngh_max, n_permutations): O = goenrich.obo.ontology(data_folder + 'goslim_generic.obo') gene2go = goenrich.read.gene2go(data_folder + 'gene2go.gz') min_category_size = 2 # max_category_size = 10000 max_category_size = 500 max_category_depth = 100000 merge_2_subgroups( data_folder, ssc_mutation_data, ssc_subgroups, gene_data, ppi_data, n_components, mut_type, alpha, ngh_max, n_permutations, lambd, O, gene2go, min_category_size, max_category_size, max_category_depth)