import sys import os from scipy.io import loadmat, savemat import scipy.sparse as sp import numpy as np import pandas as pd from numpy import genfromtxt from stratipy.nbs_class import Ppi from tqdm import trange import glob from statistics import median # NOTE some variable names changed: # dataFolder -> data_folder # net -> network, ids -> gene_id_ppi, # mutations -> mutation_profile, genes -> gene_id_patient # geneSymbol_profile -> gene_symbol_profile # subnet -> idx_ppi, good -> idx_mut, subnetNotmutated ->idx_ppi_only, # bad -> idx_mut_only # nnnet -> ppi_total, nnmut -> mut_total # nnnetFiltered -> ppi_filt, nnmut-> mut_filt # @profile def load_TCGA_UCEC_patient_data(data_folder): # TODO patients' ID, phenotypes in dictionary of dictionary or ...? print(" ==== TCGA patients' ID ") phenotypes = loadmat(data_folder+'UCEC_clinical_phenotype.mat') patient_id = [c[0][0] for c in phenotypes['UCECppheno'][0][0][0]] # mutation profiles print(' ==== TCGA mutation profiles ') somatic = loadmat(data_folder+'somatic_data_UCEC.mat') mutation_profile = sp.csc_matrix(somatic['gene_indiv_mat'] .astype(np.float32)) # Entrez gene ID and gene symbols in mutation profiles print(' ==== TCGA Entrez gene ID and gene symbols in mutation profiles ') gene_id_patient = [x[0] for x in somatic['gene_id_all']] gene_symbol_profile = [x[0][0] for x in somatic['gene_id_symbol']] # dictionnary = key:entrez gene ID, value:symbol # mutation_id_symb = dict(zip(gene_id_patient, gene_symbol_profile)) # print('mutation_profile', mutation_profile.dtype) return patient_id, mutation_profile, gene_id_patient, gene_symbol_profile # @profile def load_Faroe_Islands_data(data_folder): # TODO patients' ID, phenotypes in dictionary of dictionary or ...? print(" ==== Faroe Islands data ") df = pd.read_csv(data_folder + "Faroe_LGD_10percents_binary.txt", sep="\t") subjects = df.columns[1:] # http://www.genenames.org/cgi-bin/download?col=gd_app_sym&col=md_eg_id&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit hgnc = pd.read_csv(data_folder + "hgnc_2016-10-17.tsv", sep="\t") hgnc.rename(columns={'Approved Symbol': 'gene', 'Entrez Gene ID(supplied by NCBI)': 'EntrezID'}, inplace=True) hgnc = hgnc.loc[~hgnc.loc[:, 'gene'].str.contains('withdrawn')] mutations = df.merge(hgnc, on='gene', how='outer') mutations = mutations.loc[np.isfinite(mutations.EntrezID)] # mutations.loc[:, subjects] = mutations.loc[:, subjects].fillna(0) mutations = mutations.dropna() mutation_profile = sp.csc_matrix((mutations.loc[:, subjects].values.T) .astype(np.float32)) mutations.EntrezID = mutations.EntrezID.astype(int) gene_id_patient = mutations.EntrezID.tolist() return mutation_profile, gene_id_patient def get_indiv_list(indiv_from_df): # create individual ID list alist = [] for i in indiv_from_df: for j in i: alist.append(j) # set and keep unique ID (sort) alist = sorted(set(alist)) return alist def mutation_profile_coordinate_score(df, indiv_list, indiv_type, score_list): coord_gene = [] coord_indiv = [] weight = [] for i in trange(df.shape[0], desc="mutation profile coordinates"): # for each row (each gene), we get list of individuals' ID individuals_per_gene = coordinate(df[indiv_type][i], indiv_list) # for each element of coordinate listes x/y: for j in individuals_per_gene: # gene is saved as gene's INDEX (not EntrezGene) in dataframe coord_gene.append(i) # individual is saved as his/her INDEX (not ID) in indiv_list coord_indiv.append(j) # pLI score for each point weight.append(score_list[i]) return coord_indiv, coord_gene, weight def generate_mp(df, pLI_type): gene_id = [int(i) for i in df['entrez_id'].tolist()] indiv_type = 'Admixture_individual' # Admixture European probands df[indiv_type] = df[indiv_type].apply(eval) # create individual ID list indiv = get_indiv_list(df[indiv_type]) if '[' in indiv: indiv.remove('[') if ']' in indiv: indiv.remove(']') # pLI_oe score list pLI_score = df[pLI_type].tolist() # calculate coordinates genes x individuals -> sparse matrix coord_indiv, coord_gene, weight = mutation_profile_coordinate_score( df, indiv, indiv_type, pLI_score) mutation_profile = sp.coo_matrix((weight, (coord_indiv, coord_gene)), shape=(len(indiv), len(gene_id)), dtype=np.float32).tocsr() return mutation_profile, gene_id, indiv def merge_lof_mis(df_lof, df_mis, pLI_lof_name, pLI_mis_name): mp_lof, gene_lof, indiv_lof = generate_mp(df_lof, pLI_lof_name) mp_mis, gene_mis, indiv_mis = generate_mp(df_mis, pLI_mis_name) df_lof = pd.DataFrame(mp_lof.toarray(), columns=gene_lof, index=indiv_lof) df_mis = pd.DataFrame(mp_mis.toarray(), columns=gene_mis, index=indiv_mis) # merge 2 dataframes and take high value if needed df = pd.concat([df_lof, df_mis]).groupby(level=0).max() # fill NA by 0 df = df.fillna(0) # dataframe to scipy sparse matrix mutation_profile = sp.csr_matrix(df.values) indiv = df.index.values.tolist() gene_id = df.columns.tolist() return mutation_profile, gene_id, indiv def load_overall_SSC_mutation_profile(data_folder, ssc_mutation_data): print(' ==== load overall_{}_mutation_profile'.format(ssc_mutation_data)) overall_mutation_profile_file = ( data_folder + "{}_overall_mutation_profile.mat".format(ssc_mutation_data)) existance_file = os.path.exists(overall_mutation_profile_file) if existance_file: print('***** overall_{}_mutation_profile file already exists *****' .format(ssc_mutation_data)) loadfile = loadmat(overall_mutation_profile_file) mutation_profile = loadfile['mutation_profile'] gene_id = (loadfile['gene_id'].flatten()).tolist() indiv = (loadfile['indiv'].flatten()).tolist() else: print('overall_{}_mutation_profile file is calculating.....' .format(ssc_mutation_data)) df_lof = pd.read_csv( data_folder + 'SSC_MAF1_LoF_gene_filtered_pLI_oe.csv', sep=';') if 'mis15' in ssc_mutation_data: df_mis = pd.read_csv( data_folder + 'SSC_MAF1_mis15_gene_filtered_pLI_oe.csv', sep=';') elif 'mis30' in ssc_mutation_data: df_mis = pd.read_csv( data_folder + 'SSC_MAF1_mis30_gene_filtered_pLI_oe.csv', sep=';') mutation_profile, gene_id, indiv = merge_lof_mis( df_lof, df_mis, 'oe_lof_rescaled', 'oe_mis_rescaled') savemat(overall_mutation_profile_file, {'mutation_profile': mutation_profile, 'gene_id': gene_id, 'indiv': indiv}, do_compression=True) return mutation_profile, gene_id, indiv def load_specific_SSC_mutation_profile(data_folder, ssc_mutation_data, ssc_subgroups, gene_data): print(' ==== load_{}_{}_{}_mutation_profile' .format(ssc_mutation_data, ssc_subgroups, gene_data)) mutation_profile_file = (data_folder + "{}_{}_{}_mutation_profile.mat" .format(ssc_mutation_data, ssc_subgroups, gene_data)) existance_file = os.path.exists(mutation_profile_file) if existance_file: print('***** {}_{}_{}_mutation_profile file already exists *****' .format(ssc_mutation_data, ssc_subgroups, gene_data)) loadfile = loadmat(mutation_profile_file) mutation_profile = loadfile['mutation_profile'] gene_id = (loadfile['gene_id'].flatten()).tolist() indiv = (loadfile['indiv'].flatten()).tolist() else: mutation_profile, gene_id, indiv = ( load_overall_SSC_mutation_profile(data_folder, ssc_mutation_data)) print("SSC overall mutation profile matrix\n shape: {}\n stored elements: {}".format(mutation_profile.shape, mutation_profile.nnz)) if ssc_subgroups != "SSC": print('{}_{}_mutation_profile file is calculating.....'.format( ssc_mutation_data, ssc_subgroups)) if (ssc_subgroups == 'SSC1') or (ssc_subgroups == 'SSC2'): print('{}_{}_mutation_profile file is calculating.....'.format( ssc_mutation_data, ssc_subgroups)) df_ssc = pd.read_csv(data_folder + '{}_phenotype.csv' .format(ssc_subgroups), sep='\t') ind_ssc_raw = df_ssc.individual.tolist() # ind_ssc_raw = df_ssc.individual.apply(eval).tolist() # 'SSC_all', 'SSC_male', 'SSC_female' else: if ssc_subgroups == 'SSC_all': indiv_file = data_folder + 'admixture_1175_all.txt' elif ssc_subgroups == 'SSC_male': indiv_file = data_folder + 'admixture_1175_male.txt' elif ssc_subgroups == 'SSC_female': indiv_file = data_folder + 'admixture_1175_female.txt' with open(indiv_file) as f: ind_ssc_raw = [line.rstrip('\n') for line in f] # looking for corresponding individual ID in overall data (indiv) # then append their index in a list (ind_ssc) ind_ssc = [] [ind_ssc.append(indiv.index(i)) for i in ind_ssc_raw if i in indiv] print("After filtering by {} mutation in {} data: {} to {} individuals ({} removed from metadata)" .format(ssc_mutation_data, ssc_subgroups, len(ind_ssc_raw), len(ind_ssc), len(ind_ssc_raw) - len(ind_ssc))) indiv = [indiv[i] for i in ind_ssc] # slice overall mutation profile by SSC subgroup individuals mutation_profile = mutation_profile[ind_ssc, :] if gene_data != 'allGenes': print('genes filtering according to:', gene_data) df_gene = pd.read_csv(data_folder + 'EntrezGene_{}.csv' .format(gene_data), sep='\t') specific_gene_id = [int(i) for i in df_gene.entrez_id.tolist()] # looking for corresponding EntrezGene ID in overall data (gene_id) # then append their index in a list (gene_filtered) gene_filtered = [] [gene_filtered.append( gene_id.index(i)) for i in specific_gene_id if i in gene_id] print("After filtering by {} data: {} to {} genes ({} removed)" .format(gene_data, len(gene_id), len(gene_filtered), len(gene_id) - len(gene_filtered))) gene_id = [gene_id[i] for i in gene_filtered] # slice overall mutation profile by filtered gene mutation_profile = mutation_profile[:, gene_filtered] savemat(mutation_profile_file, {'mutation_profile': mutation_profile, 'gene_id': gene_id, 'indiv': indiv}, do_compression=True) print("Mutation profile matrix\n shape: {}\n stored elements: {}".format(mutation_profile.shape, mutation_profile.nnz)) return mutation_profile, gene_id, indiv # @profile def load_PPI(data_folder, ppi_data, load_gene_id_ppi=True): if ppi_data == 'STRING': filename = 'PPI_STRING_v10_5.mat' else: filename = 'PPI_' + ppi_data + '.mat' loadfile = loadmat(data_folder + filename) network = loadfile['adj_mat'].astype(np.float32) if load_gene_id_ppi: gene_id_ppi = (loadfile['entrez_id'].flatten()).tolist() return gene_id_ppi, network else: return network return gene_id_ppi, network # @profile def load_Hofree_PPI_String(data_folder, ppi_data): # Entrez gene ID in PPI print(' ==== load_PPI_String and gene_id_ppi') entrez_to_idmat = loadmat(data_folder+'entrez_to_idmat.mat') gene_id_ppi = [x[0][0] for x in entrez_to_idmat['entrezid'][0]] # NOTE nan values in gene_id_ppi (choice of gene ID type) network = load_PPI(data_folder, ppi_data, load_gene_id_ppi=False) # print('---network ', type(network), network.dtype) return gene_id_ppi, network # @profile def coordinate(prot_list, all_list): coo_list = [] for prot in prot_list: i = all_list.index(prot) coo_list.append(i) return coo_list def create_adjacency_matrix(prot1, prot2, PPI_file): # prot1 and prot2 are two columns including Entrez gene ID, from PPI data edge_list = np.vstack((prot1, prot2)).T gene_id_ppi = (edge_list.flatten()).tolist() gene_id_ppi = list(set(gene_id_ppi)) # From ID list to coordinate list coo1 = coordinate(prot1.tolist(), gene_id_ppi) coo2 = coordinate(prot2.tolist(), gene_id_ppi) # Adjacency matrix n = len(gene_id_ppi) weight = np.ones(len(coo1)) # if interaction -> 1 network = sp.coo_matrix((weight, (coo1, coo2)), shape=(n, n)) network = network + network.T # symmetric matrix network = network.astype(bool).astype(int) # to binary values savemat(PPI_file, {'adj_mat': network, 'entrez_id': gene_id_ppi}, do_compression=True) return gene_id_ppi, network def load_PPI_network(data_folder, ppi_data): print(' ==== load_PPI_{}'.format(ppi_data)) PPI_file = data_folder + 'PPI_' + ppi_data + '.mat' existance_file = os.path.exists(PPI_file) if existance_file: print('***** PPI_{} file already exists *****'.format(ppi_data)) gene_id_ppi, network = load_PPI( data_folder, ppi_data, load_gene_id_ppi=True) else: print('PPI_{} file is calculating.....'.format(ppi_data)) if ppi_data == "Y2H": data = genfromtxt(data_folder+'PPI_Y2H_raw.tsv', delimiter='\t', dtype=int) # List of all proteins with Entrez gene ID prot1 = data[1:, 0] prot2 = data[1:, 1] else: newest_file = max(glob.iglob(data_folder + 'PPI_APID*.csv'), key=os.path.getctime) data = pd.read_csv(newest_file, sep="\t") prot1 = data.EntrezGene_1 prot2 = data.EntrezGene_2 gene_id_ppi, network = create_adjacency_matrix(prot1, prot2, PPI_file) return gene_id_ppi, network