import sys import numpy as np import scipy.sparse as sp import pandas as pd from numpy.linalg import norm from scipy.io import loadmat, savemat from stratipy.nbs_class import Ppi, Patient from subprocess import call import os import glob import time import datetime # NOTE mutationProfileDiffusion -> propagation # mutationProfile -> M, PPIAdjacencyMatrix -> adj, dataFolder -> result_folder # PPI_influence_min -> ppi_influence_min, PPI_influence_max-> ppi_influence_max # PPI_influence()-> calcul_ppi_influence(), PPI_influence -> ppi_influence # influenceDistance->influence_distance # influenceMat -> ppi_influence, PPIneighboorsMax -> ngh_max, # bestInfluencers -> best_influencers # filteredGenes -> deg0, keepSingletons -> keep_singletons # mutationsMin -> min_mutation, mutationsMax -> mutationsMax # newnet -> ppi_ngh, netFinal -> ppi_final, mutFinal -> mut_final # filteredPatients -> filtered_patients # @profile def propagation(M, adj, alpha, tol=1e-6): # TODO equation, M, alpha """Network propagation iterative process Iterative algorithm for apply propagation using random walk on a network: Initialize:: X1 = M Repeat:: X2 = alpha * X1.A + (1-alpha) * M X1 = X2 Until:: norm(X2-X1) < tol Where:: A : degree-normalized adjacency matrix Parameters ---------- M : sparse matrix Data matrix to be diffused. adj : sparse matrix Adjacency matrice. alpha : float Diffusion/propagation factor with 0 <= alpha <= 1. For alpha = 0 : no diffusion. For alpha = 1 : tol : float, default: 1e-6 Convergence threshold. Returns ------- X2 : sparse matrix Smoothed matrix. """ n = adj.shape[0] # diagonal = 1 -> degree A = adj.toarray() np.fill_diagonal(A, 1) # set diagonal elements to 1 A = A / A.sum(axis=0) X1 = M.astype(np.float32).toarray() X2 = alpha * X1.dot(A) + (1-alpha) * M if tol: i = 0 while norm(X2-X1) > tol: X1 = X2 X2 = alpha * X1.dot(A) + (1-alpha) * M i += 1 print(' Propagation iteration = {} ----- {}'.format( i, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")), flush=True) return sp.csc_matrix(X2) # @profile def compare_ij_ji(ppi, out_min=True, out_max=True): """Helper function for calcul_ppi_influence In most cases the influence (propagation) is not symmetric. We have to compare weight (a_ij) and (a_ji) for all pairs in order to obtain symmetric matrix/matrices. 2 choices available: minimum or maximum weight. a = min [(a_ij),(a_ji)] a = max [(a_ij),(a_ji)] Minimum weight is chosen to avoid Hubs phenomenon. Parameters ---------- ppi : sparse matrix Matrice to apply comparison. out_min, out_max : boolean, default: True Minimum and/or maximum weight is chosen. Returns ------- ppi_min, ppi_max : sparse matrix Symmertric matrix with minimum and/or maximum weight. Remarks ------- This implements the algorithm described in Vandin, Upfal, Raphael: Algorithms for Detecting Significantly Mutated Pathways in Cancer - Journal of Computational Biology, 2010, doi: 10.1089/cmb.2010.0265 """ # TODO matrice type of ppi n = ppi.shape[0] ppi = ppi.tolil() # need "lil_matrix" for reshape # transpose to compare ppi(ij) and ppi(ji) ppi_transp = sp.lil_matrix.transpose(ppi) # reshape to 1D matrix ppi_1d = ppi.reshape((1, n**2)) ppi_1d_transp = ppi_transp.reshape((1, n**2)) # reshapeto original size matrix after comparison (min/max) if out_min and out_max: ppi_min = (sp.coo_matrix.tolil( sp.coo_matrix.min(sp.vstack([ppi_1d, ppi_1d_transp]), axis=0)) ).reshape((n, n)).astype(np.float32) ppi_max = (sp.coo_matrix.tolil( sp.coo_matrix.max(sp.vstack([ppi_1d, ppi_1d_transp]), axis=0)) ).reshape((n, n)).astype(np.float32) return ppi_min, ppi_max elif out_min: ppi_min = (sp.coo_matrix.tolil( sp.coo_matrix.min(sp.vstack([ppi_1d, ppi_1d_transp]), axis=0, dtype=np.float32))).reshape((n, n)) return ppi_min elif out_max: ppi_max = (sp.coo_matrix.tolil( sp.coo_matrix.max(sp.vstack([ppi_1d, ppi_1d_transp]), axis=0, dtype=np.float32))).reshape((n, n)) return ppi_max else: print('You have to choice Min or Max') # TODO change error message # @profile def calcul_final_influence(M, adj, result_folder, alpha, influence_weight='min', simplification=True, compute=False, overwrite=False, tol=10e-6): """Compute network influence score Network propagation iterative process is applied on PPI. (1) The network influence distance matrix and (2) influence matrices based on minimum / maximum weight are saved as MATLAB-style files (.mat). - (1) : 'influence_distance_alpha={}_tol={}.mat' in 'influence_distance' directory - (2) : 'ppi_influence_alpha={}_tol={}.mat' in 'ppi_influence' directory Where {} are parameter values. The directories will be automatically created if not exist. If compute=False, the latest data of directory will be taken into account: - latest data with same parameters (alpha and tol) - if not exist, latest data of directory but with differents parameters Parameters ---------- M : sparse matrix Data matrix to be diffused. adj : sparse matrix Adjacency matrice. result_folder : str Path to create a new directory for save new files. If you want to creat in current directory, enter '/directory_name'. Absolute path is also supported. influence_weight : simplification : boolean, default: True compute : boolean, default: False If True, new network influence score will be computed. If False, the latest network influence score will be taken into account. overwrite : boolean, default: False If True, new network influence score will be computed even if the file which same parameters already exists in the directory. alpha : float Diffusion (propagation) factor with 0 <= alpha <= 1. For alpha = 0 : no diffusion. For alpha = 1 : tol : float, default: 10e-6 Convergence threshold. Returns ------- final_influence : sparse matrix Smoothed PPI influence matrices based on minimum / maximum weight. """ influence_distance_directory = result_folder + 'influence_distance/' influence_distance_file = ( influence_distance_directory + 'influence_distance_PPI_alpha={}_tol={}.mat'.format(alpha, tol)) ####### final_influence_directory = result_folder + 'final_influence/' final_influence_file = ( final_influence_directory + 'final_influence_PPI_simp={}_alpha={}_tol={}.mat'.format( simplification, alpha, tol)) ####### existance_same_param = os.path.exists(final_influence_file) # TODO overwrite condition # check if same parameters file exists in directory if existance_same_param: final_influence_data = loadmat(final_influence_file) if influence_weight == 'min': final_influence = final_influence_data['final_influence_min'] else: final_influence = final_influence_data['final_influence_max'] # print('final influence matrix', type(final_influence), final_influence.shape) print(' **** Same parameters file of FINAL INFLUENCE already exists') else: if compute: # check if influence distance file exists existance_same_influence = os.path.exists(influence_distance_file) if existance_same_influence: influence_data = loadmat(influence_distance_file) influence = influence_data['influence_distance'] print(' **** Same parameters file of INFLUENCE DISTANCE on PPI network already exists') else: print(' ==== Diffusion over PPI network (it can take approximately 10 min) ==== ', flush=True) influence = propagation(M, adj, alpha, tol) # save influence distance before simplification with parameters' values in filename os.makedirs(influence_distance_directory, exist_ok=True) # NOTE For Python ≥ 3.2 print(' Start to save INFLUENCE DISTANCE (before filtering) ----- {}' .format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") ), flush=True) savemat(influence_distance_file, {'influence_distance': influence, 'alpha': alpha}, do_compression=True) # simplification: multiply by PPI adjacency matrix if simplification: influence = influence.multiply(sp.lil_matrix(adj)) # -> influence as csr_matrix else: print("---------- No simplification ----------") pass final_influence_min, final_influence_max = compare_ij_ji( influence, out_min=True, out_max=True) # save final influence with parameters' values in filename os.makedirs(final_influence_directory, exist_ok=True) print(' Start to save FINAL INFLUENCE (after filtering) ----- {}' .format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")), flush=True) savemat(final_influence_file, {'final_influence_min': final_influence_min, 'final_influence_max': final_influence_max, 'alpha': alpha}, do_compression=True) if influence_weight == 'min': final_influence = final_influence_min else: final_influence = final_influence_max # take most recent file else: for x in final_influence_file, influence_distance_directory: print(x) newest_file = max(glob.iglob(x + '*.mat'), key=os.path.getctime) final_influence_data = loadmat(newest_file) if x == final_influence_directory: if influence_weight == 'min': final_influence = final_influence_data['final_influence_min'] else: final_influence = final_influence_data['final_influence_max'] return final_influence # @profile def best_neighboors(ppi_filt, final_influence, ngh_max): """Helper function for filter_ppi_patients Keeps only the connections with the best influencers. Parameters ---------- ppi_filt : sparse matrix Filtration from ppi_total : only genes in PPI are considered. final_influence : Smoothed PPI influence matrices based on minimum or maximum weight. ngh_max : int Number of best influencers in PPI. Returns ------- ppi_ngh : sparse matrix PPI with only best influencers. """ ngh_max = ngh_max + 1 # central protein included final_influence = final_influence.todense() ppi_filt = ppi_filt.todense() ppi_ngh = np.zeros(ppi_filt.shape, dtype=np.float32) for i in range(ppi_filt.shape[0]): best_influencers = np.argpartition(-final_influence[i, :], ngh_max) best_influencers = np.asarray(best_influencers).squeeze()[:ngh_max] #NOTE different result if same value exists several times ppi_ngh[i, best_influencers] = ppi_filt[i, best_influencers] ppi_ngh = np.max(np.dstack((ppi_ngh, ppi_ngh.T)), axis=2) # too stringent if np.min return sp.csc_matrix(ppi_ngh) # @profile def filter_ppi_patients(result_folder, influence_weight, simplification, alpha, tol, ppi_total, mut_total, ppi_filt, final_influence, ngh_max, keep_singletons=False, min_mutation=0, max_mutation=2000): """Keeping only the connections with the best influencers and Filtering some patients based on mutation number 'the 11 most influential neighbors of each gene in the network as determined by network influence distance were used' 'Only mutation data generated using the Illumina GAIIx platform were retained for subsequent analy- sis, and patients with fewer than 10 mutations were discarded.' Parameters ---------- ppi_total : sparse matrix Built from all sparse sub-matrices (AA, ... , CC). mut_total : sparse matrix Patients' mutation profiles of all genes (rows: patients, columns: genes of AA, BB and CC). ppi_filt : sparse matrix Filtration from ppi_total : only genes in PPI are considered. final_influence : Smoothed PPI influence matrices based on minimum or maximum weight. ngh_max : int Number of best influencers in PPI. keep_singletons : boolean, default: False If True, proteins not annotated in PPI (genes founded only in patients' mutation profiles) will be also considered. If False, only annotated proteins in PPI will be considered. min_mutation, max_mutation : int Numbers of lowest mutations and highest mutations per patient. Returns ------- ppi_final, mut_final : sparse matrix PPI and mutation profiles after filtering. """ ppi_final_directory = result_folder + 'final_influence/' ppi_final_file = ( ppi_final_directory + 'PPI_final_weight={}_simp={}_alpha={}_tol={}_singletons={}_ngh={}.mat' .format(influence_weight, simplification, alpha, tol, keep_singletons, ngh_max)) existance_same_param = os.path.exists(ppi_final_file) if existance_same_param: ppi_final_data = loadmat(ppi_final_file) ppi_final = ppi_final_data['ppi_final'] print(' **** Same parameters file of PPI FINAL already exists') if keep_singletons: mut_final = mut_total else: mut_final = mut_total[:, Ppi(ppi_total).deg > 0] else: ppi_ngh = best_neighboors(ppi_filt, final_influence, ngh_max) deg0 = Ppi(ppi_total).deg == 0 # True if protein degree = 0 if keep_singletons: ppi_final = sp.bmat([ [ppi_ngh, sp.csc_matrix((ppi_ngh.shape[0], sum(deg0)))], [sp.csc_matrix((sum(deg0), ppi_ngh.shape[0])), sp.csc_matrix((sum(deg0), sum(deg0)))] ]) # -> COO matrix # mut_final=sp.bmat([[mut_total[:,deg0==False],mut_total[:,deg0==True]]]) mut_final = mut_total else: ppi_final = ppi_ngh mut_final = mut_total[:, Ppi(ppi_total).deg > 0] savemat(ppi_final_file, {'ppi_final': ppi_final}, do_compression=True) # to avoid worse comparison '== False' mut_final = mut_final[np.array([min_mutation <= k <= max_mutation for k in Patient(mut_final).mut_per_patient])] print(" Removing %i patients with less than %i or more than %i mutations" % (mut_total.shape[0]-mut_final.shape[0], min_mutation, max_mutation), flush=True) return ppi_final, mut_final def quantile_norm_mean(df): """Helper function for propagation_profile Forces the observations/variables to have identical intensity distribution. Parameters ---------- Returns ------- """ if not isinstance(df, pd.DataFrame): df = pd.DataFrame(df) rank_mean = df.stack().groupby(df.rank(method='first').stack().astype(int)).mean() df_norm = df.rank(method='min').stack().astype(int).map(rank_mean).unstack() return df_norm.values def quantile_norm_median(df): """Helper function for propagation_profile Forces the observations/variables to have identical intensity distribution. Parameters ---------- Returns ------- """ if not isinstance(df, pd.DataFrame): df = pd.DataFrame(df) rank_median = df.stack().groupby(df.rank(method='first').stack().astype(int)).median() df_norm = df.rank(method='min').stack().astype(int).map(rank_median).unstack() return df_norm.values def propagation_profile(mut_raw, adj, result_folder, alpha, tol, mut_type): # TODO error messages final_influence_mutation_directory = result_folder + 'final_influence/' final_influence_mutation_file = ( final_influence_mutation_directory + 'final_influence_mutation_profile_{}_alpha={}_tol={}.mat'.format( mut_type, alpha, tol)) existance_same_param = os.path.exists(final_influence_mutation_file) if existance_same_param: final_influence_data = loadmat(final_influence_mutation_file) mut_propag = final_influence_data['mut_propag'] print(' **** Same parameters file of FINAL INFLUENCE on Mutation Profile already exists') else: if mut_type == 'raw': mut_propag = mut_raw.todense() else: print(' ==== Diffusion over mutation profile ==== ', flush=True) mut_propag = propagation(mut_raw, adj, alpha, tol).todense() mut_propag[np.isnan(mut_propag)] = 0 if mut_type == 'mean_qn': mut_propag = quantile_norm_mean(mut_propag) elif mut_type == 'median_qn': mut_propag = quantile_norm_median(mut_propag) savemat(final_influence_mutation_file, {'mut_propag': mut_propag, 'alpha': alpha}, do_compression=True) return mut_propag def filtering(ppi_filt, result_folder, influence_weight, simplification, compute, overwrite, alpha, tol, ppi_total, mut_total, ngh_max, keep_singletons, min_mutation, max_mutation, mut_type): final_influence = (calcul_final_influence( sp.eye(ppi_filt.shape[0], dtype=np.float32), ppi_filt, result_folder, alpha, influence_weight, simplification, compute, overwrite, tol)) ppi_final, mut_final = filter_ppi_patients( result_folder, influence_weight, simplification, alpha, tol, ppi_total, mut_total, ppi_filt, final_influence, ngh_max, keep_singletons, min_mutation, max_mutation) mut_propag = propagation_profile( mut_final, ppi_filt, result_folder, alpha, tol, mut_type) return ppi_final, mut_propag