import sys import scipy.sparse as sp import numpy as np from stratipy.nbs_class import Ppi import warnings def check_sparsity(X): if not sp.issparse(X): X = sp.csc_matrix(X) return X def check_shape_matching(X, L, array_name, list_name): if X.shape[0] != len(L): raise Exception("Numbers in {} shape ({}) and in {} ({}) don't match " .format(array_name, X.shape, list_name, len(L))) #TODO check ID order in list and network def check_PPI_data(network, gene_id_ppi): if network.shape[0] != network.shape[1]: raise Exception("Network data format is not a square matrix") def check_patient_data(mutation_profile, gene_id_patient): if mutation_profile.shape[1] != len(gene_id_patient): raise Exception("Column number of mutation profile does not correspond to patient's gene number") #NOTE mutation_profile.shape[0] != len(patient_id) not verified because len(patient_id) could be bigger than mutation_profile.shape[0]. It depends on how many patients are annotated phenotypically. # @profile def classify_gene_index(network, mutation_profile, gene_id_ppi, gene_id_patient): """Gene index classification Compares genes' ID between PPI and mutaton profile in order to classify them by indexes. Parameters ---------- network : sparse matrix PPI data matrix, called also 'adjacency matrix'. gene_id_ppi : list List of Entrez Gene ID in PPI. The ID must be in the same order as in network. gene_id_patient : list List of Entrez Gene ID in patients' mutation profiles. The ID must be in the same order as in mutation profiles. Returns ------- idx_ppi : list List of common genes' indexes in PPI. idx_mut : list List of common genes' indexes in patients' mutation profiles. idx_ppi_only : list List of genes' indexes only in PPI. idx_mut_only : list List of genes' indexes only in patients' mutation profiles. """ # check step check_shape_matching(network, gene_id_ppi, 'PPI network matrix', 'List of Entrez Gene ID in PPI') check_PPI_data(network, gene_id_ppi) check_patient_data(mutation_profile, gene_id_patient) idx_ppi = [] idx_mut = [] idx_mut_only = [] for j, g in enumerate(gene_id_patient): try: i = gene_id_ppi.index(g) idx_ppi.append(i) idx_mut.append(j) except: i = np.nan idx_mut_only.append(j) network = check_sparsity(network) mutation_profile = check_sparsity(mutation_profile) idx_ppi_only = [g for g in range(network.shape[1]) if not(g in idx_ppi)] # print('---network ', type(network), network.dtype) # print('mutation_profile', mutation_profile.dtype) return network, mutation_profile, idx_ppi, idx_mut, idx_ppi_only, idx_mut_only # @profile def all_genes_in_submatrices(network, idx_ppi, idx_mut, idx_ppi_only, idx_mut_only, mutation_profile): """Processing of sub-matrices for each case of genes Extract the sub-matrices of references genes and Zero-padding of the adjacency matrix. ┌ - - - - - - - ┐ | | | | | AA |AB| AC | | | | | |- - - - - - - -| | BA |BB| BC | |- - - - - - - -| | CA |CB| CC | └ - - - - - - - ┘ AA, BB and CC are all adjacency matrces (0/1 matrices with 0 on diagonal) : AA : common genes between PPI and patients' mutation profiles BB : genes founded only in PPI CC : genes founded only in patients' mutation profiles = zero matrix Parameters ---------- network : sparse matrix, shape (len(gene_id_ppi),len(gene_id_ppi)) PPI data matrix, called also 'adjacency matrix'. idx_ppi : list List of common genes' indexes in PPI. idx_ppi_only : list List of genes' indexes only in PPI. idx_mut_only : list List of genes' indexes only in patients' mutation profiles. Returns ------- ppi_total : sparse matrix Built from all sparse sub-matrices (AA, ... , CC). mut_total : sparse matrix Patients' mutation profiles of all genes (rows: patients, columns: genes of AA, BB and CC). ppi_filt : sparse matrix Filtration from ppi_total : only genes in PPI are considered. ┌ - - - - -┐ | | | | AA |AB| | | | |- - - - - | | BA |BB| └ - - - - -┘ mut_filt : sparse matrix Filtration from mut_total : only genes in PPI are considered. """ AA = network[idx_ppi][:, idx_ppi] if AA.shape[0] == 0: warnings.warn("There are no common genes between PPI network and patients' mutation profile") AB = network[idx_ppi][:, idx_ppi_only] AC = sp.csc_matrix((len(idx_ppi), len(idx_mut_only))).astype(np.float32) BA = network[idx_ppi_only][:, idx_ppi] BB = network[idx_ppi_only][:, idx_ppi_only] BC = sp.csc_matrix((len(idx_ppi_only), len(idx_mut_only))).astype(np.float32) # TODO condition: if mutOnly = 0 CA = sp.csc_matrix((len(idx_mut_only), len(idx_ppi)), dtype=np.float32) CB = sp.csc_matrix((len(idx_mut_only), len(idx_ppi_only)), dtype=np.float32) CC = sp.csc_matrix((len(idx_mut_only), len(idx_mut_only)), dtype=np.float32) ppi_total = sp.bmat([[AA, AB, AC], [BA, BB, BC], [CA, CB, CC]], format='csc') # NOTE ppi_total in COO matrix -> csc matrix mut_total = sp.bmat([[mutation_profile[:, idx_mut], sp.csc_matrix((mutation_profile.shape[0], len(idx_ppi_only)), dtype=np.float32), mutation_profile[:, idx_mut_only]]], format='csc') # filter only genes in PPI degree = Ppi(ppi_total).deg ppi_filt = ppi_total[degree > 0, :][:, degree > 0] mut_filt = mut_total[:, degree > 0] return ppi_total, mut_total, ppi_filt, mut_filt # TODO for numpy docring: Raises (errors), Note, Examples # errors : sparse (ppi, patients), 0 on diagonal (ppi) def formatting(network, mutation_profile, gene_id_ppi, gene_id_patient): (network, mutation_profile, idx_ppi, idx_mut, idx_ppi_only, idx_mut_only) = (classify_gene_index( network, mutation_profile, gene_id_ppi, gene_id_patient)) (ppi_total, mut_total, ppi_filt, mut_filt) = (all_genes_in_submatrices( network, idx_ppi, idx_mut, idx_ppi_only, idx_mut_only, mutation_profile)) return idx_ppi, idx_ppi_only, ppi_total, mut_total, ppi_filt