In [2]:
import sys
import os
sys.path.append(os.path.abspath('../../stratipy'))
from stratipy import load_data
import importlib  # NOTE for python >= Python3.4
import scipy.sparse as sp
import numpy as np
import pandas as pd
import time
from datetime import datetime
from scipy.io import loadmat, savemat

In [3]:
data_folder = '../data/'
# ppi_data = 'APID'


# Load APID PPI network data

In [3]:
APID_gene_id, APID_network = load_data.load_PPI_Y2H_or_APID(data_folder, ppi_data)

 ==== load_PPI_APID
***** PPI_APID file already exists *****
 ==== load_PPI 
 ==== load_gene_id_ppi 


In [4]:
APID_network

<14080x14080 sparse matrix of type '<class 'numpy.float32'>'
	with 227160 stored elements in Compressed Sparse Column format>

# Load STRING PPI network data
data download from: https://stringdb-static.org/download/protein.links.v10.5/9606.protein.links.v10.5.txt.gz

9606 = Homosapiens

In [4]:
string_ppi = pd.read_table(data_folder + '9606.protein.links.v10.5.txt',  delim_whitespace=True)

In [5]:
print("Raw STRING PPI number: ", string_ppi.shape[0])

Raw STRING PPI number:  11353056


In [7]:
string_ppi.head()

Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000263431,260
1,9606.ENSP00000000233,9606.ENSP00000353863,164
2,9606.ENSP00000000233,9606.ENSP00000342026,159
3,9606.ENSP00000000233,9606.ENSP00000240874,194
4,9606.ENSP00000000233,9606.ENSP00000379847,164


In [6]:
# sort top 10% of combine_score
top10_string = string_ppi.nlargest(round(string_ppi.shape[0]/10), 'combined_score')

In [7]:
print("Top 10% score of STRING PPI number: ", top10_string.shape[0])

Top 10% score of STRING PPI number:  1135306


In [8]:
# remove Homosapiens ID ('9606.')
top10_string['protein1'] = top10_string['protein1'].str.replace('9606.', '')
top10_string['protein2'] = top10_string['protein2'].str.replace('9606.', '')

# rename
top10_string.rename(columns={'protein1': 'ensembl_1',
                            'protein2': 'ensembl_2'}, inplace = True)
top10_string.head()

Unnamed: 0,ensembl_1,ensembl_2,combined_score
23489,ENSP00000007516,ENSP00000252102,999
23542,ENSP00000007516,ENSP00000354961,999
23598,ENSP00000007516,ENSP00000276062,999
23723,ENSP00000007516,ENSP00000389160,999
23741,ENSP00000007516,ENSP00000266544,999


In [13]:
top10_string.shape

(1135306, 3)

In [11]:
# unique values: proteins
top10_string['ensembl_1'].nunique()

17997

In [12]:
top10_string['ensembl_2'].nunique()

17995

## ID mapping with BioMart data: Ensembl protein ID (ENSP) -> EntrezGene ID
### !!! In STRING data: Ensembl protein ID (ENSP) and not Ensembl gene ID (ENSG)
Downloaded from __[BioMart](https://grch37.ensembl.org/biomart/martview)__ <br>
Human genes __(GRCh37.p13)__

Attributes:
- Gene stable ID (ENSG)
- Protein stable ID (ENSP)
- EntrezGene ID
- HGNC symbol

"mart_export.txt" (2018/06/25)

In [13]:
df_biomart = pd.read_csv(data_folder + "mart_export.txt", sep="\t", index_col=False)
df_biomart.head()

Unnamed: 0,Gene stable ID,Protein stable ID,EntrezGene ID,HGNC symbol
0,ENSG00000261657,ENSP00000456546,115286.0,SLC25A26
1,ENSG00000261657,ENSP00000457004,115286.0,SLC25A26
2,ENSG00000261657,ENSP00000456312,115286.0,SLC25A26
3,ENSG00000261657,ENSP00000456306,115286.0,SLC25A26
4,ENSG00000261657,,115286.0,SLC25A26


In [14]:
df_biomart.shape

(233738, 4)

In [63]:
# keep only ENSP ID and EntrezGene ID
df_biomart_ENSP_Entrez = df_biomart[['Protein stable ID', 'EntrezGene ID']]
df_biomart_ENSP_Entrez.head()

Unnamed: 0,Protein stable ID,EntrezGene ID
0,ENSP00000456546,115286.0
1,ENSP00000457004,115286.0
2,ENSP00000456312,115286.0
3,ENSP00000456306,115286.0
4,,115286.0


In [66]:
print("NaN in ENSP ID: {} ({}%)"
      .format(df_biomart_ENSP_Entrez['Protein stable ID'].isnull().sum(),
              round(df_biomart_ENSP_Entrez['Protein stable ID'].isnull().sum()*100/df_biomart.shape[0], 1)))
print("NaN in EntrezGene ID: {} ({}%)"
      .format(df_biomart_ENSP_Entrez['EntrezGene ID'].isnull().sum(),
              round(df_biomart_ENSP_Entrez['EntrezGene ID'].isnull().sum()*100/df_biomart.shape[0], 1)))

# remove ENSP and EntrezGene NaN rows
df_biomart_compact = df_biomart_ENSP_Entrez[pd.notnull(df_biomart_ENSP_Entrez['Protein stable ID'])]
df_biomart_compact = df_biomart_compact[pd.notnull(df_biomart_compact['EntrezGene ID'])]
df_biomart_compact.shape

NaN in ENSP ID: 119763 (51.2%)
NaN in EntrezGene ID: 42551 (18.2%)


(111490, 2)

In [79]:
df_biomart_compact.head()

Unnamed: 0,Protein stable ID,EntrezGene ID
0,ENSP00000456546,115286.0
1,ENSP00000457004,115286.0
2,ENSP00000456312,115286.0
3,ENSP00000456306,115286.0
115,ENSP00000455676,685.0


In [100]:
print('Duplicated ENSP number: ',
      df_biomart_compact.duplicated(subset=['Protein stable ID'], keep=False).sum())
print('Duplicated EntrezGene number: ',
      df_biomart_compact.duplicated(subset=['EntrezGene ID'], keep=False).sum())

Duplicated ENSP number:  15163
Duplicated EntrezGene number:  108095


### caca ex: 1 ENSG/ENSP/Gene symbol -> several EntrezGene IDs

In [86]:
df_biomart[df_biomart['Protein stable ID']=='ENSP00000459754']

Unnamed: 0,Gene stable ID,Protein stable ID,EntrezGene ID,HGNC symbol
233679,ENSG00000262596,ENSP00000459754,100996743.0,KIR2DS1
233680,ENSG00000262596,ENSP00000459754,100133046.0,KIR2DS1
233681,ENSG00000262596,ENSP00000459754,100132285.0,KIR2DS1
233682,ENSG00000262596,ENSP00000459754,3810.0,KIR2DS1
233683,ENSG00000262596,ENSP00000459754,3809.0,KIR2DS1
233684,ENSG00000262596,ENSP00000459754,3808.0,KIR2DS1
233685,ENSG00000262596,ENSP00000459754,3806.0,KIR2DS1
233686,ENSG00000262596,ENSP00000459754,3804.0,KIR2DS1


In [91]:
# remove all ENSP duplicates rows
df_biomart_uniqENSP = df_biomart_compact.drop_duplicates(subset=['Protein stable ID'], keep=False)
removed_ensp = df_biomart_compact.shape[0] - df_biomart_uniqENSP.shape[0]
print("Removed ENSP duplicates rows: {} ({}%)"
     .format(removed_ensp, round(removed_ensp*100/df_biomart_compact.shape[0], 1)))
df_biomart_uniqENSP.shape

Removed ENSP duplicates rows: 15163 (13.6%)


(96327, 2)

In [94]:
# merge based on ensembl_1 (_x)
str_mart1 = top10_string.merge(df_biomart_uniqENSP, how='left', left_on='ensembl_1', right_on='Protein stable ID')
# str_mart1.shape

# merge based on ensembl_2 (_y)
str_mart2 = str_mart1.merge(df_biomart_uniqENSP, how='left', left_on='ensembl_2', right_on='Protein stable ID')
str_mart2.shape

(1135306, 7)

In [103]:
df_string = str_mart2[['ensembl_1', 'ensembl_2', 'EntrezGene ID_x', 'EntrezGene ID_y']]
df_string = df_string.rename(columns={'EntrezGene ID_x': 'EntrezGene ID_1',
                                     'EntrezGene ID_y': 'EntrezGene ID_2'})
df_string.head()

Unnamed: 0,ensembl_1,ensembl_2,EntrezGene ID_1,EntrezGene ID_2
0,ENSP00000007516,ENSP00000252102,4706.0,4695.0
1,ENSP00000007516,ENSP00000354961,4706.0,4538.0
2,ENSP00000007516,ENSP00000276062,4706.0,54539.0
3,ENSP00000007516,ENSP00000389160,4706.0,126328.0
4,ENSP00000007516,ENSP00000266544,4706.0,4704.0


In [109]:
print("NaN in EntrezGene ID_1: {} ({}%)"
      .format(df_string['EntrezGene ID_1'].isnull().sum(),
              round(df_string['EntrezGene ID_1'].isnull().sum()*100/df_string.shape[0], 1)))
print("NaN in EntrezGene ID_2: {} ({}%)"
      .format(df_string['EntrezGene ID_2'].isnull().sum(),
              round(df_string['EntrezGene ID_2'].isnull().sum()*100/df_string.shape[0], 1)))

# remove EntrezGene NaN rows
df_string = df_string[pd.notnull(df_string['EntrezGene ID_1'])]
df_string = df_string[pd.notnull(df_string['EntrezGene ID_2'])]
df_string.shape

NaN in EntrezGene ID_1: 72733 (6.4%)
NaN in EntrezGene ID_2: 72779 (6.4%)


(997965, 4)

In [123]:
df_string_compact = df_string.drop_duplicates(subset=['EntrezGene ID_1', 'EntrezGene ID_2'])
print("Removed duplicated EntrezGene PPI: {} ({}%)"
      .format(df_string.shape[0]-df_string_compact.shape[0],
             round((df_string.shape[0]-df_string_compact.shape[0])*100/df_string.shape[0], 1)))
df_string_compact.shape

Removed duplicated EntrezGene PPI: 1994 (0.2%)


(995971, 4)

## Create STRING PPI network matrix

In [141]:
# EntrezGene ID in lists
entrez1 = df_string_compact['EntrezGene ID_1']#.tolist()
entrez2 = df_string_compact['EntrezGene ID_2']#.tolist()

# from float to int
# entrez1 = [int(i) for i in entrez1]
# entrez2 = [int(i) for i in entrez2]

In [69]:
def coordinate(prot_list, all_list):
    coo_list = []
    for prot in prot_list:
        i = all_list.index(prot)
        coo_list.append(i)
    return coo_list

def create_adjacency_matrix(prot1, prot2):
    # remove if self interaction
    prot1, prot2 = zip(*((x, y) for x, y in zip(prot1, prot2) if x!=y))
#     prot1, prot2 = list(prot1), list(prot2)
    edge_list = np.vstack((prot1, prot2)).T
    gene_id_ppi = (edge_list.flatten()).tolist()
    gene_id_ppi = list(set(gene_id_ppi))

    # From ID list to coordinate list
    print(' ==== coordinates ')
#     coo1 = coordinate(prot1.tolist(), gene_id_ppi)
#     coo2 = coordinate(prot2.tolist(), gene_id_ppi)
    coo1 = coordinate(list(prot1), gene_id_ppi)
    coo2 = coordinate(list(prot2), gene_id_ppi)

    # Adjacency matrix
    print(' ==== Adjacency matrix ')
    n = len(gene_id_ppi)
    weight = np.ones(len(coo1))  # if interaction -> 1
    network = sp.coo_matrix((weight, (coo1, coo2)), shape=(n, n))
    network = network + network.T  # symmetric matrix
    network.setdiag(0)
#     savemat(PPI_file, {'adj_mat': network, 'entrez_id': gene_id_ppi},
#             do_compression=True)
    return gene_id_ppi, network

In [147]:
STRING_gene_id, STRING_network = create_adjacency_matrix(entrez1, entrez2)

 ==== coordinates 
 ==== Adjacency matrix 


In [148]:
STRING_network

<16590x16590 sparse matrix of type '<class 'numpy.float64'>'
	with 996925 stored elements in Compressed Sparse Row format>

In [150]:
savemat(data_folder + 'PPI_STRING_v10_5.mat',
        {'adj_mat': STRING_network, 'entrez_id': STRING_gene_id},do_compression=True)

In [1]:
996925/2

498462.5

In [70]:
list1 = [1, 1, 2, 3, 4, 5, 2]
list2 = [2, 4, 6, 6, 6, 5, 1]

df = pd.DataFrame(
    {'prot1': list1,
     'prot2': list2})

gene, net = create_adjacency_matrix(df['prot1'], df['prot2'])

df

 ==== coordinates 
 ==== Adjacency matrix 




Unnamed: 0,prot1,prot2
0,1,2
1,1,4
2,2,6
3,3,6
4,4,6
5,5,5
6,2,1


In [71]:
gene

[1, 2, 3, 4, 6]

In [45]:
net

<6x6 sparse matrix of type '<class 'numpy.float64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [72]:
net

<5x5 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [74]:
net.todense()

matrix([[ 0.,  2.,  0.,  1.,  0.],
        [ 2.,  0.,  0.,  0.,  1.],
        [ 0.,  0.,  0.,  0.,  1.],
        [ 1.,  0.,  0.,  0.,  1.],
        [ 0.,  1.,  1.,  1.,  0.]])

In [36]:
net.todense()

matrix([[ 0.,  2.,  0.,  1.,  0.,  0.],
        [ 2.,  0.,  0.,  0.,  0.,  1.],
        [ 0.,  0.,  0.,  0.,  0.,  1.],
        [ 1.,  0.,  0.,  0.,  0.,  1.],
        [ 0.,  0.,  0.,  0.,  2.,  0.],
        [ 0.,  1.,  1.,  1.,  0.,  0.]])

In [62]:
l1, l2 = zip(*((x, y) for x, y in zip(list1, list2) if x!=y))
l1, l2 = list(l1), list(l2)

In [60]:
l1 = list(l1)
l1

[1, 1, 2, 3, 4, 2]

In [61]:
l2 = list(l2)
l2

[2, 4, 6, 6, 6, 1]

In [63]:
l2

[2, 4, 6, 6, 6, 1]

In [64]:
l1

[1, 1, 2, 3, 4, 2]