import sys import os sys.path.append(os.path.abspath('../../stratipy')) from stratipy import biostat import numpy as np import pandas as pd import re import matplotlib import matplotlib.pyplot as plt import matplotlib.cm as cm from matplotlib.lines import Line2D plt.switch_backend('agg') def get_biostat_files(result_directory): # get all biostat_files in result_directory biostat_files = [os.path.join(path, name) for path, dirs, files in os.walk(result_directory) for name in files if name.startswith(("biostat"))] return biostat_files def p_stringent_loose(val, p_stringent, p_loose): if val < p_stringent: return 2 elif val < p_loose: return 1 else: return 0 def concatenate_2_pval(row, col1, col2, p_stringent, p_loose): profile1 = p_stringent_loose(row[col1], p_stringent, p_loose) profile2 = p_stringent_loose(row[col2], p_stringent, p_loose) return [profile1, profile2] def formatting_biostat_data(data_folder, result_folder, biostat_factorization_directory, ssc_mutation_data, gene_data, ppi_data, patient_data, p_stringent, p_loose): print(" ==== Plot data formatting", flush=True) # to avoid SettingWithCopyWarning\ pd.options.mode.chained_assignment = None # create list of all biostatistics files in direcytory if patient_data == 'SSC': # load both SSC1 & SSC2 result1 = re.sub(r'SSC.', 'SSC1', biostat_factorization_directory) result2 = re.sub(r'SSC.', 'SSC2', biostat_factorization_directory) biostat_files = get_biostat_files(result1) + get_biostat_files(result2) else: biostat_files = get_biostat_files(result_folder) # read and concatenate them df = pd.concat((pd.read_pickle(file) for file in biostat_files)) # keep only binary results df = df[[ 'data_k', 'data_ssc', 'sex_pval', 'sp_pval', 'iq_pval', 'srs_pval', 'vineland_pval', 'distCEU_pval', 'mutation_pval']] # slice into SSC 1 & 2 df1 = df[df['data_ssc'] == 'SSC1'] df2 = df[df['data_ssc'] == 'SSC2'] df = df1.merge(df2, how='inner', left_on='data_k', right_on='data_k', suffixes=[1, 2]) df.rename(columns={'data_k': 'k'}, inplace=True) df = df.sort_values(by=['k']) df = df.reset_index(drop=True) # new columns with p-value -> ternary profile df['Sex'] = df.apply(lambda row: concatenate_2_pval( row, 'sex_pval1', 'sex_pval2', p_stringent, p_loose), axis=1) df['Affected / Unaffected'] = df.apply( lambda row: concatenate_2_pval( row, 'sp_pval1', 'sp_pval2', p_stringent, p_loose), axis=1) df['IQ'] = df.apply( lambda row: concatenate_2_pval( row, 'iq_pval1', 'iq_pval2', p_stringent, p_loose), axis=1) df['SRS'] = df.apply( lambda row: concatenate_2_pval( row, 'srs_pval1', 'srs_pval2', p_stringent, p_loose), axis=1) df['Vineland'] = df.apply( lambda row: concatenate_2_pval( row, 'vineland_pval1', 'vineland_pval2', p_stringent, p_loose), axis=1) df['Ancestral distance'] = df.apply( lambda row: concatenate_2_pval( row, 'distCEU_pval1', 'distCEU_pval2', p_stringent, p_loose), axis=1) df['# mutated genes'] = df.apply( lambda row: concatenate_2_pval( row, 'mutation_pval1', 'mutation_pval2', p_stringent, p_loose), axis=1) return df def marker_style(p_profile): marker = 'o' size = 16 edge = 'gray' # p-value < 0.05 col005 = 'orangered' # p-value < 0.1 col01 = 'orange' if p_profile == [0, 0]: fill = 'none' return dict(marker=marker, markersize=size, markeredgecolor=edge, fillstyle=fill) else: fill = 'left' if p_profile[0] > p_profile[1] else 'right' if (p_profile == [0, 1]) or (p_profile == [1, 0]): color = col01 return dict(marker=marker, markersize=size, markeredgecolor=edge, fillstyle=fill, color=color) elif (p_profile == [0, 2]) or (p_profile == [2, 0]): color = col005 return dict(marker=marker, markersize=size, markeredgecolor=edge, fillstyle=fill, color=color) else: edge = 'black' width = 2 if p_profile == [1, 1]: color = col01 fill = 'full' return dict(marker=marker, markersize=size, markeredgecolor=edge, fillstyle=fill, color=color, markeredgewidth=width) elif (p_profile == [1, 2]) or (p_profile == [2, 1]): color = col005 face = col01 return dict(marker=marker, markersize=size, markeredgecolor=edge, fillstyle=fill, color=color, markeredgewidth=width, markerfacecoloralt=face) elif p_profile == [2, 2]: color = col005 fill = 'full' return dict(marker=marker, markersize=size, markeredgecolor=edge, fillstyle=fill, color=color, markeredgewidth=width) else: # NOTE error message print('Wrong profile format') def biostat_individuals_plot(df, data_folder, ssc_mutation_data, gene_data, patient_data, ppi_data, mut_type, lambd): print(" ==== Plotting", flush=True) p_col = ['k', 'Sex', 'Affected / Unaffected', 'IQ', 'SRS', 'Vineland', 'Ancestral distance', '# mutated genes'] df_fill = df[p_col] m_style = dict(marker='o', markersize=16, markeredgecolor='gray') legend_elements = [ Line2D([0], [0], fillstyle='left', linestyle='', color='gray', **m_style, label='SSC1'), Line2D([0], [0], fillstyle='right', linestyle='', color='gray', **m_style, label='SSC2'), Line2D([0], [0], fillstyle='none', linestyle='', markeredgecolor='black', markeredgewidth=2, marker='o', markersize=16, label='reciprocal SSC1 & SSC2'), Line2D([0], [0], fillstyle='full', linestyle='', color='orange', **m_style, label='p-value < 0.1'), Line2D([0], [0], fillstyle='full', linestyle='', color='orangered', **m_style, label='p-value < 0.05')] # k=20 -> figsize=(5, 9) fig, ax = plt.subplots(nrows=df_fill.shape[0], ncols=df_fill.shape[1], sharex=True, sharey=True, figsize=(5, 20)) if lambd > 0: nmf = 'GNMF' else: nmf = 'NMF' fig.suptitle( "Statistical significance between individual clusters\n(mutation:{} // gene:{} // PPI:{} // {} // {})". format(ssc_mutation_data, gene_data, ppi_data, mut_type, nmf), x=0.5, y=1.15, fontsize=14, linespacing=2) # y=1.15 for k=20 for col in range(len(p_col)): for row in range(df_fill.shape[0]): if col == 0: ax[row, col].text(0.02, 0.5, df_fill.iloc[row, col], horizontalalignment='right', verticalalignment='center', fontsize=12) ax[row, col].axis('off') else: ax[row, col].plot(0.5, **marker_style(df_fill.iloc[row, col])) ax[row, col].axis('off') ax[0, col].text(0, 0.54, p_col[col], horizontalalignment='left', verticalalignment='bottom', rotation=45, fontsize=12) fig.subplots_adjust(hspace=0, wspace=0) # plt.legend(handles=legend_elements, loc='center', labelspacing=1, # fontsize=14, bbox_to_anchor=(6, 12), frameon=False) if patient_data == 'SSC': fig_directory = ( data_folder + 'figures/biostat_individuals/' + ssc_mutation_data + '_' + gene_data + '_' + ppi_data + '/') else: fig_directory = (data_folder + 'figures/biostat_individuals/' + patient_data + '_' + ppi_data + '/') os.makedirs(fig_directory, exist_ok=True) fig_name = ('{}_lambd={}'.format(mut_type, lambd)) plt.savefig('{}{}.png'.format(fig_directory, fig_name), bbox_inches='tight') plt.savefig('{}{}.svg'.format(fig_directory, fig_name), bbox_inches='tight') plt.close() def load_plot_biostat_individuals(result_folder, data_folder, ssc_mutation_data, gene_data, patient_data, ppi_data, mut_type, lambd, influence_weight, simplification, alpha, tol, keep_singletons, ngh_max, min_mutation, max_mutation, n_components, n_permutations, tol_nmf, linkage_method): p_stringent = 0.05 p_loose = 0.1 biostat_factorization_directory, biostat_file = biostat.biostatistics_file( result_folder, mut_type, influence_weight, simplification, alpha, tol, keep_singletons, ngh_max, min_mutation, max_mutation, n_components, n_permutations, lambd, tol_nmf, linkage_method) df = formatting_biostat_data( data_folder, result_folder, biostat_factorization_directory, ssc_mutation_data, gene_data, ppi_data, patient_data, p_stringent, p_loose) biostat_individuals_plot( df, data_folder, ssc_mutation_data, gene_data, patient_data, ppi_data, mut_type, lambd) # def binary_cmap(val_seqence, i): # cmap = matplotlib.cm.get_cmap('binary') # normalize = matplotlib.colors.Normalize(vmin=0, vmax=max(val_seqence)) # colors = [cmap(normalize(value)) for value in val_seqence] # return colors[i] # # def marker_filling(row, col1, col2): # if row[col1]: # if row[col2]: # return 'full' # else: # return 'left' # else: # if row[col2]: # return 'right' # else: # return 'none' # # # def graduated_marker_style(left, right): # if (left[0] == 1) & (right[0] == 1): # fillstyle = 'none' # else: # fillstyle = 'left' # return dict(fillstyle=fillstyle, color=left, markerfacecoloralt=right, # markeredgecolor='black', marker='o', markersize=15) # def formatting_biostat_data(data_folder, result_folder, # biostat_factorization_directory, ssc_mutation_data, # gene_data, ppi_data, p_val_threshold, # patient_data): # # to avoid SettingWithCopyWarning\ # pd.options.mode.chained_assignment = None # # create list of all biostatistics files in direcytory # if patient_data == 'SSC': # # load both SSC1 & SSC2 # result1 = re.sub(r'SSC.', 'SSC1', biostat_factorization_directory) # result2 = re.sub(r'SSC.', 'SSC2', biostat_factorization_directory) # biostat_files = get_biostat_files(result1) + get_biostat_files(result2) # else: # biostat_files = get_biostat_files(biostat_factorization_directory) # # # read and concatenate them # df = pd.concat((pd.read_pickle(file) for file in biostat_files)) # df = df.reset_index(drop=True) # # df_plot = df[[ # 'data_k', 'data_ssc', 'sp_pval', 'iq_pval', 'sex_pval', 'srs_pval', # 'vineland_pval', 'distCEU_pval', 'mutation_pval']] # # # new columns: True if significant p-value # df_plot.loc[:, 'sp'] = (df_plot['sp_pval'] < p_val_threshold) # df_plot.loc[:, 'iq'] = (df_plot['iq_pval'] < p_val_threshold) # df_plot.loc[:, 'sex'] = (df_plot['sex_pval'] < p_val_threshold) # df_plot.loc[:, 'srs'] = (df_plot['srs_pval'] < p_val_threshold) # df_plot.loc[:, 'vineland'] = (df_plot['vineland_pval'] < p_val_threshold) # df_plot.loc[:, 'distCEU'] = (df_plot['distCEU_pval'] < p_val_threshold) # df_plot.loc[:, 'mutation'] = (df_plot['mutation_pval'] < p_val_threshold) # # # keep only binary results # df_bin = df_plot.drop(['sp_pval', 'iq_pval', 'sex_pval', 'srs_pval', # 'vineland_pval', 'distCEU_pval', 'mutation_pval'], # axis=1) # # slice into SSC 1 & 2 # df_bin1 = df_bin[df_bin['data_ssc'] == 'SSC1'] # df_bin2 = df_bin[df_bin['data_ssc'] == 'SSC2'] # # count True by row (k) # df_bin1['Total'] = df_bin1.iloc[:, 1:].sum(axis=1) # df_bin2['Total'] = df_bin2.iloc[:, 1:].sum(axis=1) # # # then merge on k # df_bin = df_bin1.merge(df_bin2, how='inner', left_on='data_k', # right_on='data_k', suffixes=[1, 2]) # df_bin = df_bin.sort_values(by=['data_k']) # df_bin = df_bin.drop(['data_ssc1', 'data_ssc2'], axis=1) # df_bin = df_bin.reset_index(drop=True) # # # count all True by row (k) for SSC1&2 # df_bin['Total'] = df_bin[['Total1', 'Total2']].sum(axis=1) # # # new columns with marker fjilling style # df_bin['Affected / Unaffected'] = df_bin.apply( # lambda row: marker_filling(row, 'sp1', 'sp2'), axis=1) # df_bin['IQ'] = df_bin.apply( # lambda row: marker_filling(row, 'iq1', 'iq2'), axis=1) # df_bin['Sex'] = df_bin.apply( # lambda row: marker_filling(row, 'sex1', 'sex2'), axis=1) # df_bin['SRS'] = df_bin.apply( # lambda row: marker_filling(row, 'srs1', 'srs2'), axis=1) # df_bin['Vineland'] = df_bin.apply( # lambda row: marker_filling(row, 'vineland1', 'vineland2'), axis=1) # df_bin['Ancestral distance'] = df_bin.apply( # lambda row: marker_filling(row, 'distCEU1', 'distCEU2'), axis=1) # df_bin['# mutated genes'] = df_bin.apply( # lambda row: marker_filling(row, 'mutation1', 'mutation2'), axis=1) # # df_bin.rename(columns={'data_k': 'k'}, inplace=True) # # return df_bin # # # def biostat_individuals_plot(df_bin, data_folder, ssc_mutation_data, gene_data, # patient_data, ppi_data, mut_type, lambd): # p_col = ['k', 'Total', 'Affected / Unaffected', 'IQ', 'Sex', 'SRS', # 'Vineland', 'Ancestral distance', '# mutated genes'] # df_fill = df_bin[p_col] # # marker_style = dict(color='black', marker='o', markersize=15) # legend_elements = [ # Line2D([0], [0], fillstyle='left', linestyle='', **marker_style, # label='SSC1'), # Line2D([0], [0], fillstyle='right', linestyle='', **marker_style, # label='SSC2')] # # fig, ax = plt.subplots(nrows=df_fill.shape[0], ncols=df_fill.shape[1], # sharex=True, sharey=True, figsize=(6, 9)) # if lambd > 0: # nmf = 'GNMF' # else: # nmf = 'NMF' # # fig.suptitle( # "Statistical significance between individual clusters\n(mutation:{} // gene:{} // PPI:{} // {} // {})". # format(ssc_mutation_data, gene_data, ppi_data, mut_type, nmf), y=1.15, # fontsize=15, linespacing=2) # # for col in range(len(p_col)): # for row in range(df_fill.shape[0]): # if col == 0: # ax[row, col].text(0.02, 0.5, df_fill.iloc[row, col], # horizontalalignment='right', # verticalalignment='center', fontsize=12) # ax[row, col].axis('off') # elif col == 1: # color_left = binary_cmap(df_bin['Total1'], row) # color_right = binary_cmap(df_bin['Total2'], row) # ax[row, col].plot(0.5, **graduated_marker_style(color_left, # color_right)) # ax[row, col].spines['left'].set_visible(False) # ax[row, col].spines['top'].set_visible(False) # ax[row, col].spines['bottom'].set_visible(False) # ax[row, col].set_xticklabels([]) # ax[row, col].tick_params(axis='both', which='both', length=0) # else: # ax[row, col].plot(0.5, fillstyle=df_fill.iloc[row, col], # **marker_style) # ax[row, col].axis('off') # # ax[0, col].text(0, 0.54, p_col[col], horizontalalignment='left', # verticalalignment='bottom', rotation=45, fontsize=12) # # fig.subplots_adjust(hspace=0, wspace=0) # plt.legend(handles=legend_elements, loc='center', labelspacing=1, # fontsize=14, bbox_to_anchor=(3, 17.9), frameon=False) # # if patient_data == 'SSC': # fig_directory = ( # data_folder + 'figures/biostat_individuals/' + ssc_mutation_data + # '_' + gene_data + '_' + ppi_data + '/') # else: # fig_directory = (data_folder + 'figures/biostat_individuals/' + # patient_data + '_' + ppi_data + '/') # os.makedirs(fig_directory, exist_ok=True) # fig_name = ('{}_lambd={}'.format(mut_type, lambd)) # plt.savefig('{}{}.png'.format(fig_directory, fig_name), # bbox_inches='tight') # plt.savefig('{}{}.svg'.format(fig_directory, fig_name), # bbox_inches='tight') # plt.close()