########################################################### # from consensus matrix, get PAC index ########################################################### # Ref: Critical limitations of consensus clustering in class discovery # Yasin Senbabaoglu, George Michailidis & Jun Z. Li # Scientific Reports 4, Article number: 6207 (2014) # doi:10.1038/srep06207 # "the "proportion of ambigous clustering" (PAC), # defined as the fraction of sample pairs with # consensus index values falloing in the intermediate # sub-interval (x1, x2), element of [0, 1]. A low value # of PAC indicates a flat middle segment, allowing inference # of the optimal K by the lowest PAC" setwd(dir="github/stratipy/simulation/r_analysis/") # install.packages("R.matlab") library(R.matlab) # install.packages("diceR") library(diceR) # PAC threashold x1 <- 0.05 x2 <- 0.95 ########################################################### # to change! sub_nb <- 10 pat_nb <- 300 subset_name <- paste0(sub_nb, "sub_", pat_nb, "pat") plot_title <- paste0(sub_nb, " sub-networks with ", pat_nb, " patients") output_dir <- paste0(subset_name, "/plot") dir.create(subset_name) dir.create(output_dir) base_file <- paste0("../output/nbs_", subset_name, "/consensus_clustering") mut_type <- c("raw", "diff", "mean_qn", "median_qn") nmf_type <- c("nmf", "gnmf") influence_weight <- "min" simplification <- "True" alpha <- c(0, 0.7) tol <- 10e-3 keep_singletons <- "False" ngh_max <- c(5, 10) min_mutation <- 0 max_mutation <- 100 n_components <- 2:12 n_permutations <- 1000 lambd <- c(0, 200) tol_nmf <- 1e-3 a_greek <- intToUtf8(0x03B1) l_greek <- intToUtf8(0x03BB) df <- data.frame(type=character(), mut_type=character(), nmf_type=character(), influence_weight=character(), simplification=logical(), alpha=numeric(), tol=numeric(), keep_singletons=logical(), ngh_max=integer(), min_mutation=integer(), max_mutation=integer(), n_components=integer(), n_permutations=integer(), lambd=integer(), tol_nmf=numeric(), pac=numeric(), stringsAsFactors = FALSE) # iterate all consensus matrix file names for (i0 in mut_type){ f0 <- paste(base_file, i0, sep="/") for (i1 in nmf_type){ f1 <- paste(f0, i1, sep='/') for (i2 in influence_weight){ f2 <- paste(f1, i2, sep='/consensus_weight=') for (i3 in simplification){ f3 <- paste(f2, i3, sep="_simp=") for (i4 in alpha){ if(i0!="raw" & i4==0) next if(i0=="raw" & i4!=0) next f4 <- paste(f3, i4, sep="_alpha=") for (i5 in tol){ f5 <- paste(f4, i5, sep="_tol=") for (i6 in keep_singletons){ f6 <- paste(f5, i6, sep="_singletons=") for (i7 in ngh_max){ f7 <- paste(f6, i7, sep="_ngh=") for (i8 in min_mutation){ f8 <- paste(f7, i8, sep="_minMut=") for (i9 in max_mutation){ f9 <- paste(f8, i9, sep="_maxMut=") for (i10 in n_components){ f10 <- paste(f9, i10, sep="_comp=") for (i11 in n_permutations){ f11 <- paste(f10, i11, sep="_permut=") for (i12 in lambd){ if(i1=="nmf" & i12!=0) next if(i1=="gnmf" & i12==0) next f12 <- paste(f11, i12, sep="_lambd=") for (i13 in tol_nmf){ f13 <- paste(f12, i13, sep="_tolNMF=") filename <- paste(f13, ".mat", sep="") if (file.exists(filename)){ ia <- paste0(a_greek, "=", i4) il <- paste0(l_greek, "=", i12) ingh <- paste0(i7, "ngh") type_name <- paste(toupper(i1), i0, ia, il, ingh) # PCA function and save as a file data <- readMat(filename) cons_mat <- data$distance.patients pac <- PAC(cons_mat, lower=x1, upper=x2) # add all values to data frame df[nrow(df)+1,] = c(type_name, i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,pac) } else print(paste0("No such file: ", filename)) } } } } } } } } } } } } } } # convert some colums' mode (because all mode is character) logic_col <- c("simplification","keep_singletons") numeric_col <- c("alpha","tol","tol_nmf","pac") int_col <- c("ngh_max","min_mutation","max_mutation","n_components","n_permutations","lambd") for (i in logic_col) df[, i] <- as.logical(df[, i]) for (i in numeric_col) df[, i] <- as.numeric(df[, i]) for (i in int_col) df[, i] <- as.integer(df[, i]) # save data frame as csv file write.table(df, file=paste0(subset_name, "/all_param_pca.csv"), row.names=FALSE, sep="\t") # a <- read.csv(file="test.csv") # b <- df[, c("n_components", "pac")] ########################################################### library(ggplot2) # install.packages('svglite') # require("ggplot2") # plot saving function # savePlot <- function(myPlot) { # pdf("myPlot.pdf") # print(myPlot) # dev.off() # } # global plot ggplot(data=df, aes(x=n_components, y=pac, group=type)) + geom_line(aes(color=type)) + geom_point(aes(color=type)) + scale_x_continuous(breaks=n_components) + labs(x="Component number", y="PAC") + ylim(0, 1) + ggtitle(plot_title) + theme_bw() + theme(legend.title=element_blank(), panel.grid.minor.x = element_blank()) ggsave("all_data.png", plot = last_plot(), path = output_dir, width = 7, height = 6, dpi = 300) # get some rows for plot # select_rows <- function(data, x1, x2, x3, x4, x5){ # subset(data, # mut_type==x1 & # nmf_type==x2 & # alpha==x3 & # ngh_max==x4 & # lambd==x5) # } # # nmf_raw <- select_rows(df, "raw", "nmf", alpha[1], ngh_max, lambd[1]) # nmf_diff <- select_rows(df, "diff", "nmf", alpha[2], ngh_max, lambd[1]) # nmf_mean <- select_rows(df, "mean_qn", "nmf", alpha[2], ngh_max, lambd[1]) # nmf_median <- select_rows(df, "median_qn", "nmf", alpha[2], ngh_max, lambd[1]) nmf_df <- subset(df, nmf_type=="nmf") gnmf200_df <- subset(df, nmf_type=="gnmf" & lambd==200) # gnmf1800_df <- subset(df, nmf_type=="gnmf"& lambd==1800) # NMF plot ggplot(data=nmf_df, aes(x=n_components, y=pac, group=type)) + geom_line(aes(color=type)) + geom_point(aes(color=type)) + scale_x_continuous(breaks=n_components) + labs(x="Component number", y="PAC") + ylim(0, 1) + ggtitle(plot_title) + theme_bw() + theme(legend.title=element_blank(), panel.grid.minor.x = element_blank()) ggsave("nmf.png", plot = last_plot(), path = output_dir, width = 7, height = 6, dpi = 300) # GNMF (lambd=200) plot ggplot(data=gnmf200_df, aes(x=n_components, y=pac, group=type)) + geom_line(aes(color=type)) + geom_point(aes(color=type)) + scale_x_continuous(breaks=n_components) + labs(x="Component number", y="PAC") + ylim(0, 1) + ggtitle(plot_title) + theme_bw() + theme(legend.title=element_blank(), panel.grid.minor.x = element_blank()) ggsave("gnmf_200.png", plot = last_plot(), path = output_dir, width = 7, height = 6, dpi = 300) # GNMF (lambd=1800) plot # ggplot(data=gnmf1800_df, aes(x=n_components, y=pac, group=type)) + # geom_line(aes(color=type)) + # geom_point(aes(color=type)) + # scale_x_continuous(breaks=n_components) + # labs(x="Component number", y="PAC") + # ylim(0, 1) + # ggtitle(plot_title) + # theme_bw() + # theme(legend.title=element_blank(), panel.grid.minor.x = element_blank()) # # ggsave("gnmf_1800.png", plot = last_plot(), path = output_dir, # width = 7, height = 6, dpi = 300) ###########################################################