--- title: "R Notebook" output: html_notebook --- ```{r} library(tidyverse) library(ggplot2) library(ggridges) library(PNWColors) ``` ```{r} #we need to download all of our simulated data setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/") outdir_files = "files/" outdir_figures = "figures/" sims_hnhs_file = "files/sims_hNhS_ratios_per_gene.txt" sims_hnhs = read.table(sims_hnhs_file, header=TRUE, stringsAsFactors = FALSE) obs_hnhs_file = "files/obs_hNhS_ratios_per_gene.txt" obs_hnhs = read.table(obs_hnhs_file, header=TRUE, stringsAsFactors = FALSE) sig_analysis_file = "files/hNhS_per_gene_sig_hits.txt" sig_analysis = read.table(sig_analysis_file, header=TRUE, stringsAsFactors = FALSE) ``` We'll be using this function in future plots -- split violins are a bit of a pain. Source: https://rpubs.com/egnielsen/challenge_category_learning who cites DeBruine 2018 Another good source: https://psyteachr.github.io/msc-conv/visualisation.html We filter out B6 Young Liver from these analyses ```{r} cond_filtered_obs_hnhs = obs_hnhs %>% filter(!(STRAIN == "B6" & TISSUE == "Liver" & AGE_BIN == "YOUNG")) cond_filtered_sims_hnhs = sims_hnhs %>% filter(!(STRAIN == "B6" & TISSUE == "Liver" & AGE_BIN == "YOUNG")) cond_filtered_sig_analysis = sig_analysis %>% filter(!(STRAIN == "B6" & TISSUE == "Liver" & AGE_BIN == "YOUNG")) ``` ```{r} rm(obs_hnhs, sims_hnhs, sig_analysis) ``` ```{r} GeomSplitViolin <- ggproto( "GeomSplitViolin", GeomViolin, draw_group = function(self, data, ..., draw_quantiles = NULL) { data <- transform(data, xminv = x - violinwidth * (x - xmin), xmaxv = x + violinwidth * (xmax - x)) grp <- data[1,'group'] newdata <- plyr::arrange( transform(data, x = if(grp%%2==1) xminv else xmaxv), if(grp%%2==1) y else -y ) newdata <- rbind(newdata[1, ], newdata, newdata[nrow(newdata), ], newdata[1, ]) newdata[c(1,nrow(newdata)-1,nrow(newdata)), 'x'] <- round(newdata[1, 'x']) if (length(draw_quantiles) > 0 & !scales::zero_range(range(data$y))) { stopifnot(all(draw_quantiles >= 0), all(draw_quantiles <= 1)) quantiles <- ggplot2:::create_quantile_segment_frame(data, draw_quantiles) aesthetics <- data[rep(1, nrow(quantiles)), setdiff(names(data), c("x", "y")), drop = FALSE] aesthetics$alpha <- rep(1, nrow(quantiles)) both <- cbind(quantiles, aesthetics) quantile_grob <- GeomPath$draw_panel(both, ...) ggplot2:::ggname("geom_split_violin", grid::grobTree(GeomPolygon$draw_panel(newdata, ...), quantile_grob)) } else { ggplot2:::ggname("geom_split_violin", GeomPolygon$draw_panel(newdata, ...)) } } ) geom_split_violin <- function (mapping = NULL, data = NULL, stat = "ydensity", position = "identity", ..., draw_quantiles = NULL, trim = TRUE, scale = "area", na.rm = FALSE, show.legend = NA, inherit.aes = TRUE) { layer(data = data, mapping = mapping, stat = stat, geom = GeomSplitViolin, position = position, show.legend = show.legend, inherit.aes = inherit.aes, params = list(trim = trim, scale = scale, draw_quantiles = draw_quantiles, na.rm = na.rm, ...) ) } ``` Filtering out ratios that we cannot calculate (e.g. nsyn and syn counts are 0) ```{r} #we did not lose any more entries (filtered out syn != 0 in earlier processing ) filtered_obs_hnhs = cond_filtered_obs_hnhs %>% filter(OBS_NONSYN_MUT_COUNT != 0) %>% filter(OBS_SYN_MUT_COUNT != 0) %>% select(STRAIN, TISSUE, AGE_BIN, GENE, OBS_HNHS) ``` ```{r} #filtered 4,310 sims 1e-3 of the simulations filtered_sims_hnhs = cond_filtered_sims_hnhs %>% filter(SIM_NONSYN_MUT_COUNT != 0) %>% filter(SIM_SYN_MUT_COUNT != 0) ``` Calculating the average simulated hnhs ratio per gene in each experimental condition ```{r} avg_sim_ratios = filtered_sims_hnhs %>% select(STRAIN, TISSUE, AGE_BIN, GENE, SIM_HNHS) %>% group_by(STRAIN, TISSUE, AGE_BIN, GENE) %>% summarise(SIM_AVG_RATIO = mean(SIM_HNHS)) setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/") write.table(avg_sim_ratios, file = paste(outdir_files,"/avg_sim_ratios.txt", sep = ""), sep = "\t", quote = F, row.names = T) ``` Note for reader: filtered_obs: contains all hnhs ratios that could be calculated regardless of significance avg_sims: contains the average hnhs ratio for each gene in each simulation sig_analysis: contains significance info of our obs hnhs ratios Calculating summary stats and cleaning data to compare obs. v sims hnhs ratios ```{r} plotting_df = left_join(filtered_obs_hnhs, avg_sim_ratios, by = c("STRAIN", "TISSUE", "AGE_BIN", "GENE")) %>% pivot_longer(OBS_HNHS:SIM_AVG_RATIO, names_to = "TYPE", values_to = "RATIO") %>% mutate(TYPE_LABEL = ifelse(TYPE == "OBS_HNHS", "Observed", "Simulated")) %>% mutate(SELECTION = ifelse(RATIO < 1, "N", "P")) %>% #x position for our split violin mutate(X_POS = 1) sig_hnhs_plotting_df = cond_filtered_sig_analysis %>% select(OBS_HNHS, P_ADJ) %>% filter(P_ADJ < 0.01) %>% mutate(TYPE_LABEL = "OBS_HNHS", X_POS = 1.15) %>% rename(RATIO = OBS_HNHS) %>% select(RATIO, X_POS, TYPE_LABEL) plotting_df_sum = plotting_df %>% select(TYPE_LABEL, SELECTION, RATIO) %>% group_by(TYPE_LABEL, SELECTION) %>% summarise(MEAN = mean(log10(RATIO)), SEM = sd(log10(RATIO))/sqrt(n())) %>% mutate(X_POS = 1) plotting_df$TYPE_LABEL = factor(plotting_df$TYPE_LABEL, level = c("Simulated", "Observed")) plotting_df_sum$TYPE_LABEL = factor(plotting_df_sum$TYPE_LABEL, level = c("Simulated", "Observed")) ``` Plotting the simulated and observed ratio distributions ```{r} sims_v_obs_plot = ggplot(data = plotting_df, aes(x = log10(RATIO), y = TYPE_LABEL, color = TYPE_LABEL)) sims_v_obs_plot = sims_v_obs_plot + geom_density_ridges(fill = "white", size = 0.85, scale = 0.65) + geom_point(shape = "|", size = 1,position=position_nudge(y=-.05)) + geom_point(aes(x=MEAN, y=TYPE_LABEL, color = TYPE_LABEL), data=plotting_df_sum, size = 1.25) + geom_errorbarh(data=plotting_df_sum, aes(xmin= MEAN-SEM,xmax=MEAN+SEM,y=TYPE_LABEL,color=TYPE_LABEL), height = 0.15, inherit.aes = FALSE) + geom_vline(xintercept = 0 , color = "grey", linetype = "dashed", size = 0.4) + xlab("log10(hN/hS)") + ylab("Data") + theme_bw() + scale_y_discrete(expand = expansion(mult = c(0.01, .7))) + scale_color_manual(name = "Data", values = c( "#d2848d", "black")) + theme(panel.grid=element_blank(), text = element_text(family = "sans"), panel.border = element_blank(), strip.background = element_blank(), axis.title.x = element_text(size = 10), axis.title.y = element_text(size = 12), strip.text.x = element_text(size = 11), strip.text.y = element_text(size = 11), axis.text.y=element_text(size = 10), axis.text.x =element_text(size = 10), legend.positio = "none") setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/") pdf(paste(outdir_figures,"sims_v_obs_ratios.pdf",sep=""), width=3.5,height=2) print(sims_v_obs_plot) dev.off() ``` Plotting obs v simulated hnhs ratios as split violins with SIM AVERAGES ```{r} sims_v_obs_violin = ggplot(data = plotting_df, aes(x = X_POS, y = log10(RATIO), fill = TYPE_LABEL)) + geom_split_violin(trim = FALSE, alpha = 0.4) + geom_boxplot(aes(color = TYPE_LABEL), width = .15, alpha = .6, position = position_dodge(.25), outlier.shape = NA) + #geom_point(aes(x = X_POS, y=MEAN, color = TYPE_LABEL), data=plotting_df_sum, size = 1) + #geom_errorbar(data=plotting_df_sum, aes(ymin= MEAN-SEM,ymax=MEAN+SEM,x=X_POS, color = TYPE_LABEL), width = 0.05, inherit.aes = FALSE) + geom_point(data = sig_hnhs_plotting_df, aes(x = X_POS, y = log10(RATIO)), color = "#d2848d", size = 1, inherit.aes = FALSE) + geom_hline(yintercept = 0, linetype = "dashed", size = 0.5, color = "gray") + theme_light() + xlab("") + ylab("log10(hN/hS)") + scale_fill_manual(name = "Data", values = c("black", "#d2848d")) + scale_color_manual(name = "Data", values = c("black", "#d2848d")) + theme(text = element_text(family = "sans"), strip.background = element_blank(), axis.title.y = element_text(size = 12), axis.text.y=element_text(size = 10), axis.text.x =element_blank(), axis.ticks.x =element_blank(), legend.position = "none") setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/") pdf(paste(outdir_figures,"violin_sims_v_obs_ratios.pdf",sep=""), width=2,height=3.5) print(sims_v_obs_violin) dev.off() ``` Simulations v observed counts with all simulated counts ```{r} obs_info = filtered_obs_hnhs %>% select(OBS_HNHS) %>% rename(RATIO = OBS_HNHS) %>% mutate(TYPE = "OBS") sim_info = filtered_sims_hnhs %>% select(SIM_HNHS) %>% rename(RATIO = SIM_HNHS) %>% mutate(TYPE = "SIM") plotting_all_sims_df = rbind(obs_info, sim_info) %>% mutate(X_POS = 1, TYPE_LABEL = ifelse(TYPE == "OBS", "Observed", "Simulated")) rm(obs_info, sim_info) ``` Main Fig. 5A: comparing our simulated and observed ratios ```{r} plotting_all_sims_df$TYPE_LABEL = factor(plotting_all_sims_df$TYPE_LABEL, level = c("Simulated", "Observed")) all_sims_v_obs_violin = ggplot(data = plotting_all_sims_df, aes(x = X_POS, y = log10(RATIO), fill = TYPE_LABEL)) + geom_split_violin(trim = FALSE, alpha = 0.4) + geom_boxplot(aes(fill = TYPE_LABEL), color = "black", width = .15, alpha = .6, position = position_dodge(.25), outlier.shape = NA) + geom_hline(yintercept = 0, linetype = "dashed", size = 0.5, color = "gray") + theme_light() + xlab("") + ylab("log10(hN/hS)") + scale_fill_manual(name = "Data", values = c("black", "white")) + scale_color_manual(name = "Data", values = c("black", "white")) + theme(text = element_text(family = "sans"), strip.background = element_blank(), axis.title.y = element_text(size = 12), axis.text.y=element_text(size = 10), axis.text.x =element_blank(), axis.ticks.x =element_blank(), legend.position = "none") setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/") pdf(paste(outdir_figures,"violin_all_sims_v_obs_ratios.pdf",sep=""), width=2,height=3.5) print(all_sims_v_obs_violin) dev.off() setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/") pdf(paste(outdir_figures,"leg_violin_all_sims_v_obs_ratios.pdf",sep=""), width=3.25,height=3) print(all_sims_v_obs_violin + theme(legend.position = "bottom", legend.text = element_text(size = 14))) dev.off() ``` Running the Wilcoxon Rank Sum Test to see if there's a significant difference in medians between the simulated and obs. distribution ```{R} #incredibly large sample size that we see approached normality sim_ratios = (plotting_all_sims_df %>% mutate(LOG10_RATIO = log10(RATIO)) %>% filter(TYPE == "SIM"))$LOG10_RATIO #testing if the sample mean is different from 0 t_summary = t.test(sim_ratios, alternative = "two.sided") print(t_summary) ``` ```{r} #comparing if the average obs and sim ratios are different from each other obs_ratios = (plotting_all_sims_df %>% mutate(LOG10_RATIO = log10(RATIO)) %>% filter(TYPE == "OBS"))$LOG10_RATIO wilcox.test(sim_ratios, obs_ratios, alternative = "two.sided") ``` Creating a split violin for all conditions ```{r} sim_ratios = avg_sim_ratios %>% ungroup() %>% select(STRAIN, TISSUE, AGE_BIN, SIM_AVG_RATIO) %>% rename(HNHS = SIM_AVG_RATIO) %>% mutate(LABEL = paste("SIM", AGE_BIN, sep = "_")) obs_ratios = filtered_obs_hnhs %>% rename(HNHS = OBS_HNHS) %>% mutate(LABEL = paste("OBS", AGE_BIN, sep = "_")) %>% select(STRAIN, TISSUE, AGE_BIN, HNHS, LABEL) ``` ```{r} plotting_all_cond_df = rbind(sim_ratios, obs_ratios) %>% mutate(X_POS = ifelse(AGE_BIN == "OLD", "2", "1")) sig_plotting_points = cond_filtered_sig_analysis %>% select(STRAIN, TISSUE, AGE_BIN, OBS_HNHS, P_ADJ) %>% filter(P_ADJ < 0.01) %>% mutate(LABEL = paste("OBS", AGE_BIN, sep = "_"), X_POS = ifelse(AGE_BIN == "OLD", 2.15, 1.15)) %>% rename(HNHS = OBS_HNHS) %>% select(STRAIN, TISSUE, HNHS, X_POS, LABEL) plotting_all_cond_df$LABEL = factor(plotting_all_cond_df$LABEL, level = c("SIM_OLD", "OBS_OLD", "SIM_YOUNG", "OBS_YOUNG")) sig_plotting_points$LABEL = factor(sig_plotting_points$LABEL, level = c("SIM_OLD", "OBS_OLD", "SIM_YOUNG", "OBS_YOUNG")) plotting_all_cond_df$STRAIN = factor(plotting_all_cond_df$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB" )) sig_plotting_points$STRAIN = factor(sig_plotting_points$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB" )) ``` Plotting all conditions as split violins ```{r} all_conds_violin = ggplot(data = plotting_all_cond_df, aes(x = X_POS, y = log10(HNHS), fill = LABEL)) + geom_split_violin(trim = FALSE, alpha = 0.4) + geom_boxplot(aes(color = LABEL), width = .15, alpha = .6, position = position_dodge(.25), outlier.shape = NA) + geom_point(data = sig_plotting_points %>% filter(LABEL == "OBS_YOUNG"), aes(x = X_POS, y = log10(HNHS)), color ="#DE5D83", size = 0.7, inherit.aes = FALSE) + geom_point(data = sig_plotting_points %>% filter(LABEL == "OBS_OLD"), aes(x = X_POS, y = log10(HNHS)), color ="#A94064", size = 0.7, inherit.aes = FALSE) + geom_hline(yintercept = 0, linetype = "dashed", size = 0.5, color = "gray") + facet_grid(STRAIN ~ TISSUE) + theme_bw() + xlab("Age") + ylab("log10(hN/hS)") + scale_x_discrete(breaks = c(1,2), labels = c("1" = "Young", "2" = "Old")) + scale_fill_manual(name = "Data", values = c("black", "#A94064", "grey45", "#DE5D83" )) + scale_color_manual(name = "Data", values = c("black", "#A94064", "grey45", "#DE5D83" )) + theme(text = element_text(family = "sans"), strip.background = element_blank(), axis.title.y = element_text(size = 12), axis.title.x = element_text(size = 12), axis.text.y=element_text(size = 10), axis.text.x=element_text(size = 10), legend.position = "none") setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/") pdf(paste(outdir_figures,"violin_all_conds.pdf",sep=""), width=6,height=5) print(all_conds_violin) dev.off() pdf(paste(outdir_figures,"leg_violin_all_conds.pdf",sep=""), width=8,height=5) print(all_conds_violin + #scale_fill_manual(labels = c("Simulated", "Observed", "Simulated", "Observed")) + theme(legend.position = "bottom")) dev.off() ```