--- title: "R Notebook" output: html_notebook --- ```{r} library(tidyverse) library(ggplot2) ``` ```{r} setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/") outdir_files = "files/" outdir_figures = "figures/" sig_analysis_file = "files/hNhS_per_gene_sig_hits.txt" sig_analysis = read.table(sig_analysis_file, header=TRUE, stringsAsFactors = FALSE) sim_ratios_file = "files/avg_sim_ratios.txt" sims_ratios = read.table(sim_ratios_file, header=TRUE, stringsAsFactors = FALSE) ``` We filter out B6 Young Liver from these analyses ```{r} cond_filtered_sig_analysis = sig_analysis %>% filter(!(STRAIN == "B6" & TISSUE == "Liver" & AGE_BIN == "YOUNG")) ``` ```{r} rm(sig_analysis) ``` Filtering for our significant hits There are 67 genes out of 377 that we identified a significant signal for selection ```{r} sig_hits = cond_filtered_sig_analysis %>% filter(P_ADJ < 0.01) %>% #filtering out the sig hit that does not make sense filter(!(STRAIN == "FVB" & TISSUE == "Brain" & AGE_BIN == "OLD" & GENE == "mt-Co1")) ``` We compare the proportion of genes under selection across age ```{r} selection_age_comp = sig_hits %>% select(STRAIN, TISSUE, AGE_BIN) %>% group_by(STRAIN, TISSUE, AGE_BIN) %>% summarise(COUNT = n()) ``` We fill need to account for the conditions that do not have any genes under selection ```{r} triplets = cond_filtered_sig_analysis %>% select(STRAIN, TISSUE, AGE_BIN) %>% unique() imputed_selection_age_comp = triplets %>% left_join(selection_age_comp, by = c("STRAIN", "TISSUE", "AGE_BIN")) %>% mutate(COUNT = ifelse(is.na(COUNT), 0, COUNT)) %>% mutate(PROP = COUNT/13) ``` ```{r} rm(triplets) ``` ```{r} plotting_selection_age_comp_df = imputed_selection_age_comp %>% ungroup() %>% select(AGE_BIN, PROP) %>% group_by(AGE_BIN) %>% summarise(MEAN = mean(PROP), SEM = sd(PROP)/sqrt(n())) %>% mutate(AGE_LABEL = ifelse(AGE_BIN == "OLD", "O", "Y")) plotting_selection_age_comp_df$AGE_LABEL = factor(plotting_selection_age_comp_df$AGE_LABEL, level = c("Y", "O")) ``` ```{r} setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/") selection_age_comp_plot = ggplot(plotting_selection_age_comp_df, aes(x = AGE_LABEL, y = MEAN, fill = AGE_LABEL)) selection_age_comp_plot = selection_age_comp_plot + geom_bar(stat = "identity", position = position_dodge(), width = 0.75) + geom_errorbar(aes(ymin=MEAN-SEM, ymax=MEAN+SEM), width=.25, position=position_dodge(0.9)) + theme_bw(base_size = 16) + ylab("Proportion of genes") + xlab("Selection") + scale_fill_manual(name = "Age", labels = c("Young", "Old"), values = c("grey70", "grey53")) + theme(text = element_text(family = "sans"), legend.position = "none", axis.title.x = element_text(size = 10), axis.title.y = element_text(size = 10), axis.text.x = element_text(size = 8), axis.text.y = element_text(size = 8)) pdf(paste(outdir_figures,"barplot_gen_selection_age_comp.pdf",sep=""), width=1.5,height=2.5) print(selection_age_comp_plot) dev.off() ``` We want to compare the proportion of genes under selection across age based on the type of selection ```{r} selection_types_counts = sig_hits %>% select(STRAIN, TISSUE, AGE_BIN, OBS_HNHS) %>% mutate(SELECTION = ifelse(OBS_HNHS < 1, "N", "P")) %>% select(STRAIN, TISSUE, AGE_BIN, SELECTION) %>% group_by(STRAIN, TISSUE, AGE_BIN, SELECTION) %>% summarise(COUNT = n()) ``` Again we need to make sure we account for every condition when we compute the average ```{r} triplets = cond_filtered_sig_analysis %>% select(STRAIN, TISSUE, AGE_BIN) %>% unique() ext_triplets = triplets[rep(seq_len(nrow(triplets)), each = 2),] #we need to extend this df to account for P/N selection SELECTION = rep(c("N", "P"), 29) selection_df = as.data.frame(SELECTION) dummy_df = cbind(ext_triplets, selection_df) rm(selection_df, ext_triplets, triplets) ``` ```{r} imputed_selection_types_counts = dummy_df %>% left_join(selection_types_counts, by = c("STRAIN", "TISSUE", "AGE_BIN", "SELECTION")) %>% mutate(COUNT = ifelse(is.na(COUNT), 0, COUNT)) %>% select(AGE_BIN, SELECTION, COUNT) %>% mutate(PROP = COUNT /13) %>% group_by(AGE_BIN, SELECTION) %>% summarise(MEAN = mean(PROP), SEM = sd(PROP)/ sqrt(n())) %>% mutate(AGE_LABEL = ifelse(AGE_BIN == "OLD", "O", "Y")) imputed_selection_types_counts$AGE_LABEL = factor(imputed_selection_types_counts$AGE_LABEL, level = c("Y", "O")) ``` ```{r} setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/") selection_type_age_comp_plot = ggplot(imputed_selection_types_counts, aes(y = SELECTION, x = MEAN, fill = AGE_LABEL)) selection_type_age_comp_plot = selection_type_age_comp_plot + geom_bar(stat = "identity", position = position_dodge(), width = 0.9) + geom_errorbarh(aes(xmin=MEAN-SEM, xmax=MEAN+SEM), height=.25, position=position_dodge(0.9)) + theme_bw(base_size = 16) + xlab("Proportion of genes") + ylab("Selection") + scale_fill_manual(name = "Age", labels = c("Young", "Old"), values = c("grey70", "grey53")) + theme(text = element_text(family = "sans"), legend.position = "none", axis.title.x = element_text(size = 10), axis.title.y = element_text(size = 10), axis.text.x = element_text(size = 8), axis.text.y = element_text(size = 8)) #pdf(paste(outdir_figures,"selection_type_age_comp.pdf",sep=""), width=1.75,height=3) pdf(paste(outdir_figures,"selection_type_age_comp.pdf",sep=""), width=3,height=1.5) print(selection_type_age_comp_plot) dev.off() pdf(paste(outdir_figures,"leg_selection_type_age_comp.pdf",sep=""), width=3,height=3) print(selection_type_age_comp_plot + theme(legend.position = "bottom")) dev.off() ``` We want to compare the strength of selection between age groups ```{r} age_comp_selection_strength = cond_filtered_sig_analysis %>% select(STRAIN, TISSUE, AGE_BIN, GENE, OBS_HNHS, P_ADJ) %>% mutate(AGE_LABEL = ifelse(P_ADJ >= 0.01, paste(AGE_BIN,"N.S.", sep = "_"), AGE_BIN)) %>% select(AGE_LABEL, OBS_HNHS, AGE_BIN) %>% rename(HNHS = OBS_HNHS) ``` Plotting just the average selection strength and SEM for our signficant observed hits ```{r} simple_selection_type_strength_df = cond_filtered_sig_analysis %>% filter(!(STRAIN == "FVB" & TISSUE == "Brain" & AGE_BIN == "OLD" & GENE == "mt-Co1")) %>% select(AGE_BIN, OBS_HNHS, P_ADJ) %>% filter(P_ADJ < 0.01) %>% select(AGE_BIN, OBS_HNHS) %>% mutate(SELECTION = ifelse(OBS_HNHS < 1, "N", "P")) %>% group_by(AGE_BIN, SELECTION) %>% summarise(MEAN = mean(log10(OBS_HNHS)), SEM = sd(log10(OBS_HNHS))/sqrt(n())) simple_selection_type_strength_df$X_POS = c(1.15, 2.15, 0.85, 1.85) simple_selection_type_strength_df$AGE_BIN = factor(simple_selection_type_strength_df$AGE_BIN, level = c("YOUNG", "OLD")) ``` Comparing the distribution of hnhs ratios for young and old samples ```{r} w_test_df = cond_filtered_sig_analysis %>% filter(!(STRAIN == "FVB" & TISSUE == "Brain" & AGE_BIN == "OLD" & GENE == "mt-Co1")) %>% select(AGE_BIN, OBS_HNHS, P_ADJ) %>% filter(P_ADJ < 0.01) %>% select(AGE_BIN, OBS_HNHS) %>% mutate(SELECTION = ifelse(OBS_HNHS < 1, "N", "P")) %>% mutate(LABEL = paste(SELECTION, AGE_BIN, sep ="_")) %>% select(LABEL, OBS_HNHS) ``` ```{r} lm_df = cond_filtered_sig_analysis %>% filter(!(STRAIN == "FVB" & TISSUE == "Brain" & AGE_BIN == "OLD" & GENE == "mt-Co1")) %>% select(AGE_BIN, OBS_HNHS, P_ADJ) %>% filter(P_ADJ < 0.01) %>% select(AGE_BIN, OBS_HNHS) %>% mutate(SELECTION = ifelse(OBS_HNHS < 1, "N", "P")) %>% filter(SELECTION == "N") lm_df$AGE_BIN = factor(lm_df$AGE_BIN, level = c("YOUNG", "OLD")) lm_df$SELECTION = factor(lm_df$SELECTION, level = c("P", "N")) ``` ```{r} lm_model = lm(data = lm_df, OBS_HNHS ~ AGE_BIN) summary(lm_model) ``` ```{r} groups = list(c("N_YOUNG", "N_OLD"), c("P_YOUNG", "P_OLD")) combos = groups %>% set_names(map_chr(., ~ paste(., collapse = "_"))) p_values = map_df(combos, function(y) { filter(w_test_df, LABEL %in% y) %>% wilcox.test(OBS_HNHS ~ LABEL, data = .) %>% broom::tidy() }, .id = "contrast") %>% mutate(p_adj = p.adjust(p.value, method = "BH")) %>% mutate(SIG_STATUS = ifelse(p_adj < 0.01, "SIG", "N.S.")) ``` ```{r} young = (w_test_df %>% filter(LABEL == "N_YOUNG"))$OBS_HNHS old = (w_test_df %>% filter(LABEL == "N_OLD"))$OBS_HNHS t.test(young, old, var.equal = TRUE) ``` ```{r} simple_selection_type_strength_plot = ggplot(simple_selection_type_strength_df, aes(y = X_POS, x = MEAN, color = AGE_BIN )) + geom_point(size = 2) + theme_bw(base_size = 16) + #geom_vline(xintercept = -0.3311625 , color = "magenta", linetype = "dashed", size = 0.6) + scale_y_continuous(breaks = c(1,2), labels = c("N","P")) + scale_color_manual(name = "Age", labels = c("Young", "Old"), values = c("grey70", "grey53")) + geom_errorbarh(aes(xmin=MEAN-SEM, xmax=MEAN+SEM), height=.15, position=position_dodge(0.9)) + theme_bw(base_size = 16) + xlab("log10(hN/hS)") + ylab("Selection") + scale_fill_manual(name = "Age", labels = c("Young", "Old"), values = c("grey70", "grey53")) + theme(text = element_text(family = "sans"), legend.position = "none", axis.title.x = element_text(size = 10), axis.title.y = element_text(size = 10.5), axis.text.x = element_text(size = 8), axis.text.y = element_text(size = 8)) setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/") #pdf(paste(outdir_figures,"simple_selection_type_strength_plot.pdf",sep=""), width=1.75,height=3) pdf(paste(outdir_figures,"simple_selection_type_strength_plot.pdf",sep=""), width=3,height=1.5) print(simple_selection_type_strength_plot) dev.off() pdf(paste(outdir_figures,"leg_simple_selection_type_strength_plot.pdf",sep=""), width=1.75,height=3) print(simple_selection_type_strength_plot + theme(legend.position = "right")) dev.off() ``` ```{r} sims_info = sims_ratios %>% select(AGE_BIN, SIM_AVG_RATIO) %>% mutate(AGE_LABEL = ifelse(AGE_BIN == "OLD", "OLD_SIMS", "YOUNG_SIMS")) %>% rename(HNHS = SIM_AVG_RATIO) %>% select(AGE_LABEL, HNHS, AGE_BIN) ``` Merging our sims and obs ratio info together ```{r} plotting_selection_strength_df = rbind(sims_info, age_comp_selection_strength) ``` Now to create our summary info df ```{r} sum_plotting_selection_strength_df = plotting_selection_strength_df %>% mutate(SELECTION = ifelse(HNHS < 1, "N", "P")) %>% group_by(AGE_BIN, AGE_LABEL, SELECTION) %>% summarise(MEAN = mean(log10(HNHS)), SEM = sd(log10(HNHS))/sqrt(n())) ``` ```{r} library(ggridges) age_comp_selection_strength_plot = ggplot(plotting_selection_strength_df, aes(x = log10(HNHS), y = AGE_LABEL, color = AGE_LABEL)) age_comp_selection_strength_plot = age_comp_selection_strength_plot + geom_density_ridges(fill = "white", size = 0.5, scale = 0.65) + geom_point(shape = "|", size = 1,position=position_nudge(y=-.15)) + geom_point(aes(x=MEAN, y= AGE_LABEL, color = AGE_LABEL), data=sum_plotting_selection_strength_df, size = 1) + geom_errorbarh(data=sum_plotting_selection_strength_df, aes(xmin= MEAN-SEM,xmax=MEAN+SEM,y=AGE_LABEL,color=AGE_LABEL), height = 0.1, inherit.aes = FALSE) + geom_vline(xintercept = 0 , color = "gray", linetype = "dashed", size = 0.6) + geom_vline(xintercept = -0.3311625 , color = "magenta", linetype = "dashed", size = 0.6) + xlab("log10(hN/hS)") + ylab("") + theme_bw() + scale_y_discrete(expand = expansion(mult = c(0.01, 0.4))) + scale_color_manual(values = c("#C71585", "grey66", "black", "#DB7093", "grey66", "black")) + theme(panel.grid=element_blank(), text = element_text(family = "sans"), panel.border = element_blank(), strip.background = element_blank(), axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12), strip.text.x = element_text(size = 11), strip.text.y = element_text(size = 11), axis.text.y=element_text(size = 10), axis.text.x =element_text(size = 10), legend.positio = "none") setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/") pdf(paste(outdir_figures,"age_comp_selection_strength_plot.pdf",sep=""), width=4,height=4) print(age_comp_selection_strength_plot) dev.off() ```