---
title: "R Notebook"
output: html_notebook
---
```{r}
library(tidyverse)
library(ggplot2)
```
```{r}
setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/")
outdir_files = "files/"
outdir_figures = "figures/"
sig_analysis_file = "files/hNhS_per_gene_sig_hits.txt"
sig_analysis = read.table(sig_analysis_file, header=TRUE, stringsAsFactors = FALSE)
sim_ratios_file = "files/avg_sim_ratios.txt"
sims_ratios = read.table(sim_ratios_file, header=TRUE, stringsAsFactors = FALSE)
```
We filter out B6 Young Liver from these analyses
```{r}
cond_filtered_sig_analysis = sig_analysis %>%
filter(!(STRAIN == "B6" & TISSUE == "Liver" & AGE_BIN == "YOUNG"))
```
```{r}
rm(sig_analysis)
```
Filtering for our significant hits
There are 67 genes out of 377 that we identified a significant signal for selection
```{r}
sig_hits = cond_filtered_sig_analysis %>%
filter(P_ADJ < 0.01) %>%
#filtering out the sig hit that does not make sense
filter(!(STRAIN == "FVB" & TISSUE == "Brain" & AGE_BIN == "OLD" & GENE == "mt-Co1"))
```
We compare the proportion of genes under selection across age
```{r}
selection_age_comp = sig_hits %>%
select(STRAIN, TISSUE, AGE_BIN) %>%
group_by(STRAIN, TISSUE, AGE_BIN) %>%
summarise(COUNT = n())
```
We fill need to account for the conditions that do not have any genes under selection
```{r}
triplets = cond_filtered_sig_analysis %>%
select(STRAIN, TISSUE, AGE_BIN) %>%
unique()
imputed_selection_age_comp = triplets %>%
left_join(selection_age_comp, by = c("STRAIN", "TISSUE", "AGE_BIN")) %>%
mutate(COUNT = ifelse(is.na(COUNT), 0, COUNT)) %>%
mutate(PROP = COUNT/13)
```
```{r}
rm(triplets)
```
```{r}
plotting_selection_age_comp_df = imputed_selection_age_comp %>%
ungroup() %>%
select(AGE_BIN, PROP) %>%
group_by(AGE_BIN) %>%
summarise(MEAN = mean(PROP), SEM = sd(PROP)/sqrt(n())) %>%
mutate(AGE_LABEL = ifelse(AGE_BIN == "OLD", "O", "Y"))
plotting_selection_age_comp_df$AGE_LABEL = factor(plotting_selection_age_comp_df$AGE_LABEL, level = c("Y", "O"))
```
```{r}
setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/")
selection_age_comp_plot = ggplot(plotting_selection_age_comp_df, aes(x = AGE_LABEL, y = MEAN, fill = AGE_LABEL))
selection_age_comp_plot = selection_age_comp_plot +
geom_bar(stat = "identity", position = position_dodge(), width = 0.75) +
geom_errorbar(aes(ymin=MEAN-SEM, ymax=MEAN+SEM), width=.25,
position=position_dodge(0.9)) +
theme_bw(base_size = 16) +
ylab("Proportion of genes") +
xlab("Selection") +
scale_fill_manual(name = "Age", labels = c("Young", "Old"), values = c("grey70", "grey53")) +
theme(text = element_text(family = "sans"),
legend.position = "none",
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10),
axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8))
pdf(paste(outdir_figures,"barplot_gen_selection_age_comp.pdf",sep=""), width=1.5,height=2.5)
print(selection_age_comp_plot)
dev.off()
```
We want to compare the proportion of genes under selection across age based on the type of selection
```{r}
selection_types_counts = sig_hits %>%
select(STRAIN, TISSUE, AGE_BIN, OBS_HNHS) %>%
mutate(SELECTION = ifelse(OBS_HNHS < 1, "N", "P")) %>%
select(STRAIN, TISSUE, AGE_BIN, SELECTION) %>%
group_by(STRAIN, TISSUE, AGE_BIN, SELECTION) %>%
summarise(COUNT = n())
```
Again we need to make sure we account for every condition when we compute the average
```{r}
triplets = cond_filtered_sig_analysis %>%
select(STRAIN, TISSUE, AGE_BIN) %>%
unique()
ext_triplets = triplets[rep(seq_len(nrow(triplets)), each = 2),]
#we need to extend this df to account for P/N selection
SELECTION = rep(c("N", "P"), 29)
selection_df = as.data.frame(SELECTION)
dummy_df = cbind(ext_triplets, selection_df)
rm(selection_df, ext_triplets, triplets)
```
```{r}
imputed_selection_types_counts = dummy_df %>%
left_join(selection_types_counts, by = c("STRAIN", "TISSUE", "AGE_BIN", "SELECTION")) %>%
mutate(COUNT = ifelse(is.na(COUNT), 0, COUNT)) %>%
select(AGE_BIN, SELECTION, COUNT) %>%
mutate(PROP = COUNT /13) %>%
group_by(AGE_BIN, SELECTION) %>%
summarise(MEAN = mean(PROP), SEM = sd(PROP)/ sqrt(n())) %>%
mutate(AGE_LABEL = ifelse(AGE_BIN == "OLD", "O", "Y"))
imputed_selection_types_counts$AGE_LABEL = factor(imputed_selection_types_counts$AGE_LABEL, level = c("Y", "O"))
```
```{r}
setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/")
selection_type_age_comp_plot = ggplot(imputed_selection_types_counts, aes(y = SELECTION, x = MEAN, fill = AGE_LABEL))
selection_type_age_comp_plot = selection_type_age_comp_plot +
geom_bar(stat = "identity", position = position_dodge(), width = 0.9) +
geom_errorbarh(aes(xmin=MEAN-SEM, xmax=MEAN+SEM), height=.25,
position=position_dodge(0.9)) +
theme_bw(base_size = 16) +
xlab("Proportion of genes") +
ylab("Selection") +
scale_fill_manual(name = "Age", labels = c("Young", "Old"), values = c("grey70", "grey53")) +
theme(text = element_text(family = "sans"),
legend.position = "none",
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10),
axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8))
#pdf(paste(outdir_figures,"selection_type_age_comp.pdf",sep=""), width=1.75,height=3)
pdf(paste(outdir_figures,"selection_type_age_comp.pdf",sep=""), width=3,height=1.5)
print(selection_type_age_comp_plot)
dev.off()
pdf(paste(outdir_figures,"leg_selection_type_age_comp.pdf",sep=""), width=3,height=3)
print(selection_type_age_comp_plot + theme(legend.position = "bottom"))
dev.off()
```
We want to compare the strength of selection between age groups
```{r}
age_comp_selection_strength = cond_filtered_sig_analysis %>%
select(STRAIN, TISSUE, AGE_BIN, GENE, OBS_HNHS, P_ADJ) %>%
mutate(AGE_LABEL = ifelse(P_ADJ >= 0.01, paste(AGE_BIN,"N.S.", sep = "_"), AGE_BIN)) %>%
select(AGE_LABEL, OBS_HNHS, AGE_BIN) %>%
rename(HNHS = OBS_HNHS)
```
Plotting just the average selection strength and SEM for our signficant observed hits
```{r}
simple_selection_type_strength_df = cond_filtered_sig_analysis %>%
filter(!(STRAIN == "FVB" & TISSUE == "Brain" & AGE_BIN == "OLD" & GENE == "mt-Co1")) %>%
select(AGE_BIN, OBS_HNHS, P_ADJ) %>%
filter(P_ADJ < 0.01) %>%
select(AGE_BIN, OBS_HNHS) %>%
mutate(SELECTION = ifelse(OBS_HNHS < 1, "N", "P")) %>%
group_by(AGE_BIN, SELECTION) %>%
summarise(MEAN = mean(log10(OBS_HNHS)), SEM = sd(log10(OBS_HNHS))/sqrt(n()))
simple_selection_type_strength_df$X_POS = c(1.15, 2.15, 0.85, 1.85)
simple_selection_type_strength_df$AGE_BIN = factor(simple_selection_type_strength_df$AGE_BIN, level = c("YOUNG", "OLD"))
```
Comparing the distribution of hnhs ratios for young and old samples
```{r}
w_test_df = cond_filtered_sig_analysis %>%
filter(!(STRAIN == "FVB" & TISSUE == "Brain" & AGE_BIN == "OLD" & GENE == "mt-Co1")) %>%
select(AGE_BIN, OBS_HNHS, P_ADJ) %>%
filter(P_ADJ < 0.01) %>%
select(AGE_BIN, OBS_HNHS) %>%
mutate(SELECTION = ifelse(OBS_HNHS < 1, "N", "P")) %>%
mutate(LABEL = paste(SELECTION, AGE_BIN, sep ="_")) %>%
select(LABEL, OBS_HNHS)
```
```{r}
lm_df = cond_filtered_sig_analysis %>%
filter(!(STRAIN == "FVB" & TISSUE == "Brain" & AGE_BIN == "OLD" & GENE == "mt-Co1")) %>%
select(AGE_BIN, OBS_HNHS, P_ADJ) %>%
filter(P_ADJ < 0.01) %>%
select(AGE_BIN, OBS_HNHS) %>%
mutate(SELECTION = ifelse(OBS_HNHS < 1, "N", "P")) %>%
filter(SELECTION == "N")
lm_df$AGE_BIN = factor(lm_df$AGE_BIN, level = c("YOUNG", "OLD"))
lm_df$SELECTION = factor(lm_df$SELECTION, level = c("P", "N"))
```
```{r}
lm_model = lm(data = lm_df, OBS_HNHS ~ AGE_BIN)
summary(lm_model)
```
```{r}
groups = list(c("N_YOUNG", "N_OLD"),
c("P_YOUNG", "P_OLD"))
combos = groups %>%
set_names(map_chr(., ~ paste(., collapse = "_")))
p_values = map_df(combos, function(y) {
filter(w_test_df, LABEL %in% y) %>%
wilcox.test(OBS_HNHS ~ LABEL, data = .) %>%
broom::tidy()
}, .id = "contrast") %>%
mutate(p_adj = p.adjust(p.value, method = "BH")) %>%
mutate(SIG_STATUS = ifelse(p_adj < 0.01, "SIG", "N.S."))
```
```{r}
young = (w_test_df %>% filter(LABEL == "N_YOUNG"))$OBS_HNHS
old = (w_test_df %>% filter(LABEL == "N_OLD"))$OBS_HNHS
t.test(young, old, var.equal = TRUE)
```
```{r}
simple_selection_type_strength_plot = ggplot(simple_selection_type_strength_df, aes(y = X_POS, x = MEAN, color = AGE_BIN )) +
geom_point(size = 2) +
theme_bw(base_size = 16) +
#geom_vline(xintercept = -0.3311625 , color = "magenta", linetype = "dashed", size = 0.6) +
scale_y_continuous(breaks = c(1,2), labels = c("N","P")) +
scale_color_manual(name = "Age", labels = c("Young", "Old"), values = c("grey70", "grey53")) +
geom_errorbarh(aes(xmin=MEAN-SEM, xmax=MEAN+SEM), height=.15,
position=position_dodge(0.9)) +
theme_bw(base_size = 16) +
xlab("log10(hN/hS)") +
ylab("Selection") +
scale_fill_manual(name = "Age", labels = c("Young", "Old"), values = c("grey70", "grey53")) +
theme(text = element_text(family = "sans"),
legend.position = "none",
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10.5),
axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8))
setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/")
#pdf(paste(outdir_figures,"simple_selection_type_strength_plot.pdf",sep=""), width=1.75,height=3)
pdf(paste(outdir_figures,"simple_selection_type_strength_plot.pdf",sep=""), width=3,height=1.5)
print(simple_selection_type_strength_plot)
dev.off()
pdf(paste(outdir_figures,"leg_simple_selection_type_strength_plot.pdf",sep=""), width=1.75,height=3)
print(simple_selection_type_strength_plot + theme(legend.position = "right"))
dev.off()
```
```{r}
sims_info = sims_ratios %>%
select(AGE_BIN, SIM_AVG_RATIO) %>%
mutate(AGE_LABEL = ifelse(AGE_BIN == "OLD", "OLD_SIMS", "YOUNG_SIMS")) %>%
rename(HNHS = SIM_AVG_RATIO) %>%
select(AGE_LABEL, HNHS, AGE_BIN)
```
Merging our sims and obs ratio info together
```{r}
plotting_selection_strength_df = rbind(sims_info, age_comp_selection_strength)
```
Now to create our summary info df
```{r}
sum_plotting_selection_strength_df = plotting_selection_strength_df %>%
mutate(SELECTION = ifelse(HNHS < 1, "N", "P")) %>%
group_by(AGE_BIN, AGE_LABEL, SELECTION) %>%
summarise(MEAN = mean(log10(HNHS)), SEM = sd(log10(HNHS))/sqrt(n()))
```
```{r}
library(ggridges)
age_comp_selection_strength_plot = ggplot(plotting_selection_strength_df, aes(x = log10(HNHS), y = AGE_LABEL, color = AGE_LABEL))
age_comp_selection_strength_plot = age_comp_selection_strength_plot +
geom_density_ridges(fill = "white", size = 0.5, scale = 0.65) +
geom_point(shape = "|", size = 1,position=position_nudge(y=-.15)) +
geom_point(aes(x=MEAN, y= AGE_LABEL, color = AGE_LABEL), data=sum_plotting_selection_strength_df, size = 1) +
geom_errorbarh(data=sum_plotting_selection_strength_df, aes(xmin= MEAN-SEM,xmax=MEAN+SEM,y=AGE_LABEL,color=AGE_LABEL), height = 0.1, inherit.aes = FALSE) +
geom_vline(xintercept = 0 , color = "gray", linetype = "dashed", size = 0.6) +
geom_vline(xintercept = -0.3311625 , color = "magenta", linetype = "dashed", size = 0.6) +
xlab("log10(hN/hS)") +
ylab("") +
theme_bw() +
scale_y_discrete(expand = expansion(mult = c(0.01, 0.4))) +
scale_color_manual(values = c("#C71585", "grey66", "black", "#DB7093", "grey66", "black")) +
theme(panel.grid=element_blank(),
text = element_text(family = "sans"),
panel.border = element_blank(),
strip.background = element_blank(),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
strip.text.x = element_text(size = 11),
strip.text.y = element_text(size = 11),
axis.text.y=element_text(size = 10),
axis.text.x =element_text(size = 10),
legend.positio = "none")
setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/")
pdf(paste(outdir_figures,"age_comp_selection_strength_plot.pdf",sep=""), width=4,height=4)
print(age_comp_selection_strength_plot)
dev.off()
```