---
title: "R Notebook"
output: html_notebook
---
```{r}
library(tidyverse)
library(broom)
library(ggplot2)
library(ggridges)
library(PNWColors)
```
Colors for our genes are based on the complex
```{r}
setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/")
outdir_files = "files/"
outdir_figures = "figures/"
sims_hnhs_file = "files/sims_hNhS_ratios_per_gene.txt"
sims_hnhs = read.table(sims_hnhs_file, header=TRUE, stringsAsFactors = FALSE)
sig_analysis_file = "files/hNhS_per_gene_sig_hits.txt"
sig_analysis = read.table(sig_analysis_file, header=TRUE, stringsAsFactors = FALSE)
```
We filter out B6 Young Liver from these analyses
```{r}
cond_filtered_sims_hnhs = sims_hnhs %>%
filter(!(STRAIN == "B6" & TISSUE == "Liver" & AGE_BIN == "YOUNG"))
cond_filtered_sig_analysis = sig_analysis %>%
filter(!(STRAIN == "B6" & TISSUE == "Liver" & AGE_BIN == "YOUNG"))
```
```{r}
rm(sims_hnhs, sig_analysis)
```
```{r}
GeomSplitViolin <- ggproto(
"GeomSplitViolin",
GeomViolin,
draw_group = function(self, data, ..., draw_quantiles = NULL) {
data <- transform(data,
xminv = x - violinwidth * (x - xmin),
xmaxv = x + violinwidth * (xmax - x))
grp <- data[1,'group']
newdata <- plyr::arrange(
transform(data, x = if(grp%%2==1) xminv else xmaxv),
if(grp%%2==1) y else -y
)
newdata <- rbind(newdata[1, ], newdata, newdata[nrow(newdata), ],
newdata[1, ])
newdata[c(1,nrow(newdata)-1,nrow(newdata)), 'x'] <- round(newdata[1,
'x'])
if (length(draw_quantiles) > 0 & !scales::zero_range(range(data$y))) {
stopifnot(all(draw_quantiles >= 0), all(draw_quantiles <= 1))
quantiles <- ggplot2:::create_quantile_segment_frame(data,
draw_quantiles)
aesthetics <- data[rep(1, nrow(quantiles)), setdiff(names(data),
c("x", "y")),
drop = FALSE]
aesthetics$alpha <- rep(1, nrow(quantiles))
both <- cbind(quantiles, aesthetics)
quantile_grob <- GeomPath$draw_panel(both, ...)
ggplot2:::ggname("geom_split_violin",
grid::grobTree(GeomPolygon$draw_panel(newdata, ...),
quantile_grob))
} else {
ggplot2:::ggname("geom_split_violin",
GeomPolygon$draw_panel(newdata, ...))
}
}
)
geom_split_violin <- function (mapping = NULL,
data = NULL,
stat = "ydensity",
position = "identity", ...,
draw_quantiles = NULL,
trim = TRUE,
scale = "area",
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE) {
layer(data = data,
mapping = mapping,
stat = stat,
geom = GeomSplitViolin,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(trim = trim,
scale = scale,
draw_quantiles = draw_quantiles,
na.rm = na.rm, ...)
)
}
```
```{r}
plot_sims = cond_filtered_sims_hnhs %>%
filter(SIM_NONSYN_MUT_COUNT != 0) %>%
filter(SIM_SYN_MUT_COUNT != 0) %>%
select(STRAIN, TISSUE, AGE_BIN, GENE, SIM_HNHS) %>%
rename(RATIO = SIM_HNHS) %>%
mutate(TYPE = "Simulated") %>%
mutate(GENE_LABEL = case_when(grepl("mt-Nd1", GENE) ~ 1,
grepl("mt-Nd2", GENE) ~ 2,
grepl("mt-Nd3", GENE) ~ 3,
GENE == "mt-Nd4" ~ 4,
GENE == "mt-Nd4l" ~ 5,
grepl("mt-Nd5", GENE) ~ 6,
grepl("mt-Nd6", GENE) ~ 7,
grepl("mt-Cytb", GENE) ~ 8,
grepl("mt-Co1", GENE) ~ 9,
grepl("mt-Co2", GENE) ~ 10,
grepl("mt-Co3", GENE) ~ 11,
grepl("mt-Atp6", GENE) ~ 12,
TRUE ~ 13
))
```
```{r}
sims_sum = plot_sims %>%
select(GENE_LABEL, RATIO) %>%
group_by(GENE_LABEL) %>%
summarise(SIM_MEDIAN = median(log10(RATIO)), SIM_AVG = mean(log10(RATIO)))
```
```{r}
plot_obs = cond_filtered_sig_analysis %>%
select(STRAIN, TISSUE, AGE_BIN, GENE, OBS_HNHS) %>%
rename(RATIO = OBS_HNHS) %>%
mutate(TYPE = "Observed") %>%
mutate(GENE_LABEL = case_when(grepl("mt-Nd1", GENE) ~ 1,
grepl("mt-Nd2", GENE) ~ 2,
grepl("mt-Nd3", GENE) ~ 3,
GENE == "mt-Nd4" ~ 4,
GENE == "mt-Nd4l" ~ 5,
grepl("mt-Nd5", GENE) ~ 6,
grepl("mt-Nd6", GENE) ~ 7,
grepl("mt-Cytb", GENE) ~ 8,
grepl("mt-Co1", GENE) ~ 9,
grepl("mt-Co2", GENE) ~ 10,
grepl("mt-Co3", GENE) ~ 11,
grepl("mt-Atp6", GENE) ~ 12,
TRUE ~ 13
))
```
```{r}
obs_sum = plot_obs %>%
select(GENE_LABEL, RATIO) %>%
group_by(GENE_LABEL) %>%
summarise(OBS_MEDIAN = median(log10(RATIO)), OBS_AVG = mean(log10(RATIO)))
```
```{r}
plotting_df = rbind(plot_obs, plot_sims)
plotting_df$TYPE = factor(plotting_df$TYPE, level = c("Simulated", "Observed"))
```
```{r}
obs_points = cond_filtered_sig_analysis %>%
select(STRAIN, TISSUE, AGE_BIN, GENE, P_ADJ, OBS_HNHS) %>%
mutate(SIG = ifelse(P_ADJ < 0.01, "Y", "N")) %>%
mutate(SIG = ifelse(STRAIN == "FVB" & TISSUE == "Brain" & AGE_BIN == "OLD" & GENE == "mt-Co1", "N", SIG)) %>%
mutate(GENE_LABEL = case_when(grepl("mt-Nd1", GENE) ~ 1,
grepl("mt-Nd2", GENE) ~ 2,
grepl("mt-Nd3", GENE) ~ 3,
GENE == "mt-Nd4" ~ 4,
GENE == "mt-Nd4l" ~ 5,
grepl("mt-Nd5", GENE) ~ 6,
grepl("mt-Nd6", GENE) ~ 7,
grepl("mt-Cytb", GENE) ~ 8,
grepl("mt-Co1", GENE) ~ 9,
grepl("mt-Co2", GENE) ~ 10,
grepl("mt-Co3", GENE) ~ 11,
grepl("mt-Atp6", GENE) ~ 12,
TRUE ~ 13
))
```
Plotting the violins
```{r}
genes_violins = ggplot(data = plotting_df, aes(y = log10(RATIO), x = factor(GENE_LABEL), fill = TYPE)) +
geom_split_violin(trim = FALSE, alpha = 0.4) +
geom_point(data = sims_sum, inherit.aes = FALSE, aes(x = GENE_LABEL - 0.12, y = SIM_AVG), shape = 18 , size = 0.85, color = "red") +
geom_point(data = obs_sum, inherit.aes = FALSE, aes(x = GENE_LABEL + 0.1, y = OBS_AVG), shape = 18 , size = 0.85, color = "red") +
geom_point(data = obs_points, inherit.aes = FALSE, aes(x = GENE_LABEL + 0.35, y = log10(OBS_HNHS), color = STRAIN, shape = SIG), size = 0.7) +
geom_hline(yintercept = 0, linetype = "dashed", size = 0.5, color = "gray") +
ylab("log10(hN/hS)") +
xlab("Gene") +
scale_fill_manual(values = c("black","white")) +
scale_shape_manual(label = c("N.S.", "Sig"), values = c(1, 19)) +
scale_color_manual(values = c("B6" = "#1d457f" , "AKR" = "#cc5c76", "ALR" = "#2f9e23", "FVB" = "#f57946", "NZB" = "#f7c22d")) +
scale_x_discrete(labels = c("mt-Nd1", "mt-Nd2", "mt-Nd3", "mt-Nd4", "mt-Nd4l", "mt-Nd5", "mt-Nd6", "mt-Cytb", "mt-Co1", "mt-Co2", "mt-Co3", "mt-Atp6", "mt-Atp8")) +
theme_bw() +
theme(text = element_text(family = "sans"),
axis.title.y = element_text(size = 11),
axis.text.y=element_text(size = 11),
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
legend.position = "none")
setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/")
pdf(paste(outdir_figures,"genes_selection_dist.pdf",sep=""), width=4.25,height=3.75)
print(genes_violins)
dev.off()
setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/")
pdf(paste(outdir_figures,"leg_genes_selection_dist.pdf",sep=""), width=9.5,height=3.75)
print(genes_violins + theme(legend.position = "bottom") + guides(shape = guide_legend(override.aes = list(size = 2)), color = guide_legend(override.aes = list(size = 2))))
dev.off()
```
Comparing the averages of sims v obs per gene
```{r}
plotting_diff = sims_sum %>%
left_join(obs_sum, by = "GENE_LABEL") %>%
select(GENE_LABEL, SIM_AVG, OBS_AVG) %>%
mutate(DIFF = OBS_AVG - SIM_AVG)
```
Running our statistical tests:
1) Are our simulation log10(means) non-zero?
One sample t-test for each gene with a BH correction -- double checked by running a t-test on the genes independently
```{r}
plot_sims %>%
mutate(LOG10_RATIO = log10(RATIO)) %>%
select(GENE, LOG10_RATIO) %>%
group_by(GENE) %>%
do(tidy(t.test(.$LOG10_RATIO, alternative = "two.sided"))) %>%
mutate(p_adj = p.adjust(p.value, method = "BH")) %>%
mutate(SIG_STATUS = ifelse(p_adj < 0.01, "SIG", "N.S."))
```
2) Are our sims and obs distributions different? Wilcoxon Rank Sum Test with BH correction
```{r}
sims_test = plot_sims %>%
select(GENE, RATIO) %>%
mutate(LOG10_RATIO = log10(RATIO)) %>%
mutate(LABEL = paste("SIM", GENE, sep = "_")) %>%
select(LABEL, GENE, LOG10_RATIO)
```
```{r}
obs_test = plot_obs %>%
select(GENE, RATIO) %>%
mutate(LOG10_RATIO = log10(RATIO)) %>%
mutate(LABEL = paste("OBS", GENE, sep = "_")) %>%
select(LABEL, GENE, LOG10_RATIO)
```
```{r}
two_sample_t_test_df = rbind(obs_test, sims_test)
```
```{r}
groups = list(c("SIM_mt-Atp6", "OBS_mt-Atp6"),
c("SIM_mt-Atp8", "OBS_mt-Atp8"),
c("SIM_mt-Co1", "OBS_mt-Co1"),
c("SIM_mt-Co2", "OBS_mt-Co2"),
c("SIM_mt-Co3", "OBS_mt-Co3"),
c("SIM_mt-Cytb", "OBS_mt-Cytb"),
c("SIM_mt-Nd1", "OBS_mt-Nd1"),
c("SIM_mt-Nd2", "OBS_mt-Nd2"),
c("SIM_mt-Nd3", "OBS_mt-Nd3"),
c("SIM_mt-Nd4", "OBS_mt-Nd4"),
c("SIM_mt-Nd4l", "OBS_mt-Nd4l"),
c("SIM_mt-Nd5", "OBS_mt-Nd5"),
c("SIM_mt-Nd6", "OBS_mt-Nd6")
)
```
```{r}
combos = groups %>%
set_names(map_chr(., ~ paste(., collapse = "_")))
```
```{r}
two_sample_p_values = map_df(combos, function(y) {
filter(two_sample_t_test_df, LABEL %in% y) %>%
t.test(LOG10_RATIO ~ LABEL, data = .) %>%
broom::tidy()
}, .id = "contrast") %>%
mutate(p_adj = p.adjust(p.value, method = "BH")) %>%
mutate(SIG_STATUS = ifelse(p_adj < 0.01, "SIG", "N.S."))
sig_p_val = two_sample_p_values %>%
filter(SIG_STATUS == "SIG") %>%
mutate(X_POS = 0.35) %>%
mutate(GENE_LABEL = case_when(grepl("mt-Nd1", contrast) ~ 1,
grepl("mt-Nd2", contrast) ~ 2,
grepl("mt-Nd3", contrast) ~ 3,
grepl("mt-Nd4l", contrast) ~ 5,
grepl("mt-Nd5", contrast) ~ 6,
grepl("mt-Nd6", contrast) ~ 7,
grepl("mt-Cytb", contrast) ~ 8,
grepl("mt-Co1", contrast) ~ 9,
grepl("mt-Co2", contrast) ~ 10,
grepl("mt-Co3", contrast) ~ 11,
grepl("mt-Atp6", contrast) ~ 12,
grepl("SIM_mt-Nd4_OBS_mt-Nd4", contrast) ~ 4,
TRUE ~ 13
))
```
Plotting difference in means with statistical test info
```{r}
avg_diff = ggplot(plotting_diff, aes(y = factor(GENE_LABEL), x = DIFF))
avg_diff = avg_diff +
geom_point() +
geom_segment(aes(y = GENE_LABEL, yend = GENE_LABEL, x = 0, xend=DIFF)) +
geom_point(data = sig_p_val, inherit.aes = FALSE, aes(x = X_POS, y = GENE_LABEL), shape = 8, size = 1) +
geom_text(aes(x = 0.25, y = 12.5, label = "two sample t-test\n BH adj. p-val < 0.01"), size = 2.75) +
theme_bw() +
scale_y_discrete(labels = c("mt-Nd1", "mt-Nd2", "mt-Nd3", "mt-Nd4", "mt-Nd4l", "mt-Nd5", "mt-Nd6", "mt-Cytb", "mt-Co1", "mt-Co2", "mt-Co3", "mt-Atp6", "mt-Atp8")) +
scale_x_continuous(limits = c(0, 0.4), breaks = c(0, 0, 0.1, 0.2, 0.3, 0.4)) +
xlab("Gene") +
ylab("Difference of Average hN/hS \n[log10]") +
theme_bw() +
theme(text = element_text(family = "sans"),
axis.title.y = element_text(size = 11),
axis.title.x = element_text(size = 11),
axis.text.y=element_text(size = 10),
axis.text.x = element_text(size = 10),
legend.position = "none")
setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/")
pdf(paste(outdir_figures,"avg_diff_genes.pdf",sep=""), width=2.75,height=3.75)
print(avg_diff)
dev.off()
```
Result: Not significant p-value ~ 0.5 and p-value = ~ 0.3 with a fisher's exact test
```{r}
chi_df = cond_filtered_sig_analysis %>%
select(STRAIN, OBS_HNHS, P_ADJ) %>%
mutate(SEL = ifelse(OBS_HNHS < 1, "N", "P")) %>%
filter(P_ADJ < 0.01) %>%
mutate(STRAIN_LABEL = ifelse(STRAIN == "B6", "WT", "CONPLASTIC")) %>%
select(STRAIN_LABEL, SEL) %>%
group_by(STRAIN_LABEL, SEL) %>%
summarise(COUNT = n()) %>%
ungroup() %>%
pivot_wider(names_from = STRAIN_LABEL, values_from = COUNT) %>%
column_to_rownames(var = "SEL")
chi_mat = as.matrix(chi_df)
fisher.test(chi_mat)
```
```{r}
cond_filtered_sig_analysis %>%
select(STRAIN, OBS_HNHS, P_ADJ) %>%
mutate(SEL = ifelse(OBS_HNHS < 1, "N", "P")) %>%
filter(P_ADJ < 0.01) %>%
filter(SEL == "N") %>%
group_by(STRAIN, SEL) %>%
summarise(COUNT = n())
#P: 56 - 1 (bc of that weird FVB hit) = 55
#N: 12
# total sig hits: 67
```