---
title: "R Notebook"
output: html_notebook
---
```{r}
library(tidyverse)
library(ggplot2)
library(PNWColors)
```
```{r}
setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/")
outdir_figures = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/mutation_frequency_per_region/figures/"
outdir_files = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/mutation_frequency_per_region/files/"
supertable_file ="/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/supertable.txt"
supertable = read.table(supertable_file, header=TRUE, stringsAsFactors = FALSE)
```
Normalize for sequencing depth:
```{r}
norm_seq_depth = supertable %>%
select(SAMPLE, STRAIN, TISSUE, AGE_BIN, REF, ALT, START, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>%
#unique here to get rid of redundant mutations present at pos that overlap in genes
unique() %>%
select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>%
group_by(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>%
summarise(SAMPLE_MUT_COUNT_AT_POS = sum(ALT_ALLELE_DEPTH)) %>%
ungroup() %>%
select(STRAIN, TISSUE, AGE_BIN, START, SAMPLE_MUT_COUNT_AT_POS, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>%
group_by(STRAIN, TISSUE, AGE_BIN, START, CONDITION_MUT_FREQ_AT_POS) %>%
summarise(CONDITION_MUT_COUNT_AT_POS = sum(SAMPLE_MUT_COUNT_AT_POS), CONDITION_READ_DEPTH_AT_POS = sum(READ_DEPTH_AT_POS)) %>%
ungroup() %>%
group_by(START) %>%
mutate(MIN_READ_DEPTH_AT_POS = min(CONDITION_READ_DEPTH_AT_POS)) %>%
mutate(FLOOR_MIN_MUT_FREQ_AT_POS = 1/MIN_READ_DEPTH_AT_POS) %>%
mutate(CONDITION_MUT_COUNT_AT_POS = ifelse(CONDITION_MUT_FREQ_AT_POS < FLOOR_MIN_MUT_FREQ_AT_POS, 0, CONDITION_MUT_COUNT_AT_POS))
```
Plotting time:
Color palette
```{r}
library(PNWColors)
bay_pal <- pnw_palette(name="Bay", type="discrete")
```
Are there high heteroplasmic areas in the OriL?
```{r}
norm_seq_depth %>%
filter(START > 5158, START < 5191) %>%
#this was the arbitrary frequency we define as being a high frequency position
filter(CONDITION_MUT_FREQ_AT_POS > 0.001) %>%
select(STRAIN, TISSUE, AGE_BIN, START) %>%
unique()
#5170 stands out as a high heteroplasmic variant
```
Pinpointing the high frequency region to the OriL region; we plot the neighboring regions to the OriL in order to contrast the mutation frequency across regions
```{r}
ori_reg = norm_seq_depth %>%
#this was the arbitrary frequency we define as being a high frequency position
filter(CONDITION_MUT_FREQ_AT_POS < 0.001) %>%
mutate(NORM_CONDITION_MUT_FREQ_AT_POS = CONDITION_MUT_COUNT_AT_POS/CONDITION_READ_DEPTH_AT_POS) %>%
ungroup() %>%
select(STRAIN, TISSUE, AGE_BIN, START, NORM_CONDITION_MUT_FREQ_AT_POS) %>%
#spans mt-Ta to mt-Ty
filter(START > 5000, START < 5400) %>%
filter(NORM_CONDITION_MUT_FREQ_AT_POS > 0)
ori_reg$STRAIN = factor(ori_reg$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB"))
ori_reg$AGE_BIN = factor(ori_reg$AGE_BIN, level = c("YOUNG", "OLD"))
```
```{r}
ori_reg_plot = ggplot(ori_reg %>%
mutate(AGE_LABEL = recode(AGE_BIN, "YOUNG" = "Young", "OLD" = "Old")), aes(x = START, y = NORM_CONDITION_MUT_FREQ_AT_POS, color = TISSUE))
ori_reg_plot = ori_reg_plot +
geom_point(size = 0.5, alpha = 0.7) +
#this highlights where the OriL is in the region
geom_segment(x = 5159, xend = 5190, y = 0, yend = 0, color = "black") +
theme_bw() +
facet_grid(STRAIN~AGE_LABEL) +
ylab("Mutation frequency at position") +
xlab("Position on the mt-genome (bp)") +
scale_color_manual(name = "Tissue", values= bay_pal[c(1,5,4)]) +
guides(color = guide_legend(override.aes = list(size = 3))) +
theme(strip.background=element_blank(),
panel.grid=element_blank(),
text = element_text(family = "sans"),
axis.title = element_text(size = 10),
strip.text.x = element_text(size = 10, vjust = 1),
strip.text.y = element_text(size = 10, vjust = 1),
axis.text.y=element_text(size = 8),
axis.text.x=element_text(size = 8, angle = 45, vjust = 1, hjust = 1),
legend.position = "bottom")
pdf(paste(outdir_figures,"/mut_freq_oril_reg.pdf",sep=""),width=6,height=4)
print(ori_reg_plot)
dev.off()
```
Zooming into the OriL to find where in the OriL we have this high frequency cluster
```{r}
oriL_zoomies = norm_seq_depth %>%
#this was the arbitrary frequency we define as being a high frequency position
filter(CONDITION_MUT_FREQ_AT_POS < 0.001) %>%
mutate(NORM_CONDITION_MUT_FREQ_AT_POS = CONDITION_MUT_COUNT_AT_POS/CONDITION_READ_DEPTH_AT_POS) %>%
ungroup() %>%
select(STRAIN, TISSUE, AGE_BIN, START, NORM_CONDITION_MUT_FREQ_AT_POS) %>%
#the OriL region
filter(START>5158, START<5191)
oriL_zoomies$STRAIN = factor(oriL_zoomies$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB"))
oriL_zoomies$AGE_BIN = factor(oriL_zoomies$AGE_BIN, level = c("YOUNG", "OLD"))
```
```{r}
oriL_zoomies_plot = ggplot(oriL_zoomies %>%
mutate(AGE_LABEL = recode(AGE_BIN, "YOUNG" = "Young", "OLD" = "Old")), aes(x = START, y = NORM_CONDITION_MUT_FREQ_AT_POS, color = TISSUE))
oriL_zoomies_plot = oriL_zoomies_plot +
geom_point(size = 0.5, alpha = 0.7) +
#geom_point(x = 9818, y = 0.00075, shape = 8, size = 0.7, color = "magenta") +
theme_bw() +
facet_grid(STRAIN~AGE_LABEL) +
ylab("Mutation frequency at position") +
xlab("Position on the mt-genome (bp)") +
scale_color_manual(name = "Tissue", values= bay_pal[c(1,5,4)]) +
scale_x_continuous(breaks = seq(5159, 5190,1)) +
guides(color = guide_legend(override.aes = list(size = 3))) +
theme(strip.background=element_blank(),
panel.grid=element_blank(),
text = element_text(family = "sans"),
axis.title = element_text(size = 8),
strip.text.x = element_text(size = 8, vjust = 1),
strip.text.y = element_text(size = 8, vjust = 1),
axis.text.y=element_text(size = 6),
axis.text.x=element_text(size = 4, angle = 45, vjust = 1, hjust = 1),
legend.position = "bottom")
print(oriL_zoomies_plot)
pdf(paste(outdir_figures,"/mut_freq_oriL.pdf",sep=""),width=4,height=3)
print(oriL_zoomies_plot)
dev.off()
```
Main figure OriL zoom in
We average the mutation frequency at each position across tissues
```{r}
main_fig_oriL = oriL_zoomies %>%
ungroup() %>%
filter(START > 5167,START < 5191) %>%
select(STRAIN, AGE_BIN, START, NORM_CONDITION_MUT_FREQ_AT_POS) %>%
group_by(STRAIN, AGE_BIN, START) %>%
summarise(STRAIN_AVG_FREQ = mean(NORM_CONDITION_MUT_FREQ_AT_POS)) %>%
mutate(STRAIN_LABEL = ifelse(STRAIN == "B6", "B6", "Conplastic")) %>%
filter(STRAIN_AVG_FREQ > 0) %>%
mutate(X_POS = ifelse(AGE_BIN == "YOUNG", START - 0.15, START + 0.15))
```
```{r}
X_LABEL = (supertable %>%
filter(START > 5167, START < 5191) %>%
select(START, REF) %>%
unique() %>%
filter(nchar(REF) == 1) %>%
mutate(X_LABEL = paste(REF, START, sep = "")))$X_LABEL
```
```{r}
main_fig_oriL_plot = ggplot(main_fig_oriL, aes(x = X_POS, y = STRAIN_AVG_FREQ, color = STRAIN, shape = AGE_BIN))
main_fig_oriL_plot = main_fig_oriL_plot +
geom_point(size = 0.3) +
geom_point(aes(x = 5170, y = 6e-4), shape = "*", color = "#ec008c", size = 2.5) +
theme_bw(base_size = 6) +
facet_wrap(STRAIN_LABEL~., nrow = 2) +
ylab("Mutation frequency") +
xlab("Position (bp)") +
scale_color_manual(values = c("B6"= "#1d457f","AKR" = "#cc5c76", "ALR" = "#2f9e23", "FVB" = "#f57946", "NZB" = "#f7c22d")) +
scale_shape_manual(labels = c("Young", "Old") , values = c(1,19)) +
scale_x_continuous(breaks = seq(5168,5190,1), labels = X_LABEL) +
scale_y_continuous(labels = function(x) format(x, scientific = TRUE)) +
theme(strip.background=element_blank(),
panel.grid=element_blank(),
text = element_text(family = "sans"),
axis.title = element_text(size = 5),
strip.text.x = element_text(size = 6, vjust = 1),
strip.text.y = element_text(size = 6, vjust = 1),
axis.text.y=element_text(size = 3.5),
axis.text.x=element_text(size = 3.5, angle = 45, vjust = 1, hjust = 1),
legend.position = "none")
pdf(paste(outdir_figures,"/main_fig_oriL_plot.pdf",sep=""),width=1.85,height=1.25)
print(main_fig_oriL_plot)
dev.off()
pdf(paste(outdir_figures,"/leg_main_fig_oriL_plot.pdf",sep=""),width=3,height=1.75)
print(main_fig_oriL_plot + guides(shape = guide_legend(override.aes = list(size = 2)), color = guide_legend(override.aes = list(size = 2))) + theme(legend.position = "right", legend.key.size = unit(0.25, "cm")))
dev.off()
```
We want to consolidate the mutations in the A repeat region because we can't with certainty narrow down to the exact base
```{r}
A_repeat_region_mut_freq = norm_seq_depth %>%
#this was the arbitrary frequency we define as being a high frequency position
filter(CONDITION_MUT_FREQ_AT_POS < 0.001) %>%
ungroup() %>%
#zoom into the hotspot region
filter(START > 5170, START < 5182) %>%
select(STRAIN, TISSUE, AGE_BIN, CONDITION_MUT_COUNT_AT_POS, CONDITION_READ_DEPTH_AT_POS) %>%
group_by(STRAIN, TISSUE, AGE_BIN) %>%
summarise(REGION_MUT_COUNT = sum(CONDITION_MUT_COUNT_AT_POS), REGION_READ_DEPTH = sum(CONDITION_READ_DEPTH_AT_POS)) %>%
mutate(REGION_MUT_FREQ = REGION_MUT_COUNT/REGION_READ_DEPTH) %>%
ungroup() %>%
select(STRAIN, AGE_BIN, REGION_MUT_FREQ) %>%
group_by(STRAIN, AGE_BIN) %>%
#averaging across tissues in a strain
summarise(STRAIN_AVG_FREQ = mean(REGION_MUT_FREQ)) %>%
filter(STRAIN_AVG_FREQ > 0) %>%
mutate(STRAIN_LABEL = ifelse(STRAIN == "B6", "B6", "Conplastic")) %>%
mutate(START = 5171)
```
```{r}
nonA_repeat_region = main_fig_oriL %>%
filter(START < 5171 | START > 5181) %>%
select(STRAIN, AGE_BIN, START, STRAIN_AVG_FREQ, STRAIN_LABEL)
```
Merging our nonA and A repeat regions
```{r}
region_info_df = rbind(nonA_repeat_region, A_repeat_region_mut_freq)
```
We need to create a pseudo START so that we can avoid a gap in plotting
```{r}
start_pseudo = supertable %>%
filter(START > 5167, START < 5191) %>%
select(START, REF) %>%
unique() %>%
filter(nchar(REF) == 1) %>%
mutate(COMP_REF = case_when(REF == "C" ~ "G",
REF == "G" ~ "C",
REF == "T" ~ "A",
TRUE ~ "T")) %>%
mutate(X_LABEL = paste(COMP_REF, START, sep = "")) %>%
filter(!(START > 5171 & START < 5182)) %>%
mutate(START_PSEUDO = seq(1,13)) %>%
mutate(X_LABEL = ifelse(START == 5171, "T(5171-5181)", X_LABEL))
```
```{r}
region_plotting_df = region_info_df %>%
left_join(start_pseudo, by = ("START")) %>%
mutate(X_POS = ifelse(AGE_BIN == "YOUNG", START_PSEUDO - 0.15, START_PSEUDO + 0.15))
region_plotting_df$STRAIN = factor(region_plotting_df$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB"))
region_plotting_df$AGE_BIN = factor(region_plotting_df$AGE_BIN, level = c("YOUNG", "OLD"))
```
Plotting our region mut freq
```{r}
oriL_region_plot = ggplot(region_plotting_df, aes(x = X_POS, y = STRAIN_AVG_FREQ, color = STRAIN, shape = AGE_BIN))
oriL_region_plot = oriL_region_plot +
geom_point(size = 0.3) +
geom_point(aes(x = 3, y = 6e-4), shape = "*", color = "#ec008c", size = 2.5) +
theme_bw(base_size = 6) +
facet_wrap(STRAIN_LABEL~., nrow = 2) +
ylab("Mutation frequency") +
xlab("Position (bp)") +
scale_color_manual(values = c("B6"= "#1d457f","AKR" = "#cc5c76", "ALR" = "#2f9e23", "FVB" = "#f57946", "NZB" = "#f7c22d")) +
scale_shape_manual(labels = c("Young", "Old") , values = c(1,19)) +
scale_x_continuous(breaks = seq(1,13), labels = start_pseudo$X_LABEL) +
scale_y_continuous(labels = function(x) format(x, scientific = TRUE)) +
theme(strip.background=element_blank(),
panel.grid=element_blank(),
text = element_text(family = "sans"),
axis.title = element_text(size = 5),
strip.text.x = element_text(size = 6, vjust = 1),
strip.text.y = element_text(size = 6, vjust = 1),
axis.text.y=element_text(size = 3.5),
axis.text.x=element_text(size = 3.5, angle = 45, vjust = 1, hjust = 1),
legend.position = "none")
pdf(paste(outdir_figures,"/region_oriL_plot.pdf",sep=""),width=1.85,height=1.25)
print(oriL_region_plot)
dev.off()
pdf(paste(outdir_figures,"/leg_region_oriL_plot.pdf",sep=""),width=3,height=1.75)
print(main_fig_oriL_plot + guides(shape = guide_legend(override.aes = list(size = 2)), color = guide_legend(override.aes = list(size = 2))) + theme(legend.position = "right", legend.key.size = unit(0.25, "cm")))
dev.off()
```
Types of mutations at the Ori region
```{r}
ori_mut_type = supertable %>%
select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, REF, ALT, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>%
unique() %>%
filter(CONDITION_MUT_FREQ_AT_POS < 1e-3) %>%
filter(START > 5167, START < 5191) %>%
mutate(MUTATION = paste(REF,ALT, sep = ">")) %>%
select(STRAIN, TISSUE, AGE_BIN, START, MUTATION, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS) %>%
group_by(STRAIN, TISSUE, AGE_BIN, START, MUTATION) %>%
summarise(COND_ALLELE_COUNT = sum(ALT_ALLELE_DEPTH), COND_READ_DEPTH = sum(READ_DEPTH_AT_POS)) %>%
filter(COND_ALLELE_COUNT > 0) %>%
group_by(STRAIN, TISSUE, AGE_BIN) %>%
mutate(TOTAL_MUTS = sum(COND_ALLELE_COUNT)) %>%
select(STRAIN, TISSUE, AGE_BIN, MUTATION, COND_ALLELE_COUNT, TOTAL_MUTS) %>%
ungroup() %>%
group_by(STRAIN, TISSUE, AGE_BIN, MUTATION, TOTAL_MUTS) %>%
#here we count how many of each allele is present across the entire region
summarise(MUT_TYPE_TOTAL_COUNT = sum(COND_ALLELE_COUNT)) %>%
mutate(MUT_PROP = MUT_TYPE_TOTAL_COUNT/TOTAL_MUTS)
ori_mut_type$STRAIN = factor(ori_mut_type$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB"))
ori_mut_type$AGE_BIN = factor(ori_mut_type$AGE_BIN, level = c("YOUNG", "OLD"))
```
```{r}
supertable %>%
select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, REF, ALT, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>%
unique() %>%
filter(CONDITION_MUT_FREQ_AT_POS < 1e-3) %>%
filter(START == 5169) %>%
mutate(MUTATION = paste(REF,ALT, sep = ">")) %>%
select(STRAIN, TISSUE, AGE_BIN, START, MUTATION, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS) %>%
group_by(STRAIN, TISSUE, AGE_BIN, START, MUTATION) %>%
summarise(COND_ALLELE_COUNT = sum(ALT_ALLELE_DEPTH), COND_READ_DEPTH = sum(READ_DEPTH_AT_POS)) %>%
filter(COND_ALLELE_COUNT > 0) %>%
mutate(MUT_FREQ = COND_ALLELE_COUNT/COND_READ_DEPTH)
```
```{r}
ori_mut_type_plot = ggplot(ori_mut_type, aes(x = STRAIN, y = MUT_PROP, fill = MUTATION)) +
geom_bar(position = "stack", stat = "identity") +
facet_grid(AGE_BIN~TISSUE) +
xlab("Strain") +
ylab("Allele mutation frequency") +
theme_bw(base_size = 10) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
strip.background = element_blank())
pdf(paste(outdir_figures,"/mut_type_oril_plot.pdf",sep=""),width=5,height=5)
print(ori_mut_type_plot)
dev.off()
```
```{R}
supertable %>%
filter(START == 5170)
```