---
title: "R Notebook"
output: html_notebook
---
```{r}
library(tidyverse)
library(ggplot2)
library(scales)
library(ggridges)
```
```{r}
setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/reversions/")
outdir_figures = "figures/"
outdir_files = "files/"
#this file contains REVERSION haplotype sites (ALT > B6 REF)
#normalized_mut_freq_per_pos_file = "../files/normalized_mut_freq_per_pos_w_reversions.txt"
normalized_mut_freq_per_pos_file = "files/corr_NUMT_freqs_normalized_mut_freq_per_pos_w_reversions.txt"
normalized_mut_freq_per_pos = read.table(normalized_mut_freq_per_pos_file, header = TRUE, stringsAsFactors = FALSE)
#we're using this table to get the reversion mutation type at the haplotype positions
contingency_table_entries_file = "files/contingency_tables_entries.txt"
contingency_table_entries = read.table(contingency_table_entries_file, header = TRUE, stringsAsFactors = FALSE) %>%
filter(AGE_BIN == "YOUNG")
```
I took these out of NZB because they are the ins haplotype sites in NZB
```{r}
filtered_normalized_mut_freq_pos = normalized_mut_freq_per_pos %>%
filter(!(STRAIN == "NZB" & START == 9819)) %>%
filter(!(STRAIN == "NZB" & START == 5203))
```
We use this file to label the genes the reversions are in and to create our mt-genome map
```{r}
coordinates_file <- "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/Mouse_Mt_Genome_Coordinates"
coordinates <- read.table(coordinates_file, stringsAsFactors = FALSE, sep = "\t")
```
Formatting the coordinates file:
```{r}
#adding the D-Loop coordinates to our file
coordinates[nrow(coordinates) + 1, ] <- c("D-Loop", 15422, 16299)
#adding column names to dataframe
colnames(coordinates) <- c("GENE", "START", "END")
coordinates$START <- as.numeric(coordinates$START)
coordinates$END <- as.numeric(coordinates$END)
#so things don't get wonky if the coordinates are read as strings -- also we convert our coordinates to be 0-indexed
coordinates$START <- as.numeric(coordinates$START) - 1
coordinates$END <- as.numeric(coordinates$END) - 1
coordinates = coordinates %>%
mutate(TYPE = case_when(grepl("mt-R", GENE) ~ "rRNA",
grepl("mt-T", GENE) ~ "tRNA",
GENE == "D-Loop" ~ "D-Loop",
GENE == "mt-OL" ~ "OriL",
TRUE ~ "Protein"))
```
Creating the fields that will hold the info we want to include in the figure
```{r}
gene_labeller = function(position){
gene_label = coordinates[((coordinates$START <= position) & (coordinates$END >= position)),]$GENE
if (length(gene_label) < 1){
gene_label = "intergene_region"
}
#in the case of the overlapping genes, assign the position to the first gene
if (length(gene_label) > 1){
gene_label = gene_label[1]
}
return(gene_label)
}
```
Pivot the table so that we can calculate the mut freq difference per position with age
```{r}
delta_df = filtered_normalized_mut_freq_pos %>%
ungroup() %>%
select(STRAIN, TISSUE, AGE_BIN, START, SITE_TYPE, NORM_MUT_FREQ_AT_POS) %>%
pivot_wider(names_from = AGE_BIN, values_from = NORM_MUT_FREQ_AT_POS) %>%
mutate(DELTA = OLD - YOUNG) %>%
filter(DELTA != 0) %>%
mutate(CHANGE_TYPE = ifelse(DELTA > 0, "Increase", "Decrease")) %>%
#we don't need B6 in this analysis since we're interested in the conplastic haplotype sites
filter(STRAIN != "B6")
delta_df$STRAIN = factor(delta_df$STRAIN, level = c("AKR", "ALR", "FVB", "NZB"))
delta_df = delta_df %>%
mutate(DUMMY_AGE_LABEL = recode(TISSUE, "Brain" = 1,
"Heart" = 2,
"Liver" = 3))
```
We need to find the empirical p-value for all of the haplotype sites that change in freq with age
```{r}
haplo_sites_delta = delta_df %>%
filter(SITE_TYPE == "HAPLO_SITE")
```
1) We calculate the total number of background sites that increase or decrease in freq with age
```{r}
total_sites_per_change_type = delta_df %>%
filter(SITE_TYPE == "NOT_HAPLO") %>%
select(STRAIN, TISSUE, CHANGE_TYPE) %>%
group_by(STRAIN, TISSUE, CHANGE_TYPE) %>%
summarise(TOTAL_COUNT = n())
```
```{r}
delta_background = delta_df %>%
filter(SITE_TYPE != "HAPLO_SITE")
```
Now let's create a function that will take the delta and condition info from the haplotype delta df and output the p-value
```{r}
p_value_calculator = function(delta, strain, tissue, change){
#this part works :)
total_count = (total_sites_per_change_type %>%
filter(STRAIN == strain, TISSUE == tissue, CHANGE_TYPE == change))$TOTAL_COUNT
df = delta_background %>%
filter(STRAIN == strain, TISSUE == tissue, CHANGE_TYPE == change)
if(change == "Decrease"){
count_above_haplo_delta = (df %>%
mutate(FLAG = ifelse(DELTA < delta, 1, 0)) %>%
select(FLAG) %>%
summarise(count_above_haplo_delta = sum(FLAG)))$count_above_haplo_delta}
if(change == "Increase"){
count_above_haplo_delta = (df %>%
mutate(FLAG = ifelse(DELTA > delta, 1, 0)) %>%
select(FLAG) %>%
summarise(count_above_haplo_delta = sum(FLAG)))$count_above_haplo_delta}
#this works! tested with individual cases
p_val = count_above_haplo_delta/total_count
return(p_val)
}
```
```{r}
haplo_sites_p_vals = haplo_sites_delta %>%
select(STRAIN, TISSUE, START, DELTA, CHANGE_TYPE) %>%
rowwise() %>%
mutate(P_VAL = p_value_calculator(DELTA, STRAIN, TISSUE, CHANGE_TYPE))
```
```{r}
adjusted_haplo_sites_p_vals = haplo_sites_p_vals %>%
mutate(p_adj = p.adjust(P_VAL, method = "BH")) %>%
mutate(SIG_STATUS = ifelse(p_adj < 0.01, "SIG", "N.S."))
```
```{r}
setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/reversions/")
write.table(adjusted_haplo_sites_p_vals, file = paste(outdir_files,"sig_rev_change_in_freq_w_age.txt", sep = ""), sep = "\t", quote = F, row.names = T)
```
```{r}
plotting_df = adjusted_haplo_sites_p_vals %>%
rename(POS = START) %>%
left_join(contingency_table_entries, by = c("STRAIN", "TISSUE", "POS")) %>%
select(STRAIN, TISSUE, POS, DELTA, SIG_STATUS, CHANGE_TYPE, ORG_CONPLASTIC_ALLELE, REV_B6_ALLELE) %>%
rowwise() %>%
mutate(GENE = gene_labeller(POS)) %>%
mutate(MUT_TYPE = paste(ORG_CONPLASTIC_ALLELE, REV_B6_ALLELE, sep = ">")) %>%
mutate(MUT_TYPE_LABEL = recode(MUT_TYPE, "G>A" = "G>A/C>T",
"G>C" = "G>C/C>G",
"G>T" = "G>T/C>A",
"T>C" = "T>C/A>G",
"T>G" = "T>G/A>C",
"T>A" = "T>A/A>T"))
```
Text for the increasing alleles
```{r}
inc_label_text = plotting_df %>%
filter(CHANGE_TYPE == "Increase", SIG_STATUS == "SIG") %>%
select(STRAIN, TISSUE, GENE, POS, DELTA, MUT_TYPE_LABEL) %>%
group_by(STRAIN, POS, GENE, MUT_TYPE_LABEL) %>%
mutate(Y_POS = max(DELTA)) %>%
ungroup() %>%
select(STRAIN, GENE, POS, Y_POS, MUT_TYPE_LABEL) %>%
group_by(STRAIN, GENE, POS, Y_POS, MUT_TYPE_LABEL) %>%
summarise(TISSUE_COUNT = n()) %>%
mutate(X_POS = POS) %>%
mutate(LABEL = ifelse(TISSUE_COUNT > 2, paste(STRAIN, GENE, MUT_TYPE_LABEL, paste("(", TISSUE_COUNT,")"), sep = "\n"), "")) %>%
mutate(Y_POS = ifelse(STRAIN == "NZB" & GENE == "mt-Co2", 1e-1, Y_POS))
```
```{R}
#to note what other genes had a significant increasing reversion (these were tissue specific rev)
inc_sec_label_text = plotting_df %>%
filter(CHANGE_TYPE == "Increase", SIG_STATUS == "SIG") %>%
select(STRAIN, TISSUE, GENE, POS, DELTA, MUT_TYPE_LABEL) %>%
group_by(STRAIN, POS, GENE, MUT_TYPE_LABEL) %>%
mutate(Y_POS = max(DELTA)) %>%
ungroup() %>%
select(STRAIN, GENE, POS, Y_POS, MUT_TYPE_LABEL) %>%
group_by(STRAIN, GENE, POS, Y_POS, MUT_TYPE_LABEL) %>%
summarise(TISSUE_COUNT = n()) %>%
mutate(X_POS = POS) %>%
mutate(SEC_LABEL = ifelse(TISSUE_COUNT < 2, paste(GENE), "")) %>%
ungroup() %>%
select(X_POS, Y_POS, SEC_LABEL) %>%
distinct(SEC_LABEL, .keep_all = TRUE) %>%
mutate(Y_POS = ifelse(SEC_LABEL == "D-Loop", 1.1e-4, Y_POS - 5e-5)) %>%
mutate(X_POS = ifelse(SEC_LABEL == "D-Loop", 15500, X_POS ))
```
Now doing increasing pops
```{r}
setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/reversions/")
library(PNWColors)
bay_pal <- pnw_palette(name="Bay", type="discrete")
inc_rev = ggplot(plotting_df %>%
filter(DELTA > 0)) +
geom_point(aes(x = POS, y = log10(DELTA), color = STRAIN, shape = SIG_STATUS, size = log10(DELTA)/4)) +
geom_text(data = inc_label_text, aes(x = ifelse(STRAIN == "AKR", X_POS + 500, X_POS - 800), y = log10(Y_POS), label = LABEL), size = 2.5) +
geom_text(data = inc_sec_label_text, aes(x = X_POS, y = log10(Y_POS), label = SEC_LABEL), size = 2.5) +
scale_shape_manual(name = "Significance", labels = c("N.S.", "Sig"), values = c(1, 19)) +
scale_color_manual(values = c("AKR" = "#cc5c76", "ALR" = "#2f9e23", "FVB" = "#f57946", "NZB" = "#f7c22d")) +
#scale_y_continuous(limits = c(-6,0), labels=c("-6" = "1e-6", "-5" = "1e-5", "-4" = "1e-4", "-3" = "1e-3", "2" = "1e-2", "-1" = "1e-1", "0" = "0")) +
scale_y_continuous(limits = c(-7,0), breaks = seq(-6, 0 , by = 2), labels=c("-6" = "1e-6", "-4" = "1e-4", "2" = "1e-2", "0" = "")) +
#scale_y_log10() +
coord_cartesian(expand = FALSE,
xlim = c(0, 16299)) +
ylab("Delta Mutation Frequency \n (Old - Young) [log10]") +
xlab("") +
theme_bw() +
theme(legend.position = "none",
strip.background=element_blank(),
text = element_text(family = "sans"),
axis.ticks.x = element_blank())
pdf(paste(outdir_figures,"increasing_reversions.pdf",sep=""),width=9,height=2.5)
print(inc_rev)
dev.off()
setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/reversions/")
pdf(paste(outdir_figures,"leg_reversions.pdf",sep=""),width=9,height=6)
print(inc_rev + guides(color = guide_legend(override.aes = list(size = 3)), shape = guide_legend(override.aes = list(size = 3))) + theme(legend.position = "right"))
dev.off()
```
```{r}
dec_rev = ggplot(plotting_df %>%
filter(DELTA < 0)) +
geom_point(aes(x = POS, y = log10(abs(DELTA)), color = STRAIN, shape = SIG_STATUS, size = log10(abs(DELTA))/4)) +
#no significant decreases :/
scale_shape_manual(name = "Significance", labels = c("N.S.", "Sig"), values = c(1, 19)) +
scale_color_manual(values = c("AKR" = "#cc5c76", "ALR" = "#2f9e23", "FVB" = "#f57946", "NZB" = "#f7c22d")) +
scale_y_reverse(limits = c(0, -7), breaks = c(-6, -4, -2, 0), labels = c("-6" = "1e-6", "-4" = "1e-4", "-2" = "1e-2", "0" = "") ) +
#labels=c("-7" = "1e-7", "-5" = "1e-5", "-4.5" = "2.75e-5", "-4" = "1e-4", "-3.5" = "2.75e-4", "0" = "0")
coord_cartesian(expand = FALSE,
xlim = c(0, 16299)) +
scale_x_continuous(position = "top") +
ylab("Delta Mutation Frequency \n (Old - Young) [log10]") +
xlab("") +
theme_bw() +
theme(legend.position = "none",
strip.background=element_blank(),
text = element_text(family = "sans"),
axis.ticks.x = element_blank())
setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/reversions/")
pdf(paste(outdir_figures, "decreasing_reversions.pdf",sep=""),width=9,height=2.5)
print(dec_rev)
dev.off()
```
Map of mt-genome for the aesthetic
```{r}
num_segs = length(coordinates$GENE)
#important part here is to include the very last coordinate on the mt-genome map otherwise we miss our "last x coord" at the end of our map
x = c(coordinates$START, 16299)
xs = c(x[1:num_segs], x[1:num_segs], x[-1], x[-1], x[1:num_segs])
ys = c(rep(0,num_segs), rep(1,num_segs), rep(1,num_segs), rep(0,num_segs), rep(0,num_segs))
idx = rep(seq(1,num_segs),5)
type = rep(coordinates$TYPE, 5)
mt_genome_map = data.frame(x_pos = xs, y = ys, idx = idx, type = type) %>%
arrange(idx)
#alternating the thickness of the protein regions to match the circular mt-genome map
mt_genome_map = mt_genome_map %>%
mutate(y = ifelse(type == "Protein", ifelse(y == 0, -0.25, 1.25), y))
```
Plotting the linear map of the mt-genome
```{r}
setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/reversions/")
library(RColorBrewer)
lake_pal <- pnw_palette(name="Lake", type="discrete")
spectral = brewer.pal(11,"Spectral")
PRGn = brewer.pal(11,"PRGn")
set1 = brewer.pal(3,"Set1")
PuRd = brewer.pal(11,"PuRd")
Blues = brewer.pal(9, "Blues")
colors = c(Blues[9], set1[1], lake_pal[6], PuRd[6],PRGn[4])
#colors = c(blues[10],blues[3],spectral[1],spectral[8],spectral[11])
#lake_pal[c(3,7,1,6,5)]
#colors = c(lake_pal[c(3,7)],set1[1],lake_pal[6],PRGn[4])
linear_map = ggplot(mt_genome_map)
linear_map = linear_map +
geom_polygon(aes(x=x_pos, y=y, group=idx, fill=type)) +
theme_bw(base_size=16) +
theme(legend.position="None",
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
panel.grid=element_blank()) +
scale_y_continuous("") +
scale_x_continuous("Position", breaks=seq(0,16000,4000)) +
scale_fill_manual(name = "Genomic Region", values = colors) +
theme(strip.background = element_blank(),
strip.text=element_blank(),
text = element_text(family = "sans"),
panel.border = element_blank(),
axis.title.x = element_text(size = 14),
legend.position = "none")
pdf(paste(outdir_figures,"linear_mtdna_map.pdf",sep=""),width=9,height=1)
print(linear_map)
dev.off()
```