---
title: "R Notebook"
output: html_notebook
---


```{r}
setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/")

outdir_figures = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/mutation_frequency_per_region/figures/"
outdir_files = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/mutation_frequency_per_region/files/"

supertable_file ="/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/supertable.txt"
supertable = read.table(supertable_file, header=TRUE, stringsAsFactors = FALSE)

coordinates_file = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/Mouse_Mt_Genome_Coordinates"
coordinates = read.table(coordinates_file, stringsAsFactors = FALSE, sep = "\t")
```


```{r}
#adding the D-Loop coordinates to our file 
coordinates[nrow(coordinates) + 1, ] <- c("D-Loop", 15422, 16299)

#adding column names to dataframe
colnames(coordinates) <- c("GENE", "START", "END")

coordinates$START <- as.numeric(coordinates$START)
coordinates$END <- as.numeric(coordinates$END)

#so things don't get wonky if the coordinates are read as strings -- also we convert our coordinates to be 0-indexed 
coordinates$START <- as.numeric(coordinates$START) - 1
coordinates$END <- as.numeric(coordinates$END) - 1

coordinates = coordinates
```

We need to figure out the length of each gene in order to normalize by length. 

```{r}

length_of_gene = coordinates %>%
  #filter our genes out 
  filter(grepl("mt-[N|A|C]",GENE)) %>%
  mutate(LENGTH_OF_GENE = (END - START) + 1)
```

Now, we need to figure out the average read depth of each region for every condition 

```{r}

avg_read_depth_per_gene = supertable %>% 
  select(SAMPLE, STRAIN, TISSUE, AGE_BIN, GENE, START, READ_DEPTH_AT_POS) %>% 
  #unique here so that we eliminate redundancy from multiple mutation types
  unique() %>% 
  select(STRAIN, TISSUE, AGE_BIN, GENE, START, READ_DEPTH_AT_POS) %>% 
  group_by(STRAIN, TISSUE, AGE_BIN, GENE, START) %>% 
  summarise(COND_READ_DEPTH_AT_POS = sum(READ_DEPTH_AT_POS)) %>% 
  ungroup() %>% 
  select(STRAIN, TISSUE, AGE_BIN, GENE, START, COND_READ_DEPTH_AT_POS) %>% 
  filter(grepl("mt-[N|A|C]",GENE)) %>%
  select(STRAIN, TISSUE, AGE_BIN, GENE, COND_READ_DEPTH_AT_POS) %>% 
  group_by(STRAIN, TISSUE, AGE_BIN, GENE) %>% 
  summarise(AVG_READ_DEPTH_GENE = mean(COND_READ_DEPTH_AT_POS))

```
We want to normalize for sequencing depth across our conditions (i.e. we wouldn't capture mutations if we didn't sequence to the level that we did in some samples): 

```{r}
norm_seq_depth = supertable %>% 
  select(SAMPLE, STRAIN, TISSUE, AGE_BIN, GENE, START, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>%
  group_by(SAMPLE, STRAIN, TISSUE, AGE_BIN, GENE, START, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>% 
  summarise(SAMPLE_MUT_COUNT_AT_POS = sum(ALT_ALLELE_DEPTH)) %>% 
  ungroup() %>% 
  select(STRAIN, TISSUE, AGE_BIN, GENE, START, SAMPLE_MUT_COUNT_AT_POS, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>% 
  group_by(STRAIN, TISSUE, AGE_BIN, GENE, START, CONDITION_MUT_FREQ_AT_POS) %>% 
  summarise(CONDITION_MUT_COUNT_AT_POS = sum(SAMPLE_MUT_COUNT_AT_POS), CONDITION_READ_DEPTH_AT_POS = sum(READ_DEPTH_AT_POS)) %>%
  ungroup() %>% 
  group_by(GENE, START) %>% 
  mutate(MIN_READ_DEPTH_AT_POS = min(CONDITION_READ_DEPTH_AT_POS)) %>% 
  #this is the lowest mutation frequency we would be able to get given the smallest read depth at a position across our conditions
  mutate(FLOOR_MIN_MUT_FREQ_AT_POS = 1/MIN_READ_DEPTH_AT_POS) %>% 
  #if our mutation frequency is lower than the floor we set, we reset our condition mutation count to 0 --> in essence we wouldn't have been able to capture these mutations without the sequencing depth we had 
  mutate(CONDITION_MUT_COUNT_AT_POS = ifelse(CONDITION_MUT_FREQ_AT_POS < FLOOR_MIN_MUT_FREQ_AT_POS, 0, CONDITION_MUT_COUNT_AT_POS)) 
  
  
``` 

Process our condition mutation count now that we've normalized for sequencing depth: 
```{r}

gene_mut_count_df = norm_seq_depth %>% 
  #filter out HFPs
  filter(CONDITION_MUT_FREQ_AT_POS < 0.001) %>%
  #filter for just our genes
  filter(grepl("mt-[N|A|C]",GENE)) %>%
  ungroup() %>% 
  select(STRAIN, TISSUE, AGE_BIN, GENE, CONDITION_MUT_COUNT_AT_POS) %>% 
  group_by(STRAIN, TISSUE, AGE_BIN, GENE) %>% 
  summarise(CONDITION_MUT_COUNT_GENE = sum(CONDITION_MUT_COUNT_AT_POS))


```

Now combining all of our dataframes to make one large df with our info -- our summary df for the info without HFPs

```{r}

summary_df = gene_mut_count_df %>% 
  left_join(length_of_gene, by = "GENE") %>% 
  left_join(avg_read_depth_per_gene, by =  c("STRAIN", "TISSUE", "AGE_BIN", "GENE")) %>% 
  mutate(PERC_GENE_MUTATED = CONDITION_MUT_COUNT_GENE/(LENGTH_OF_GENE*AVG_READ_DEPTH_GENE)) %>%
  filter(!(STRAIN == "B6" & TISSUE == "Liver"))

summary_df$STRAIN = factor(summary_df$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB"))
summary_df$AGE_BIN = factor(summary_df$AGE_BIN, level = c("YOUNG", "OLD"))

```

Comparison of gene avg mut freq
```{r}

gene_mut_freq_plot = ggplot(summary_df, aes(x = GENE, y = PERC_GENE_MUTATED, fill = GENE, alpha = AGE_BIN))

gene_mut_freq_plot = gene_mut_freq_plot + 
  geom_bar(stat="identity", position=position_dodge()) + 
  facet_grid(STRAIN~TISSUE) + 
  scale_alpha_manual(values = c(0.6, 1)) + 
  theme_bw() + 
  theme(strip.background = element_blank(), 
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))

pdf(paste(outdir_figures,"/avg_mut_freq_genes.pdf",sep=""),width=6.5,height=6)
print(gene_mut_freq_plot)
dev.off()
```


Plotting time:
Color palette

```{r}
library(PNWColors)
bay_pal <- pnw_palette(name="Bay", type="discrete")
```

Pinpointing the high frequency region to the Nd2 region; we plot the neighboring regions to the Nd2 in order to contrast the mutation frequency across regions
```{r}
nd2_reg = norm_seq_depth %>%
  #this was the arbitrary frequency we define as being a high frequency position
  filter(CONDITION_MUT_FREQ_AT_POS < 0.001) %>% 
  mutate(NORM_CONDITION_MUT_FREQ_AT_POS = CONDITION_MUT_COUNT_AT_POS/CONDITION_READ_DEPTH_AT_POS) %>% 
  ungroup() %>%
  select(STRAIN, TISSUE, AGE_BIN, START, NORM_CONDITION_MUT_FREQ_AT_POS) %>% 
  #spans mt-Ti to mt-Tn
  filter(START > 3700, START < 5088) %>%
  filter(NORM_CONDITION_MUT_FREQ_AT_POS > 0)

nd2_reg$STRAIN = factor(nd2_reg$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB"))
nd2_reg$AGE_BIN = factor(nd2_reg$AGE_BIN, level = c("YOUNG", "OLD"))

```

```{r}

nd2_reg_plot = ggplot(nd2_reg %>% 
                         mutate(AGE_LABEL = recode(AGE_BIN, "YOUNG" = "Young", "OLD" = "Old")), aes(x = START, y = NORM_CONDITION_MUT_FREQ_AT_POS, color = TISSUE))

nd2_reg_plot = nd2_reg_plot + 
  geom_point(size = 0.5, alpha = 0.7) + 
  #this highlights where the Nd2 is in the region
  geom_segment(x = 3913, xend = 4950, y = 0, yend = 0, color = "black") + 
  theme_bw() + 
  facet_grid(STRAIN~AGE_LABEL) + 
  ylab("Mutation frequency at position") + 
  xlab("Position on the mt-genome (bp)") + 
  scale_color_manual(name = "Tissue", values= bay_pal[c(1,5,4)]) + 
  guides(color = guide_legend(override.aes = list(size = 3))) +
  theme(strip.background=element_blank(),
         panel.grid=element_blank(),
         text = element_text(family = "sans"),
         axis.title = element_text(size = 10),
         strip.text.x = element_text(size = 10, vjust = 1),
         strip.text.y = element_text(size = 10, vjust = 1),
         axis.text.y=element_text(size = 8), 
         axis.text.x=element_text(size = 8, angle = 45, vjust = 1, hjust = 1),
        legend.position = "bottom")

pdf(paste(outdir_figures,"/mut_freq_nd2_reg.pdf",sep=""),width=6,height=4)
print(nd2_reg_plot)
dev.off()

```

Zooming into Nd2 to find where in the Nd2 we have this high frequency cluster 

```{r}
nd2_zoomies = norm_seq_depth %>%
  #this was the arbitrary frequency we define as being a high frequency position
  filter(CONDITION_MUT_FREQ_AT_POS < 0.001) %>%
  mutate(NORM_CONDITION_MUT_FREQ_AT_POS = CONDITION_MUT_COUNT_AT_POS/CONDITION_READ_DEPTH_AT_POS) %>% 
  ungroup() %>%
  select(STRAIN, TISSUE, AGE_BIN, START, NORM_CONDITION_MUT_FREQ_AT_POS) %>% 
  filter(START>3912, START<4951) %>%
  filter(NORM_CONDITION_MUT_FREQ_AT_POS > 0)

nd2_zoomies$STRAIN = factor(nd2_zoomies$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB"))
nd2_zoomies$AGE_BIN = factor(nd2_zoomies$AGE_BIN, level = c("YOUNG", "OLD"))

```


```{r}
nd2_zoomies_plot = ggplot(nd2_zoomies %>% 
                         mutate(AGE_LABEL = recode(AGE_BIN, "YOUNG" = "Young", "OLD" = "Old")), aes(x = START, y = NORM_CONDITION_MUT_FREQ_AT_POS, color = TISSUE))

nd2_zoomies_plot = nd2_zoomies_plot + 
  geom_point(size = 0.5, alpha = 0.7) + 
  #geom_point(x = 9818, y = 0.00075, shape = 8, size = 0.7, color = "magenta") + 
  theme_bw() + 
  facet_grid(STRAIN~AGE_LABEL) + 
  ylab("Mutation frequency at position") + 
  xlab("Position on the mt-genome (bp)") + 
  scale_color_manual(name = "Tissue", values= bay_pal[c(1,5,4)]) + 
  scale_x_continuous(breaks = seq(3913, 4950,1)) +
  guides(color = guide_legend(override.aes = list(size = 3))) +
  theme(strip.background=element_blank(),
         panel.grid=element_blank(),
         text = element_text(family = "sans"),
         axis.title = element_text(size = 8),
         strip.text.x = element_text(size = 8, vjust = 1),
         strip.text.y = element_text(size = 8, vjust = 1),
         axis.text.y=element_text(size = 6), 
         axis.text.x=element_text(size = 4, angle = 45, vjust = 1, hjust = 1),
        legend.position = "bottom")

print(nd2_zoomies_plot)

pdf(paste(outdir_figures,"/mut_freq_nd2.pdf",sep=""),width=4,height=3)
print(nd2_zoomies_plot)
dev.off()
```

```{r}

strain_avg_freq = norm_seq_depth %>%
  #this was the arbitrary frequency we define as being a high frequency position
  filter(CONDITION_MUT_FREQ_AT_POS < 0.001) %>%
  mutate(NORM_CONDITION_MUT_FREQ_AT_POS = CONDITION_MUT_COUNT_AT_POS/CONDITION_READ_DEPTH_AT_POS) %>% 
  ungroup() %>%
  select(STRAIN, TISSUE, AGE_BIN, START, NORM_CONDITION_MUT_FREQ_AT_POS) %>% 
  filter(START>3912, START<4951) %>%
  group_by(STRAIN, AGE_BIN, START) %>%
  summarise(AVG_STRAIN = mean(NORM_CONDITION_MUT_FREQ_AT_POS)) %>%
  filter(AVG_STRAIN > 0) %>%
  mutate(X_POS = ifelse(AGE_BIN == "YOUNG", START - 0.15, START + 0.15))

strain_avg_freq$STRAIN = factor(strain_avg_freq$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB"))
strain_avg_freq$AGE_BIN = factor(strain_avg_freq$AGE_BIN, level = c("YOUNG", "OLD"))

```


```{r}

strain_avg_freq_plot = ggplot(strain_avg_freq, aes(x = X_POS, y = AVG_STRAIN, color = STRAIN, shape = AGE_BIN))

strain_avg_freq_plot = strain_avg_freq_plot + 
  geom_point(size = 0.15) +
  theme_bw(base_size = 6) + 
  facet_wrap(STRAIN~., nrow = 5) + 
  ylab("Mutation frequency") + 
  xlab("Position (bp)") + 
  scale_color_manual(values = c("B6"= "#1d457f","AKR" = "#cc5c76", "ALR" = "#2f9e23", "FVB" = "#f57946", "NZB" = "#f7c22d")) +
  scale_shape_manual(labels = c("Young", "Old") , values = c(1,19)) + 
  #scale_x_continuous(breaks = seq(5168,5190,1), labels = X_LABEL) +
  scale_y_continuous(labels = function(x) format(x, scientific = TRUE)) + 
  theme(strip.background=element_blank(),
         panel.grid=element_blank(),
         text = element_text(family = "sans"),
         axis.title = element_text(size = 5),
         strip.text.x = element_text(size = 6, vjust = 1),
         strip.text.y = element_text(size = 6, vjust = 1),
         axis.text.y=element_text(size = 3.5), 
         axis.text.x=element_text(size = 3.5, angle = 45, vjust = 1, hjust = 1),
        legend.position = "none")

pdf(paste(outdir_figures,"/strain_avg_freq_nd2.pdf",sep=""),width=4,height=3)
print(strain_avg_freq_plot)
dev.off()

```

FURTHER ZOOMED IN 

```{r}

high_freq_peak = strain_avg_freq %>%
  filter(START > 4044, START < 4056) %>%
  mutate(STRAIN_LABEL = ifelse(STRAIN == "B6", "B6", "Conplastic")) 

```

```{r}

X_LABEL = (supertable %>%
  filter(START > 4044, START < 4056) %>%
  select(START, REF) %>%
  unique() %>%
  filter(nchar(REF) == 1) %>%
  mutate(X_LABEL = paste(REF, START, sep = "")))$X_LABEL

```


```{r}

peak_nd2_plot = ggplot(high_freq_peak, aes(x = X_POS, y = AVG_STRAIN, color = STRAIN, shape = AGE_BIN))

peak_nd2_plot = peak_nd2_plot + 
  geom_point(size = 0.3) +
  theme_bw(base_size = 6) + 
  facet_wrap(STRAIN_LABEL~., nrow = 2) + 
  ylab("Mutation frequency") + 
  xlab("Position (bp)") + 
  scale_color_manual(values = c("B6"= "#1d457f","AKR" = "#cc5c76", "ALR" = "#2f9e23", "FVB" = "#f57946", "NZB" = "#f7c22d")) +
  scale_shape_manual(labels = c("Young", "Old") , values = c(1,19)) + 
  scale_x_continuous(breaks = seq(4045,4055,1), labels = X_LABEL) +
  scale_y_continuous(labels = function(x) format(x, scientific = TRUE)) + 
  theme(strip.background=element_blank(),
        panel.grid=element_blank(),
         text = element_text(family = "sans"),
         axis.title = element_text(size = 5),
         strip.text.x = element_text(size = 6, vjust = 1),
         strip.text.y = element_text(size = 6, vjust = 1),
         axis.text.y=element_text(size = 3.5), 
         axis.text.x=element_text(size = 3.5, angle = 45, vjust = 1, hjust = 1),
        legend.position = "none")

pdf(paste(outdir_figures,"/peak_nd2.pdf",sep=""),width=1,height=1.25)
print(peak_nd2_plot)
dev.off()

```

```{r}

nd2_peak_mut_type = supertable %>% 
  select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, REF, ALT, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>%
  unique() %>%
  filter(CONDITION_MUT_FREQ_AT_POS < 1e-3) %>%
  filter(START == 4050) %>%
  mutate(MUTATION = paste(REF,ALT, sep = ">")) %>%
  select(STRAIN, TISSUE, AGE_BIN, START, MUTATION, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS) %>%
  group_by(STRAIN, TISSUE, AGE_BIN, START, MUTATION) %>%
  summarise(COND_ALLELE_COUNT = sum(ALT_ALLELE_DEPTH), COND_READ_DEPTH = sum(READ_DEPTH_AT_POS)) %>%
  filter(COND_ALLELE_COUNT > 0) %>%
  group_by(STRAIN, TISSUE, AGE_BIN) %>%
  mutate(TOTAL_MUTS = sum(COND_ALLELE_COUNT)) %>%
  select(STRAIN, TISSUE, AGE_BIN, MUTATION, COND_ALLELE_COUNT, TOTAL_MUTS) %>% 
  ungroup() %>%
  group_by(STRAIN, TISSUE, AGE_BIN, MUTATION, TOTAL_MUTS) %>%
  #here we count how many of each allele is present across the entire region 
  summarise(MUT_TYPE_TOTAL_COUNT = sum(COND_ALLELE_COUNT)) %>% 
  mutate(MUT_PROP = MUT_TYPE_TOTAL_COUNT/TOTAL_MUTS)

nd2_peak_mut_type$STRAIN = factor(nd2_peak_mut_type$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB"))
nd2_peak_mut_type$AGE_BIN = factor(nd2_peak_mut_type$AGE_BIN, level = c("YOUNG", "OLD"))
```

```{r}

nd2_peak_mut_type_plot = ggplot(nd2_peak_mut_type, aes(x = STRAIN, y = MUT_PROP, fill = MUTATION)) + 
  geom_bar(position = "stack", stat = "identity") + 
  facet_grid(AGE_BIN~TISSUE) + 
  xlab("Strain") + 
  ylab("Allele mutation frequency") + 
  theme_bw(base_size = 10) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    strip.background = element_blank())

pdf(paste(outdir_figures,"/mut_type_nd2_plot.pdf",sep=""),width=5,height=5)
print(nd2_peak_mut_type_plot)
dev.off()
```