---
title: "R Notebook"
output: html_notebook
---


```{r}
library(tidyverse)
library(ggplot2)
library(PNWColors)
library(gridExtra)
```

```{r}
setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/")

outdir_figures = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/mutation_frequency_per_region/figures/"
outdir_files = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/mutation_frequency_per_region/files/"

supertable_file ="/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/supertable.txt"
supertable = read.table(supertable_file, header=TRUE, stringsAsFactors = FALSE)

coordinates_file = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/Mouse_Mt_Genome_Coordinates"
coordinates = read.table(coordinates_file, stringsAsFactors = FALSE, sep = "\t")
```

```{r}
#adding the D-Loop coordinates to our file 
coordinates[nrow(coordinates) + 1, ] <- c("D-Loop", 15422, 16299)

#adding column names to dataframe
colnames(coordinates) <- c("GENE", "START", "END")

coordinates$START <- as.numeric(coordinates$START)
coordinates$END <- as.numeric(coordinates$END)

#so things don't get wonky if the coordinates are read as strings -- also we convert our coordinates to be 0-indexed 
coordinates$START <- as.numeric(coordinates$START) - 1
coordinates$END <- as.numeric(coordinates$END) - 1

coordinates = coordinates %>%
  mutate(TYPE = case_when(grepl("mt-R", GENE) ~ "rRNA",
                          grepl("mt-T", GENE) ~ "tRNA", 
                          GENE == "D-Loop" ~ "D-Loop",
                          GENE == "mt-OL" ~ "OriL",
                          grepl("mt-[N|A|C]",GENE) ~ "Protein",
                          TRUE ~ "Intergene"))
```

We need to figure out the length of each region in order to normalize by length. 

```{r}

length_of_region = coordinates %>%
  mutate(LENGTH_OF_GENE = (END - START) + 1) %>%
  filter(TYPE != "Intergene") %>%
  select(TYPE, LENGTH_OF_GENE) %>%
  group_by(TYPE) %>%
  summarise(LENGTH_OF_TYPE = sum(LENGTH_OF_GENE))

```

Now, we need to figure out the average read depth of each region for every condition 

```{r}

avg_read_depth_per_region = supertable %>% 
  select(SAMPLE, STRAIN, TISSUE, AGE_BIN, GENE, START, READ_DEPTH_AT_POS) %>% 
  #unique here so that we eliminate redundancy from multiple mutation types
  unique() %>% 
  select(STRAIN, TISSUE, AGE_BIN, GENE, START, READ_DEPTH_AT_POS) %>% 
  group_by(STRAIN, TISSUE, AGE_BIN, GENE, START) %>% 
  summarise(COND_READ_DEPTH_AT_POS = sum(READ_DEPTH_AT_POS)) %>% 
  ungroup() %>% 
  select(STRAIN, TISSUE, AGE_BIN, GENE, START, COND_READ_DEPTH_AT_POS) %>% 
  mutate(TYPE = case_when(grepl("mt-R", GENE) ~ "rRNA",
                          grepl("mt-T", GENE) ~ "tRNA", 
                          GENE == "D-Loop" ~ "D-Loop",
                          GENE == "mt-OL" ~ "OriL",
                          grepl("mt-[N|A|C]",GENE) ~ "Protein",
                          TRUE ~ "Intergene")) %>% 
  #filtering out those few bp that exist between these regions
  filter(TYPE != "Intergene") %>%
  select(STRAIN, TISSUE, AGE_BIN, TYPE, COND_READ_DEPTH_AT_POS) %>% 
  group_by(STRAIN, TISSUE, AGE_BIN, TYPE) %>% 
  summarise(AVG_READ_DEPTH_REGION = mean(COND_READ_DEPTH_AT_POS))

```

We want to normalize for sequencing depth across our conditions (i.e. we wouldn't capture mutations if we didn't sequence to the level that we did in some samples): 

```{r}
norm_seq_depth = supertable %>% 
  select(SAMPLE, STRAIN, TISSUE, AGE_BIN, GENE, START, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>%
  group_by(SAMPLE, STRAIN, TISSUE, AGE_BIN, GENE, START, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>% 
  summarise(SAMPLE_MUT_COUNT_AT_POS = sum(ALT_ALLELE_DEPTH)) %>% 
  ungroup() %>% 
  select(STRAIN, TISSUE, AGE_BIN, GENE, START, SAMPLE_MUT_COUNT_AT_POS, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>% 
  group_by(STRAIN, TISSUE, AGE_BIN, GENE, START, CONDITION_MUT_FREQ_AT_POS) %>% 
  summarise(CONDITION_MUT_COUNT_AT_POS = sum(SAMPLE_MUT_COUNT_AT_POS), CONDITION_READ_DEPTH_AT_POS = sum(READ_DEPTH_AT_POS)) %>%
  ungroup() %>% 
  group_by(GENE, START) %>% 
  mutate(MIN_READ_DEPTH_AT_POS = min(CONDITION_READ_DEPTH_AT_POS)) %>% 
  #this is the lowest mutation frequency we would be able to get given the smallest read depth at a position across our conditions
  mutate(FLOOR_MIN_MUT_FREQ_AT_POS = 1/MIN_READ_DEPTH_AT_POS) %>% 
  #if our mutation frequency is lower than the floor we set, we reset our condition mutation count to 0 --> in essence we wouldn't have been able to capture these mutations without the sequencing depth we had 
  mutate(CONDITION_MUT_COUNT_AT_POS = ifelse(CONDITION_MUT_FREQ_AT_POS < FLOOR_MIN_MUT_FREQ_AT_POS, 0, CONDITION_MUT_COUNT_AT_POS)) 
  
  
```  
  
Process our condition mutation count now that we've normalized for sequencing depth: 
```{r}

region_mut_freq_df = norm_seq_depth %>% 
  ungroup() %>% 
  select(STRAIN, TISSUE, AGE_BIN, GENE, CONDITION_MUT_COUNT_AT_POS) %>% 
  mutate(TYPE = case_when(grepl("mt-R", GENE) ~ "rRNA",
                          grepl("mt-T", GENE) ~ "tRNA", 
                          GENE == "D-Loop" ~ "D-Loop",
                          GENE == "mt-OL" ~ "OriL",
                          grepl("mt-[N|A|C]",GENE) ~ "Protein",
                          TRUE ~ "Intergene")) %>% 
  #filtering out those few bp that exist between these regions
  filter(TYPE != "Intergene") %>%
  select(STRAIN, TISSUE, AGE_BIN, TYPE, CONDITION_MUT_COUNT_AT_POS) %>%
  group_by(STRAIN, TISSUE, AGE_BIN, TYPE) %>% 
  summarise(CONDITION_MUT_COUNT_REGION = sum(CONDITION_MUT_COUNT_AT_POS))


```

Processing our condition mutation count WO identified outliers 

```{r}

wo_hfps_region_mut_freq_df = norm_seq_depth %>% 
  filter(CONDITION_MUT_FREQ_AT_POS < 0.001) %>% 
  ungroup() %>% 
  select(STRAIN, TISSUE, AGE_BIN, GENE, CONDITION_MUT_COUNT_AT_POS) %>% 
  mutate(TYPE = case_when(grepl("mt-R", GENE) ~ "rRNA",
                          grepl("mt-T", GENE) ~ "tRNA", 
                          GENE == "D-Loop" ~ "D-Loop",
                          GENE == "mt-OL" ~ "OriL",
                          grepl("mt-[N|A|C]",GENE) ~ "Protein",
                          TRUE ~ "Intergene")) %>% 
  #filtering out those few bp that exist between these regions
  filter(TYPE != "Intergene") %>%
  select(STRAIN, TISSUE, AGE_BIN, TYPE, CONDITION_MUT_COUNT_AT_POS) %>%
  group_by(STRAIN, TISSUE, AGE_BIN, TYPE) %>% 
  summarise(WO_HFPS_CONDITION_MUT_COUNT_REGION = sum(CONDITION_MUT_COUNT_AT_POS))
  
```

```{r}
rm(supertable)
```


Now combining all of our dataframes to make one large df with our info -- our summary df for the info without HFPs

```{r}

summary_df = wo_hfps_region_mut_freq_df %>% 
  full_join(region_mut_freq_df, by = c("STRAIN", "TISSUE", "AGE_BIN", "TYPE")) %>% 
  left_join(length_of_region, by = "TYPE") %>% 
  left_join(avg_read_depth_per_region, by =  c("STRAIN", "TISSUE", "AGE_BIN", "TYPE")) %>% 
  mutate(WO_HFPS_PERC_REGION_MUTATED = WO_HFPS_CONDITION_MUT_COUNT_REGION/(LENGTH_OF_TYPE*AVG_READ_DEPTH_REGION)) %>%
  mutate(HFPS_PERC_REGION_MUTATED = CONDITION_MUT_COUNT_REGION/(LENGTH_OF_TYPE*AVG_READ_DEPTH_REGION))

write.table(summary_df, file = paste(outdir_files,"/mut_freq_per_region.txt", sep = ""), sep = "\t", quote = F, row.names = T)

```

Calculating statistics for our comparisons: 

1) Avg % region mutated in OriL compared to D-Loop 

```{r}

summary_df %>% 
  ungroup() %>% 
  select(STRAIN, TISSUE, AGE_BIN, TYPE, WO_HFPS_PERC_REGION_MUTATED) %>% 
  filter(TYPE == "D-Loop" | TYPE == "OriL") %>% 
  mutate(TYPE = recode(TYPE, "D-Loop" = "DLOOP")) %>% 
  pivot_wider(names_from = TYPE, values_from = WO_HFPS_PERC_REGION_MUTATED) %>% 
  mutate(FOLD_CHANGE = OriL/DLOOP) %>%
  select(STRAIN, TISSUE, AGE_BIN, FOLD_CHANGE) %>% 
  group_by(AGE_BIN) %>% 
  summarise(AVG_FOLD_CHANGE = mean(FOLD_CHANGE), MEDIAN_FOLD_CHANGE = median(FOLD_CHANGE))


```

2) Statistical difference in mutation frequency between D-Loop at RNA coding regions

```{r}

t_test_df = summary_df %>% 
  ungroup() %>% 
  filter(TYPE != "OriL") %>% 
  mutate(TYPE = recode(TYPE, "D-Loop" = "DLOOP")) %>% 
  mutate(TYPE = ifelse(TYPE == "DLOOP", TYPE, "RNA_CODING")) %>% 
  group_by(STRAIN, TISSUE, AGE_BIN, TYPE) %>%
  summarise(SUM_MUT_COUNT_WO_HFPS = sum(WO_HFPS_CONDITION_MUT_COUNT_REGION), AVG_DEPTH = mean(AVG_READ_DEPTH_REGION), SUM_LENGTH_OF_REGIONS = sum(LENGTH_OF_TYPE)) %>% 
  mutate(PERC_REGION_MUT = SUM_MUT_COUNT_WO_HFPS/(AVG_DEPTH * SUM_LENGTH_OF_REGIONS)) %>% 
  select(STRAIN, TISSUE, AGE_BIN, TYPE, PERC_REGION_MUT) %>%
  pivot_wider(values_from = PERC_REGION_MUT, names_from = TYPE)

avg_perc_mut = summary_df %>% 
  ungroup() %>% 
  filter(TYPE != "OriL") %>% 
  mutate(TYPE = recode(TYPE, "D-Loop" = "DLOOP")) %>% 
  mutate(TYPE = ifelse(TYPE == "DLOOP", TYPE, "RNA_CODING")) %>% 
  group_by(STRAIN, TISSUE, AGE_BIN, TYPE) %>%
  summarise(SUM_MUT_COUNT_WO_HFPS = sum(WO_HFPS_CONDITION_MUT_COUNT_REGION), AVG_DEPTH = mean(AVG_READ_DEPTH_REGION), SUM_LENGTH_OF_REGIONS = sum(LENGTH_OF_TYPE)) %>% 
  mutate(PERC_REGION_MUT = SUM_MUT_COUNT_WO_HFPS/(AVG_DEPTH * SUM_LENGTH_OF_REGIONS)) %>% 
  select(STRAIN, TISSUE, AGE_BIN, TYPE, PERC_REGION_MUT) %>% 
  group_by(AGE_BIN, TYPE) %>%
  summarise(AVG = mean(PERC_REGION_MUT))

young = t_test_df %>% 
  filter(AGE_BIN == "YOUNG")

old = t_test_df %>% 
  filter(AGE_BIN == "OLD")
  
print(t_test_df)
print(avg_perc_mut)

t.test(young$DLOOP, young$RNA_CODING, paired = TRUE, alternative = "two.sided")
t.test(old$DLOOP, old$RNA_CODING, paired = TRUE, alternative = "two.sided")
```

Plotting the dot plots wo outliers

Colors: 

```{r}
library(RColorBrewer)
library(PNWColors)

lake_pal <- pnw_palette(name="Lake", type="discrete")
spectral = brewer.pal(11,"Spectral")
PRGn = brewer.pal(11,"PRGn")
set1 = brewer.pal(3,"Set1")
PuRd = brewer.pal(11,"PuRd")
Blues = brewer.pal(9, "Blues")

colors = c(Blues[9], lake_pal[6], set1[1], PuRd[6],PRGn[4])
colors_in_order = c(set1[1], Blues[9], PRGn[4], lake_pal[6], PuRd[6] )

```
Percent of region mutated across conditions 

```{r}

summary_df$AGE_BIN = factor(summary_df$AGE_BIN, level = c("YOUNG", "OLD"))
summary_df$TYPE= factor(summary_df$TYPE, level=c("OriL","D-Loop","tRNA", "Protein", "rRNA")) 
summary_df$STRAIN= factor(summary_df$STRAIN, level=c("B6", "AKR", "ALR", "FVB", "NZB")) 

summary_df = summary_df %>%
  mutate(TISSUE = recode(TISSUE, "Brain" = "B", "Heart" = "H", "Liver" = "L"))

mut_freq_per_region_wo_hfps=ggplot(summary_df)

mut_freq_per_region_wo_hfps=mut_freq_per_region_wo_hfps+
  geom_point(aes(y=WO_HFPS_PERC_REGION_MUTATED, size=WO_HFPS_PERC_REGION_MUTATED, x=STRAIN, color=TYPE,shape= AGE_BIN, group= TISSUE), alpha=0.8)+
  facet_wrap(TYPE~TISSUE,nrow=1, strip.position = "bottom")+
  #facet_wrap(TISSUE~TYPE,nrow=1, strip.position = "bottom")+
  #facet_wrap(~TYPE,nrow=1)+
  theme_bw(base_size=6)+
  theme(strip.background = element_blank())+
  scale_shape_manual(name = "Age", labels = c("Young", "Old"), values=c(1,19))+
  #theme(legend.position="None")+
  theme(panel.grid = element_blank())+
  theme(axis.text.x=element_text(angle=90, hjust=1, vjust=1))+
  scale_x_discrete("Strain", position = "top")+
  scale_y_continuous("P(mutated)")+
  #scale_alpha_manual(values=c(0.6,0.8,1))+
  scale_size_continuous(name = "Frequency", range=c(.1,4))+
  scale_color_manual(values = colors_in_order) +
  #theme(legend.position = "none") +
  theme(axis.text.x=element_text(size=3.5),
        axis.text.y=element_text(size=7),
        axis.title.y = element_text(size = 8),
        axis.title.x = element_text(size = 8.25),
        strip.text.x = element_text(size = 8), 
        legend.text = element_text(size = 6),
        legend.position = "none")

pdf(paste(outdir_figures,"/wo_outliers_mut_freq_per_region.pdf",sep=""), width=8.5,height=2)
print(mut_freq_per_region_wo_hfps)
dev.off()

pdf(paste(outdir_figures,"/leg_wo_outliers_mut_freq_per_region.pdf",sep=""), width=8.5,height=3)
print(mut_freq_per_region_wo_hfps + theme(legend.position = "bottom"))
dev.off()


```

```{r}

summary_df$AGE_BIN = factor(summary_df$AGE_BIN, level = c("YOUNG", "OLD"))
summary_df$TYPE= factor(summary_df$TYPE, level=c("OriL","D-Loop","tRNA", "Protein", "rRNA")) 
summary_df$STRAIN= factor(summary_df$STRAIN, level=c("B6", "AKR", "ALR", "FVB", "NZB")) 

summary_df = summary_df %>%
  mutate(TISSUE = recode(TISSUE, "Brain" = "B", "Heart" = "H", "Liver" = "L"))

#t$TISSUE=factor(t$TISSUE, levels=c("Liver","Heart","Brain")) 

#g=ggplot(t %>% filter(TISSUE == "Brain"))
log_mut_freq_per_region_wo_hfps=ggplot(summary_df)

log_mut_freq_per_region_wo_hfps=log_mut_freq_per_region_wo_hfps+geom_point(aes(y=WO_HFPS_PERC_REGION_MUTATED,size=WO_HFPS_PERC_REGION_MUTATED,x=STRAIN,color=TYPE,shape=AGE_BIN,group=TISSUE),alpha=0.8)+
  facet_wrap(TYPE~TISSUE,nrow=1)+
  #facet_wrap(TISSUE~TYPE,nrow=3)+
  #facet_wrap(~TYPE,nrow=1)+
  theme_bw(base_size=6)+
  theme(strip.background = element_blank())+
  scale_shape_manual(values=c(1,19))+
  #theme(legend.position="None")+
  theme(panel.grid = element_blank())+
  theme(axis.text.x=element_text(angle=90, hjust=1, vjust=1))+
  scale_x_discrete("Strain")+
  #scale_y_continuous("P(mutated)")+
  scale_y_log10("P(mutated)") +
  #scale_alpha_manual(values=c(0.6,0.8,1))+
  scale_size_continuous(range=c(.1,4))+
  scale_color_manual(values = colors_in_order) +
  #theme(legend.position = "none") +
  theme(axis.text.x=element_text(size=3.5),
        axis.text.y=element_text(size=7),
        axis.title = element_text(size = 9),
        strip.text.x = element_text(size = 8), 
        legend.position = "none")

pdf(paste(outdir_figures,"/log_wo_outliers_mut_freq_per_region.pdf",sep=""), width=8.5,height=3)
print(log_mut_freq_per_region_wo_hfps)
dev.off()


```

Summary figure of delta values without high frequency positions

```{r}
summary_wo_hfps_spread  = summary_df %>% 
  filter(!(STRAIN == "B6" & TISSUE == "L")) %>% 
  select(STRAIN, TISSUE, AGE_BIN, TYPE, WO_HFPS_PERC_REGION_MUTATED) %>% 
  pivot_wider(names_from = AGE_BIN, values_from = WO_HFPS_PERC_REGION_MUTATED) %>% 
  mutate(DELTA = OLD - YOUNG)

```

```{r}

summary_wo_hfps_spread$TISSUE=factor(summary_wo_hfps_spread$TISSUE, levels=c("L","B","H")) 

g=ggplot(summary_wo_hfps_spread)
g=g+
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray") + 
  stat_summary(aes(x=DELTA,y=TISSUE,color=TYPE,group=TISSUE,shape=TISSUE), size = 0.3)+
  theme_bw(base_size=6)+
  facet_grid(~TYPE)+
  ylab("Tissue") + 
  xlab("Delta % mutated per region (old - young)") + 
  scale_color_manual(values = colors_in_order) +
  scale_shape_manual(values = c(15, 19, 17)) + 
  theme(strip.background = element_blank())+
  theme(legend.position="None")+
  theme(panel.grid = element_blank())+
  theme(axis.text.x=element_text(size = 6, angle = 45, vjust = 1, hjust = 1), 
        axis.text.y =element_text(size = 6),
        axis.title = element_text(size = 7), 
        strip.text.x = element_text(size = 6), 
        text = element_text(family = "sans"))

pdf(paste(outdir_figures,"/wo_outliers_mut_freq_per_region_summary.pdf",sep=""),width= 7,height=1)
print(g)
dev.off()

```