---
title: "duplex_read_depth"
output: html_notebook
---

This figure contributes to Fig 1d. 

```{r}
library(tidyverse)
library(ggplot2)
library(scales)
library(ggbeeswarm)
```

```{r}

outdir_figures <- "~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/experimental_design/figures"

outdir_files <- "~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/experimental_design/files"

supertable_file <-"~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/supertable.txt"

supertable <- read.table(supertable_file, header=TRUE, stringsAsFactors = FALSE)

```

We calculate the average read depth across the mt-genome for each sample. This average represents the average number of duplex mt-genomes sequenced given that each mt-genome has a unique molecular identifier. 

```{r}
sample_depth = supertable %>%
  select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, READ_DEPTH_AT_POS) %>%
  unique() %>% 
  group_by(SAMPLE, STRAIN, TISSUE, AGE_BIN) %>%
  summarise(AVG_DUP_DEPTH = mean(READ_DEPTH_AT_POS))
```


```{r}

sample_depth_plot = ggplot(sample_depth, aes(x = AVG_DUP_DEPTH)) +
  geom_histogram(aes(y=..count../sum(..count..)), bins = 25) +
  xlab("Duplex mt-genomes Profiled Per Sample") +
  ylab("Proportion") + 
  theme_bw() + 
  theme(axis.text = element_text(size=10), 
        axis.title = element_text(size=11),
        axis.text.x = element_text(size = 10), 
        axis.text.y = element_text(size = 10),
        strip.background= element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black")) 

pdf(paste(outdir_figures, "/dcs_read_depth_per_sample.pdf", sep = ""), width = 4, height = 3)
print(sample_depth_plot)
dev.off()
```

Quantifying the total bp depth per condition (strain x age x tissue) --> sum of the avg_bp_depth for all samples in a condition. 
```{r}

cond_depth = supertable %>%
  select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, READ_DEPTH_AT_POS) %>%
  #we want to filter any redundant positions (i.e. positions that are in two genes)
  unique() %>%
  select(STRAIN, TISSUE, AGE_BIN, START, READ_DEPTH_AT_POS) %>%
  group_by(STRAIN, TISSUE, AGE_BIN, START) %>%
  summarise(COND_DEPTH_AT_POS = sum(READ_DEPTH_AT_POS)) %>%
  ungroup() %>%
  select(STRAIN, TISSUE, AGE_BIN, COND_DEPTH_AT_POS) %>%
  group_by(STRAIN, TISSUE, AGE_BIN) %>%
  summarise(COND_AVG_DEPTH = mean(COND_DEPTH_AT_POS))

```

```{r}
median_per_cond = median(cond_depth$COND_AVG_DEPTH)
total_mt_genomes = floor(sum(cond_depth$COND_AVG_DEPTH))

print(median_per_cond)
print(total_mt_genomes)
range(cond_depth$COND_AVG_DEPTH)

#the lowest we can sequence is 
1/max(cond_depth$COND_AVG_DEPTH)
```


```{r}

cond_depth = cond_depth %>% 
  mutate(STRAIN = recode(STRAIN, "F" = "FVB"))

cond_depth$STRAIN = factor(cond_depth$STRAIN, levels = c("B6", "AKR", "ALR", "FVB", "NZB"))

beeswarm_fig = ggplot(cond_depth) + 
  geom_beeswarm(aes(x = STRAIN, y = COND_AVG_DEPTH, color = STRAIN)) +
  annotate("text", x = 2.45, y = 5.3e+05, label = "Total: 2.5M", size = 3.75) + 
  annotate("text", x = 3, y = 4.6e+05, label = "Median: 74,764", size = 3.75) + 
  xlab("Strains") + 
  ylab("Count of duplex mt-genomes") +
   scale_color_manual(values = c("B6"= "#1d457f","AKR" = "#cc5c76", "ALR" = "#2f9e23", "FVB" = "#f57946", "NZB" = "#f7c22d")) + 
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
                labels = scales::trans_format("log10", math_format(10^.x))) + 
  theme_bw(base_size = 16) + 
  theme(axis.text = element_text(size=11), 
        axis.title = element_text(size=12),
        axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), 
        axis.text.y = element_text(size = 11),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black"), 
        strip.background=element_blank(),
        strip.text = element_text(face = "bold"), 
        text = element_text(family = "sans"), 
        legend.position = "none")

pdf(paste(outdir_figures, "/beeswarm_duplex_depth_per_cond.pdf", sep = ""), width = 2.35, height = 4)
print(beeswarm_fig)
dev.off()
                        
                      
```

Writing out files:
1) Sample duplex depth
2) Condition duplex depth 

```{r}

write.table(sample_depth, file = paste(outdir_files,"/sample_duplex_depth.txt", sep = ""), sep = "\t", quote = F, row.names = F)

write.table(cond_depth, file = paste(outdir_files,"/experimental_condition_duplex_depth.txt", sep = ""), sep = "\t", quote = F, row.names = F)

```

~~~~~~SCRATCH CODE~~~~~~~~~~

This is a second method for estimating duplex depth from the raw number of duplex sequences. Note if our reads overlap this would give ~ 2-fold higher estimate.  

```{r}

dcs_read_depth_file <-"~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/experimental_design/files/duplex_read_depth.csv"

dcs_read_depth <- read.table(dcs_read_depth_file, header=TRUE, stringsAsFactors = FALSE)

```

Filtering out the samples we will not be using in our analysis: 

```{r}

filtered_dcs_read_depth = dcs_read_depth %>%
  filter(AGE_BIN != "MID") %>%
  filter(!grepl("B6_Y3_Heart", sample)) %>%
  filter(!grepl("B6_Y._Liver", sample))
```

```{r}

#we multiply by the number of bp our reads covered (150x PE reads) and then divide by the length of the mtGenome to obtain the avg_bp_depth per sample 
setwd(outdir_figures)
filtered_dcs_read_depth$avg_bp_depth = (filtered_dcs_read_depth$read_count * 300) / 16299

cleaned_dcs_read_depth = filtered_dcs_read_depth %>% 
  separate(sample, c("STRAIN", "AGE", "TISSUE"), extra = "drop", sep = "_")

mean_per_exp = mean(cleaned_dcs_read_depth$avg_bp_depth)
print(mean_per_exp)
range(cleaned_dcs_read_depth$avg_bp_depth)
sum(cleaned_dcs_read_depth$avg_bp_depth)
```