---
title: "R Notebook"
output: html_notebook
---
```{r}
library(tidyverse)
library(ggplot2)
```
```{r}
setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/checking_NUMT")
outdir_figures = "figures/"
outdir_files = "files/"
read_depth_file = "files/NZB_remapping_merged_read_depth.txt"
read_depth = read.table(read_depth_file, header = FALSE, stringsAsFactors = FALSE)
#this file contains the read depth in the NUMT region
supertable_file = "../input_files/supertable.txt"
supertable = read.table(supertable_file, header = TRUE, stringsAsFactors = FALSE)
```
Calculating the average read depth in the chrM region that mimics the NUMT
```{r}
#filter for only NZB since our anchors are in NZB
average_depth_chrM_NUMT_region = supertable %>%
select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, READ_DEPTH_AT_POS) %>%
unique() %>%
select(STRAIN, TISSUE, AGE_BIN, START, READ_DEPTH_AT_POS) %>%
group_by(STRAIN, TISSUE, AGE_BIN, START) %>%
#aggregating the read depth across samples
summarise(COND_READ_DEPTH_AT_POS = sum(READ_DEPTH_AT_POS)) %>%
#filtering for just the NUMT region
filter(START >= 6394 & START <= 11042) %>%
ungroup() %>%
#calculating the average read depth in the chrM NUMT region
select(STRAIN, TISSUE, AGE_BIN, COND_READ_DEPTH_AT_POS) %>%
group_by(STRAIN, TISSUE, AGE_BIN) %>%
summarise(AVG_READ_DEPTH = mean(COND_READ_DEPTH_AT_POS)) %>%
filter(STRAIN == "NZB")
```
```{r}
rm(supertable)
```
Aggregating our sample information for the read depth at the chr1 NUMT and chrM region
```{r}
colnames(read_depth) = c("CHRM", "START", "DEPTH", "FILENAME")
read_depth = read_depth %>%
separate(FILENAME, c("STRAIN", "AGE", "TISSUE"), sep = "_", extra = "drop", fill = "right")
```
```{r}
#this file contains the read depth for reads mapped to the NZB ref genome and chr1 NUMT and underwent a strict filter for quality and high matches
read_depth = read_depth %>%
mutate(CHRM = ifelse(CHRM == "NZB_chrM", "chrM", CHRM)) %>%
mutate(AGE_BIN = ifelse(grepl("O", AGE), "OLD", "YOUNG")) %>%
select(STRAIN, TISSUE, AGE_BIN, CHRM, START, DEPTH) %>%
group_by(STRAIN, TISSUE, AGE_BIN, CHRM, START) %>%
summarise(COND_DEPTH = sum(DEPTH))
```
Creating a column that labels our anchor points -- this will be super hard coded :/
```{r}
#63 bp apart
chr1_anchor_end_1 = 24616171 + 10
chr1_anchor_start_1 = 24616108 - 10
#45 bp apart
chr1_anchor_end_2 = 24616003 + 10
chr1_anchor_start_2 = 24615958 - 10
#36 bp apart
chr1_anchor_start_3 = 24611996 - 10
chr1_anchor_end_3 = 24612032 + 10
#spans 208 bp but has multiple haplotype sites in this region
chr1_anchor_start_4 = 24612980 - 10
chr1_anchor_end_4 = 24613188 + 10
#15 bp apart
chr1_anchor_start_5 = 24613427 - 10
chr1_anchor_end_5 = 24613442 + 10
#6 bp apart
chr1_anchor_start_6 = 24613715 - 10
chr1_anchor_end_6 = 24613721 + 10
#Finding these anchor points in chrM
chrM_anchor_start_1 = 6407 - 10
chrM_anchor_end_1 = 6470 + 10
chrM_anchor_start_2 = 6575 - 10
chrM_anchor_end_2 = 6620 + 10
#36 bp apart
chrM_anchor_end_3 = 10583 + 10
chrM_anchor_start_3 = 10547 - 10
#208 bp apart
chrM_anchor_end_4 = 9599 + 10
chrM_anchor_start_4 = 9391 - 10
#15 bp apart
chrM_anchor_end_5 = 9152 + 10
chrM_anchor_start_5 = 9137 - 10
#6 bp apart
chrM_anchor_end_6 = 8864 + 10
chrM_anchor_start_6 = 8858 - 10
```
```{r}
anchor_points_id = function(position){
if(position > chr1_anchor_start_1 & position < chr1_anchor_end_1) {label = "chr1_anchor_1"}
else if (position > chr1_anchor_start_2 & position < chr1_anchor_end_2) {label = "chr1_anchor_2"}
else if (position > chr1_anchor_start_3 & position < chr1_anchor_end_3) {label = "chr1_anchor_3"}
else if (position > chr1_anchor_start_4 & position < chr1_anchor_end_4) {label = "chr1_anchor_4"}
else if (position > chr1_anchor_start_5 & position < chr1_anchor_end_5) {label = "chr1_anchor_5"}
else if (position > chr1_anchor_start_6 & position < chr1_anchor_end_6) {label = "chr1_anchor_6"}
else if (position > chrM_anchor_start_1 & position < chrM_anchor_end_1) {label = "chrM_anchor_1"}
else if (position > chrM_anchor_start_2 & position < chrM_anchor_end_2) {label = "chrM_anchor_2"}
else if (position > chrM_anchor_start_3 & position < chrM_anchor_end_3) {label = "chrM_anchor_3"}
else if (position > chrM_anchor_start_4 & position < chrM_anchor_end_4) {label = "chrM_anchor_4"}
else if (position > chrM_anchor_start_5 & position < chrM_anchor_end_5) {label = "chrM_anchor_5"}
else if (position > chrM_anchor_start_6 & position < chrM_anchor_end_6) {label = "chrM_anchor_6"}
else {label = "not_anchor"}
}
```
```{r}
labeled_read_depth = read_depth %>%
rowwise() %>%
mutate(ANCHOR = anchor_points_id(START))
```
```{r}
anchor_points_depth = labeled_read_depth %>%
ungroup() %>%
filter(ANCHOR != "not_anchor") %>%
select(STRAIN, TISSUE, AGE_BIN, CHRM, ANCHOR, COND_DEPTH) %>%
group_by(STRAIN, TISSUE, AGE_BIN, CHRM, ANCHOR) %>%
summarise(AVG_READ_DEPTH_AT_ANCHOR = mean(COND_DEPTH))
```
Now to merge the average read depth for the NUMT region in chr1 and chrM
```{r}
cont_est_anchor_pts = anchor_points_depth %>%
left_join(average_depth_chrM_NUMT_region, by = c("STRAIN", "TISSUE", "AGE_BIN")) %>%
mutate(CONT_EST = AVG_READ_DEPTH_AT_ANCHOR/AVG_READ_DEPTH) %>%
select(STRAIN, TISSUE, AGE_BIN, CHRM, ANCHOR, CONT_EST)
```
```{r}
setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/checking_NUMT")
#this file contains the prop of reads mapped to the junctions without strict filtering
junction_prop_file = "files/cons_junction_cont_est.txt"
junction_prop = read.table(junction_prop_file, header = TRUE, stringsAsFactors = FALSE)
```
```{r}
plotting_junctions = junction_prop %>%
filter(STRAIN == "NZB") %>%
mutate(X_LABEL = paste0(TISSUE, "_", AGE_BIN))
```
```{r}
plotting_cont_anchors = cont_est_anchor_pts %>%
mutate(X_LABEL = paste0(TISSUE, "_", AGE_BIN))
```
```{r}
setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/checking_NUMT")
comparison_NZBanchors_NZBjunction = ggplot(plotting_cont_anchors) +
geom_boxplot(aes(x = X_LABEL, y = CONT_EST, color = CHRM)) +
geom_point(data = plotting_junctions, aes(x = X_LABEL, y = JUNCTION_BEFORE), color = "#990033", size = 0.7) +
geom_point(data = plotting_junctions, aes(x = X_LABEL, y = JUNCTION_AFTER), color = "#990033", size = 0.8) +
scale_y_log10() +
xlab("Condition") +
ylab("Proportion of Reads Mapped \n(Haploptype Sites: chrM)") +
scale_color_manual(name = "Chromosome", values = c("cornflowerblue", "black")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 6),
text = element_text(family = "sans"),
axis.title.x = element_text(size = 8.25),
axis.title.y = element_text(size = 8),
axis.text.y=element_text(size = 6.5),
legend.position = "None")
pdf(paste(outdir_figures,"/comparison_NZBanchors_NZBjunction.pdf",sep=""),width=3.25,height=3)
print(comparison_NZBanchors_NZBjunction)
dev.off()
pdf(paste(outdir_figures,"/leg_comparison_NZBanchors_NZBjunction.pdf",sep=""),width=3.5,height=3.5)
print(comparison_NZBanchors_NZBjunction + theme(legend.position = "left"))
dev.off()
```