---
title: "R Notebook"
output: html_notebook
---
```{r}
library(tidyverse)
library(ggplot2)
```
```{r}
setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/reversions/")
outdir_figures = "figures/"
outdir_files = "files/"
#this file contains REVERSION haplotype sites (ALT > B6 REF) wihtout any correction for NUMT contamination
haplotype_reversions_file = "../input_files/haplotype_mutations.vcf"
haplotype_reversions = read.table(haplotype_reversions_file, header = TRUE, stringsAsFactors = FALSE)
#this file contains the estimated chr1 NUMT contamination
NUMT_mapping_prop_file = "../checking_NUMT/files/cons_junction_cont_est.txt"
NUMT_mapping_prop = read.table(NUMT_mapping_prop_file, header = TRUE, stringsAsFactors = FALSE)
```
We need to label positions that are in the NUMT region in our haplotype reversion file -- we will implement the correction for all positions in the NUMT regardless if it's a haplotype position or not
```{r}
cond_haplotype_reversions = haplotype_reversions %>%
mutate(STRAIN = ifelse(STRAIN == "F", "FVB", STRAIN)) %>%
select(STRAIN, TISSUE, AGE_BIN, START, REF_ALLELE_DEPTH, READ_DEPTH_AT_POS) %>%
group_by(STRAIN, TISSUE, AGE_BIN, START) %>%
summarise(COND_REV_ALLELE_DEPTH = sum(REF_ALLELE_DEPTH), COND_READ_DEPTH_AT_POS = sum(READ_DEPTH_AT_POS))
```
```{r}
#building our labeller
NUMT_labeler = function(position) {
#remember we are in 0-indexed coords
start = 6393
end = 11041
if(position >= start & position <= end) {
label = "NUMT"}
else{label = "NOT_NUMT"}
return(label)
}
```
Labeling our NUMT region
```{r}
labeled_positions = cond_haplotype_reversions %>%
rowwise() %>%
mutate(REGION = NUMT_labeler(START))
```
```{r}
NUMT_mapping_prop = NUMT_mapping_prop %>%
mutate(REGION = "NUMT") %>%
mutate(STRAIN = ifelse(STRAIN == "F", "FVB", STRAIN))
NUMT_contamination = NUMT_mapping_prop %>%
filter(CHRM == "chr1") %>%
#let's choose our contamination perc == max(JUNCTION_BEFORE, JUNCTION_AFTER)
select(STRAIN, TISSUE, AGE_BIN, REGION, JUNCTION_BEFORE, JUNCTION_AFTER) %>%
group_by(STRAIN, TISSUE, AGE_BIN, REGION) %>%
mutate(CONT_PERC = max(JUNCTION_BEFORE, JUNCTION_AFTER))
```
Merging our mut_freq info with our contamination info
```{r}
merged_df = labeled_positions %>%
left_join(NUMT_contamination, by = c("STRAIN", "TISSUE", "AGE_BIN", "REGION")) %>%
mutate(CONT_PERC = ifelse(is.na(CONT_PERC), 0, CONT_PERC)) %>%
select(STRAIN, TISSUE, AGE_BIN, START, COND_REV_ALLELE_DEPTH, COND_READ_DEPTH_AT_POS, CONT_PERC)
```
```{r}
rm(NUMT_contamination, haplotype_reversions, labeled_positions)
```
Now calculating our correction factor
We assume that every read that comes from chr1 contains the B6 allele; thus, we subtract the # of reads that we calculate as coming from ch1 from both the read depth and the allele depth
```{r}
NUMT_corrected_mut_freq = merged_df %>%
mutate(EST_READS_MAPPED_TO_CHRM1 = COND_READ_DEPTH_AT_POS *CONT_PERC) %>%
mutate(CORR_READ_DEPTH = COND_READ_DEPTH_AT_POS - EST_READS_MAPPED_TO_CHRM1,
CORR_REV_ALLELE_DEPTH = COND_REV_ALLELE_DEPTH - EST_READS_MAPPED_TO_CHRM1) %>%
mutate(FLOORED_CORR_REV_ALLELE_DEPTH = ifelse(CORR_REV_ALLELE_DEPTH < 0, 0, CORR_REV_ALLELE_DEPTH))
```
Writing this file out for future analyses
```{r}
setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/reversions/")
write.table(NUMT_corrected_mut_freq, file = paste(outdir_files,"corrected_reversion_counts.txt", sep = ""), sep = "\t", quote = F, row.names = T)
```