--- title: "R Notebook" output: html_notebook --- ```{r} library(tidyverse) library(ggplot2) ``` ```{r} setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/reversions/") outdir_figures = "figures/" outdir_files = "files/" #this file contains REVERSION haplotype sites (ALT > B6 REF) wihtout any correction for NUMT contamination haplotype_reversions_file = "../input_files/haplotype_mutations.vcf" haplotype_reversions = read.table(haplotype_reversions_file, header = TRUE, stringsAsFactors = FALSE) #this file contains the estimated chr1 NUMT contamination NUMT_mapping_prop_file = "../checking_NUMT/files/cons_junction_cont_est.txt" NUMT_mapping_prop = read.table(NUMT_mapping_prop_file, header = TRUE, stringsAsFactors = FALSE) ``` We need to label positions that are in the NUMT region in our haplotype reversion file -- we will implement the correction for all positions in the NUMT regardless if it's a haplotype position or not ```{r} cond_haplotype_reversions = haplotype_reversions %>% mutate(STRAIN = ifelse(STRAIN == "F", "FVB", STRAIN)) %>% select(STRAIN, TISSUE, AGE_BIN, START, REF_ALLELE_DEPTH, READ_DEPTH_AT_POS) %>% group_by(STRAIN, TISSUE, AGE_BIN, START) %>% summarise(COND_REV_ALLELE_DEPTH = sum(REF_ALLELE_DEPTH), COND_READ_DEPTH_AT_POS = sum(READ_DEPTH_AT_POS)) ``` ```{r} #building our labeller NUMT_labeler = function(position) { #remember we are in 0-indexed coords start = 6393 end = 11041 if(position >= start & position <= end) { label = "NUMT"} else{label = "NOT_NUMT"} return(label) } ``` Labeling our NUMT region ```{r} labeled_positions = cond_haplotype_reversions %>% rowwise() %>% mutate(REGION = NUMT_labeler(START)) ``` ```{r} NUMT_mapping_prop = NUMT_mapping_prop %>% mutate(REGION = "NUMT") %>% mutate(STRAIN = ifelse(STRAIN == "F", "FVB", STRAIN)) NUMT_contamination = NUMT_mapping_prop %>% filter(CHRM == "chr1") %>% #let's choose our contamination perc == max(JUNCTION_BEFORE, JUNCTION_AFTER) select(STRAIN, TISSUE, AGE_BIN, REGION, JUNCTION_BEFORE, JUNCTION_AFTER) %>% group_by(STRAIN, TISSUE, AGE_BIN, REGION) %>% mutate(CONT_PERC = max(JUNCTION_BEFORE, JUNCTION_AFTER)) ``` Merging our mut_freq info with our contamination info ```{r} merged_df = labeled_positions %>% left_join(NUMT_contamination, by = c("STRAIN", "TISSUE", "AGE_BIN", "REGION")) %>% mutate(CONT_PERC = ifelse(is.na(CONT_PERC), 0, CONT_PERC)) %>% select(STRAIN, TISSUE, AGE_BIN, START, COND_REV_ALLELE_DEPTH, COND_READ_DEPTH_AT_POS, CONT_PERC) ``` ```{r} rm(NUMT_contamination, haplotype_reversions, labeled_positions) ``` Now calculating our correction factor We assume that every read that comes from chr1 contains the B6 allele; thus, we subtract the # of reads that we calculate as coming from ch1 from both the read depth and the allele depth ```{r} NUMT_corrected_mut_freq = merged_df %>% mutate(EST_READS_MAPPED_TO_CHRM1 = COND_READ_DEPTH_AT_POS *CONT_PERC) %>% mutate(CORR_READ_DEPTH = COND_READ_DEPTH_AT_POS - EST_READS_MAPPED_TO_CHRM1, CORR_REV_ALLELE_DEPTH = COND_REV_ALLELE_DEPTH - EST_READS_MAPPED_TO_CHRM1) %>% mutate(FLOORED_CORR_REV_ALLELE_DEPTH = ifelse(CORR_REV_ALLELE_DEPTH < 0, 0, CORR_REV_ALLELE_DEPTH)) ``` Writing this file out for future analyses ```{r} setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/reversions/") write.table(NUMT_corrected_mut_freq, file = paste(outdir_files,"corrected_reversion_counts.txt", sep = ""), sep = "\t", quote = F, row.names = T) ```