--- title: "R Notebook" output: html_notebook --- ```{r} library(tidyverse) library(ggplot2) library(PNWColors) library(gridExtra) ``` ```{r} setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/") outdir_figures = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/mutation_frequency_per_region/figures/" outdir_files = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/mutation_frequency_per_region/files/" supertable_file ="/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/supertable.txt" supertable = read.table(supertable_file, header=TRUE, stringsAsFactors = FALSE) coordinates_file = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/Mouse_Mt_Genome_Coordinates" coordinates = read.table(coordinates_file, stringsAsFactors = FALSE, sep = "\t") ``` ```{r} #adding the D-Loop coordinates to our file coordinates[nrow(coordinates) + 1, ] <- c("D-Loop", 15422, 16299) #adding column names to dataframe colnames(coordinates) <- c("GENE", "START", "END") coordinates$START <- as.numeric(coordinates$START) coordinates$END <- as.numeric(coordinates$END) #so things don't get wonky if the coordinates are read as strings -- also we convert our coordinates to be 0-indexed coordinates$START <- as.numeric(coordinates$START) - 1 coordinates$END <- as.numeric(coordinates$END) - 1 coordinates = coordinates %>% mutate(TYPE = case_when(grepl("mt-R", GENE) ~ "rRNA", grepl("mt-T", GENE) ~ "tRNA", GENE == "D-Loop" ~ "D-Loop", GENE == "mt-OL" ~ "OriL", grepl("mt-[N|A|C]",GENE) ~ "Protein", TRUE ~ "Intergene")) ``` We need to figure out the length of each region in order to normalize by length. ```{r} length_of_region = coordinates %>% mutate(LENGTH_OF_GENE = (END - START) + 1) %>% filter(TYPE != "Intergene") %>% select(TYPE, LENGTH_OF_GENE) %>% group_by(TYPE) %>% summarise(LENGTH_OF_TYPE = sum(LENGTH_OF_GENE)) ``` Now, we need to figure out the average read depth of each region for every condition ```{r} avg_read_depth_per_region = supertable %>% select(SAMPLE, STRAIN, TISSUE, AGE_BIN, GENE, START, READ_DEPTH_AT_POS) %>% #unique here so that we eliminate redundancy from multiple mutation types unique() %>% select(STRAIN, TISSUE, AGE_BIN, GENE, START, READ_DEPTH_AT_POS) %>% group_by(STRAIN, TISSUE, AGE_BIN, GENE, START) %>% summarise(COND_READ_DEPTH_AT_POS = sum(READ_DEPTH_AT_POS)) %>% ungroup() %>% select(STRAIN, TISSUE, AGE_BIN, GENE, START, COND_READ_DEPTH_AT_POS) %>% mutate(TYPE = case_when(grepl("mt-R", GENE) ~ "rRNA", grepl("mt-T", GENE) ~ "tRNA", GENE == "D-Loop" ~ "D-Loop", GENE == "mt-OL" ~ "OriL", grepl("mt-[N|A|C]",GENE) ~ "Protein", TRUE ~ "Intergene")) %>% #filtering out those few bp that exist between these regions filter(TYPE != "Intergene") %>% select(STRAIN, TISSUE, AGE_BIN, TYPE, COND_READ_DEPTH_AT_POS) %>% group_by(STRAIN, TISSUE, AGE_BIN, TYPE) %>% summarise(AVG_READ_DEPTH_REGION = mean(COND_READ_DEPTH_AT_POS)) ``` We want to normalize for sequencing depth across our conditions (i.e. we wouldn't capture mutations if we didn't sequence to the level that we did in some samples): ```{r} norm_seq_depth = supertable %>% select(SAMPLE, STRAIN, TISSUE, AGE_BIN, GENE, START, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>% group_by(SAMPLE, STRAIN, TISSUE, AGE_BIN, GENE, START, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>% summarise(SAMPLE_MUT_COUNT_AT_POS = sum(ALT_ALLELE_DEPTH)) %>% ungroup() %>% select(STRAIN, TISSUE, AGE_BIN, GENE, START, SAMPLE_MUT_COUNT_AT_POS, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>% group_by(STRAIN, TISSUE, AGE_BIN, GENE, START, CONDITION_MUT_FREQ_AT_POS) %>% summarise(CONDITION_MUT_COUNT_AT_POS = sum(SAMPLE_MUT_COUNT_AT_POS), CONDITION_READ_DEPTH_AT_POS = sum(READ_DEPTH_AT_POS)) %>% ungroup() %>% group_by(GENE, START) %>% mutate(MIN_READ_DEPTH_AT_POS = min(CONDITION_READ_DEPTH_AT_POS)) %>% #this is the lowest mutation frequency we would be able to get given the smallest read depth at a position across our conditions mutate(FLOOR_MIN_MUT_FREQ_AT_POS = 1/MIN_READ_DEPTH_AT_POS) %>% #if our mutation frequency is lower than the floor we set, we reset our condition mutation count to 0 --> in essence we wouldn't have been able to capture these mutations without the sequencing depth we had mutate(CONDITION_MUT_COUNT_AT_POS = ifelse(CONDITION_MUT_FREQ_AT_POS < FLOOR_MIN_MUT_FREQ_AT_POS, 0, CONDITION_MUT_COUNT_AT_POS)) ``` Process our condition mutation count now that we've normalized for sequencing depth: ```{r} region_mut_freq_df = norm_seq_depth %>% ungroup() %>% select(STRAIN, TISSUE, AGE_BIN, GENE, CONDITION_MUT_COUNT_AT_POS) %>% mutate(TYPE = case_when(grepl("mt-R", GENE) ~ "rRNA", grepl("mt-T", GENE) ~ "tRNA", GENE == "D-Loop" ~ "D-Loop", GENE == "mt-OL" ~ "OriL", grepl("mt-[N|A|C]",GENE) ~ "Protein", TRUE ~ "Intergene")) %>% #filtering out those few bp that exist between these regions filter(TYPE != "Intergene") %>% select(STRAIN, TISSUE, AGE_BIN, TYPE, CONDITION_MUT_COUNT_AT_POS) %>% group_by(STRAIN, TISSUE, AGE_BIN, TYPE) %>% summarise(CONDITION_MUT_COUNT_REGION = sum(CONDITION_MUT_COUNT_AT_POS)) ``` Processing our condition mutation count WO identified outliers ```{r} wo_hfps_region_mut_freq_df = norm_seq_depth %>% filter(CONDITION_MUT_FREQ_AT_POS < 0.001) %>% ungroup() %>% select(STRAIN, TISSUE, AGE_BIN, GENE, CONDITION_MUT_COUNT_AT_POS) %>% mutate(TYPE = case_when(grepl("mt-R", GENE) ~ "rRNA", grepl("mt-T", GENE) ~ "tRNA", GENE == "D-Loop" ~ "D-Loop", GENE == "mt-OL" ~ "OriL", grepl("mt-[N|A|C]",GENE) ~ "Protein", TRUE ~ "Intergene")) %>% #filtering out those few bp that exist between these regions filter(TYPE != "Intergene") %>% select(STRAIN, TISSUE, AGE_BIN, TYPE, CONDITION_MUT_COUNT_AT_POS) %>% group_by(STRAIN, TISSUE, AGE_BIN, TYPE) %>% summarise(WO_HFPS_CONDITION_MUT_COUNT_REGION = sum(CONDITION_MUT_COUNT_AT_POS)) ``` ```{r} rm(supertable) ``` Now combining all of our dataframes to make one large df with our info -- our summary df for the info without HFPs ```{r} summary_df = wo_hfps_region_mut_freq_df %>% full_join(region_mut_freq_df, by = c("STRAIN", "TISSUE", "AGE_BIN", "TYPE")) %>% left_join(length_of_region, by = "TYPE") %>% left_join(avg_read_depth_per_region, by = c("STRAIN", "TISSUE", "AGE_BIN", "TYPE")) %>% mutate(WO_HFPS_PERC_REGION_MUTATED = WO_HFPS_CONDITION_MUT_COUNT_REGION/(LENGTH_OF_TYPE*AVG_READ_DEPTH_REGION)) %>% mutate(HFPS_PERC_REGION_MUTATED = CONDITION_MUT_COUNT_REGION/(LENGTH_OF_TYPE*AVG_READ_DEPTH_REGION)) write.table(summary_df, file = paste(outdir_files,"/mut_freq_per_region.txt", sep = ""), sep = "\t", quote = F, row.names = T) ``` Calculating statistics for our comparisons: 1) Avg % region mutated in OriL compared to D-Loop ```{r} summary_df %>% ungroup() %>% select(STRAIN, TISSUE, AGE_BIN, TYPE, WO_HFPS_PERC_REGION_MUTATED) %>% filter(TYPE == "D-Loop" | TYPE == "OriL") %>% mutate(TYPE = recode(TYPE, "D-Loop" = "DLOOP")) %>% pivot_wider(names_from = TYPE, values_from = WO_HFPS_PERC_REGION_MUTATED) %>% mutate(FOLD_CHANGE = OriL/DLOOP) %>% select(STRAIN, TISSUE, AGE_BIN, FOLD_CHANGE) %>% group_by(AGE_BIN) %>% summarise(AVG_FOLD_CHANGE = mean(FOLD_CHANGE), MEDIAN_FOLD_CHANGE = median(FOLD_CHANGE)) ``` 2) Statistical difference in mutation frequency between D-Loop at RNA coding regions ```{r} t_test_df = summary_df %>% ungroup() %>% filter(TYPE != "OriL") %>% mutate(TYPE = recode(TYPE, "D-Loop" = "DLOOP")) %>% mutate(TYPE = ifelse(TYPE == "DLOOP", TYPE, "RNA_CODING")) %>% group_by(STRAIN, TISSUE, AGE_BIN, TYPE) %>% summarise(SUM_MUT_COUNT_WO_HFPS = sum(WO_HFPS_CONDITION_MUT_COUNT_REGION), AVG_DEPTH = mean(AVG_READ_DEPTH_REGION), SUM_LENGTH_OF_REGIONS = sum(LENGTH_OF_TYPE)) %>% mutate(PERC_REGION_MUT = SUM_MUT_COUNT_WO_HFPS/(AVG_DEPTH * SUM_LENGTH_OF_REGIONS)) %>% select(STRAIN, TISSUE, AGE_BIN, TYPE, PERC_REGION_MUT) %>% pivot_wider(values_from = PERC_REGION_MUT, names_from = TYPE) avg_perc_mut = summary_df %>% ungroup() %>% filter(TYPE != "OriL") %>% mutate(TYPE = recode(TYPE, "D-Loop" = "DLOOP")) %>% mutate(TYPE = ifelse(TYPE == "DLOOP", TYPE, "RNA_CODING")) %>% group_by(STRAIN, TISSUE, AGE_BIN, TYPE) %>% summarise(SUM_MUT_COUNT_WO_HFPS = sum(WO_HFPS_CONDITION_MUT_COUNT_REGION), AVG_DEPTH = mean(AVG_READ_DEPTH_REGION), SUM_LENGTH_OF_REGIONS = sum(LENGTH_OF_TYPE)) %>% mutate(PERC_REGION_MUT = SUM_MUT_COUNT_WO_HFPS/(AVG_DEPTH * SUM_LENGTH_OF_REGIONS)) %>% select(STRAIN, TISSUE, AGE_BIN, TYPE, PERC_REGION_MUT) %>% group_by(AGE_BIN, TYPE) %>% summarise(AVG = mean(PERC_REGION_MUT)) young = t_test_df %>% filter(AGE_BIN == "YOUNG") old = t_test_df %>% filter(AGE_BIN == "OLD") print(t_test_df) print(avg_perc_mut) t.test(young$DLOOP, young$RNA_CODING, paired = TRUE, alternative = "two.sided") t.test(old$DLOOP, old$RNA_CODING, paired = TRUE, alternative = "two.sided") ``` Plotting the dot plots wo outliers Colors: ```{r} library(RColorBrewer) library(PNWColors) lake_pal <- pnw_palette(name="Lake", type="discrete") spectral = brewer.pal(11,"Spectral") PRGn = brewer.pal(11,"PRGn") set1 = brewer.pal(3,"Set1") PuRd = brewer.pal(11,"PuRd") Blues = brewer.pal(9, "Blues") colors = c(Blues[9], lake_pal[6], set1[1], PuRd[6],PRGn[4]) colors_in_order = c(set1[1], Blues[9], PRGn[4], lake_pal[6], PuRd[6] ) ``` Percent of region mutated across conditions ```{r} summary_df$AGE_BIN = factor(summary_df$AGE_BIN, level = c("YOUNG", "OLD")) summary_df$TYPE= factor(summary_df$TYPE, level=c("OriL","D-Loop","tRNA", "Protein", "rRNA")) summary_df$STRAIN= factor(summary_df$STRAIN, level=c("B6", "AKR", "ALR", "FVB", "NZB")) summary_df = summary_df %>% mutate(TISSUE = recode(TISSUE, "Brain" = "B", "Heart" = "H", "Liver" = "L")) mut_freq_per_region_wo_hfps=ggplot(summary_df) mut_freq_per_region_wo_hfps=mut_freq_per_region_wo_hfps+ geom_point(aes(y=WO_HFPS_PERC_REGION_MUTATED, size=WO_HFPS_PERC_REGION_MUTATED, x=STRAIN, color=TYPE,shape= AGE_BIN, group= TISSUE), alpha=0.8)+ facet_wrap(TYPE~TISSUE,nrow=1, strip.position = "bottom")+ #facet_wrap(TISSUE~TYPE,nrow=1, strip.position = "bottom")+ #facet_wrap(~TYPE,nrow=1)+ theme_bw(base_size=6)+ theme(strip.background = element_blank())+ scale_shape_manual(name = "Age", labels = c("Young", "Old"), values=c(1,19))+ #theme(legend.position="None")+ theme(panel.grid = element_blank())+ theme(axis.text.x=element_text(angle=90, hjust=1, vjust=1))+ scale_x_discrete("Strain", position = "top")+ scale_y_continuous("P(mutated)")+ #scale_alpha_manual(values=c(0.6,0.8,1))+ scale_size_continuous(name = "Frequency", range=c(.1,4))+ scale_color_manual(values = colors_in_order) + #theme(legend.position = "none") + theme(axis.text.x=element_text(size=3.5), axis.text.y=element_text(size=7), axis.title.y = element_text(size = 8), axis.title.x = element_text(size = 8.25), strip.text.x = element_text(size = 8), legend.text = element_text(size = 6), legend.position = "none") pdf(paste(outdir_figures,"/wo_outliers_mut_freq_per_region.pdf",sep=""), width=8.5,height=2) print(mut_freq_per_region_wo_hfps) dev.off() pdf(paste(outdir_figures,"/leg_wo_outliers_mut_freq_per_region.pdf",sep=""), width=8.5,height=3) print(mut_freq_per_region_wo_hfps + theme(legend.position = "bottom")) dev.off() ``` ```{r} summary_df$AGE_BIN = factor(summary_df$AGE_BIN, level = c("YOUNG", "OLD")) summary_df$TYPE= factor(summary_df$TYPE, level=c("OriL","D-Loop","tRNA", "Protein", "rRNA")) summary_df$STRAIN= factor(summary_df$STRAIN, level=c("B6", "AKR", "ALR", "FVB", "NZB")) summary_df = summary_df %>% mutate(TISSUE = recode(TISSUE, "Brain" = "B", "Heart" = "H", "Liver" = "L")) #t$TISSUE=factor(t$TISSUE, levels=c("Liver","Heart","Brain")) #g=ggplot(t %>% filter(TISSUE == "Brain")) log_mut_freq_per_region_wo_hfps=ggplot(summary_df) log_mut_freq_per_region_wo_hfps=log_mut_freq_per_region_wo_hfps+geom_point(aes(y=WO_HFPS_PERC_REGION_MUTATED,size=WO_HFPS_PERC_REGION_MUTATED,x=STRAIN,color=TYPE,shape=AGE_BIN,group=TISSUE),alpha=0.8)+ facet_wrap(TYPE~TISSUE,nrow=1)+ #facet_wrap(TISSUE~TYPE,nrow=3)+ #facet_wrap(~TYPE,nrow=1)+ theme_bw(base_size=6)+ theme(strip.background = element_blank())+ scale_shape_manual(values=c(1,19))+ #theme(legend.position="None")+ theme(panel.grid = element_blank())+ theme(axis.text.x=element_text(angle=90, hjust=1, vjust=1))+ scale_x_discrete("Strain")+ #scale_y_continuous("P(mutated)")+ scale_y_log10("P(mutated)") + #scale_alpha_manual(values=c(0.6,0.8,1))+ scale_size_continuous(range=c(.1,4))+ scale_color_manual(values = colors_in_order) + #theme(legend.position = "none") + theme(axis.text.x=element_text(size=3.5), axis.text.y=element_text(size=7), axis.title = element_text(size = 9), strip.text.x = element_text(size = 8), legend.position = "none") pdf(paste(outdir_figures,"/log_wo_outliers_mut_freq_per_region.pdf",sep=""), width=8.5,height=3) print(log_mut_freq_per_region_wo_hfps) dev.off() ``` Summary figure of delta values without high frequency positions ```{r} summary_wo_hfps_spread = summary_df %>% filter(!(STRAIN == "B6" & TISSUE == "L")) %>% select(STRAIN, TISSUE, AGE_BIN, TYPE, WO_HFPS_PERC_REGION_MUTATED) %>% pivot_wider(names_from = AGE_BIN, values_from = WO_HFPS_PERC_REGION_MUTATED) %>% mutate(DELTA = OLD - YOUNG) ``` ```{r} summary_wo_hfps_spread$TISSUE=factor(summary_wo_hfps_spread$TISSUE, levels=c("L","B","H")) g=ggplot(summary_wo_hfps_spread) g=g+ geom_vline(xintercept = 0, linetype = "dashed", color = "gray") + stat_summary(aes(x=DELTA,y=TISSUE,color=TYPE,group=TISSUE,shape=TISSUE), size = 0.3)+ theme_bw(base_size=6)+ facet_grid(~TYPE)+ ylab("Tissue") + xlab("Delta % mutated per region (old - young)") + scale_color_manual(values = colors_in_order) + scale_shape_manual(values = c(15, 19, 17)) + theme(strip.background = element_blank())+ theme(legend.position="None")+ theme(panel.grid = element_blank())+ theme(axis.text.x=element_text(size = 6, angle = 45, vjust = 1, hjust = 1), axis.text.y =element_text(size = 6), axis.title = element_text(size = 7), strip.text.x = element_text(size = 6), text = element_text(family = "sans")) pdf(paste(outdir_figures,"/wo_outliers_mut_freq_per_region_summary.pdf",sep=""),width= 7,height=1) print(g) dev.off() ```