--- title: "R Notebook" output: html_notebook --- ```{r} setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/") outdir_figures = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/mutation_frequency_per_region/figures/" outdir_files = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/mutation_frequency_per_region/files/" supertable_file ="/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/supertable.txt" supertable = read.table(supertable_file, header=TRUE, stringsAsFactors = FALSE) ``` ```{r} supertable %>% filter(START > 9820, START < 9827) ``` Normalize for sequencing depth: ```{r} norm_seq_depth = supertable %>% select(SAMPLE, STRAIN, TISSUE, AGE_BIN, REF, ALT, START, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>% #unique here to get rid of redundant mutations present at pos that overlap in genes unique() %>% select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>% group_by(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>% summarise(SAMPLE_MUT_COUNT_AT_POS = sum(ALT_ALLELE_DEPTH)) %>% ungroup() %>% select(STRAIN, TISSUE, AGE_BIN, START, SAMPLE_MUT_COUNT_AT_POS, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>% group_by(STRAIN, TISSUE, AGE_BIN, START, CONDITION_MUT_FREQ_AT_POS) %>% summarise(CONDITION_MUT_COUNT_AT_POS = sum(SAMPLE_MUT_COUNT_AT_POS), CONDITION_READ_DEPTH_AT_POS = sum(READ_DEPTH_AT_POS)) %>% ungroup() %>% group_by(START) %>% mutate(MIN_READ_DEPTH_AT_POS = min(CONDITION_READ_DEPTH_AT_POS)) %>% mutate(FLOOR_MIN_MUT_FREQ_AT_POS = 1/MIN_READ_DEPTH_AT_POS) %>% mutate(CONDITION_MUT_COUNT_AT_POS = ifelse(CONDITION_MUT_FREQ_AT_POS < FLOOR_MIN_MUT_FREQ_AT_POS, 0, CONDITION_MUT_COUNT_AT_POS)) ``` Plotting time: Color palette ```{r} library(PNWColors) bay_pal <- pnw_palette(name="Bay", type="discrete") ``` Pinpointing the high frequency region to the tRNA ```{r} trna_reg = norm_seq_depth %>% #this was the arbitrary frequency we define as being a high frequency position filter(CONDITION_MUT_FREQ_AT_POS < 0.001) %>% mutate(NORM_CONDITION_MUT_FREQ_AT_POS = CONDITION_MUT_COUNT_AT_POS/CONDITION_READ_DEPTH_AT_POS) %>% ungroup() %>% select(STRAIN, TISSUE, AGE_BIN, START, NORM_CONDITION_MUT_FREQ_AT_POS) %>% filter(START > 9400, START < 10200) %>% filter(NORM_CONDITION_MUT_FREQ_AT_POS > 0) trna_reg$STRAIN = factor(trna_reg$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB")) trna_reg$AGE_BIN = factor(trna_reg$AGE_BIN, level = c("YOUNG", "OLD")) ``` ```{r} trna_reg_plot = ggplot(trna_reg %>% mutate(AGE_LABEL = recode(AGE_BIN, "YOUNG" = "Young", "OLD" = "Old")), aes(x = START, y = NORM_CONDITION_MUT_FREQ_AT_POS, color = TISSUE)) trna_reg_plot = trna_reg_plot + geom_point(size = 0.5, alpha = 0.7) + geom_segment(x = 9807, xend = 9876, y = 0, yend = 0, color = "black") + theme_bw() + facet_grid(STRAIN~AGE_LABEL) + ylab("Mutation frequency at position") + xlab("Position on the mt-genome (bp)") + scale_color_manual(name = "Tissue", values= bay_pal[c(1,5,4)]) + guides(color = guide_legend(override.aes = list(size = 3))) + theme(strip.background=element_blank(), panel.grid=element_blank(), text = element_text(family = "sans"), axis.title = element_text(size = 10), strip.text.x = element_text(size = 10, vjust = 1), strip.text.y = element_text(size = 10, vjust = 1), axis.text.y=element_text(size = 8), axis.text.x=element_text(size = 8, angle = 45, vjust = 1, hjust = 1), legend.position = "bottom") pdf(paste(outdir_figures,"/mut_freq_nd3_trna_nd4_reg.pdf",sep=""),width=6,height=4) print(trna_reg_plot) dev.off() ``` Zooming into the tRNA region to identify high frequency sites ```{r} trna_arg = norm_seq_depth %>% #this was the arbitrary frequency we define as being a high frequency position filter(CONDITION_MUT_FREQ_AT_POS < 0.001) %>% mutate(NORM_CONDITION_MUT_FREQ_AT_POS = CONDITION_MUT_COUNT_AT_POS/CONDITION_READ_DEPTH_AT_POS) %>% ungroup() %>% select(STRAIN, TISSUE, AGE_BIN, START, NORM_CONDITION_MUT_FREQ_AT_POS) %>% filter(START > 9816, START < 9830) trna_arg$STRAIN = factor(trna_arg$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB")) trna_arg$AGE_BIN = factor(trna_arg$AGE_BIN, level = c("YOUNG", "OLD")) ``` ```{r} trna_arg_plot = ggplot(trna_arg %>% mutate(AGE_LABEL = recode(AGE_BIN, "YOUNG" = "Young", "OLD" = "Old")), aes(x = START, y = NORM_CONDITION_MUT_FREQ_AT_POS, color = TISSUE)) trna_arg_plot = trna_arg_plot + geom_point(size = 0.5, alpha = 0.7) + #geom_point(x = 9818, y = 0.00075, shape = 8, size = 0.7, color = "magenta") + theme_bw() + facet_grid(STRAIN~AGE_LABEL) + ylab("Mutation frequency at position") + xlab("Position on the mt-genome (bp)") + scale_color_manual(name = "Tissue", values= bay_pal[c(1,5,4)]) + scale_x_continuous(breaks = seq(9816,9830,1)) + guides(color = guide_legend(override.aes = list(size = 3))) + theme(strip.background=element_blank(), panel.grid=element_blank(), text = element_text(family = "sans"), axis.title = element_text(size = 8), strip.text.x = element_text(size = 8, vjust = 1), strip.text.y = element_text(size = 8, vjust = 1), axis.text.y=element_text(size = 6), axis.text.x=element_text(size = 6, angle = 45, vjust = 1, hjust = 1), legend.position = "bottom") print(trna_arg_plot) pdf(paste(outdir_figures,"/mut_freq_trna_arg.pdf",sep=""),width=4,height=3) print(trna_arg_plot) dev.off() ``` ```{r} main_fig_trna_arg = trna_arg %>% ungroup() %>% filter(START > 9819, START < 9828) %>% group_by(STRAIN, AGE_BIN, START) %>% summarise(STRAIN_AVG_FREQ = mean(NORM_CONDITION_MUT_FREQ_AT_POS)) %>% filter(STRAIN_AVG_FREQ > 0) %>% mutate(STRAIN_LABEL = ifelse(STRAIN == "B6", "B6", "Conplastic")) %>% mutate(X_POS = ifelse(AGE_BIN == "YOUNG", START - 0.15, START + 0.15)) ``` ```{r} X_LABEL = (supertable %>% filter(START > 9815, START < 9831) %>% select(START, REF) %>% unique() %>% filter(nchar(REF) == 1) %>% mutate(X_LABEL = paste(REF, START, sep = "")))$X_LABEL ``` ```{r} main_fig_trna_arg_plot = ggplot(main_fig_trna_arg, aes(x = X_POS, y = STRAIN_AVG_FREQ, color = STRAIN, shape = AGE_BIN)) main_fig_trna_arg_plot = main_fig_trna_arg_plot + geom_point(size = 0.3) + #geom_point(x = 9818, y = 0.00075, shape = 8, size = 0.7, color = "magenta") + theme_bw(base_size = 6) + facet_wrap(STRAIN_LABEL~., nrow = 2) + ylab("Mutation frequency") + xlab("Position (bp)") + scale_color_manual(values = c("B6"= "#1d457f","AKR" = "#cc5c76", "ALR" = "#2f9e23", "FVB" = "#f57946", "NZB" = "#f7c22d")) + scale_shape_manual(labels = c("Young", "Old") , values = c(1,19)) + scale_x_continuous(breaks = seq(9816,9830,1), labels = X_LABEL) + theme(strip.background=element_blank(), panel.grid=element_blank(), text = element_text(family = "sans"), axis.title = element_text(size = 5), strip.text.x = element_text(size = 6, vjust = 1), strip.text.y = element_text(size = 6, vjust = 1), axis.text.y=element_text(size = 3.5), axis.text.x=element_text(size = 3.5, angle = 45, vjust = 1, hjust = 1), legend.position = "none") print(main_fig_trna_arg_plot) pdf(paste(outdir_figures,"/main_fig_trna_arg_plot.pdf",sep=""),width=1,height=1.25) print(main_fig_trna_arg_plot) dev.off() pdf(paste(outdir_figures,"/leg_main_fig_trna_arg_plot.pdf",sep=""),width=3,height=1.75) print(main_fig_trna_arg_plot + guides(shape = guide_legend(override.aes = list(size = 2)), color = guide_legend(override.aes = list(size = 2))) + theme(legend.position = "right", legend.key.size = unit(0.25, "cm"))) dev.off() ``` We consolidate the mutation frequency at each position to a mutation frequency across the region since we can't with certainty say where the mutations are occurring in the repeat region ```{r} region_mut_freq = norm_seq_depth %>% #this was the arbitrary frequency we define as being a high frequency position filter(CONDITION_MUT_FREQ_AT_POS < 0.001) %>% ungroup() %>% #zoom into the hotspot region filter(START > 9819, START < 9828) %>% select(STRAIN, TISSUE, AGE_BIN, CONDITION_MUT_COUNT_AT_POS, CONDITION_READ_DEPTH_AT_POS) %>% group_by(STRAIN, TISSUE, AGE_BIN) %>% summarise(REGION_MUT_COUNT = sum(CONDITION_MUT_COUNT_AT_POS), REGION_READ_DEPTH = sum(CONDITION_READ_DEPTH_AT_POS)) %>% mutate(REGION_MUT_FREQ = REGION_MUT_COUNT/REGION_READ_DEPTH) %>% ungroup() %>% select(STRAIN, AGE_BIN, REGION_MUT_FREQ) %>% group_by(STRAIN, AGE_BIN) %>% #averaging across tissues in a strain summarise(AVG_REGION_MUT_FREQ = mean(REGION_MUT_FREQ)) %>% filter(AVG_REGION_MUT_FREQ > 0) %>% mutate(STRAIN_LABEL = ifelse(STRAIN == "B6", "B6", "Conplastic")) %>% mutate(X_POS = ifelse(AGE_BIN == "YOUNG", 0.95, 1.05)) region_mut_freq$STRAIN = factor(region_mut_freq$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB")) region_mut_freq$AGE_BIN = factor(region_mut_freq$AGE_BIN, level = c("YOUNG", "OLD")) ``` ```{r} region_mut_freq_plot = ggplot(region_mut_freq, aes(x = X_POS, y = AVG_REGION_MUT_FREQ, color = STRAIN, shape = AGE_BIN)) region_mut_freq_plot = region_mut_freq_plot + geom_point(size = 0.3) + #geom_point(x = 9818, y = 0.00075, shape = 8, size = 0.7, color = "magenta") + theme_bw(base_size = 6) + facet_wrap(STRAIN_LABEL~., nrow = 2) + ylab("Mutation frequency") + xlab("A-Repeat Region (7)\n (Indels)") + scale_color_manual(values = c("B6"= "#1d457f","AKR" = "#cc5c76", "ALR" = "#2f9e23", "FVB" = "#f57946", "NZB" = "#f7c22d")) + scale_shape_manual(labels = c("Young", "Old") , values = c(1,19)) + scale_x_continuous(limits = c(0.9,1.1), breaks = c(0.95, 1.05), labels = c("Y", "O")) + theme(strip.background=element_blank(), panel.grid=element_blank(), text = element_text(family = "sans"), axis.title = element_text(size = 5), strip.text.x = element_text(size = 6, vjust = 1), strip.text.y = element_text(size = 6, vjust = 1), axis.text.y=element_text(size = 3.5), axis.text.x=element_text(size = 3.5), legend.position = "none") print(region_mut_freq_plot) pdf(paste(outdir_figures,"/region_trna_arg_plot.pdf",sep=""),width=1,height=1.25) print(region_mut_freq_plot) dev.off() pdf(paste(outdir_figures,"/leg_region_trna_arg_plot.pdf",sep=""),width=3,height=1.75) print(region_mut_freq_plot + guides(shape = guide_legend(override.aes = list(size = 2)), color = guide_legend(override.aes = list(size = 2))) + theme(legend.position = "right", legend.key.size = unit(0.25, "cm"))) dev.off() ``` ```{r} avg_across_strains_trna_arg = trna_arg %>% ungroup() %>% filter(START > 9819, START < 9828) %>% mutate(STRAIN_LABEL = ifelse(STRAIN == "B6", "B6", "Conplastic")) %>% filter(STRAIN_LABEL == "Conplastic") %>% select(STRAIN, AGE_BIN, START, NORM_CONDITION_MUT_FREQ_AT_POS) %>% group_by(STRAIN, AGE_BIN, START) %>% summarise(AVG_FREQ = mean(NORM_CONDITION_MUT_FREQ_AT_POS)) ``` ```{r} avg_across_strains_trna_plot = ggplot(avg_across_strains_trna_arg, aes(x = START, y = AVG_FREQ, shape = AGE_BIN)) avg_across_strains_trna_plot = avg_across_strains_trna_plot + geom_point(size = 0.25, position = position_dodge(width = 0.65)) + #geom_point(x = 9818, y = 0.00075, shape = 8, size = 0.7, color = "magenta") + theme_bw(base_size = 6) + facet_wrap(STRAIN~., nrow = 4) + ylab("Mutation frequency") + xlab("Position (bp)") + scale_color_manual(values = c("B6"= "#1d457f","Conplastic" = "magenta")) + scale_shape_manual(labels = c("Young", "Old") , values = c(1,19)) + scale_x_continuous(breaks = seq(9816,9830,1)) + theme(strip.background=element_blank(), panel.grid=element_blank(), text = element_text(family = "sans"), axis.title = element_text(size = 5), strip.text.x = element_text(size = 6, vjust = 1), strip.text.y = element_text(size = 6, vjust = 1), axis.text.y=element_text(size = 4.5), axis.text.x=element_text(size = 4.5, angle = 45, vjust = 1, hjust = 1), legend.position = "none") pdf(paste(outdir_figures,"/avg_freq_age_trna_arg_plot.pdf",sep=""),width=1,height=2) print(avg_across_strains_trna_plot) dev.off() ``` Types of mutations at the tRNA high frequency region ```{r} trn_mut_type = supertable %>% select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, REF, ALT, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>% unique() %>% filter(CONDITION_MUT_FREQ_AT_POS < 1e-3) %>% filter(START > 9819, START < 9828) %>% mutate(MUTATION = paste(REF,ALT, sep = ">")) %>% select(STRAIN, TISSUE, AGE_BIN, START, MUTATION, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS) %>% group_by(STRAIN, TISSUE, AGE_BIN, START, MUTATION) %>% summarise(COND_ALLELE_COUNT = sum(ALT_ALLELE_DEPTH), COND_READ_DEPTH = sum(READ_DEPTH_AT_POS)) %>% filter(COND_ALLELE_COUNT > 0) %>% group_by(STRAIN, TISSUE, AGE_BIN) %>% mutate(TOTAL_MUTS = sum(COND_ALLELE_COUNT)) %>% select(STRAIN, TISSUE, AGE_BIN, MUTATION, COND_ALLELE_COUNT, TOTAL_MUTS) %>% ungroup() %>% group_by(STRAIN, TISSUE, AGE_BIN, MUTATION, TOTAL_MUTS) %>% #here we count how many of each allele is present across the entire region summarise(MUT_TYPE_TOTAL_COUNT = sum(COND_ALLELE_COUNT)) %>% mutate(MUT_PROP = MUT_TYPE_TOTAL_COUNT/TOTAL_MUTS) trn_mut_type$STRAIN = factor(trn_mut_type$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB")) trn_mut_type$AGE_BIN = factor(trn_mut_type$AGE_BIN, level = c("YOUNG", "OLD")) ``` ```{r} mut_type_plot = ggplot(trn_mut_type, aes(x = STRAIN, y = MUT_PROP, fill = MUTATION)) + geom_bar(position = "stack", stat = "identity") + facet_grid(AGE_BIN~TISSUE) + xlab("Strain") + ylab("Allele mutation frequency") + theme_bw(base_size = 10) + theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), strip.background = element_blank()) pdf(paste(outdir_figures,"/mut_type_trna_arg_plot.pdf",sep=""),width=5,height=5) print(mut_type_plot) dev.off() ``` The high HFP position in tRNA is 9819 in B6 and CIS conplastic strains ```{r} supertable %>% filter(CONDITION_MUT_FREQ_AT_POS > 0.001) %>% ungroup() %>% select(STRAIN, TISSUE, AGE_BIN, START, CONDITION_MUT_FREQ_AT_POS) %>% filter(START > 9807, START < 9876) %>% select(STRAIN, TISSUE, AGE_BIN, START) %>% unique() supertable %>% filter(CONDITION_MUT_FREQ_AT_POS > 0.001) %>% ungroup() %>% select(STRAIN, TISSUE, AGE_BIN, START, CONDITION_MUT_FREQ_AT_POS) %>% filter(START == 9819) ```