--- title: "R Notebook" output: html_notebook --- ```{r} library(tidyverse) library(ggplot2) library(PNWColors) library(gridExtra) library(scales) ``` ```{r} setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/") outdir_figures = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/mutation_frequency_per_region/figures/" outdir_files = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/mutation_frequency_per_region/files/" supertable_file ="/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/supertable.txt" supertable = read.table(supertable_file, header=TRUE, stringsAsFactors = FALSE) coordinates_file <- "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/Mouse_Mt_Genome_Coordinates" coordinates <- read.table(coordinates_file, stringsAsFactors = FALSE, sep = "\t") ``` Formatting the coordinates file: ```{r} #adding the D-Loop coordinates to our file coordinates[nrow(coordinates) + 1, ] <- c("D-Loop", 15422, 16299) #adding column names to dataframe colnames(coordinates) <- c("GENE", "START", "END") coordinates$START <- as.numeric(coordinates$START) coordinates$END <- as.numeric(coordinates$END) #so things don't get wonky if the coordinates are read as strings -- also we convert our coordinates to be 0-indexed coordinates$START <- as.numeric(coordinates$START) - 1 coordinates$END <- as.numeric(coordinates$END) - 1 coordinates = coordinates %>% mutate(TYPE = case_when(grepl("mt-R", GENE) ~ "rRNA", grepl("mt-T", GENE) ~ "tRNA", GENE == "D-Loop" ~ "D-Loop", GENE == "mt-OL" ~ "OriL", TRUE ~ "Protein")) ``` Normalize for sequencing depth: ```{r} norm_seq_depth = supertable %>% select(SAMPLE, STRAIN, TISSUE, AGE_BIN, REF, ALT, START, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>% #unique here to get rid of redundant mutations present at pos that overlap in genes unique() %>% select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>% group_by(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>% summarise(SAMPLE_MUT_COUNT_AT_POS = sum(ALT_ALLELE_DEPTH)) %>% ungroup() %>% select(STRAIN, TISSUE, AGE_BIN, START, SAMPLE_MUT_COUNT_AT_POS, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS) %>% group_by(STRAIN, TISSUE, AGE_BIN, START, CONDITION_MUT_FREQ_AT_POS) %>% summarise(CONDITION_MUT_COUNT_AT_POS = sum(SAMPLE_MUT_COUNT_AT_POS), CONDITION_READ_DEPTH_AT_POS = sum(READ_DEPTH_AT_POS)) %>% ungroup() %>% group_by(START) %>% mutate(MIN_READ_DEPTH_AT_POS = min(CONDITION_READ_DEPTH_AT_POS)) %>% mutate(FLOOR_MIN_MUT_FREQ_AT_POS = 1/MIN_READ_DEPTH_AT_POS) %>% mutate(CONDITION_MUT_COUNT_AT_POS = ifelse(CONDITION_MUT_FREQ_AT_POS < FLOOR_MIN_MUT_FREQ_AT_POS, 0, CONDITION_MUT_COUNT_AT_POS)) ``` Calculating the average mutation frequency in 150 bp windows without the high frequency positions ```{r} mut_freq_sliding_window_avg = norm_seq_depth %>% #this was the arbitrary frequency we define as being a high frequency position filter(CONDITION_MUT_FREQ_AT_POS < 0.001) %>% mutate(NORM_CONDITION_MUT_FREQ_AT_POS = CONDITION_MUT_COUNT_AT_POS/CONDITION_READ_DEPTH_AT_POS) %>% ungroup() %>% group_by(STRAIN, TISSUE, AGE_BIN) %>% mutate(WINDOW_MUT_AVG = RcppRoll::roll_mean(NORM_CONDITION_MUT_FREQ_AT_POS,150,fill=NA)) %>% select(STRAIN, TISSUE, AGE_BIN, START, NORM_CONDITION_MUT_FREQ_AT_POS, WINDOW_MUT_AVG) ``` We identify the high frequency peaks that are shared across conplastic strains First, we create our gene labeller function ```{r} gene_labeller = function(position){ gene_label = coordinates[((coordinates$START <= position) & (coordinates$END >= position)),]$GENE if (length(gene_label) < 1){ gene_label = "intergene_region" } #in the case of the overlapping genes, assign the position to the first gene if (length(gene_label) > 1){ gene_label = gene_label[1] } return(gene_label) } ``` Finding mutation peaks > 1e-5 and locating peaks that are in > 3 strains ```{r} peak_ids = mut_freq_sliding_window_avg %>% ungroup() %>% select(STRAIN, TISSUE, AGE_BIN, START, WINDOW_MUT_AVG) %>% #we want peaks that are not the D-Loop filter(START < 15421) %>% #we want windows that are above this arbitrary threshold -- we define these areas as peaks filter(WINDOW_MUT_AVG > 1e-5) %>% rowwise() %>% mutate(GENE = gene_labeller(START)) %>% ungroup() %>% select(STRAIN, GENE) %>% unique() ``` Plotting time Color palette ```{r} library(PNWColors) bay_pal <- pnw_palette(name="Bay", type="discrete") ``` ```{r} grid_text = norm_seq_depth %>% ungroup() %>% select(STRAIN, AGE_BIN) %>% unique() %>% mutate(x = ifelse(STRAIN == "B6", 25, 300), y = 4e-5) ``` ```{r} peak_labels = bind_cols(data.frame(STRAIN = rep(c("AKR", "ALR", "B6", "FVB", "NZB"), 3)), data.frame(GENE = rep(c("mt-ND2", "OriL", "mt-Tr"), 5)), data.frame(X_POS = rep(c(4000, 5180, 9810), 5)) ) peak_labels = peak_labels %>% mutate(Y_POS = ifelse(STRAIN == "NZB", 3.95e-5, 3e-5)) %>% mutate(Y_POS = ifelse(GENE == "mt-ND2", 2e-5, Y_POS)) ``` Plotting our sliding windows with our peak labels ```{r} mut_freq_sliding_window_avg$STRAIN = factor(mut_freq_sliding_window_avg$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB")) mut_freq_sliding_window_avg$AGE_BIN = factor(mut_freq_sliding_window_avg$AGE_BIN, level = c("YOUNG", "OLD")) grid_text$STRAIN = factor(grid_text$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB")) peak_labels$STRAIN = factor(peak_labels$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB")) mutation_freq_windows_plot = ggplot(mut_freq_sliding_window_avg %>% mutate(AGE_LABEL = recode(AGE_BIN, "YOUNG" = "Young", "OLD" = "Old"))) mutation_freq_windows_plot = mutation_freq_windows_plot + geom_line(aes(x=START, y=WINDOW_MUT_AVG, color=TISSUE), size= 0.3, alpha = 0.8) + geom_text(data = grid_text, aes(x = x, y = y, label = STRAIN), size = 3.25) + geom_text(data = peak_labels, aes(x = X_POS, y = Y_POS, label = GENE), fontface = "italic", size=2.6) + scale_color_manual(name = "Tissue", values= bay_pal[c(1,5,4)], guide = "none") + theme_bw(base_size= 16) + ylab("Average Mutation Frequency (150 bp)") + xlab("Position on the mt-genome (bp)") + facet_grid(STRAIN~AGE_LABEL) + scale_x_continuous("") + ylim(0, 4.5e-5) + theme(strip.background=element_blank(), strip.text=element_blank(), panel.grid=element_blank(), text = element_text(family = "sans"), axis.title = element_text(size = 9.25), axis.ticks.x = element_blank(), axis.text.x = element_blank(), strip.text.x = element_text(size = 10, vjust = 1), axis.text.y=element_text(size = 8.25)) print(mutation_freq_windows_plot) pdf(paste(outdir_figures,"/leg_mut_freq_sliding_windows.pdf",sep=""),width=9,height=8) print(mutation_freq_windows_plot + theme(legend.position="bottom") + guides(color = guide_legend(override.aes = list(size = 3)))) dev.off() ``` Creating the mt-genome map that we'll place at the bottom of the figure ```{r} num_segs = length(coordinates$GENE) #important part here is to include the very last coordinate on the mt-genome map otherwise we miss our "last x coord" at the end of our map x = c(coordinates$START, 16299) xs = c(x[1:num_segs], x[1:num_segs], x[-1], x[-1], x[1:num_segs]) ys = c(rep(0,num_segs), rep(1,num_segs), rep(1,num_segs), rep(0,num_segs), rep(0,num_segs)) idx = rep(seq(1,num_segs),5) type = rep(coordinates$TYPE, 5) mt_genome_map = data.frame(x_pos = xs, y = ys, idx = idx, type = type) %>% arrange(idx) mt_genome_map = rbind(mt_genome_map %>% mutate(AGE_BIN="Young"),mt_genome_map %>% mutate(AGE_BIN="Old")) #alternating the thickness of the protein regions to match the circular mt-genome map mt_genome_map = mt_genome_map %>% mutate(y = ifelse(type == "Protein", ifelse(y == 0, -0.25, 1.25), y)) ``` Plotting the linear map of the mt-genome ```{r} library(RColorBrewer) lake_pal <- pnw_palette(name="Lake", type="discrete") spectral = brewer.pal(11,"Spectral") PRGn = brewer.pal(11,"PRGn") set1 = brewer.pal(3,"Set1") PuRd = brewer.pal(11,"PuRd") Blues = brewer.pal(9, "Blues") colors = c(Blues[9], set1[1], lake_pal[6], PuRd[6],PRGn[4]) #colors = c(blues[10],blues[3],spectral[1],spectral[8],spectral[11]) #lake_pal[c(3,7,1,6,5)] #colors = c(lake_pal[c(3,7)],set1[1],lake_pal[6],PRGn[4]) linear_map = ggplot(mt_genome_map) linear_map = linear_map + geom_polygon(aes(x=x_pos, y=y, group=idx, fill=type)) + theme_bw(base_size=16) + theme(legend.position="None", axis.text.y=element_blank(), axis.ticks.y=element_blank(), panel.grid=element_blank()) + scale_y_continuous("") + scale_x_continuous("Position") + scale_fill_manual(name = "Genomic Region", values = colors) + facet_grid(.~AGE_BIN) + theme(strip.background = element_blank(), strip.text=element_blank(), text = element_text(family = "sans"), panel.border = element_blank(), axis.title.x = element_text(size = 10), axis.text.x = element_text(size = 9.5), legend.position = "none") print(linear_map) ``` ```{r} #margin is top, r, b, l gA <- ggplotGrob(mutation_freq_windows_plot+theme(plot.margin = unit(c(1,1,0,1), "cm"))) gB <- ggplotGrob(linear_map+theme(plot.margin = unit(c(-.5,1,.5,1), "cm"))) maxWidth = grid::unit.pmax(gA$widths[2:5], gB$widths[2:5]) gA$widths[2:5] <- as.list(maxWidth) gB$widths[2:5] <- as.list(maxWidth) pdf(paste(outdir_figures,"/mut_freq_sliding_windows.pdf",sep=""),width=8,height=5) grid.arrange(gA,gB,heights=c(6,0.85)) dev.off() ```