--- title: "R Notebook" output: html_notebook --- ```{r} library(tidyverse) library(ggplot2) library(scales) library(ggridges) ``` ```{r} setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/reversions/") outdir_figures = "figures/" outdir_files = "files/" #this file contains REVERSION haplotype sites (ALT > B6 REF) #normalized_mut_freq_per_pos_file = "../files/normalized_mut_freq_per_pos_w_reversions.txt" normalized_mut_freq_per_pos_file = "files/corr_NUMT_freqs_normalized_mut_freq_per_pos_w_reversions.txt" normalized_mut_freq_per_pos = read.table(normalized_mut_freq_per_pos_file, header = TRUE, stringsAsFactors = FALSE) #we're using this table to get the reversion mutation type at the haplotype positions contingency_table_entries_file = "files/contingency_tables_entries.txt" contingency_table_entries = read.table(contingency_table_entries_file, header = TRUE, stringsAsFactors = FALSE) %>% filter(AGE_BIN == "YOUNG") ``` I took these out of NZB because they are the ins haplotype sites in NZB ```{r} filtered_normalized_mut_freq_pos = normalized_mut_freq_per_pos %>% filter(!(STRAIN == "NZB" & START == 9819)) %>% filter(!(STRAIN == "NZB" & START == 5203)) ``` We use this file to label the genes the reversions are in and to create our mt-genome map ```{r} coordinates_file <- "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/Mouse_Mt_Genome_Coordinates" coordinates <- read.table(coordinates_file, stringsAsFactors = FALSE, sep = "\t") ``` Formatting the coordinates file: ```{r} #adding the D-Loop coordinates to our file coordinates[nrow(coordinates) + 1, ] <- c("D-Loop", 15422, 16299) #adding column names to dataframe colnames(coordinates) <- c("GENE", "START", "END") coordinates$START <- as.numeric(coordinates$START) coordinates$END <- as.numeric(coordinates$END) #so things don't get wonky if the coordinates are read as strings -- also we convert our coordinates to be 0-indexed coordinates$START <- as.numeric(coordinates$START) - 1 coordinates$END <- as.numeric(coordinates$END) - 1 coordinates = coordinates %>% mutate(TYPE = case_when(grepl("mt-R", GENE) ~ "rRNA", grepl("mt-T", GENE) ~ "tRNA", GENE == "D-Loop" ~ "D-Loop", GENE == "mt-OL" ~ "OriL", TRUE ~ "Protein")) ``` Creating the fields that will hold the info we want to include in the figure ```{r} gene_labeller = function(position){ gene_label = coordinates[((coordinates$START <= position) & (coordinates$END >= position)),]$GENE if (length(gene_label) < 1){ gene_label = "intergene_region" } #in the case of the overlapping genes, assign the position to the first gene if (length(gene_label) > 1){ gene_label = gene_label[1] } return(gene_label) } ``` Pivot the table so that we can calculate the mut freq difference per position with age ```{r} delta_df = filtered_normalized_mut_freq_pos %>% ungroup() %>% select(STRAIN, TISSUE, AGE_BIN, START, SITE_TYPE, NORM_MUT_FREQ_AT_POS) %>% pivot_wider(names_from = AGE_BIN, values_from = NORM_MUT_FREQ_AT_POS) %>% mutate(DELTA = OLD - YOUNG) %>% filter(DELTA != 0) %>% mutate(CHANGE_TYPE = ifelse(DELTA > 0, "Increase", "Decrease")) %>% #we don't need B6 in this analysis since we're interested in the conplastic haplotype sites filter(STRAIN != "B6") delta_df$STRAIN = factor(delta_df$STRAIN, level = c("AKR", "ALR", "FVB", "NZB")) delta_df = delta_df %>% mutate(DUMMY_AGE_LABEL = recode(TISSUE, "Brain" = 1, "Heart" = 2, "Liver" = 3)) ``` We need to find the empirical p-value for all of the haplotype sites that change in freq with age ```{r} haplo_sites_delta = delta_df %>% filter(SITE_TYPE == "HAPLO_SITE") ``` 1) We calculate the total number of background sites that increase or decrease in freq with age ```{r} total_sites_per_change_type = delta_df %>% filter(SITE_TYPE == "NOT_HAPLO") %>% select(STRAIN, TISSUE, CHANGE_TYPE) %>% group_by(STRAIN, TISSUE, CHANGE_TYPE) %>% summarise(TOTAL_COUNT = n()) ``` ```{r} delta_background = delta_df %>% filter(SITE_TYPE != "HAPLO_SITE") ``` Now let's create a function that will take the delta and condition info from the haplotype delta df and output the p-value ```{r} p_value_calculator = function(delta, strain, tissue, change){ #this part works :) total_count = (total_sites_per_change_type %>% filter(STRAIN == strain, TISSUE == tissue, CHANGE_TYPE == change))$TOTAL_COUNT df = delta_background %>% filter(STRAIN == strain, TISSUE == tissue, CHANGE_TYPE == change) if(change == "Decrease"){ count_above_haplo_delta = (df %>% mutate(FLAG = ifelse(DELTA < delta, 1, 0)) %>% select(FLAG) %>% summarise(count_above_haplo_delta = sum(FLAG)))$count_above_haplo_delta} if(change == "Increase"){ count_above_haplo_delta = (df %>% mutate(FLAG = ifelse(DELTA > delta, 1, 0)) %>% select(FLAG) %>% summarise(count_above_haplo_delta = sum(FLAG)))$count_above_haplo_delta} #this works! tested with individual cases p_val = count_above_haplo_delta/total_count return(p_val) } ``` ```{r} haplo_sites_p_vals = haplo_sites_delta %>% select(STRAIN, TISSUE, START, DELTA, CHANGE_TYPE) %>% rowwise() %>% mutate(P_VAL = p_value_calculator(DELTA, STRAIN, TISSUE, CHANGE_TYPE)) ``` ```{r} adjusted_haplo_sites_p_vals = haplo_sites_p_vals %>% mutate(p_adj = p.adjust(P_VAL, method = "BH")) %>% mutate(SIG_STATUS = ifelse(p_adj < 0.01, "SIG", "N.S.")) ``` ```{r} setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/reversions/") write.table(adjusted_haplo_sites_p_vals, file = paste(outdir_files,"sig_rev_change_in_freq_w_age.txt", sep = ""), sep = "\t", quote = F, row.names = T) ``` ```{r} plotting_df = adjusted_haplo_sites_p_vals %>% rename(POS = START) %>% left_join(contingency_table_entries, by = c("STRAIN", "TISSUE", "POS")) %>% select(STRAIN, TISSUE, POS, DELTA, SIG_STATUS, CHANGE_TYPE, ORG_CONPLASTIC_ALLELE, REV_B6_ALLELE) %>% rowwise() %>% mutate(GENE = gene_labeller(POS)) %>% mutate(MUT_TYPE = paste(ORG_CONPLASTIC_ALLELE, REV_B6_ALLELE, sep = ">")) %>% mutate(MUT_TYPE_LABEL = recode(MUT_TYPE, "G>A" = "G>A/C>T", "G>C" = "G>C/C>G", "G>T" = "G>T/C>A", "T>C" = "T>C/A>G", "T>G" = "T>G/A>C", "T>A" = "T>A/A>T")) ``` Text for the increasing alleles ```{r} inc_label_text = plotting_df %>% filter(CHANGE_TYPE == "Increase", SIG_STATUS == "SIG") %>% select(STRAIN, TISSUE, GENE, POS, DELTA, MUT_TYPE_LABEL) %>% group_by(STRAIN, POS, GENE, MUT_TYPE_LABEL) %>% mutate(Y_POS = max(DELTA)) %>% ungroup() %>% select(STRAIN, GENE, POS, Y_POS, MUT_TYPE_LABEL) %>% group_by(STRAIN, GENE, POS, Y_POS, MUT_TYPE_LABEL) %>% summarise(TISSUE_COUNT = n()) %>% mutate(X_POS = POS) %>% mutate(LABEL = ifelse(TISSUE_COUNT > 2, paste(STRAIN, GENE, MUT_TYPE_LABEL, paste("(", TISSUE_COUNT,")"), sep = "\n"), "")) %>% mutate(Y_POS = ifelse(STRAIN == "NZB" & GENE == "mt-Co2", 1e-1, Y_POS)) ``` ```{R} #to note what other genes had a significant increasing reversion (these were tissue specific rev) inc_sec_label_text = plotting_df %>% filter(CHANGE_TYPE == "Increase", SIG_STATUS == "SIG") %>% select(STRAIN, TISSUE, GENE, POS, DELTA, MUT_TYPE_LABEL) %>% group_by(STRAIN, POS, GENE, MUT_TYPE_LABEL) %>% mutate(Y_POS = max(DELTA)) %>% ungroup() %>% select(STRAIN, GENE, POS, Y_POS, MUT_TYPE_LABEL) %>% group_by(STRAIN, GENE, POS, Y_POS, MUT_TYPE_LABEL) %>% summarise(TISSUE_COUNT = n()) %>% mutate(X_POS = POS) %>% mutate(SEC_LABEL = ifelse(TISSUE_COUNT < 2, paste(GENE), "")) %>% ungroup() %>% select(X_POS, Y_POS, SEC_LABEL) %>% distinct(SEC_LABEL, .keep_all = TRUE) %>% mutate(Y_POS = ifelse(SEC_LABEL == "D-Loop", 1.1e-4, Y_POS - 5e-5)) %>% mutate(X_POS = ifelse(SEC_LABEL == "D-Loop", 15500, X_POS )) ``` Now doing increasing pops ```{r} setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/reversions/") library(PNWColors) bay_pal <- pnw_palette(name="Bay", type="discrete") inc_rev = ggplot(plotting_df %>% filter(DELTA > 0)) + geom_point(aes(x = POS, y = log10(DELTA), color = STRAIN, shape = SIG_STATUS, size = log10(DELTA)/4)) + geom_text(data = inc_label_text, aes(x = ifelse(STRAIN == "AKR", X_POS + 500, X_POS - 800), y = log10(Y_POS), label = LABEL), size = 2.5) + geom_text(data = inc_sec_label_text, aes(x = X_POS, y = log10(Y_POS), label = SEC_LABEL), size = 2.5) + scale_shape_manual(name = "Significance", labels = c("N.S.", "Sig"), values = c(1, 19)) + scale_color_manual(values = c("AKR" = "#cc5c76", "ALR" = "#2f9e23", "FVB" = "#f57946", "NZB" = "#f7c22d")) + #scale_y_continuous(limits = c(-6,0), labels=c("-6" = "1e-6", "-5" = "1e-5", "-4" = "1e-4", "-3" = "1e-3", "2" = "1e-2", "-1" = "1e-1", "0" = "0")) + scale_y_continuous(limits = c(-7,0), breaks = seq(-6, 0 , by = 2), labels=c("-6" = "1e-6", "-4" = "1e-4", "2" = "1e-2", "0" = "")) + #scale_y_log10() + coord_cartesian(expand = FALSE, xlim = c(0, 16299)) + ylab("Delta Mutation Frequency \n (Old - Young) [log10]") + xlab("") + theme_bw() + theme(legend.position = "none", strip.background=element_blank(), text = element_text(family = "sans"), axis.ticks.x = element_blank()) pdf(paste(outdir_figures,"increasing_reversions.pdf",sep=""),width=9,height=2.5) print(inc_rev) dev.off() setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/reversions/") pdf(paste(outdir_figures,"leg_reversions.pdf",sep=""),width=9,height=6) print(inc_rev + guides(color = guide_legend(override.aes = list(size = 3)), shape = guide_legend(override.aes = list(size = 3))) + theme(legend.position = "right")) dev.off() ``` ```{r} dec_rev = ggplot(plotting_df %>% filter(DELTA < 0)) + geom_point(aes(x = POS, y = log10(abs(DELTA)), color = STRAIN, shape = SIG_STATUS, size = log10(abs(DELTA))/4)) + #no significant decreases :/ scale_shape_manual(name = "Significance", labels = c("N.S.", "Sig"), values = c(1, 19)) + scale_color_manual(values = c("AKR" = "#cc5c76", "ALR" = "#2f9e23", "FVB" = "#f57946", "NZB" = "#f7c22d")) + scale_y_reverse(limits = c(0, -7), breaks = c(-6, -4, -2, 0), labels = c("-6" = "1e-6", "-4" = "1e-4", "-2" = "1e-2", "0" = "") ) + #labels=c("-7" = "1e-7", "-5" = "1e-5", "-4.5" = "2.75e-5", "-4" = "1e-4", "-3.5" = "2.75e-4", "0" = "0") coord_cartesian(expand = FALSE, xlim = c(0, 16299)) + scale_x_continuous(position = "top") + ylab("Delta Mutation Frequency \n (Old - Young) [log10]") + xlab("") + theme_bw() + theme(legend.position = "none", strip.background=element_blank(), text = element_text(family = "sans"), axis.ticks.x = element_blank()) setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/reversions/") pdf(paste(outdir_figures, "decreasing_reversions.pdf",sep=""),width=9,height=2.5) print(dec_rev) dev.off() ``` Map of mt-genome for the aesthetic ```{r} num_segs = length(coordinates$GENE) #important part here is to include the very last coordinate on the mt-genome map otherwise we miss our "last x coord" at the end of our map x = c(coordinates$START, 16299) xs = c(x[1:num_segs], x[1:num_segs], x[-1], x[-1], x[1:num_segs]) ys = c(rep(0,num_segs), rep(1,num_segs), rep(1,num_segs), rep(0,num_segs), rep(0,num_segs)) idx = rep(seq(1,num_segs),5) type = rep(coordinates$TYPE, 5) mt_genome_map = data.frame(x_pos = xs, y = ys, idx = idx, type = type) %>% arrange(idx) #alternating the thickness of the protein regions to match the circular mt-genome map mt_genome_map = mt_genome_map %>% mutate(y = ifelse(type == "Protein", ifelse(y == 0, -0.25, 1.25), y)) ``` Plotting the linear map of the mt-genome ```{r} setwd("~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/reversions/") library(RColorBrewer) lake_pal <- pnw_palette(name="Lake", type="discrete") spectral = brewer.pal(11,"Spectral") PRGn = brewer.pal(11,"PRGn") set1 = brewer.pal(3,"Set1") PuRd = brewer.pal(11,"PuRd") Blues = brewer.pal(9, "Blues") colors = c(Blues[9], set1[1], lake_pal[6], PuRd[6],PRGn[4]) #colors = c(blues[10],blues[3],spectral[1],spectral[8],spectral[11]) #lake_pal[c(3,7,1,6,5)] #colors = c(lake_pal[c(3,7)],set1[1],lake_pal[6],PRGn[4]) linear_map = ggplot(mt_genome_map) linear_map = linear_map + geom_polygon(aes(x=x_pos, y=y, group=idx, fill=type)) + theme_bw(base_size=16) + theme(legend.position="None", axis.text.y=element_blank(), axis.ticks.y=element_blank(), panel.grid=element_blank()) + scale_y_continuous("") + scale_x_continuous("Position", breaks=seq(0,16000,4000)) + scale_fill_manual(name = "Genomic Region", values = colors) + theme(strip.background = element_blank(), strip.text=element_blank(), text = element_text(family = "sans"), panel.border = element_blank(), axis.title.x = element_text(size = 14), legend.position = "none") pdf(paste(outdir_figures,"linear_mtdna_map.pdf",sep=""),width=9,height=1) print(linear_map) dev.off() ```