---
title: "R Notebook"
output: html_notebook
---
Change in mutation frequency with age per mutation type -- HFPs are not included in this analysis
```{r}
library(tidyverse)
library(ggplot2)
library(broom)
library(ggsignif)
library(PNWColors)
```
```{r}
setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/")
outdir_figures = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/change_in_mut_freq/figures/"
outdir_files = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/change_in_mut_freq/files"
mut_freq_per_type_file = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/mut_freq_per_type.csv"
mut_freq_per_type = read.table(mut_freq_per_type_file, header=TRUE, stringsAsFactors = FALSE)
supertable_file ="/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/supertable.txt"
supertable = read.table(supertable_file, header=TRUE, stringsAsFactors = FALSE)
```
Denominator for our average mutation frequencies -- also helps normalize for sequencing depth
We:
1) Filter out the mid point in B6
2) Filter the df so that it's all A,C,G,T ref bases and the total duplex depth, which is used as the denominator for indels
3) Recode mutation types so that we group complimentary mutation types and then sum the duplex depth (i.e. A+T and C+G ref base depth are summed). The totals across samples are summed together.
4) Relabel Total to Indel mutation for merging purposes
```{r}
#MUTATION_TYPE: The exact mutation type (i.e. G>A, C>T, etc)
#MUTATION_CLASS: SNV, INS, or DEL
#DENOMINATOR: Duplex depth for that ref base
#we don't unique because each sample has their own duplex depth to contribute
ref_base_seq_depth = mut_freq_per_type %>%
#filter out samples not included in the analysis
filter(AGE_BIN != "MID") %>%
filter(!grepl("B6_Y3_Heart", SAMPLE)) %>%
filter(!grepl("B6_Y._Liver", SAMPLE)) %>%
filter(MUTATION_CLASS == "SNV") %>%
select(STRAIN, TISSUE, AGE_BIN, MUTATION_TYPE, MUTATION_CLASS, DENOMINATOR)
#consolidating complimentary mutation types
#sum the depth across complimentary ref bases
ref_base_seq_depth = ref_base_seq_depth %>%
mutate(MUTATION = recode(ref_base_seq_depth$MUTATION_TYPE, "A>T" = "T>A",
"A>C" = "T>G",
"A>G" = "T>C",
"C>A" = "G>T",
"C>G" = "G>C",
"C>T" = "G>A")) %>%
select(STRAIN, TISSUE, AGE_BIN, MUTATION, DENOMINATOR) %>%
group_by(STRAIN, TISSUE, AGE_BIN, MUTATION) %>%
summarise(REF_BASE_DEPTH = sum(DENOMINATOR)) %>%
mutate(MUTATION = ifelse(MUTATION == "Total", "INDEL", MUTATION)) %>%
mutate(STRAIN = ifelse(STRAIN == "F", "FVB", STRAIN))
```
Now, we want to calculate the mutation type count per condition. For this analysis, we remove the high frequency positions:
So we,
1) Created the mutation type based on the ref and alt alleles
2) Recoded the SNV mutation types to consolidate complimentary mutation types
3) Relabeled mutations as INS and DEL in the mutation categories for grouping and merging purposes
4) Group by strain, tissue, age bin, and mutation type to count how many of each mutation type there is in a condition
5) The last mutate is 100% for merging purposes so that we can get the indel denominator for both INS and DEL
```{r}
#MUTATION_TYPE: The exact mutation type (i.e. G>A, C>T, etc)
#VARIANT_TYPE: SNV or INDEL
mutation_type_counts = supertable %>%
#we unique to remove any redundancy due to positions being in multiple gene regions
select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, REF, ALT, VARIANT_TYPE, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS, HFP_THRESHOLD) %>%
unique() %>%
#filter out the HFP positions from our data
filter(CONDITION_MUT_FREQ_AT_POS < HFP_THRESHOLD) %>%
filter(VARIANT_TYPE != "no_mutation") %>%
select(STRAIN, TISSUE, AGE_BIN, REF, ALT, VARIANT_TYPE, ALT_ALLELE_DEPTH) %>%
mutate(MUTATION_TYPE = paste(REF, ALT, sep = ">")) %>%
select(STRAIN, TISSUE, AGE_BIN, VARIANT_TYPE, MUTATION_TYPE, ALT_ALLELE_DEPTH)
#consolidating complimentary mutation types
#we also recode the MUTATION_TYPE to be either INS or DEL rather than the specific mutation
#we then create a new column MUTATION (same name as in our ref_base_depth df) that has the exact point mutations and indel for merging information
mutation_type_counts = mutation_type_counts %>%
mutate(MUTATION_TYPE = recode(mutation_type_counts$MUTATION_TYPE, "A>T" = "T>A",
"A>C" = "T>G",
"A>G" = "T>C",
"C>A" = "G>T",
"C>G" = "G>C",
"C>T" = "G>A")) %>%
select(STRAIN, TISSUE, AGE_BIN, VARIANT_TYPE, MUTATION_TYPE, ALT_ALLELE_DEPTH) %>%
mutate(MUTATION_TYPE = ifelse(VARIANT_TYPE == "DEL", "DEL", MUTATION_TYPE)) %>%
mutate(MUTATION_TYPE = ifelse(VARIANT_TYPE == "INS", "INS", MUTATION_TYPE)) %>%
select(STRAIN, TISSUE, AGE_BIN, MUTATION_TYPE, ALT_ALLELE_DEPTH) %>%
group_by(STRAIN, TISSUE, AGE_BIN, MUTATION_TYPE) %>%
#calculating the alt allele count across samples and the redundant mutation types
summarise(MUT_COUNT = sum(ALT_ALLELE_DEPTH)) %>%
mutate(MUTATION = ifelse(MUTATION_TYPE == "INS"| MUTATION_TYPE == "DEL", "INDEL", MUTATION_TYPE))
```
Merge our ref base and mutation counts tables to calculate frequencies and proportions
```{r}
summary_df = mutation_type_counts %>%
left_join(ref_base_seq_depth, by = c("STRAIN", "TISSUE", "AGE_BIN", "MUTATION")) %>%
select(STRAIN, TISSUE, AGE_BIN, MUTATION_TYPE, MUT_COUNT, REF_BASE_DEPTH) %>%
ungroup() %>%
mutate(FREQ = MUT_COUNT/REF_BASE_DEPTH) %>%
group_by(STRAIN, TISSUE, AGE_BIN) %>%
mutate(TOTAL_MUT_COUNT_IN_COND = sum(MUT_COUNT)) %>%
mutate(PROPORTION = MUT_COUNT/TOTAL_MUT_COUNT_IN_COND)
summary_df$STRAIN = factor(summary_df$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB"))
summary_df$AGE_BIN = factor(summary_df$AGE_BIN, level = c("YOUNG", "OLD"))
#rearranged from highest to lowest mutation proportion
summary_df$MUTATION_TYPE = factor(summary_df$MUTATION_TYPE, level = c("G>A", "G>T", "T>C", "G>C", "DEL", "T>A", "INS", "T>G"))
```
Write out the mutation type freq calculations
```{r}
outdir_files = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/change_in_mut_freq/files"
#summary df of mutation type frequencies without the HFPs in our data
write.table(summary_df, file = paste(outdir_files,"/mut_freq_per_type_wo_HFPs.txt", sep = ""), sep = "\t", quote = F, row.names = T)
```
Back to the drawing board with a good ole fisher's exact test -- to test if there's an association with mutation type counts and age
```{r}
fe_test_df = summary_df %>%
filter(!(STRAIN == "B6" & TISSUE == "Liver")) %>%
ungroup() %>%
select(AGE_BIN, MUTATION_TYPE, MUT_COUNT) %>%
group_by(AGE_BIN, MUTATION_TYPE) %>%
summarise(TYPE_MUT_COUNT = sum(MUT_COUNT)) %>%
ungroup() %>%
group_by(AGE_BIN) %>%
mutate(TOTAL_MUTS_IN_AGE = sum(TYPE_MUT_COUNT)) %>%
#super cool!!
pivot_wider(names_from = AGE_BIN, values_from = c(TYPE_MUT_COUNT, TOTAL_MUTS_IN_AGE)) %>%
mutate(OTHER_MUTS_YOUNG = TOTAL_MUTS_IN_AGE_YOUNG - TYPE_MUT_COUNT_YOUNG,
OTHER_MUTS_OLD = TOTAL_MUTS_IN_AGE_OLD - TYPE_MUT_COUNT_OLD) %>%
select(MUTATION_TYPE, TYPE_MUT_COUNT_OLD, OTHER_MUTS_OLD, TYPE_MUT_COUNT_YOUNG, OTHER_MUTS_YOUNG) %>%
mutate(MUT_TYPE_CHAR = as.character(MUTATION_TYPE)) %>%
select(MUT_TYPE_CHAR, TYPE_MUT_COUNT_OLD, OTHER_MUTS_OLD, TYPE_MUT_COUNT_YOUNG, OTHER_MUTS_YOUNG)
```
Implementing fisher's exact test
```{r}
#this loop will run through our contingency table -- where each row has information for the contingency table
#1) builds the tble
#2) runs the fisher's exact test on the tbl
#3) adds result to df
# recall: matrices need to have the same type of info and we have mixed info we're saving here
#also extracting the pval does weird things to the column name so we don't use rbind given that it assumes the same column name
iterations = nrow(fe_test_df)
#initializing our df
output = data.frame(MUTATION_TYPE = character(),
PVAL = numeric(),
OR = numeric())
for(i in 1:iterations){
#creating contingency table
row_of_info = fe_test_df[i,]
#capturing info that we want to keep in our df
mut_type_label = row_of_info[1]
mut_type_old = row_of_info[2]
other_type_old = row_of_info[3]
mut_type_young = row_of_info[4]
other_type_young = row_of_info[5]
#constructed our 2x2 df
tbl = matrix(as.numeric(row_of_info[2:5]), ncol=2, byrow=F)
#running fisher's exact on the df
fishers_results = fisher.test(tbl, alternative = "two.sided")
pval = fishers_results$p.value[1]
OR = fishers_results$estimate
#appending to our df
results = c(mut_type_label, pval, OR)
output[nrow(output) + 1,] = results
}
```
Multiple hypothesis test correcting
```{r}
output_bh_corr = output %>%
mutate(ADJ_PVAL=p.adjust(PVAL, method="BH"))
```
Plotting the ~~classic~~ visualization of the mutational spectra proportions:
Figure setting(s)
```{r}
bay = pnw_palette("Bay",8,type="continuous")
moth = pnw_palette("Moth",12,type="continuous")
star = pnw_palette("Starfish", 7, type = "continuous")
x_axis_labs <- c("B", "H", "L")
refactored_spec_colors = c(star[2], bay[4], moth[6], star[4], star[1], star[5], moth[9], moth[4])
```
```{r}
strain_comparison = ggplot(summary_df %>% mutate(MUT_LABEL = recode(MUTATION_TYPE, "G>A" = "G>A/C>T",
"G>C" = "G>C/C>G",
"G>T" = "G>T/C>A",
"T>A" = "T>A/A>T",
"T>C" = "T>C/A>G",
"T>G" = "T>G/A>C"
)), aes(x = STRAIN, y = PROPORTION, fill = MUT_LABEL, alpha = AGE_BIN))
strain_comparison = strain_comparison +
geom_bar(stat="identity", color="black", position=position_dodge()) +
facet_grid(TISSUE ~ MUT_LABEL, scales = "free_y") +
ylab("Propotion of Mutation Type") +
xlab("Tissue") +
theme_bw() +
scale_fill_manual(name = "Mutation Type" , values = refactored_spec_colors) +
scale_alpha_manual(name = "Age", labels = c("Young", "Old"), values = c(0.55, 1)) +
theme(strip.background=element_blank(),
strip.text = element_text(),
text = element_text(family = "sans"),
axis.text.x = element_text(angle = 90, size = 11),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
strip.text.y = element_text(size = 12),
strip.text.x = element_text(size = 10),
axis.text.y=element_text(size = 10),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position="none")
pdf(paste(outdir_figures,"/classical_spectrum_strain_comparison.pdf",sep=""), width=8,height=5)
print(strain_comparison)
dev.off()
```
Plotting the change in mutation frequency
```{r}
change_mut_freq = ggplot(summary_df %>%
mutate(AGE_LABEL = recode(AGE_BIN, "YOUNG" = "Y", "OLD" = "O")) %>%
mutate(MUT_LABEL = recode(MUTATION_TYPE, "G>A" = "G>A/C>T",
"G>C" = "G>C/C>G",
"G>T" = "G>T/C>A",
"T>A" = "T>A/A>T",
"T>C" = "T>C/A>G",
"T>G" = "T>G/A>C")), aes(x = AGE_LABEL, y = FREQ, color = STRAIN, group = STRAIN))
change_mut_freq = change_mut_freq +
geom_line() +
geom_point() +
facet_grid(MUT_LABEL ~ TISSUE, scales = "free_y") +
scale_color_manual(values = c("B6" = "#1d457f" , "AKR" = "#cc5c76", "ALR" = "#2f9e23", "FVB" = "#f57946", "NZB" = "#f7c22d"), guide = "none") +
xlab("Age") +
ylab("Average Mutation Frequency") +
theme_bw(base_size = 12) +
theme(strip.background=element_blank(),
text = element_text(family = "sans"),
axis.title.x = element_text(size = 6.5),
axis.title.y = element_text( size = 5.5),
strip.text.x = element_text(size = 6.25),
strip.text.y = element_text(size = 6),
axis.text.y = element_text(size = 5.5),
axis.text.x = element_text(size = 5.5),
legend.position = "none")
pdf(paste(outdir_figures,"/change_in_freq_mut_types.pdf",sep=""), width=2.25,height=4.25)
print(change_mut_freq)
dev.off()
```
```{r}
summary_df$TISSUE = factor(summary_df$TISSUE, level= c("Heart", "Brain", "Liver"))
flipped_change_mut_freq = ggplot(summary_df %>%
mutate(AGE_LABEL = recode(AGE_BIN, "YOUNG" = "Y", "OLD" = "O")) %>%
mutate(MUT_LABEL = recode(MUTATION_TYPE, "G>A" = "G>A/C>T",
"G>C" = "G>C/C>G",
"G>T" = "G>T/C>A",
"T>A" = "T>A/A>T",
"T>C" = "T>C/A>G",
"T>G" = "T>G/A>C")), aes(y = AGE_LABEL, x = FREQ, color = STRAIN, group = STRAIN))
flipped_change_mut_freq = flipped_change_mut_freq +
geom_line() +
geom_point() +
facet_grid(TISSUE ~ MUT_LABEL, scales = "free_x") +
scale_color_manual(values = c("B6" = "#1d457f" , "AKR" = "#cc5c76", "ALR" = "#2f9e23", "FVB" = "#f57946", "NZB" = "#f7c22d"), guide = "none") +
ylab("Age") +
xlab("Average Mutation Frequency") +
theme_bw() +
theme_bw(base_size = 12) +
theme(strip.background=element_blank(),
text = element_text(family = "sans"),
axis.title.x = element_text(size = 6),
axis.title.y = element_text( size = 5.5),
strip.text.x = element_text(size = 6),
strip.text.y = element_text(size = 6),
axis.text.y = element_text(size = 5),
axis.text.x = element_text(size = 5, angle = 45, vjust = 1, hjust = 1),
legend.position = "none")
pdf(paste(outdir_figures,"/flipped_change_in_freq_mut_types.pdf",sep=""), width=6,height=2.25)
print(flipped_change_mut_freq)
dev.off()
```
```{r}
min_max_df = summary_df %>%
select(STRAIN, TISSUE, AGE_BIN, MUTATION_TYPE, FREQ) %>%
ungroup() %>%
group_by(AGE_BIN, MUTATION_TYPE) %>%
mutate(MIN_FREQ = min(FREQ), MAX_FREQ = max(FREQ), MEDIAN_FREQ = median(FREQ)) %>%
select(AGE_BIN, MUTATION_TYPE, MIN_FREQ, MAX_FREQ, MEDIAN_FREQ) %>%
mutate(DUMMY_AGE = ifelse(AGE_BIN == "OLD", 0 , 1))
min_max_df$MUTATION_TYPE = factor(min_max_df$MUTATION_TYPE, level = c("G>A", "G>T", "T>C", "G>C", "DEL", "T>A","INS", "T>G"))
min_max_df$AGE_BIN = factor(min_max_df$AGE_BIN, level = c("OLD", "YOUNG"))
```
```{r}
range_freq_mut_types = ggplot(min_max_df) +
geom_segment(aes(x = MIN_FREQ, xend = MAX_FREQ, y = DUMMY_AGE, yend = DUMMY_AGE)) +
geom_point(aes(x = MEDIAN_FREQ, y = DUMMY_AGE), shape = 19, size = 1, color = "black") +
facet_wrap(. ~ MUTATION_TYPE, ncol = 1, strip.position = "right") +
xlab("Average \nMutation Frequency") +
ylab("Age") +
#scale_shape_manual(name = "Age", values = c(19, 1), label = c("Old", "Young"), guide = "none") +
theme_bw(base_size = 14) +
scale_y_continuous(limits = c(-1, 2), labels = c("", "O", "Y",""), position = "right") +
coord_cartesian(expand = FALSE,
xlim = c(0, 1e-5)) +
theme(strip.background=element_blank(),
strip.text.y = element_blank(),
text = element_text(family = "sans"),
axis.title.x = element_text(size = 8),
axis.text.x = element_text(size = 6, angle = 90, hjust = 1, vjust = 1),
axis.title.y = element_blank(),
axis.text.y = element_text(size = 7),
axis.ticks.y = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line.x = element_line(colour = "black"),
legend.position = "none"
)
pdf(paste(outdir_figures,"/range_freq_mut_types.pdf",sep=""), width=1.5, height=4.75)
print(range_freq_mut_types)
dev.off()
pdf(paste(outdir_figures,"/leg_range_freq_mut_types.pdf",sep=""), width=2.25, height=5.5)
print(range_freq_mut_types + scale_shape_manual(name = "Age", values = c(1, 19), label = c("Young", "Old")) + guides(shape = guide_legend(override.aes = list(size = 2.5))) + theme(legend.position = "bottom"))
dev.off()
```
Calculating the delta in mutation frequency for each type
```{r}
delta_fig_df = summary_df %>%
filter(!(STRAIN == "B6" & TISSUE == "Liver")) %>%
ungroup()%>%
select(STRAIN, TISSUE, AGE_BIN, MUTATION_TYPE, FREQ) %>%
pivot_wider(values_from = FREQ, names_from = AGE_BIN) %>%
mutate(BETA = OLD - YOUNG)
delta_fig_df$TISSUE = factor(delta_fig_df$TISSUE, level = c("Heart", "Brain", "Liver"))
```
```{r}
delta_fig = ggplot(delta_fig_df, aes(x = BETA, y = STRAIN, color = STRAIN, shape = TISSUE))
delta_fig = delta_fig +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
geom_point() +
facet_grid(TISSUE ~ MUTATION_TYPE, scales = "free_x") +
xlab("Beta Mutation Frequency\n (Old - Young)") +
ylab("Strain") +
scale_color_manual(values = c("B6" = "#1d457f" , "AKR" = "#cc5c76", "ALR" = "#2f9e23", "FVB" = "#f57946", "NZB" = "#f7c22d"), guide = "none") +
scale_shape_manual(values = c(17, 19, 15)) +
theme_bw(base_size = 16) +
theme(strip.background=element_blank(),
strip.text = element_text(),
text = element_text(family = "sans"),
axis.text.x = element_text(angle = 90, size = 8),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10),
strip.text.y = element_text(size = 10),
strip.text.x = element_text(size = 9),
axis.text.y=element_text(size = 8),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position="none")
pdf(paste(outdir_figures,"/beta_mut_types.pdf",sep=""), width=6,height=3)
print(delta_fig)
dev.off()
```