---
title: "R Notebook"
output: html_notebook
---
```{r}
library(sigfit)
```
```{r}
setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/")
outdir_figures = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/mutational_spectra/figures/"
outdir_files = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/mutational_spectra/files/"
supertable_file ="/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/supertable.txt"
supertable = read.table(supertable_file, header=TRUE, stringsAsFactors = FALSE)
```
Now, we want to calculate the mutation type count per condition. For this analysis, we remove the high frequency positions:
So we,
1) Created the mutation type based on the ref and alt alleles
2) Recoded the SNV mutation types to consolidate complimentary mutation types
3) Relabeled mutations as INS and DEL in the mutation categories for grouping and merging purposes
4) Group by strain, tissue, age bin, and mutation type to count how many of each mutation type there is in a condition
5) The last mutate is 100% for merging purposes so that we can get the indel denominator for both INS and DEL
```{r}
#MUTATION_TYPE: The exact mutation type (i.e. G>A, C>T, etc)
#VARIANT_TYPE: SNV or INDEL
mutation_type_counts = supertable %>%
#we unique to remove any redundancy due to positions being in multiple gene regions
select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, REF, ALT, VARIANT_TYPE, ALT_ALLELE_DEPTH, READ_DEPTH_AT_POS, CONDITION_MUT_FREQ_AT_POS, HFP_THRESHOLD) %>%
unique() %>%
#filter out the HFP positions from our data
filter(CONDITION_MUT_FREQ_AT_POS < HFP_THRESHOLD) %>%
filter(VARIANT_TYPE != "no_mutation") %>%
select(STRAIN, TISSUE, AGE_BIN, REF, ALT, VARIANT_TYPE, ALT_ALLELE_DEPTH) %>%
mutate(MUTATION_TYPE = paste(REF, ALT, sep = ">")) %>%
select(STRAIN, TISSUE, AGE_BIN, VARIANT_TYPE, MUTATION_TYPE, ALT_ALLELE_DEPTH)
#consolidating complimentary mutation types
#we also recode the MUTATION_TYPE to be either INS or DEL rather than the specific mutation
#we then create a new column MUTATION (same name as in our ref_base_depth df) that has the exact point mutations and indel for merging information
mutation_type_counts = mutation_type_counts %>%
mutate(MUTATION_TYPE = recode(mutation_type_counts$MUTATION_TYPE, "A>T" = "T>A",
"A>C" = "T>G",
"A>G" = "T>C",
"C>A" = "G>T",
"C>G" = "G>C",
"C>T" = "G>A")) %>%
select(STRAIN, TISSUE, AGE_BIN, VARIANT_TYPE, MUTATION_TYPE, ALT_ALLELE_DEPTH) %>%
mutate(MUTATION_TYPE = ifelse(VARIANT_TYPE == "DEL", "DEL", MUTATION_TYPE)) %>%
mutate(MUTATION_TYPE = ifelse(VARIANT_TYPE == "INS", "INS", MUTATION_TYPE)) %>%
select(STRAIN, TISSUE, AGE_BIN, MUTATION_TYPE, ALT_ALLELE_DEPTH) %>%
group_by(STRAIN, TISSUE, AGE_BIN, MUTATION_TYPE) %>%
#calculating the alt allele count across samples and the redundant mutation types
summarise(MUT_COUNT = sum(ALT_ALLELE_DEPTH)) %>%
mutate(MUTATION = ifelse(MUTATION_TYPE == "INS"| MUTATION_TYPE == "DEL", "INDEL", MUTATION_TYPE))
```
Now we want to combine the labels for strain, tissue, and age bin to create a condition label
```{r}
counts_per_cond = mutation_type_counts %>%
unite(COND, c("STRAIN", "TISSUE", "AGE_BIN")) %>%
select(COND, MUTATION_TYPE, MUT_COUNT) %>%
pivot_wider(names_from = MUTATION_TYPE, values_from = MUT_COUNT)
counts_per_cond_df = as.data.frame(counts_per_cond)
rownames(counts_per_cond_df) = counts_per_cond_df[,1]
counts_per_cond_df[,1] = NULL
matrix_counts = as.matrix(counts_per_cond_df)
```
Writing out the mutation type count matrix that is used as input for sigfit
```{r}
write.table(counts_per_cond, file = paste(outdir_files,"/sigfit_mut_counts_input.txt", sep = ""), sep = "\t", quote = F, row.names = F)
```
```{r}
rm(supertable)
```
Now to extract signatures
First we figure out the optimal number of signatures:
warmup (helps identify parameters is half of iteration value)
```{r}
nsig_extraction = extract_signatures(counts = matrix_counts,
nsignatures = 1:7,
iter = 1000,
seed = 1756)
```
```{r}
pdf(paste(outdir_figures,"/gof_nsigs.pdf",sep=""), width=3.5,height=3.5)
plot_gof(nsig_extraction)
dev.off()
#best number of features is 2
```
Now to refit the model using the best number of signatures
```{r}
model_refit = extract_signatures(counts = matrix_counts,
nsignatures = 2,
iter = 10000,
seed = 1756)
```
```{r}
signatures <- retrieve_pars(model_refit,
par = "signatures")
#this signatures object has the mean mutation probability and the upper and lower tails surrounding this mean
sig_mean = as.data.frame(t(signatures$mean)) %>%
rownames_to_column() %>%
pivot_longer(cols = c("Signature A", "Signature B"), names_to = "SIGS", values_to = "MEAN")
sig_lci = as.data.frame(t(signatures$lower_95)) %>%
rownames_to_column() %>%
pivot_longer(cols = c("Signature A", "Signature B"), names_to = "SIGS", values_to = "LCI")
sig_uci = as.data.frame(t(signatures$upper_95)) %>%
rownames_to_column() %>%
pivot_longer(cols = c("Signature A", "Signature B"), names_to = "SIGS", values_to = "UCI")
summary_sig_comp = sig_mean %>%
full_join(sig_uci, by = c("rowname", "SIGS")) %>%
full_join(sig_lci, by = c("rowname", "SIGS")) %>%
mutate(MUTATION_TYPE = rowname) %>%
select(MUTATION_TYPE, SIGS, MEAN, LCI, UCI) %>%
mutate(MUT_LABEL = recode(MUTATION_TYPE, "G>A" = "G>A/C>T",
"G>C" = "G>C/C>G",
"G>T" = "G>T/C>A",
"T>A" = "T>A/A>T",
"T>C" = "T>C/A>G",
"T>G" = "T>G/A>C",
"INS" = "Ins",
"DEL" = "Del"))
rm(sig_mean, sig_lci, sig_uci)
```
Colors!
```{r}
bay = pnw_palette("Bay",8,type="continuous")
moth = pnw_palette("Moth",12,type="continuous")
star = pnw_palette("Starfish", 7, type = "continuous")
spec_colors = c(star[1], star[2], star[4], bay[4], moth[9], star[5], moth[6],moth[4])
snv_spec_colors = c(star[2], bay[4], star[4], moth[4], moth[6], moth[9])
```
```{r}
sig_comp_fig = ggplot(summary_sig_comp)
sig_comp_fig = sig_comp_fig +
geom_bar(aes(x = MUT_LABEL, y = MEAN, fill = MUTATION_TYPE), stat="identity") +
geom_errorbar(aes(x=MUT_LABEL, ymin= LCI, ymax= UCI), width=0.4, size=0.5) +
xlab("Mutation Type") +
ylab("Mutation \nProbability") +
facet_wrap(SIGS ~ ., strip.position = "top", ncol = 2) +
scale_fill_manual(name = "Mutation Type" , values = spec_colors) +
theme_bw(base_size = 12) +
theme(text = element_text(family = "sans"),
axis.text.x = element_text(angle = 45, size = 6, vjust = 1, hjust = 1),
axis.text.y=element_text(size = 6.5),
strip.background=element_blank(),
panel.background = element_blank(),
axis.title.x = element_text(size = 7.5),
axis.title.y = element_text(size = 7.5),
strip.text.x = element_text(size = 7.5),
strip.text.y = element_text(size = 7),
legend.position = "none")
pdf(paste(outdir_figures,"/sigfit_sig_composition.pdf",sep=""), width=2.75,height=2)
print(sig_comp_fig)
dev.off()
```
```{r}
exposures = as.data.frame(retrieve_pars(model_refit,
par = "exposures")) %>%
rownames_to_column()
```
```{r}
sig_to_sig = exposures %>%
select(rowname, mean.Signature.A, mean.Signature.B) %>%
separate(rowname, c("STRAIN", "TISSUE", "AGE_BIN"), sep = "_") %>%
mutate(SHAPE = paste(TISSUE, AGE_BIN, sep = "_"))
sig_to_sig$AGE_BIN = factor(sig_to_sig$AGE_BIN, level = c("YOUNG", "OLD"))
```
```{r}
sig_to_sig_fig = ggplot(sig_to_sig) +
geom_point(aes(x = mean.Signature.B, y = mean.Signature.A, shape = SHAPE, color = STRAIN), size = 2) +
theme_bw(base_size = 16) +
xlab("Signature B") +
ylab("Signature A") +
scale_color_manual(values = c("B6" = "#1d457f" , "AKR" = "#cc5c76", "ALR" = "#625a94", "FVB" = "#f57946", "NZB" = "#f7c22d")) +
scale_shape_manual(values = c(19, 1, 17, 2, 15, 0)) +
theme( text = element_text(family = "sans"),
axis.title.x = element_text(size = 11),
axis.title.y = element_text(size = 11),
axis.text.y=element_text(size = 10),
axis.text.x =element_text(size = 10),
legend.position = "none")
pdf(paste(outdir_figures,"/sigfit_sig_to_sig.pdf",sep=""), width=3,height=3)
print(sig_to_sig_fig)
dev.off()
```
```{r}
contribution = sig_to_sig %>%
select(STRAIN, TISSUE, AGE_BIN, mean.Signature.A, mean.Signature.B) %>%
pivot_longer(cols = c("mean.Signature.A", "mean.Signature.B"), names_to = "SIG", values_to = "CONTRIBUTION")
contribution$STRAIN = factor(contribution$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB"))
contribution$AGE_BIN = factor(contribution$AGE_BIN, level = c("YOUNG", "OLD"))
contribution$TISSUE = factor(contribution$TISSUE, level = c("Brain", "Heart", "Liver"))
```
```{r}
contribution_sig_fig = ggplot(contribution %>%
mutate(AGE_LABEL = recode(AGE_BIN, "YOUNG" = "Y", "OLD" = "O")), aes(x = AGE_LABEL, y = CONTRIBUTION, fill = SIG ))
contribution_sig_fig = contribution_sig_fig +
geom_bar(position="stack", stat="identity") +
facet_grid(TISSUE ~ STRAIN) +
ylab("Contribution of Signature") +
xlab("Age") +
scale_fill_manual(name = "Signature", labels = c("Sig A", "Sig B"), values = c("#5d74a5", "#b0cbe7")) +
theme_bw(base_size = 12) +
theme( strip.background=element_blank(),
text = element_text(family = "sans"),
axis.title.x = element_text(size = 8.5),
axis.title.y = element_text(size = 8.5),
axis.text.y=element_text(size = 7),
axis.text.x =element_text(size = 7.5),
strip.text.x = element_text(size = 7.5),
strip.text.y = element_text(size = 7.5),
legend.position = "none")
pdf(paste(outdir_figures,"/sigfit_sig_contributions.pdf",sep=""), width=3,height=2.5)
print(contribution_sig_fig)
dev.off()
pdf(paste(outdir_figures,"/leg_sigfit_sig_contributions.pdf",sep=""), width=4,height=4.5)
print(contribution_sig_fig + theme(legend.position = "bottom"))
dev.off()
```