--- title: "duplex_read_depth" output: html_notebook --- This figure contributes to Fig 1d. ```{r} library(tidyverse) library(ggplot2) library(scales) library(ggbeeswarm) ``` ```{r} outdir_figures <- "~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/experimental_design/figures" outdir_files <- "~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/experimental_design/files" supertable_file <-"~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/supertable.txt" supertable <- read.table(supertable_file, header=TRUE, stringsAsFactors = FALSE) ``` We calculate the average read depth across the mt-genome for each sample. This average represents the average number of duplex mt-genomes sequenced given that each mt-genome has a unique molecular identifier. ```{r} sample_depth = supertable %>% select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, READ_DEPTH_AT_POS) %>% unique() %>% group_by(SAMPLE, STRAIN, TISSUE, AGE_BIN) %>% summarise(AVG_DUP_DEPTH = mean(READ_DEPTH_AT_POS)) ``` ```{r} sample_depth_plot = ggplot(sample_depth, aes(x = AVG_DUP_DEPTH)) + geom_histogram(aes(y=..count../sum(..count..)), bins = 25) + xlab("Duplex mt-genomes Profiled Per Sample") + ylab("Proportion") + theme_bw() + theme(axis.text = element_text(size=10), axis.title = element_text(size=11), axis.text.x = element_text(size = 10), axis.text.y = element_text(size = 10), strip.background= element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black")) pdf(paste(outdir_figures, "/dcs_read_depth_per_sample.pdf", sep = ""), width = 4, height = 3) print(sample_depth_plot) dev.off() ``` Quantifying the total bp depth per condition (strain x age x tissue) --> sum of the avg_bp_depth for all samples in a condition. ```{r} cond_depth = supertable %>% select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, READ_DEPTH_AT_POS) %>% #we want to filter any redundant positions (i.e. positions that are in two genes) unique() %>% select(STRAIN, TISSUE, AGE_BIN, START, READ_DEPTH_AT_POS) %>% group_by(STRAIN, TISSUE, AGE_BIN, START) %>% summarise(COND_DEPTH_AT_POS = sum(READ_DEPTH_AT_POS)) %>% ungroup() %>% select(STRAIN, TISSUE, AGE_BIN, COND_DEPTH_AT_POS) %>% group_by(STRAIN, TISSUE, AGE_BIN) %>% summarise(COND_AVG_DEPTH = mean(COND_DEPTH_AT_POS)) ``` ```{r} median_per_cond = median(cond_depth$COND_AVG_DEPTH) total_mt_genomes = floor(sum(cond_depth$COND_AVG_DEPTH)) print(median_per_cond) print(total_mt_genomes) range(cond_depth$COND_AVG_DEPTH) #the lowest we can sequence is 1/max(cond_depth$COND_AVG_DEPTH) ``` ```{r} cond_depth = cond_depth %>% mutate(STRAIN = recode(STRAIN, "F" = "FVB")) cond_depth$STRAIN = factor(cond_depth$STRAIN, levels = c("B6", "AKR", "ALR", "FVB", "NZB")) beeswarm_fig = ggplot(cond_depth) + geom_beeswarm(aes(x = STRAIN, y = COND_AVG_DEPTH, color = STRAIN)) + annotate("text", x = 2.45, y = 5.3e+05, label = "Total: 2.5M", size = 3.75) + annotate("text", x = 3, y = 4.6e+05, label = "Median: 74,764", size = 3.75) + xlab("Strains") + ylab("Count of duplex mt-genomes") + scale_color_manual(values = c("B6"= "#1d457f","AKR" = "#cc5c76", "ALR" = "#2f9e23", "FVB" = "#f57946", "NZB" = "#f7c22d")) + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = scales::trans_format("log10", math_format(10^.x))) + theme_bw(base_size = 16) + theme(axis.text = element_text(size=11), axis.title = element_text(size=12), axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), axis.text.y = element_text(size = 11), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"), strip.background=element_blank(), strip.text = element_text(face = "bold"), text = element_text(family = "sans"), legend.position = "none") pdf(paste(outdir_figures, "/beeswarm_duplex_depth_per_cond.pdf", sep = ""), width = 2.35, height = 4) print(beeswarm_fig) dev.off() ``` Writing out files: 1) Sample duplex depth 2) Condition duplex depth ```{r} write.table(sample_depth, file = paste(outdir_files,"/sample_duplex_depth.txt", sep = ""), sep = "\t", quote = F, row.names = F) write.table(cond_depth, file = paste(outdir_files,"/experimental_condition_duplex_depth.txt", sep = ""), sep = "\t", quote = F, row.names = F) ``` ~~~~~~SCRATCH CODE~~~~~~~~~~ This is a second method for estimating duplex depth from the raw number of duplex sequences. Note if our reads overlap this would give ~ 2-fold higher estimate. ```{r} dcs_read_depth_file <-"~/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/experimental_design/files/duplex_read_depth.csv" dcs_read_depth <- read.table(dcs_read_depth_file, header=TRUE, stringsAsFactors = FALSE) ``` Filtering out the samples we will not be using in our analysis: ```{r} filtered_dcs_read_depth = dcs_read_depth %>% filter(AGE_BIN != "MID") %>% filter(!grepl("B6_Y3_Heart", sample)) %>% filter(!grepl("B6_Y._Liver", sample)) ``` ```{r} #we multiply by the number of bp our reads covered (150x PE reads) and then divide by the length of the mtGenome to obtain the avg_bp_depth per sample setwd(outdir_figures) filtered_dcs_read_depth$avg_bp_depth = (filtered_dcs_read_depth$read_count * 300) / 16299 cleaned_dcs_read_depth = filtered_dcs_read_depth %>% separate(sample, c("STRAIN", "AGE", "TISSUE"), extra = "drop", sep = "_") mean_per_exp = mean(cleaned_dcs_read_depth$avg_bp_depth) print(mean_per_exp) range(cleaned_dcs_read_depth$avg_bp_depth) sum(cleaned_dcs_read_depth$avg_bp_depth) ```