--- title: "R Notebook" output: html_notebook --- Calculating the observed hNhS ratio from our data. ```{r} library(tidyverse) library(ggplot2) ``` ```{r} setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/") outdir_files = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/files/" outdir_figures = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/figures/" B6_sims_wo_hfps_file = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/files/simulator_output/B6_wo_hfp_simulated_annotation_counts" AKR_sims_wo_hfps_file = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/files/simulator_output/AKR_wo_hfp_simulated_annotation_counts" ALR_sims_wo_hfps_file = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/files/simulator_output/ALR_wo_hfp_simulated_annotation_counts" FVB_sims_wo_hfps_file = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/files/simulator_output/F_wo_hfp_simulated_annotation_counts" NZB_sims_wo_hfps_file = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/files/simulator_output/NZB_wo_hfp_simulated_annotation_counts" B6_sims_wo_hfps = read.table(B6_sims_wo_hfps_file, header=TRUE, stringsAsFactors = FALSE) AKR_sims_wo_hfps = read.table(AKR_sims_wo_hfps_file, header=TRUE, stringsAsFactors = FALSE) ALR_sims_wo_hfps = read.table(ALR_sims_wo_hfps_file, header=TRUE, stringsAsFactors = FALSE) FVB_sims_wo_hfps = read.table(FVB_sims_wo_hfps_file, header=TRUE, stringsAsFactors = FALSE) NZB_sims_wo_hfps = read.table(NZB_sims_wo_hfps_file, header=TRUE, stringsAsFactors = FALSE) ``` Combining our files from the simulation ```{r} #the F in the strain name was read at False when downloaded into R studio? Recoded the strain name for downstream analysis FVB_sims_wo_hfps = FVB_sims_wo_hfps %>% mutate(STRAIN = "FVB") simulations_wo_hfps = bind_rows(B6_sims_wo_hfps, AKR_sims_wo_hfps, ALR_sims_wo_hfps, FVB_sims_wo_hfps, NZB_sims_wo_hfps) rm(B6_sims_wo_hfps, AKR_sims_wo_hfps, ALR_sims_wo_hfps, FVB_sims_wo_hfps, NZB_sims_wo_hfps) ``` Reading in the other files we'll need ```{r} supertable_file ="/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/supertable.txt" supertable = read.table(supertable_file, header=TRUE, stringsAsFactors = FALSE) all_possible_muts_annotation_file ="/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/files/annotated_all_possible_variants.txt" all_possible_muts_annotation = read.table(all_possible_muts_annotation_file, header=TRUE, stringsAsFactors = FALSE) ``` ```{r} possible_sites = all_possible_muts_annotation %>% filter(ANNOTATION != "non_coding_transcript_exon_variant") %>% select(STRAIN, ANNOTATION, GENE) %>% mutate(STRAIN = recode(STRAIN, "F" = "FVB")) %>% group_by(STRAIN, GENE, ANNOTATION) %>% summarise(SITE_COUNT = n()/3) %>% pivot_wider(names_from = ANNOTATION, values_from = SITE_COUNT) %>% mutate(NONSYN_SITE_COUNT = missense_variant, SYN_SITE_COUNT = synonymous_variant) %>% select(STRAIN, GENE, NONSYN_SITE_COUNT, SYN_SITE_COUNT) ``` ```{r} average_read_depth = supertable %>% #to filter out tRNAs filter(!grepl("mt-T", GENE)) %>% filter(GENE != "mt-OL", GENE != "intergene_region") %>% select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, GENE, VARIANT_TYPE, ANNOTATION, READ_DEPTH_AT_POS) %>% filter(ANNOTATION != "non_coding_transcript_exon_variant" & ANNOTATION != "INDEL") %>% group_by(STRAIN, TISSUE, AGE_BIN, GENE, START) %>% summarise(COND_READ_DEPTH_AT_POS = sum(READ_DEPTH_AT_POS)) %>% ungroup() %>% select(STRAIN, TISSUE, AGE_BIN, GENE, COND_READ_DEPTH_AT_POS) %>% group_by(STRAIN, TISSUE, AGE_BIN, GENE) %>% summarise(AVG_READ_DEPTH = mean(COND_READ_DEPTH_AT_POS)) ``` Merging the number of sites with the average read depth -- this calculation will give us the total number of sites. ```{r} total_num_sites = average_read_depth %>% left_join(possible_sites, by = c("STRAIN", "GENE")) %>% mutate(TOTAL_NONSYN_SITE_COUNT = AVG_READ_DEPTH*NONSYN_SITE_COUNT, TOTAL_SYN_SITE_COUNT = AVG_READ_DEPTH*SYN_SITE_COUNT) rm(possible_sites, average_read_depth) ``` Now calculating our observed hNhS ratio ```{r} observed_mut_counts = supertable %>% select(STRAIN, TISSUE, AGE_BIN, GENE, VARIANT_TYPE, ANNOTATION, ALT_ALLELE_DEPTH) %>% filter(VARIANT_TYPE == "SNV") %>% filter(ANNOTATION != "non_coding_transcript_exon_variant") %>% select(STRAIN, TISSUE, AGE_BIN, GENE, ANNOTATION, ALT_ALLELE_DEPTH) %>% group_by(STRAIN, TISSUE, AGE_BIN, GENE, ANNOTATION) %>% summarise(MUT_COUNT = sum(ALT_ALLELE_DEPTH)) %>% pivot_wider(names_from = ANNOTATION, values_from = MUT_COUNT) %>% mutate(OBS_NONSYN_MUT_COUNT = missense_variant, OBS_SYN_MUT_COUNT = synonymous_variant) %>% select(STRAIN, TISSUE, AGE_BIN, GENE, OBS_NONSYN_MUT_COUNT, OBS_SYN_MUT_COUNT) observed_mut_counts[is.na(observed_mut_counts)] <- 0 ``` ```{r} #we reformat our simulations long format to wide format in order to allow for easy calculations of the ratio simulations_reformated = simulations_wo_hfps %>% pivot_wider(names_from = ANNOTATION, values_from = COUNT) %>% mutate(SIM_NONSYN_MUT_COUNT = missense_variant, SIM_SYN_MUT_COUNT = synonymous_variant) %>% select(STRAIN, TISSUE, AGE_BIN, GENE, SIM_RUN, SIM_NONSYN_MUT_COUNT, SIM_SYN_MUT_COUNT) simulations_reformated[is.na(simulations_reformated)] <- 0 rm(simulations_wo_hfps) ``` ```{r} sims_data = simulations_reformated %>% left_join(total_num_sites, by = c("STRAIN", "TISSUE", "AGE_BIN", "GENE")) obs_data = observed_mut_counts %>% left_join(total_num_sites, by = c("STRAIN", "TISSUE", "AGE_BIN", "GENE")) rm(simulations_reformated, observed_mut_counts, total_num_sites, supertable, all_possible_muts_annotation) ``` ```{r} sims_hNhS_ratios = sims_data %>% mutate(sim_hN = SIM_NONSYN_MUT_COUNT/TOTAL_NONSYN_SITE_COUNT, sim_hS = SIM_SYN_MUT_COUNT/TOTAL_SYN_SITE_COUNT) %>% mutate(SIM_HNHS = sim_hN/sim_hS) %>% #calculated hNhS as Inf because there are 0 syn mutations in the gene --> cannot calculate the hNhS ratio filter(SIM_HNHS != "Inf") obs_hNhS_ratios = obs_data %>% mutate(obs_hN = OBS_NONSYN_MUT_COUNT/TOTAL_NONSYN_SITE_COUNT, obs_hS = OBS_SYN_MUT_COUNT/TOTAL_SYN_SITE_COUNT) %>% mutate(OBS_HNHS = obs_hN/obs_hS) %>% #calculated hNhS as Inf because there are 0 syn mutations in the gene --> cannot calculate the hNhS ratio filter(OBS_HNHS != "Inf") rm(obs_data, sims_data) ``` Plotting AHHHHH ```{r} obs_hNhS_ratios$STRAIN = factor(obs_hNhS_ratios$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB")) obs_hNhS_ratios$AGE_BIN = factor(obs_hNhS_ratios$AGE_BIN, level = c("YOUNG", "OLD")) sims_hNhS_ratios$STRAIN = factor(sims_hNhS_ratios$STRAIN, level = c("B6", "AKR", "ALR", "FVB", "NZB")) sims_hNhS_ratios$AGE_BIN = factor(sims_hNhS_ratios$AGE_BIN, level = c("YOUNG", "OLD")) hnhs_wo_hfps_plot = ggplot(obs_hNhS_ratios) hnhs_wo_hfps_plot = hnhs_wo_hfps_plot + geom_violin(data = sims_hNhS_ratios, aes(x = log10(SIM_HNHS), y = GENE, fill = AGE_BIN), alpha = 0.7, size = 0, scale = "width") + geom_point(aes(x = log10(OBS_HNHS), y = GENE, color = AGE_BIN), size = 1 , shape = 8) + xlab("log10(hN/hS)") + ylab("Gene") + facet_grid(STRAIN~TISSUE) + scale_color_manual(name = "Age", labels = c("Young", "Old"), values = c("grey57", "black")) + scale_fill_manual(name = "Age", labels = c("Young", "Old"), values = c("grey80", "black")) + theme_bw(base_size = 15) + theme(strip.background=element_blank(), strip.text = element_text(), text = element_text(family = "sans"), legend.position = "bottom") + coord_cartesian(xlim = c(-5, 5)) pdf(paste(outdir_figures,"/hnhs_wo_hfps.pdf",sep=""), width=8,height=10.5) print(hnhs_wo_hfps_plot) dev.off() ``` We want to find the significance of the observed hN/hS values. ```{r} obs = obs_hNhS_ratios %>% select(STRAIN, TISSUE, AGE_BIN, GENE, OBS_HNHS) sims = sims_hNhS_ratios %>% select(STRAIN, TISSUE, AGE_BIN, GENE, SIM_HNHS) %>% group_by(STRAIN, TISSUE, AGE_BIN, GENE) %>% summarise(MEDIAN = median(SIM_HNHS), AVG = mean(SIM_HNHS), SD = sd(SIM_HNHS), LOWER = quantile(SIM_HNHS, probs = 0.025, na.rm = T), HIGHER = quantile(SIM_HNHS, probs = 0.975, na.rm = T)) merged_data = sims %>% left_join(obs, by = c("STRAIN", "TISSUE", "AGE_BIN", "GENE")) rm(obs, sims) ``` ```{r} merged_data_w_sim_data = sims_hNhS_ratios %>% select(STRAIN, TISSUE, AGE_BIN, SIM_RUN, GENE, SIM_HNHS) %>% left_join(merged_data, by = c("STRAIN", "TISSUE", "AGE_BIN", "GENE")) rm(sims_hNhS_ratios) ``` ```{r} merged_data_w_sim_data = merged_data_w_sim_data %>% select(STRAIN, TISSUE, AGE_BIN, SIM_RUN, GENE, SIM_HNHS, OBS_HNHS, AVG) %>% mutate(Z_SIM = SIM_HNHS - AVG, Z_OBS = OBS_HNHS - AVG) p_values = merged_data_w_sim_data %>% select(STRAIN, TISSUE, AGE_BIN, GENE, OBS_HNHS, Z_SIM, Z_OBS) %>% mutate(POS_Z_OBS = abs(Z_OBS), NEG_Z_OBS = -1*POS_Z_OBS) %>% #how many data points do we have for each gene -- remember we filtered out the Infs group_by(STRAIN, TISSUE, AGE_BIN, GENE, OBS_HNHS, Z_OBS) %>% mutate(SIM_TOTAL = n()) %>% ungroup() %>% #identify if our simulations generate values passed our obs pt in their respective upper or lower tails -- this is based on the Z_OBS sign --> greater or lower than the expected mutate(COMP_Z_OBS = ifelse(Z_OBS < 0, ifelse(NEG_Z_OBS >= Z_SIM, 1, 0), ifelse(POS_Z_OBS <= Z_SIM, 1, 0))) %>% #mutate(COMP_POS_Z_OBS = ifelse(POS_Z_OBS <= Z_SIM, 1, 0), COMP_NEG_Z_OBS = ifelse(NEG_Z_OBS >= Z_SIM, 1, 0)) %>% #we need to count how many points are more extreme than our obs value under the null select(STRAIN, TISSUE, AGE_BIN, SIM_TOTAL, GENE, OBS_HNHS, Z_OBS, COMP_Z_OBS) %>% group_by(STRAIN, TISSUE, AGE_BIN, SIM_TOTAL, GENE, OBS_HNHS, Z_OBS) %>% #summarise(COUNT_SIMS_GREATER = sum(COMP_POS_Z_OBS), COUNT_SIMS_LOWER = sum(COMP_NEG_Z_OBS)) %>% summarise(SUM_COMPARISON = sum(COMP_Z_OBS)) %>% #mutate(SUM_COMPARISON = COUNT_SIMS_GREATER + COUNT_SIMS_LOWER) %>% #multiplying by 2 to create a 2-tailed test --> assuming that we have a symmetric distribution; this will allow for a stricter significance level & more reliable/conservative results mutate(P_VALUE = 2*(SUM_COMPARISON/SIM_TOTAL)) %>% #we need to multiple hypothesis correct mutate(P_ADJ = p.adjust(P_VALUE, method="BH")) %>% na.omit() %>% mutate(COLOR_LABEL = ifelse(P_ADJ < 0.01, as.character(STRAIN), "INSIG")) #want to export this table for the site by site conservation analysis p_values %>% group_by(Z_OBS>0,P_ADJ<0.05) %>% summarize(n=n()) ``` Making a summary plot! Color codes: https://www.nceas.ucsb.edu/sites/default/files/2020-04/colorPaletteCheatsheet.pdf ```{r} p_values$AGE_BIN = factor(p_values$AGE_BIN, level = c( "YOUNG", "OLD")) sig_points = p_values %>% filter(P_ADJ <= 0.01) %>% mutate(AGE_LABEL = recode(AGE_BIN, "YOUNG" = "Y", "OLD" = "O")) write.table(sig_points, file = paste(outdir_files,"/hNhS_wo_HFPs_sig_hits.txt", sep = ""), sep = "\t", quote = F, row.names = T) insig_points = p_values %>% filter(P_ADJ > 0.01) %>% mutate(AGE_LABEL = recode(AGE_BIN, "YOUNG" = "Y", "OLD" = "O")) summary_hNhS = ggplot(p_values %>% mutate(AGE_LABEL = recode(AGE_BIN, "YOUNG" = "Y", "OLD" = "O"))) summary_hNhS = summary_hNhS + geom_point(data = insig_points, aes(x = log10(OBS_HNHS), y = GENE, shape = TISSUE, color = COLOR_LABEL), alpha = 0.9, size = 0.85) + geom_point(data = sig_points, aes(x = log10(OBS_HNHS), y = GENE, shape = TISSUE, color = COLOR_LABEL), alpha = 0.9) + facet_wrap(AGE_LABEL~., strip.position = "right", ncol = 1) + xlab("log(hN/hS)") + ylab("Age") + scale_color_manual(values = c("B6" = "#1d457f" , "AKR" = "#cc5c76", "ALR" = "#625a94", "FVB" = "#f57946", "NZB" = "#f7c22d", "INSIG" = "gray90")) + scale_shape_manual(values = c(19, 17, 15)) + theme_bw(base_size = 16) + theme(strip.background=element_blank(), strip.text.y = element_text(angle = 0), text = element_text(family = "sans"), legend.position = "none", panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank()) pdf(paste(outdir_figures,"/summary_hnhs_wo_hfps.pdf",sep=""), width=6,height=6) print(summary_hNhS) dev.off() pdf(paste(outdir_figures,"/summary_hnhs_wo_hfps_leg.pdf",sep=""), width=10,height=6) print(summary_hNhS + theme(legend.position = "bottom")) dev.off() ```