--- title: "R Notebook" output: html_notebook --- ```{r} library(tidyverse) ``` ```{r} setwd("/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/") outdir_files = "/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/genomewide_selection_scan/files/" supertable_file ="/Users/isabelserrano/Documents/Science/Analyses/Conplastic_Strains/files_and_analyses/input_files/supertable.txt" supertable = read.table(supertable_file, header=TRUE, stringsAsFactors = FALSE) ``` Mutation count with HFPs: ```{r} mut_count = supertable %>% #reduce redundancy in mutations if mutations are located in multiple gene regions select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, REF, ALT, VARIANT_TYPE, ALT_ALLELE_DEPTH) %>% unique() %>% filter(VARIANT_TYPE == "SNV") %>% select(STRAIN, TISSUE, AGE_BIN, ALT_ALLELE_DEPTH) %>% group_by(STRAIN, TISSUE, AGE_BIN) %>% summarise(HET_COUNT = sum(ALT_ALLELE_DEPTH)) %>% #this is needed so that the FVB strain name matches across files -- ugh mutate(STRAIN = recode(STRAIN, "FVB" = "F")) ``` Mutation count without HFPs: ```{r} mut_count_wo_hfps = supertable %>% #reduce redundancy in mutations if mutations are located in multiple gene regions select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, REF, ALT, VARIANT_TYPE, ALT_ALLELE_DEPTH, CONDITION_MUT_FREQ_AT_POS, HFP_THRESHOLD) %>% unique() %>% #filter out indels and any HFPs from the analysis filter(VARIANT_TYPE == "SNV") %>% filter(CONDITION_MUT_FREQ_AT_POS < HFP_THRESHOLD) %>% #process as before: creating the mutation type and counting the number of mutations in a condition select(STRAIN, TISSUE, AGE_BIN, ALT_ALLELE_DEPTH) %>% group_by(STRAIN, TISSUE, AGE_BIN) %>% summarise(HET_COUNT = sum(ALT_ALLELE_DEPTH)) %>% mutate(STRAIN = recode(STRAIN, "FVB" = "F")) ``` Calculating the proportions of each mutation type within a given condition: ```{r} mut_prop = supertable %>% #reduce redundancy in mutations if mutations are located in multiple gene regions select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, REF, ALT, VARIANT_TYPE, ALT_ALLELE_DEPTH) %>% unique() %>% filter(VARIANT_TYPE == "SNV") %>% mutate(MUTATION_TYPE = paste(REF, ALT, sep = ">")) %>% select(STRAIN, TISSUE, AGE_BIN, MUTATION_TYPE, ALT_ALLELE_DEPTH) %>% group_by(STRAIN, TISSUE, AGE_BIN, MUTATION_TYPE) %>% summarise(MUT_TYPE_COUNT = sum(ALT_ALLELE_DEPTH)) %>% ungroup() %>% group_by(STRAIN, TISSUE, AGE_BIN) %>% mutate(MUT_TOTAL = sum(MUT_TYPE_COUNT)) %>% mutate(HET_PROP = MUT_TYPE_COUNT/ MUT_TOTAL) %>% mutate(STRAIN = recode(STRAIN, "FVB" = "F")) ``` Calculating mutation proportions without the hfps ```{r} mut_prop_wo_hfps = supertable %>% #reduce redundancy in mutations if mutations are located in multiple gene regions select(SAMPLE, STRAIN, TISSUE, AGE_BIN, START, REF, ALT, VARIANT_TYPE, ALT_ALLELE_DEPTH, CONDITION_MUT_FREQ_AT_POS, HFP_THRESHOLD) %>% unique() %>% #filter out indels and any HFPs from the analysis filter(VARIANT_TYPE == "SNV") %>% filter(CONDITION_MUT_FREQ_AT_POS < HFP_THRESHOLD) %>% mutate(MUTATION_TYPE = paste(REF, ALT, sep = ">")) %>% select(STRAIN, TISSUE, AGE_BIN, MUTATION_TYPE, ALT_ALLELE_DEPTH) %>% group_by(STRAIN, TISSUE, AGE_BIN, MUTATION_TYPE) %>% summarise(MUT_TYPE_COUNT = sum(ALT_ALLELE_DEPTH)) %>% ungroup() %>% group_by(STRAIN, TISSUE, AGE_BIN) %>% mutate(MUT_TOTAL = sum(MUT_TYPE_COUNT)) %>% mutate(HET_PROP = MUT_TYPE_COUNT/ MUT_TOTAL) %>% mutate(STRAIN = recode(STRAIN, "FVB" = "F")) #FVB Brain Young is missing a mutation type after filtering out HFP positions mut_prop_wo_hfps[nrow(mut_prop_wo_hfps) + 1,] = list("F", "Brain", "YOUNG", "T>G", 0 , 2436, 0) ``` Output files: ```{r} mut_count_file_path = paste(outdir_files, "mut_count.txt", sep = "") mut_count_wo_hfps_file_path = paste(outdir_files, "mut_count_wo_hfps.txt", sep = "") mut_prop_file_path = paste(outdir_files, "mut_prop.txt", sep = "") mut_prop_wo_hfps_file_path = paste(outdir_files, "mut_prop_wo_hfps.txt", sep = "") write.table(mut_count, mut_count_file_path, row.names = FALSE, sep = "\t") write.table(mut_count_wo_hfps, mut_count_wo_hfps_file_path, row.names = FALSE, sep = "\t") write.table(mut_prop, mut_prop_file_path, row.names = FALSE, sep = "\t") write.table(mut_prop_wo_hfps, mut_prop_wo_hfps_file_path, row.names = FALSE, sep = "\t") ``` Figuring out which condition was missing a mutation type after filtering out HFPs: FVB Brain Young T>G ```{r} mut_prop_wo_hfps %>% ungroup() %>% group_by(STRAIN, TISSUE, AGE_BIN) %>% summarise(COUNTS = n()) mut_prop_wo_hfps %>% filter(STRAIN == "FVB", TISSUE == "Brain", AGE_BIN == "YOUNG") mut_prop %>% filter(STRAIN == "FVB", TISSUE == "Brain", AGE_BIN == "YOUNG") ```