import pandas as pd import numpy as np import io import os import argparse from Bio import SeqIO #we will first parse our argument -strain from input and define our strain variable parser = argparse.ArgumentParser() parser.add_argument("-strain", help = "Conplastic Strain Name") parser.add_argument("-mut_count_file_path", help = "File path to the mut count file you are referring to") parser.add_argument("-mut_prop_file_path", help = "File path to the mut prop file you are referring to") parser.add_argument("-wo_hfp", action = "store_true", help = "Includes wo_hfp in output") args = parser.parse_args() strain = args.strain if args.wo_hfp: print("wo_hfps") else: print("w_hfps") #read in our mut_prop and mut_total_count files mut_prop_file = args.mut_prop_file_path mut_total_count_file = args.mut_count_file_path mut_type_prop = pd.read_csv(mut_prop_file, sep = "\t") mut_total_count = pd.read_csv(mut_total_count_file, sep = "\t") #loading in our annotation file of all possible variants annotation_file = "files/annotated_all_possible_variants.txt" annotations = pd.read_csv(annotation_file, sep = "\t") #read in our chrM sequence and create our dictionary of positions chrM_file = "files/chrM.fa" ref_seq = SeqIO.read(chrM_file, "fasta") #merging our files to have the information we need in one position per_condition_info = pd.merge(mut_type_prop, mut_total_count, how = "left", on = ["STRAIN",\ "TISSUE",\ "AGE_BIN"]) #file needed to make the haplotype dictionary haplotype_info_file = "files/haplotype_mutations.vcf" haplotype_info = pd.read_csv(haplotype_info_file, sep = "\t") #only keep SNVs haplotype_info = haplotype_info[haplotype_info["VARIANT_TYPE"] == "SNV"] haplotype_info = haplotype_info.drop_duplicates(subset = ["STRAIN", "START"], keep = "first")[["STRAIN", \ "START", "REF", "ALT"]] strains = list(haplotype_info["STRAIN"].unique()) haplotype_dict = {} for conplastic in strains: #take the position, ref, and alt alleles for all haploptypes for a given strain & place into one list of lists haplotypes = haplotype_info[haplotype_info["STRAIN"] == conplastic] haplotypes = haplotypes[["START", "REF", "ALT"]].values.tolist() #add the list of lists as a value to the strain keys haplotype_dict[conplastic] = haplotypes #functions we will be using def get_ref_genome(strain, haplotype_dict): #we need to convert from a seq to a mutable object in order to mutate our sequence strain_ref_genome = ref_seq.seq.tomutable() haplotypes = haplotype_dict[strain] #we loop through each haplotype and mutate the ref sequence from mm10 accordingly for index in range(0, len(haplotypes)): site = haplotypes[index][0] allele = haplotypes[index][2] strain_ref_genome[site] = allele #convert back to a seq object in order to access all the methods strain_ref_genome = strain_ref_genome.toseq() return strain_ref_genome def get_position_dict(strain_ref_genome): #create a dictionary to hold the positions that our reference alleles are in #this will be used to choose which places in the genome are randomly mutated position_dict = {} nucleotides = ['A','C','G','T'] #create the position_dict which the following format {"nucleotide": [lst of positions this nucleotide is located]} for base in nucleotides: position_dict[base] = [index for index in range(len(strain_ref_genome)) if strain_ref_genome[index] == base] return position_dict simulated_variants_list = [] #obtain the conplastic strain reference genome if strain == "B6": strain_ref_genome = ref_seq else: strain_ref_genome = get_ref_genome(strain, haplotype_dict) #create the position_dict based on the conplastic strain ref genome position_dict = get_position_dict(strain_ref_genome) for age in ["YOUNG", "OLD"]: for tissue in ["Brain", "Heart", "Liver"]: condition_df = per_condition_info[(per_condition_info["STRAIN"] == strain) &\ (per_condition_info["AGE_BIN"] == age) &\ (per_condition_info["TISSUE"] == tissue)] mut_count = condition_df[(condition_df["AGE_BIN"] == age) \ & (condition_df["TISSUE"] == tissue)]["HET_COUNT"].unique() if len(mut_count) > 1: print("ERROR OCCURRED IN CALCULATING MUT_COUNT") mut_count = mut_count[0] #creating a dictionary of our mutation type and heterplasmy prop to call in in our multinomal sampling mut_type_prop_dict = dict(zip(condition_df.MUTATION_TYPE, condition_df.HET_PROP)) #make sure that round off error does not break multinomial draw sum_mut_type_prop = sum(mut_type_prop_dict.values()) #normalizes our mutation type proportions to ensure that our proportions sum to 1 mut_type_prop_dict.update((key, (value/sum_mut_type_prop)) for key,value in mut_type_prop_dict.items()) #it's redundant I KNOOOOW but I want to make sure we keep track of the prop wrt mutation type sim_mut_draws = np.random.multinomial(mut_count, [mut_type_prop_dict["A>C"], mut_type_prop_dict["A>G"], mut_type_prop_dict["A>T"],\ mut_type_prop_dict["C>A"], mut_type_prop_dict["C>G"], mut_type_prop_dict["C>T"],\ mut_type_prop_dict["G>A"], mut_type_prop_dict["G>C"], mut_type_prop_dict["G>T"],\ mut_type_prop_dict["T>A"], mut_type_prop_dict["T>C"], mut_type_prop_dict["T>G"]],\ 10000) list_of_mutations = list(mut_type_prop_dict.keys()) #generate the mutations given in each simulation counter = 0 for sim_index in range(len(sim_mut_draws)): #for every possible reference allele create a dataframe that includes: #the mutation type, strain, age, tissue, simulation A_ref = pd.DataFrame() A_ref["MUT_TYPE"] = pd.Series(list_of_mutations[0:3]).repeat(sim_mut_draws[sim_index][0:3]) A_ref["STRAIN"] = pd.Series(strain).repeat(sum(sim_mut_draws[sim_index][0:3])).values A_ref["AGE_BIN"] = pd.Series(age).repeat(sum(sim_mut_draws[sim_index][0:3])).values A_ref["TISSUE"] = pd.Series(tissue).repeat(sum(sim_mut_draws[sim_index][0:3])).values A_ref["SIM_RUN"] = pd.Series(sim_index).repeat(sum(sim_mut_draws[sim_index][0:3])).values C_ref = pd.DataFrame() C_ref["MUT_TYPE"] = pd.Series(list_of_mutations[3:6]).repeat(sim_mut_draws[sim_index][3:6]) C_ref["STRAIN"] = pd.Series(strain).repeat(sum(sim_mut_draws[sim_index][3:6])).values C_ref["AGE_BIN"] = pd.Series(age).repeat(sum(sim_mut_draws[sim_index][3:6])).values C_ref["TISSUE"] = pd.Series(tissue).repeat(sum(sim_mut_draws[sim_index][3:6])).values C_ref["SIM_RUN"] = pd.Series(sim_index).repeat(sum(sim_mut_draws[sim_index][3:6])).values G_ref = pd.DataFrame() G_ref["MUT_TYPE"] = pd.Series(list_of_mutations[6:9]).repeat(sim_mut_draws[sim_index][6:9]) G_ref["STRAIN"] = pd.Series(strain).repeat(sum(sim_mut_draws[sim_index][6:9])).values G_ref["AGE_BIN"] = pd.Series(age).repeat(sum(sim_mut_draws[sim_index][6:9])).values G_ref["TISSUE"] = pd.Series(tissue).repeat(sum(sim_mut_draws[sim_index][6:9])).values G_ref["SIM_RUN"] = pd.Series(sim_index).repeat(sum(sim_mut_draws[sim_index][6:9])).values T_ref = pd.DataFrame() T_ref["MUT_TYPE"] = pd.Series(list_of_mutations[9:12]).repeat(sim_mut_draws[sim_index][9:12]) T_ref["STRAIN"] = pd.Series(strain).repeat(sum(sim_mut_draws[sim_index][9:12])).values T_ref["AGE_BIN"] = pd.Series(age).repeat(sum(sim_mut_draws[sim_index][9:12])).values T_ref["TISSUE"] = pd.Series(tissue).repeat(sum(sim_mut_draws[sim_index][9:12])).values T_ref["SIM_RUN"] = pd.Series(sim_index).repeat(sum(sim_mut_draws[sim_index][9:12])).values #drawing uniformly with replacement from the position vectors A_ref_positions_mutated = [np.random.choice(position_dict["A"], \ sim_mut_draws[sim_index][draw]) \ for draw in range(0,3)] A_ref["POS"] = np.concatenate(A_ref_positions_mutated) C_ref_positions_mutated = [np.random.choice(position_dict["C"], \ sim_mut_draws[sim_index][draw]) \ for draw in range(3,6)] C_ref["POS"] = np.concatenate(C_ref_positions_mutated) G_ref_positions_mutated = [np.random.choice(position_dict["G"], \ sim_mut_draws[sim_index][draw]) \ for draw in range(6,9)] G_ref["POS"] = np.concatenate(G_ref_positions_mutated) T_ref_positions_mutated = [np.random.choice(position_dict["T"], \ sim_mut_draws[sim_index][draw]) \ for draw in range(9,12)] T_ref["POS"] = np.concatenate(T_ref_positions_mutated) #appending the above dataframes together simulated_variants_for_condition = pd.concat([A_ref, C_ref, G_ref, T_ref]) #joined_variants = A_ref.append(C_ref).append(G_ref).append(T_ref).reset_index(drop = True) simulated_variants_for_condition[["REF","ALT"]] = simulated_variants_for_condition["MUT_TYPE"].str.split(">", expand = True) simulated_variants_list.append(simulated_variants_for_condition) if counter % 1000 == 0: print(strain, age, tissue, sim_index) counter += 1 simulated_variants_df = pd.concat(simulated_variants_list) filtering_noncoding_regions = simulated_variants_df[(simulated_variants_df["POS"] >= 2750) &\ (simulated_variants_df["POS"] <= 15288)] annotated_simulated_variants = pd.merge(filtering_noncoding_regions, annotations, \ how = "left", \ on = ["STRAIN", "REF", "ALT", "POS"])[["STRAIN", "TISSUE", "AGE_BIN",\ "SIM_RUN", "POS", "REF", "ALT",\ "MUT_TYPE", "ANNOTATION", "GENE",\ "CODON_INDEX", "CODON_POS", "REF_AA", "ALT_AA"]] protein_coding_annotated_simulated_variants = annotated_simulated_variants[annotated_simulated_variants["ANNOTATION"] != "non_coding_transcript_exon_variant"] annotation_counts = protein_coding_annotated_simulated_variants[["STRAIN", "TISSUE", "AGE_BIN",\ "SIM_RUN", "GENE", "MUT_TYPE", \ "ANNOTATION"]].groupby(["STRAIN", "TISSUE", "AGE_BIN", "SIM_RUN", "GENE", "ANNOTATION"]).size() annotation_counts = annotation_counts.reset_index() annotation_counts.rename(columns = {0: "COUNT"}, inplace = True) if args.wo_hfp: print("wo hfp") output_annotated_simulated_variants = "output/{}_wo_hfp_simulated_annotation_counts".format(strain) else: print("has hfp") output_annotated_simulated_variants = "output/{}_simulated_annotation_counts".format(strain) annotation_counts.to_csv(output_annotated_simulated_variants, header = True, index = False, sep = "\t")