library(TwoSampleMR) library(dplyr) library(data.table) library(ieugwasr) library(optparse) opt_parser <- OptionParser(option_list = list( make_option("--exposure", type = "character", help = "Exposure dataset ID"), make_option("--outcome", type = "character", help = "Outcome dataset ID"), make_option("--plink-path", type = "character", help = "Path of the plink1.9 executable") )) opt <- parse_args(opt_parser) clumped_data <- here::here(sprintf("output/analysis/clumped_data/%s__%s", opt$exposure, opt$outcome)) output_folder <- here::here(sprintf("output/analysis/mr/%s__%s", opt$exposure, opt$outcome)) dir.create(output_folder, recursive = TRUE, showWarnings = FALSE) exposure_file <- here::here(sprintf("output/data/%s.feather", opt$exposure)) outcome_file <- here::here(sprintf("output/data/%s.feather", opt$outcome)) if (opt$exposure == "sibling_income") { # Use main income GWAS to identify SNPs for sibling-adjusted exposure identification_file <- here::here("output/data/income_kweon.feather") } else { identification_file <- here::here(sprintf("output/data/%s.feather", opt$exposure)) } # exposure_file <- "output/data/income_kweon.feather" # outcome_file <- "output/data/ieu-a-835.feather" # identification_file <- "output/data/income_kweon.feather" output_file <- here::here(sprintf("output/analysis/clumped_data/%s__%s.rds", opt$exposure, opt$outcome)) print(paste("Exposure dataset: ", exposure_file)) print(paste("Outcome dataset: ", outcome_file)) print(paste("Identification dataset:", identification_file)) print(paste("Output file: ", output_file)) exposure_data <- arrow::read_feather(exposure_file) outcome_data <- arrow::read_feather(outcome_file) identification_data <- arrow::read_feather(identification_file) # Identify significant SNPs in the identification dataset that are also present # in the exposure and outcome datasets shared_significant_rsids <- Reduce(intersect, list( identification_data[p < 5e-8, rsid], exposure_data$rsid, outcome_data$rsid )) # Clump SNPs using p values from the identification dataset clumped <- ld_clump( identification_data[shared_significant_rsids, .(rsid, pval = p)], clump_kb = 10000, clump_r2 = 0.001, clump_p = 0.99, bfile = "input/ld_ref_panel/EUR", plink_bin = opt$`plink-path` ) clumped_exposure <- exposure_data[clumped$rsid] clumped_outcome <- outcome_data[clumped$rsid] # Estimate R^2 for continuous phenotypes if (clumped_exposure$model[[1]] == "continuous") { clumped_exposure[, r := get_r_from_bsen(b = beta, se = se, n = n)] } else { clumped_exposure[, r := NA] } if (clumped_outcome$model[[1]] == "continuous") { clumped_outcome[, r := get_r_from_bsen(b = beta, se = se, n = n)] } else { clumped_outcome[, r := NA] } setnames( clumped_exposure, c("rsid", "study_id", "phenotype", "beta", "se", "p", "eaf", "a1", "a2", "r"), c("SNP", "id.exposure", "exposure", "beta.exposure", "se.exposure", "pval.exposure", "eaf.exposure", "effect_allele.exposure", "other_allele.exposure", "rsq.exposure"), skip_absent = TRUE ) setnames( clumped_outcome, c("rsid", "study_id", "phenotype", "beta", "se", "p", "eaf", "a1", "a2", "r"), c("SNP", "id.outcome", "outcome", "beta.outcome", "se.outcome", "pval.outcome", "eaf.outcome", "effect_allele.outcome", "other_allele.outcome", "rsq.outcome"), skip_absent = TRUE ) harmonised_data <- harmonise_data( exposure_dat = clumped_exposure, outcome_dat = clumped_outcome ) %>% mutate(steiger_dir = rsq.exposure > rsq.outcome) saveRDS(harmonised_data, output_file)