welch16
2019-09-10 fad2d50d4084cd610692123903bbe72cb6abc2fc
adding emca4 interaction analysis to see if it improves prediction performance of models
3 files added
229 ■■■■■ changed files
WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/1_create_interaction_blocks.R 137 ●●●●● patch | view | raw | blame | history
WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/2019_08_15_elastic_nets_emca4_interactions_models_condor_submit 16 ●●●●● patch | view | raw | blame | history
WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/2_collect_results.R 76 ●●●●● patch | view | raw | blame | history
WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/1_create_interaction_blocks.R
New file
@@ -0,0 +1,137 @@
library(magrittr)
library(tidyverse)
library(furrr)
plan(multiprocess, workers = 8)
param_dr <- here::here("results","2019_05_28_clean_complete_data","model_parameters")
model_dr <- here::here("bin","2019_05_28_prepare_predictive_models","models")
data_dr <- here::here("results","2019_07_24_segregated_data")
cv_dr <- here::here("results","2019_05_28_clean_complete_data")
snp_dr <- here::here("results","2019_08_15_Emca4_interactions","datasets")
dir.create(snp_dr,showWarnings = FALSE)
out_dr <- here::here("results","2019_08_15_Emca4_interactions","predictions")
dir.create(out_dr,showWarnings = FALSE)
dir.create(file.path(out_dr,"logs"),showWarnings = FALSE)
dir.create(file.path(out_dr,"err"),showWarnings = FALSE)
dir.create(file.path(out_dr,"out"),showWarnings = FALSE)
params <- list.files(param_dr,full.names = TRUE)
models <- list.files(model_dr,full.names = TRUE)
models <- tibble(models)
### need to modify if we add more more models
models %<>%
    mutate(
        params = case_when(
            str_detect(models,"decision_trees") ~ "dont.rds",
            str_detect(models,"lasso") ~ str_subset(params,"lasso"),
            str_detect(models,"ridge") ~ str_subset(params, "ridge"),
            str_detect(models,"elastic") ~ str_subset(params, "glmnet"),
            str_detect(models,"gbm") ~ str_subset(params,"gbm"),
            str_detect(models,"logistic") ~ "dont.rds",
            str_detect(models,"random_forest") ~ str_subset(params,"rf"),
            str_detect(models,"svm_linear") ~ str_subset(params,"svm_linear"),
            str_detect(models,"svm_poly") ~ str_subset(params,"svm_poly"),
            str_detect(models,"svm_radial") ~ str_subset(params,"svm_radial")
        )
    ) %>%
    filter(!str_detect(models,"gbm")) %>%
    filter(!str_detect(models,"svm")) %>%
    filter(!str_detect(models,"logistic")) %>%
    filter(!str_detect(models,"adaptive"))
models %<>% filter(str_detect(basename(models),"elastic"))
data_sets <- tibble(
    data_file = list.files(data_dr,full.names = TRUE,pattern ="rds")) %>%
    mutate(
        phase = if_else(str_detect(data_file,"aim1"), "aim1","aim2"),
        phase_dr = file.path(cv_dr,paste0(phase,"_cv")))
data_sets %<>% filter(str_detect(basename(data_file),"aim1"))
data_sets %<>% filter(str_detect(basename(data_file),"geno"))
data_sets %<>% filter(str_detect(basename(data_file),"emca4"))
phase_dr <- data_sets$phase_dr %>% unique()
### add the couple of all_snps, all_snps_survey datasets
only_snps   <- here::here("results/2019_05_28_clean_complete_data/aim1_only_snps_geno.rds")
snps_survey <- here::here("results/2019_05_28_clean_complete_data/aim1_snps_survey_geno.rds")
data_sets %<>% add_row(data_file = only_snps,phase = "aim1",phase_dr)
data_sets %<>% add_row(data_file = snps_survey, phase = "aim1",phase_dr)
#### add the interactions:
emca4_list <- c("rs2935776","rs3870371","rs1562430","rs1456315","rs6470588","rs1264202","rs7014346","rs9792269","rs284489")
build_interactions <- function(data,snps)
{
    data_mat <- data %>% select(-person_id,-status) %>% select_if(is.numeric)
    data_mat <- as.matrix(data_mat)
    all_pairs <- t(combn(snps,2))
    nms <- apply(all_pairs, 1, FUN = function(x)str_c(x[1],x[2],sep = "_"))
    sp_all_pairs <- all_pairs %>%
        as.data.frame() %>%
        { split(.,seq_len(nrow(.)))}
    inter_list <- map(sp_all_pairs, ~ data_mat[,c(.$V1,.$V2)])
    inter_list %<>% map(matrixStats::rowProds)
    interactions <- bind_cols(inter_list) %>%
        as_tibble() %>%
        set_colnames(nms)
    bind_cols(data,interactions)
}
data_sets %<>%
    mutate(
        data = future_map(data_file,readRDS),
        data_int = future_map(data,build_interactions,emca4_list))
data_sets %<>%
    select(-data) %>%
    mutate(
        data_file = basename(data_file) %>% str_replace(".rds","_inter.rds"),
        data_file = file.path(snp_dr,data_file),
        data_int = map2(data_int,data_file,saveRDS)) %>%
    select(-data_int)
data_sets %<>% nest(-phase_dr,-phase)
data_sets %<>%
    mutate(
        cv_files = map(phase_dr,list.files,full.names = TRUE),
        cv_files = map(cv_files , ~ tibble(cv_files = . )),
        all_files = map2(data,cv_files,crossing)
    )
data_sets %<>% select(phase,all_files) %>% unnest()
data_sets %<>% crossing(models)
data_sets %<>% select(models,data_file,cv_files,params)
data_sets %<>% mutate_all(list( aux = ~ basename(.) %>% str_remove(".rds") %>%
    str_remove_all("_parameters") %>% str_remove_all(".R"))) %>%
    mutate(
        outdir = paste(data_file_aux,cv_files_aux,models_aux,sep = "_")
    )
data_sets %<>% select(-contains("aux"))
data_sets %>% write_csv(file.path(out_dr,"2019_08_15_predictions_Emca4_inter_elastic_nets_params.csv"),
                col_names = FALSE)
WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/2019_08_15_elastic_nets_emca4_interactions_models_condor_submit
New file
@@ -0,0 +1,16 @@
universe         = vanilla
executable       = /s/bin/Rscript
arguments        = $(script) --data_file $(data) --cv_idx $(cv) --param $(param) --train_p .75 --cv_fold 5 --cores 8 --seed 12345 --outdir $(basedir)/$(outdr)
output           = $(basedir)/out/$(CLUSTER).$(PROCESS)_$(outdr).out
error            = $(basedir)/err/$(CLUSTER).$(PROCESS)_$(outdr).err
log              = $(basedir)/logs/$(CLUSTER).$(PROCESS)_$(outdr).log
request_memory   = 12G
request_cpus     = 8
on_exit_hold     = (ExitBySignal == True) || (ExitCode != 0)
periodic_release = (NumJobStarts < 3) && ((CurrentTime - EnteredCurrentStatus) > 120)
basedir          = ./results/2019_08_15_Emca4_interactions/predictions
queue script, data, cv, param, outdr from $(basedir)/2019_08_15_predictions_Emca4_inter_elastic_nets_params.csv
WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/2_collect_results.R
New file
@@ -0,0 +1,76 @@
library(magrittr)
library(tidyverse)
library(BiocParallel)
mp = MulticoreParam(workers = 8)
results_dr <- here::here("results/2019_08_15_Emca4_interactions/predictions")
all_files = results_dr %>% list.files(pattern = "rds", full.names = TRUE)
results <- tibble( file = all_files) %>%
    mutate(
        aux = basename(all_files),
        type = case_when(
            str_detect(aux, "model_fit") ~ "model",
            str_detect(aux, "train_metrics") ~ "train_metrics",
            str_detect(aux, "test_metrics") ~ "test_metrics",
            str_detect(aux, "train_prob") ~ "train_prob",
            str_detect(aux, "test_prob") ~ "test_prob"
        ))
results %<>%
    mutate(
        aim = if_else(str_detect(aux,"aim1"),"aim1","aim2"),
        geno = if_else(str_detect(aux,"geno"),"genotype","risk_allele"),
        cov = str_split(aux,"_cvfold") %>% map_chr(1) %>%
            str_split("\\_") %>% map_chr( ~ paste(.[-1],collapse = "_")),
        adaptive = if_else(str_detect(file,"adaptive"),"yes","no"),
        fit = case_when(
            str_detect(aux,"decision") ~ "decision_trees",
            str_detect(aux,"random") ~ "random_forests",
            str_detect(aux,"logis") ~ "logistic_regression",
            str_detect(aux,"elasti") ~ "elastic_nets",
            str_detect(aux,"lasso") ~ "lasso_regression",
            str_detect(aux,"ridge") ~ "ridge_regression"),
        aux = str_split(aux,"\\_"),
        time = map_chr(aux, ~ .[nchar(.) == 1]),
        cv_fold = map_chr(aux, ~ .[nchar(.) == 2]))
results  %<>% select(-aux)  %>% spread(type,file)
read_filelist <- function(files,mp)
{
    bplapply(files, readRDS, BPPARAM = mp)
}
results  %<>%
    mutate_at(vars(contains("prob")), list( ~ read_filelist(.,mp))) %>%
    mutate_at(vars(contains("metric")), list( ~ read_filelist(.,mp)))
auc <- function(prob_estimate,mp)
{
    estimate_df <- bplapply(prob_estimate,
        function(x)yardstick::roc_auc(x,status,prob),BPPARAM = mp)
    map_dbl(estimate_df,".estimate")
}
aupr <- function(prob_estimate,mp)
{
    estimate_df <- bplapply(prob_estimate,
        function(x)yardstick::pr_auc(x,status,prob),BPPARAM = mp)
    map_dbl(estimate_df,".estimate")
}
results %<>%
    mutate_at(vars(contains("prob")),
        list( ~ auc(.,mp),~ aupr(.,mp)))
results  %>% saveRDS(file.path(dirname(results_dr),"2019_08_15_predictive_results.rds"))