From fad2d50d4084cd610692123903bbe72cb6abc2fc Mon Sep 17 00:00:00 2001
From: welch16 <welch16@gmail.com>
Date: Tue, 10 Sep 2019 10:16:56 -0500
Subject: [PATCH] adding emca4 interaction analysis to see if it improves prediction performance of models

---
 WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/2019_08_15_elastic_nets_emca4_interactions_models_condor_submit |   16 ++++
 WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/2_collect_results.R                                             |   76 +++++++++++++++++++
 WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/1_create_interaction_blocks.R                                   |  137 ++++++++++++++++++++++++++++++++++
 3 files changed, 229 insertions(+), 0 deletions(-)

diff --git a/WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/1_create_interaction_blocks.R b/WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/1_create_interaction_blocks.R
new file mode 100644
index 0000000..25a0350
--- /dev/null
+++ b/WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/1_create_interaction_blocks.R
@@ -0,0 +1,137 @@
+
+library(magrittr)
+library(tidyverse)
+library(furrr)
+
+plan(multiprocess, workers = 8)
+
+
+param_dr <- here::here("results","2019_05_28_clean_complete_data","model_parameters")
+model_dr <- here::here("bin","2019_05_28_prepare_predictive_models","models")
+data_dr <- here::here("results","2019_07_24_segregated_data")
+cv_dr <- here::here("results","2019_05_28_clean_complete_data")
+
+snp_dr <- here::here("results","2019_08_15_Emca4_interactions","datasets")
+
+dir.create(snp_dr,showWarnings = FALSE)
+
+out_dr <- here::here("results","2019_08_15_Emca4_interactions","predictions")
+
+dir.create(out_dr,showWarnings = FALSE)
+dir.create(file.path(out_dr,"logs"),showWarnings = FALSE)
+dir.create(file.path(out_dr,"err"),showWarnings = FALSE)
+dir.create(file.path(out_dr,"out"),showWarnings = FALSE)
+
+params <- list.files(param_dr,full.names = TRUE)
+models <- list.files(model_dr,full.names = TRUE)
+
+models <- tibble(models)
+
+### need to modify if we add more more models
+models %<>% 
+	mutate(
+        params = case_when(
+    		str_detect(models,"decision_trees") ~ "dont.rds",
+            str_detect(models,"lasso") ~ str_subset(params,"lasso"), 
+            str_detect(models,"ridge") ~ str_subset(params, "ridge"),
+			str_detect(models,"elastic") ~ str_subset(params, "glmnet"),
+            str_detect(models,"gbm") ~ str_subset(params,"gbm"),
+            str_detect(models,"logistic") ~ "dont.rds",
+            str_detect(models,"random_forest") ~ str_subset(params,"rf"),
+            str_detect(models,"svm_linear") ~ str_subset(params,"svm_linear"),
+            str_detect(models,"svm_poly") ~ str_subset(params,"svm_poly"),
+            str_detect(models,"svm_radial") ~ str_subset(params,"svm_radial")
+        )
+    ) %>% 
+    filter(!str_detect(models,"gbm")) %>% 
+    filter(!str_detect(models,"svm")) %>% 
+    filter(!str_detect(models,"logistic")) %>% 
+    filter(!str_detect(models,"adaptive"))
+
+models %<>% filter(str_detect(basename(models),"elastic"))
+
+
+data_sets <- tibble(
+    data_file = list.files(data_dr,full.names = TRUE,pattern ="rds")) %>% 
+    mutate(
+        phase = if_else(str_detect(data_file,"aim1"), "aim1","aim2"),
+        phase_dr = file.path(cv_dr,paste0(phase,"_cv")))
+        
+data_sets %<>% filter(str_detect(basename(data_file),"aim1"))
+data_sets %<>% filter(str_detect(basename(data_file),"geno"))
+
+data_sets %<>% filter(str_detect(basename(data_file),"emca4"))
+
+phase_dr <- data_sets$phase_dr %>% unique()
+
+### add the couple of all_snps, all_snps_survey datasets
+only_snps   <- here::here("results/2019_05_28_clean_complete_data/aim1_only_snps_geno.rds")
+snps_survey <- here::here("results/2019_05_28_clean_complete_data/aim1_snps_survey_geno.rds")
+
+data_sets %<>% add_row(data_file = only_snps,phase = "aim1",phase_dr)
+data_sets %<>% add_row(data_file = snps_survey, phase = "aim1",phase_dr)
+
+#### add the interactions:
+
+emca4_list <- c("rs2935776","rs3870371","rs1562430","rs1456315","rs6470588","rs1264202","rs7014346","rs9792269","rs284489")
+
+build_interactions <- function(data,snps)
+{
+	data_mat <- data %>% select(-person_id,-status) %>% select_if(is.numeric)
+	data_mat <- as.matrix(data_mat)
+	
+	all_pairs <- t(combn(snps,2))
+	nms <- apply(all_pairs, 1, FUN = function(x)str_c(x[1],x[2],sep = "_"))
+	sp_all_pairs <- all_pairs %>%
+		as.data.frame() %>% 
+		{ split(.,seq_len(nrow(.)))}
+	inter_list <- map(sp_all_pairs, ~ data_mat[,c(.$V1,.$V2)])		
+	inter_list %<>% map(matrixStats::rowProds)
+	
+	interactions <- bind_cols(inter_list) %>% 
+		as_tibble() %>% 
+		set_colnames(nms)
+	
+	bind_cols(data,interactions) 
+		
+}
+
+data_sets %<>% 
+	mutate(
+		data = future_map(data_file,readRDS),
+		data_int = future_map(data,build_interactions,emca4_list))
+
+data_sets %<>% 
+	select(-data) %>% 
+	mutate(
+		data_file = basename(data_file) %>% str_replace(".rds","_inter.rds"),
+		data_file = file.path(snp_dr,data_file),
+		data_int = map2(data_int,data_file,saveRDS)) %>% 
+	select(-data_int)
+
+data_sets %<>% nest(-phase_dr,-phase)
+data_sets %<>% 
+	mutate(
+        cv_files = map(phase_dr,list.files,full.names = TRUE),
+        cv_files = map(cv_files , ~ tibble(cv_files = . )),
+        all_files = map2(data,cv_files,crossing)
+    )
+
+data_sets %<>% select(phase,all_files) %>% unnest()
+
+data_sets %<>% crossing(models)
+
+data_sets %<>% select(models,data_file,cv_files,params)
+
+
+data_sets %<>% mutate_all(list( aux = ~ basename(.) %>% str_remove(".rds") %>% 
+	str_remove_all("_parameters") %>% str_remove_all(".R"))) %>% 
+    mutate(
+        outdir = paste(data_file_aux,cv_files_aux,models_aux,sep = "_")
+    )
+
+data_sets %<>% select(-contains("aux"))
+
+
+data_sets %>% write_csv(file.path(out_dr,"2019_08_15_predictions_Emca4_inter_elastic_nets_params.csv"),
+				col_names = FALSE)
\ No newline at end of file
diff --git a/WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/2019_08_15_elastic_nets_emca4_interactions_models_condor_submit b/WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/2019_08_15_elastic_nets_emca4_interactions_models_condor_submit
new file mode 100644
index 0000000..1376f12
--- /dev/null
+++ b/WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/2019_08_15_elastic_nets_emca4_interactions_models_condor_submit
@@ -0,0 +1,16 @@
+
+universe         = vanilla
+executable       = /s/bin/Rscript
+arguments        = $(script) --data_file $(data) --cv_idx $(cv) --param $(param) --train_p .75 --cv_fold 5 --cores 8 --seed 12345 --outdir $(basedir)/$(outdr)
+output           = $(basedir)/out/$(CLUSTER).$(PROCESS)_$(outdr).out
+error            = $(basedir)/err/$(CLUSTER).$(PROCESS)_$(outdr).err
+log              = $(basedir)/logs/$(CLUSTER).$(PROCESS)_$(outdr).log
+request_memory   = 12G
+request_cpus     = 8
+
+on_exit_hold     = (ExitBySignal == True) || (ExitCode != 0)
+periodic_release = (NumJobStarts < 3) && ((CurrentTime - EnteredCurrentStatus) > 120)
+
+basedir          = ./results/2019_08_15_Emca4_interactions/predictions
+
+queue script, data, cv, param, outdr from $(basedir)/2019_08_15_predictions_Emca4_inter_elastic_nets_params.csv
diff --git a/WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/2_collect_results.R b/WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/2_collect_results.R
new file mode 100644
index 0000000..8c1fabe
--- /dev/null
+++ b/WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/2_collect_results.R
@@ -0,0 +1,76 @@
+
+library(magrittr)
+library(tidyverse)
+
+library(BiocParallel)
+
+mp = MulticoreParam(workers = 8)
+
+results_dr <- here::here("results/2019_08_15_Emca4_interactions/predictions")
+
+all_files = results_dr %>% list.files(pattern = "rds", full.names = TRUE)
+
+
+results <- tibble( file = all_files) %>% 
+	mutate(
+        aux = basename(all_files),
+        type = case_when(
+            str_detect(aux, "model_fit") ~ "model",
+            str_detect(aux, "train_metrics") ~ "train_metrics",
+            str_detect(aux, "test_metrics") ~ "test_metrics",
+            str_detect(aux, "train_prob") ~ "train_prob",
+            str_detect(aux, "test_prob") ~ "test_prob"
+        )) 
+
+results %<>% 
+	mutate(
+        aim = if_else(str_detect(aux,"aim1"),"aim1","aim2"),
+        geno = if_else(str_detect(aux,"geno"),"genotype","risk_allele"),
+        cov = str_split(aux,"_cvfold") %>% map_chr(1) %>% 
+        	str_split("\\_") %>% map_chr( ~ paste(.[-1],collapse = "_")),
+        adaptive = if_else(str_detect(file,"adaptive"),"yes","no"),
+        fit = case_when(
+            str_detect(aux,"decision") ~ "decision_trees",
+            str_detect(aux,"random") ~ "random_forests",
+            str_detect(aux,"logis") ~ "logistic_regression",
+            str_detect(aux,"elasti") ~ "elastic_nets",
+            str_detect(aux,"lasso") ~ "lasso_regression",
+            str_detect(aux,"ridge") ~ "ridge_regression"),
+        aux = str_split(aux,"\\_"),
+        time = map_chr(aux, ~ .[nchar(.) == 1]),
+        cv_fold = map_chr(aux, ~ .[nchar(.) == 2]))
+
+results  %<>% select(-aux)  %>% spread(type,file)
+
+
+read_filelist <- function(files,mp)
+{
+    bplapply(files, readRDS, BPPARAM = mp)
+}
+
+results  %<>%   
+	mutate_at(vars(contains("prob")), list( ~ read_filelist(.,mp))) %>% 
+    mutate_at(vars(contains("metric")), list( ~ read_filelist(.,mp)))
+
+
+auc <- function(prob_estimate,mp)
+{
+	estimate_df <- bplapply(prob_estimate, 
+    	function(x)yardstick::roc_auc(x,status,prob),BPPARAM = mp)
+    map_dbl(estimate_df,".estimate")
+}
+
+aupr <- function(prob_estimate,mp)
+{
+    estimate_df <- bplapply(prob_estimate, 
+    	function(x)yardstick::pr_auc(x,status,prob),BPPARAM = mp)
+    map_dbl(estimate_df,".estimate")
+}
+
+results %<>% 
+	mutate_at(vars(contains("prob")), 
+		list( ~ auc(.,mp),~ aupr(.,mp))) 
+
+
+results  %>% saveRDS(file.path(dirname(results_dr),"2019_08_15_predictive_results.rds"))
+

--
Gitblit v1.8.0