welch16
2019-09-10 fad2d50d4084cd610692123903bbe72cb6abc2fc
commit | author | age
fad2d5 1
W 2 library(magrittr)
3 library(tidyverse)
4 library(furrr)
5
6 plan(multiprocess, workers = 8)
7
8
9 param_dr <- here::here("results","2019_05_28_clean_complete_data","model_parameters")
10 model_dr <- here::here("bin","2019_05_28_prepare_predictive_models","models")
11 data_dr <- here::here("results","2019_07_24_segregated_data")
12 cv_dr <- here::here("results","2019_05_28_clean_complete_data")
13
14 snp_dr <- here::here("results","2019_08_15_Emca4_interactions","datasets")
15
16 dir.create(snp_dr,showWarnings = FALSE)
17
18 out_dr <- here::here("results","2019_08_15_Emca4_interactions","predictions")
19
20 dir.create(out_dr,showWarnings = FALSE)
21 dir.create(file.path(out_dr,"logs"),showWarnings = FALSE)
22 dir.create(file.path(out_dr,"err"),showWarnings = FALSE)
23 dir.create(file.path(out_dr,"out"),showWarnings = FALSE)
24
25 params <- list.files(param_dr,full.names = TRUE)
26 models <- list.files(model_dr,full.names = TRUE)
27
28 models <- tibble(models)
29
30 ### need to modify if we add more more models
31 models %<>% 
32     mutate(
33         params = case_when(
34             str_detect(models,"decision_trees") ~ "dont.rds",
35             str_detect(models,"lasso") ~ str_subset(params,"lasso"), 
36             str_detect(models,"ridge") ~ str_subset(params, "ridge"),
37             str_detect(models,"elastic") ~ str_subset(params, "glmnet"),
38             str_detect(models,"gbm") ~ str_subset(params,"gbm"),
39             str_detect(models,"logistic") ~ "dont.rds",
40             str_detect(models,"random_forest") ~ str_subset(params,"rf"),
41             str_detect(models,"svm_linear") ~ str_subset(params,"svm_linear"),
42             str_detect(models,"svm_poly") ~ str_subset(params,"svm_poly"),
43             str_detect(models,"svm_radial") ~ str_subset(params,"svm_radial")
44         )
45     ) %>% 
46     filter(!str_detect(models,"gbm")) %>% 
47     filter(!str_detect(models,"svm")) %>% 
48     filter(!str_detect(models,"logistic")) %>% 
49     filter(!str_detect(models,"adaptive"))
50
51 models %<>% filter(str_detect(basename(models),"elastic"))
52
53
54 data_sets <- tibble(
55     data_file = list.files(data_dr,full.names = TRUE,pattern ="rds")) %>% 
56     mutate(
57         phase = if_else(str_detect(data_file,"aim1"), "aim1","aim2"),
58         phase_dr = file.path(cv_dr,paste0(phase,"_cv")))
59         
60 data_sets %<>% filter(str_detect(basename(data_file),"aim1"))
61 data_sets %<>% filter(str_detect(basename(data_file),"geno"))
62
63 data_sets %<>% filter(str_detect(basename(data_file),"emca4"))
64
65 phase_dr <- data_sets$phase_dr %>% unique()
66
67 ### add the couple of all_snps, all_snps_survey datasets
68 only_snps   <- here::here("results/2019_05_28_clean_complete_data/aim1_only_snps_geno.rds")
69 snps_survey <- here::here("results/2019_05_28_clean_complete_data/aim1_snps_survey_geno.rds")
70
71 data_sets %<>% add_row(data_file = only_snps,phase = "aim1",phase_dr)
72 data_sets %<>% add_row(data_file = snps_survey, phase = "aim1",phase_dr)
73
74 #### add the interactions:
75
76 emca4_list <- c("rs2935776","rs3870371","rs1562430","rs1456315","rs6470588","rs1264202","rs7014346","rs9792269","rs284489")
77
78 build_interactions <- function(data,snps)
79 {
80     data_mat <- data %>% select(-person_id,-status) %>% select_if(is.numeric)
81     data_mat <- as.matrix(data_mat)
82     
83     all_pairs <- t(combn(snps,2))
84     nms <- apply(all_pairs, 1, FUN = function(x)str_c(x[1],x[2],sep = "_"))
85     sp_all_pairs <- all_pairs %>%
86         as.data.frame() %>% 
87         { split(.,seq_len(nrow(.)))}
88     inter_list <- map(sp_all_pairs, ~ data_mat[,c(.$V1,.$V2)])        
89     inter_list %<>% map(matrixStats::rowProds)
90     
91     interactions <- bind_cols(inter_list) %>% 
92         as_tibble() %>% 
93         set_colnames(nms)
94     
95     bind_cols(data,interactions) 
96         
97 }
98
99 data_sets %<>% 
100     mutate(
101         data = future_map(data_file,readRDS),
102         data_int = future_map(data,build_interactions,emca4_list))
103
104 data_sets %<>% 
105     select(-data) %>% 
106     mutate(
107         data_file = basename(data_file) %>% str_replace(".rds","_inter.rds"),
108         data_file = file.path(snp_dr,data_file),
109         data_int = map2(data_int,data_file,saveRDS)) %>% 
110     select(-data_int)
111
112 data_sets %<>% nest(-phase_dr,-phase)
113 data_sets %<>% 
114     mutate(
115         cv_files = map(phase_dr,list.files,full.names = TRUE),
116         cv_files = map(cv_files , ~ tibble(cv_files = . )),
117         all_files = map2(data,cv_files,crossing)
118     )
119
120 data_sets %<>% select(phase,all_files) %>% unnest()
121
122 data_sets %<>% crossing(models)
123
124 data_sets %<>% select(models,data_file,cv_files,params)
125
126
127 data_sets %<>% mutate_all(list( aux = ~ basename(.) %>% str_remove(".rds") %>% 
128     str_remove_all("_parameters") %>% str_remove_all(".R"))) %>% 
129     mutate(
130         outdir = paste(data_file_aux,cv_files_aux,models_aux,sep = "_")
131     )
132
133 data_sets %<>% select(-contains("aux"))
134
135
136 data_sets %>% write_csv(file.path(out_dr,"2019_08_15_predictions_Emca4_inter_elastic_nets_params.csv"),
137                 col_names = FALSE)