welch16
2019-09-10 fad2d50d4084cd610692123903bbe72cb6abc2fc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
 
library(magrittr)
library(tidyverse)
library(furrr)
 
plan(multiprocess, workers = 8)
 
 
param_dr <- here::here("results","2019_05_28_clean_complete_data","model_parameters")
model_dr <- here::here("bin","2019_05_28_prepare_predictive_models","models")
data_dr <- here::here("results","2019_07_24_segregated_data")
cv_dr <- here::here("results","2019_05_28_clean_complete_data")
 
snp_dr <- here::here("results","2019_08_15_Emca4_interactions","datasets")
 
dir.create(snp_dr,showWarnings = FALSE)
 
out_dr <- here::here("results","2019_08_15_Emca4_interactions","predictions")
 
dir.create(out_dr,showWarnings = FALSE)
dir.create(file.path(out_dr,"logs"),showWarnings = FALSE)
dir.create(file.path(out_dr,"err"),showWarnings = FALSE)
dir.create(file.path(out_dr,"out"),showWarnings = FALSE)
 
params <- list.files(param_dr,full.names = TRUE)
models <- list.files(model_dr,full.names = TRUE)
 
models <- tibble(models)
 
### need to modify if we add more more models
models %<>% 
    mutate(
        params = case_when(
            str_detect(models,"decision_trees") ~ "dont.rds",
            str_detect(models,"lasso") ~ str_subset(params,"lasso"), 
            str_detect(models,"ridge") ~ str_subset(params, "ridge"),
            str_detect(models,"elastic") ~ str_subset(params, "glmnet"),
            str_detect(models,"gbm") ~ str_subset(params,"gbm"),
            str_detect(models,"logistic") ~ "dont.rds",
            str_detect(models,"random_forest") ~ str_subset(params,"rf"),
            str_detect(models,"svm_linear") ~ str_subset(params,"svm_linear"),
            str_detect(models,"svm_poly") ~ str_subset(params,"svm_poly"),
            str_detect(models,"svm_radial") ~ str_subset(params,"svm_radial")
        )
    ) %>% 
    filter(!str_detect(models,"gbm")) %>% 
    filter(!str_detect(models,"svm")) %>% 
    filter(!str_detect(models,"logistic")) %>% 
    filter(!str_detect(models,"adaptive"))
 
models %<>% filter(str_detect(basename(models),"elastic"))
 
 
data_sets <- tibble(
    data_file = list.files(data_dr,full.names = TRUE,pattern ="rds")) %>% 
    mutate(
        phase = if_else(str_detect(data_file,"aim1"), "aim1","aim2"),
        phase_dr = file.path(cv_dr,paste0(phase,"_cv")))
        
data_sets %<>% filter(str_detect(basename(data_file),"aim1"))
data_sets %<>% filter(str_detect(basename(data_file),"geno"))
 
data_sets %<>% filter(str_detect(basename(data_file),"emca4"))
 
phase_dr <- data_sets$phase_dr %>% unique()
 
### add the couple of all_snps, all_snps_survey datasets
only_snps   <- here::here("results/2019_05_28_clean_complete_data/aim1_only_snps_geno.rds")
snps_survey <- here::here("results/2019_05_28_clean_complete_data/aim1_snps_survey_geno.rds")
 
data_sets %<>% add_row(data_file = only_snps,phase = "aim1",phase_dr)
data_sets %<>% add_row(data_file = snps_survey, phase = "aim1",phase_dr)
 
#### add the interactions:
 
emca4_list <- c("rs2935776","rs3870371","rs1562430","rs1456315","rs6470588","rs1264202","rs7014346","rs9792269","rs284489")
 
build_interactions <- function(data,snps)
{
    data_mat <- data %>% select(-person_id,-status) %>% select_if(is.numeric)
    data_mat <- as.matrix(data_mat)
    
    all_pairs <- t(combn(snps,2))
    nms <- apply(all_pairs, 1, FUN = function(x)str_c(x[1],x[2],sep = "_"))
    sp_all_pairs <- all_pairs %>%
        as.data.frame() %>% 
        { split(.,seq_len(nrow(.)))}
    inter_list <- map(sp_all_pairs, ~ data_mat[,c(.$V1,.$V2)])        
    inter_list %<>% map(matrixStats::rowProds)
    
    interactions <- bind_cols(inter_list) %>% 
        as_tibble() %>% 
        set_colnames(nms)
    
    bind_cols(data,interactions) 
        
}
 
data_sets %<>% 
    mutate(
        data = future_map(data_file,readRDS),
        data_int = future_map(data,build_interactions,emca4_list))
 
data_sets %<>% 
    select(-data) %>% 
    mutate(
        data_file = basename(data_file) %>% str_replace(".rds","_inter.rds"),
        data_file = file.path(snp_dr,data_file),
        data_int = map2(data_int,data_file,saveRDS)) %>% 
    select(-data_int)
 
data_sets %<>% nest(-phase_dr,-phase)
data_sets %<>% 
    mutate(
        cv_files = map(phase_dr,list.files,full.names = TRUE),
        cv_files = map(cv_files , ~ tibble(cv_files = . )),
        all_files = map2(data,cv_files,crossing)
    )
 
data_sets %<>% select(phase,all_files) %>% unnest()
 
data_sets %<>% crossing(models)
 
data_sets %<>% select(models,data_file,cv_files,params)
 
 
data_sets %<>% mutate_all(list( aux = ~ basename(.) %>% str_remove(".rds") %>% 
    str_remove_all("_parameters") %>% str_remove_all(".R"))) %>% 
    mutate(
        outdir = paste(data_file_aux,cv_files_aux,models_aux,sep = "_")
    )
 
data_sets %<>% select(-contains("aux"))
 
 
data_sets %>% write_csv(file.path(out_dr,"2019_08_15_predictions_Emca4_inter_elastic_nets_params.csv"),
                col_names = FALSE)