library(magrittr) library(tidyverse) library(furrr) plan(multiprocess, workers = 8) param_dr <- here::here("results","2019_05_28_clean_complete_data","model_parameters") model_dr <- here::here("bin","2019_05_28_prepare_predictive_models","models") data_dr <- here::here("results","2019_07_24_segregated_data") cv_dr <- here::here("results","2019_05_28_clean_complete_data") snp_dr <- here::here("results","2019_08_15_Emca4_interactions","datasets") dir.create(snp_dr,showWarnings = FALSE) out_dr <- here::here("results","2019_08_15_Emca4_interactions","predictions") dir.create(out_dr,showWarnings = FALSE) dir.create(file.path(out_dr,"logs"),showWarnings = FALSE) dir.create(file.path(out_dr,"err"),showWarnings = FALSE) dir.create(file.path(out_dr,"out"),showWarnings = FALSE) params <- list.files(param_dr,full.names = TRUE) models <- list.files(model_dr,full.names = TRUE) models <- tibble(models) ### need to modify if we add more more models models %<>% mutate( params = case_when( str_detect(models,"decision_trees") ~ "dont.rds", str_detect(models,"lasso") ~ str_subset(params,"lasso"), str_detect(models,"ridge") ~ str_subset(params, "ridge"), str_detect(models,"elastic") ~ str_subset(params, "glmnet"), str_detect(models,"gbm") ~ str_subset(params,"gbm"), str_detect(models,"logistic") ~ "dont.rds", str_detect(models,"random_forest") ~ str_subset(params,"rf"), str_detect(models,"svm_linear") ~ str_subset(params,"svm_linear"), str_detect(models,"svm_poly") ~ str_subset(params,"svm_poly"), str_detect(models,"svm_radial") ~ str_subset(params,"svm_radial") ) ) %>% filter(!str_detect(models,"gbm")) %>% filter(!str_detect(models,"svm")) %>% filter(!str_detect(models,"logistic")) %>% filter(!str_detect(models,"adaptive")) models %<>% filter(str_detect(basename(models),"elastic")) data_sets <- tibble( data_file = list.files(data_dr,full.names = TRUE,pattern ="rds")) %>% mutate( phase = if_else(str_detect(data_file,"aim1"), "aim1","aim2"), phase_dr = file.path(cv_dr,paste0(phase,"_cv"))) data_sets %<>% filter(str_detect(basename(data_file),"aim1")) data_sets %<>% filter(str_detect(basename(data_file),"geno")) data_sets %<>% filter(str_detect(basename(data_file),"emca4")) phase_dr <- data_sets$phase_dr %>% unique() ### add the couple of all_snps, all_snps_survey datasets only_snps <- here::here("results/2019_05_28_clean_complete_data/aim1_only_snps_geno.rds") snps_survey <- here::here("results/2019_05_28_clean_complete_data/aim1_snps_survey_geno.rds") data_sets %<>% add_row(data_file = only_snps,phase = "aim1",phase_dr) data_sets %<>% add_row(data_file = snps_survey, phase = "aim1",phase_dr) #### add the interactions: emca4_list <- c("rs2935776","rs3870371","rs1562430","rs1456315","rs6470588","rs1264202","rs7014346","rs9792269","rs284489") build_interactions <- function(data,snps) { data_mat <- data %>% select(-person_id,-status) %>% select_if(is.numeric) data_mat <- as.matrix(data_mat) all_pairs <- t(combn(snps,2)) nms <- apply(all_pairs, 1, FUN = function(x)str_c(x[1],x[2],sep = "_")) sp_all_pairs <- all_pairs %>% as.data.frame() %>% { split(.,seq_len(nrow(.)))} inter_list <- map(sp_all_pairs, ~ data_mat[,c(.$V1,.$V2)]) inter_list %<>% map(matrixStats::rowProds) interactions <- bind_cols(inter_list) %>% as_tibble() %>% set_colnames(nms) bind_cols(data,interactions) } data_sets %<>% mutate( data = future_map(data_file,readRDS), data_int = future_map(data,build_interactions,emca4_list)) data_sets %<>% select(-data) %>% mutate( data_file = basename(data_file) %>% str_replace(".rds","_inter.rds"), data_file = file.path(snp_dr,data_file), data_int = map2(data_int,data_file,saveRDS)) %>% select(-data_int) data_sets %<>% nest(-phase_dr,-phase) data_sets %<>% mutate( cv_files = map(phase_dr,list.files,full.names = TRUE), cv_files = map(cv_files , ~ tibble(cv_files = . )), all_files = map2(data,cv_files,crossing) ) data_sets %<>% select(phase,all_files) %>% unnest() data_sets %<>% crossing(models) data_sets %<>% select(models,data_file,cv_files,params) data_sets %<>% mutate_all(list( aux = ~ basename(.) %>% str_remove(".rds") %>% str_remove_all("_parameters") %>% str_remove_all(".R"))) %>% mutate( outdir = paste(data_file_aux,cv_files_aux,models_aux,sep = "_") ) data_sets %<>% select(-contains("aux")) data_sets %>% write_csv(file.path(out_dr,"2019_08_15_predictions_Emca4_inter_elastic_nets_params.csv"), col_names = FALSE)