4 files added
4 files modified
460 ■■■■■ changed files
RFEST_runs/results/2019_09_10_calculate_risk_alleles/2019_09_10_snp_odds_ratio.R 143 ●●●●● patch | view | raw | blame | history
WWHS_BreastCancer/bin/2019_08_08_AMIA_paper_analysis/2019_08_08_aim1_elastic_nets_ROC_comparison.R 35 ●●●● patch | view | raw | blame | history
WWHS_BreastCancer/bin/2019_08_08_AMIA_paper_analysis/2019_08_13_plot_performance_curves.R 53 ●●●● patch | view | raw | blame | history
WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/1_create_interaction_blocks.R 137 ●●●●● patch | view | raw | blame | history
WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/2019_08_15_elastic_nets_emca4_interactions_models_condor_submit 16 ●●●●● patch | view | raw | blame | history
WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/2_collect_results.R 76 ●●●●● patch | view | raw | blame | history
WWHS_BreastCancer/figs/2019_08_15_AMIA_paper_submission/2019_08_12_ROC_comparison_ord1.pdf patch | view | raw | blame | history
WWHS_BreastCancer/figs/2019_08_15_AMIA_paper_submission/2019_08_12_ROC_comparison_survey.pdf patch | view | raw | blame | history
RFEST_runs/results/2019_09_10_calculate_risk_alleles/2019_09_10_snp_odds_ratio.R
New file
@@ -0,0 +1,143 @@
library(magrittr)
library(tidyverse)
metadata <- read_tsv(here::here("rawdata","metadata","emca4_snps.tsv")) %>%
    rename(rs_id = SNP)
indr <- here::here("data","filtered")
outdr <- here::here("data","clean")
cases <- file.path(indr,"genotype_CGEMS_CASE.rds") %>% readRDS()
controls <- file.path(indr,"genotype_CGEMS_CONTROL.rds") %>% readRDS()
rename_variables <- function(data)
{
    set_names(data, c("rs_id","specimen","individual","allele1","allele2"))
}
## remove specimen column because it is not necessary for our
## purposes
cases %<>% rename_variables() %>% select(-specimen)
controls %<>% rename_variables() %>% select(-specimen)
cases %>% group_by(rs_id) %>% tally()
controls %>% group_by(rs_id) %>% tally()
cases %>% group_by(individual) %>% tally() %>% nrow()
controls %>% group_by(individual) %>% tally() %>% nrow()
## calculate allele frequency to determine the risk allele
## defined as the minor allele
## parse case and control information as a binary variables
cases %<>% mutate( status = 1)
controls %<>% mutate( status = 0)
cgems <- bind_rows(cases,controls)
## as expected, in this df, the allele 1 NA's are the same individuals
## than in allele2
cgems %<>%
    nest(-rs_id, .key = "ind_data") %>%
    mutate(
        alleles = map(ind_data, ~ unique(c(.$allele1,.$allele2))) %>% map(sort),
        allele1 = map_chr(alleles,1),
        allele2 = map_chr(alleles,2),
        n = map_int(ind_data,nrow),
        n_NA = map(ind_data, filter,is.na(allele1)) %>% map_int(nrow),
        n_AA = map2(ind_data, alleles, ~ filter(.x, allele1 == .y[1] & allele2 == .y[2])) %>%
            map_int(nrow),
        n_aa = map2(ind_data, alleles, ~ filter(.x, allele1 == .y[2] & allele2 == .y[2])) %>%
            map_int(nrow),
        n_Aa = n - n_NA - n_AA - n_aa,
        allele1_freq = ( 2 * n_AA + n_Aa) / (2 * n),
        risk_allele = if_else(allele1_freq > 0.5, allele2, allele1)) %>%
    select(-alleles,-allele1,-allele2)
cgems_summary <- cgems %>%
    select(-ind_data) %>%
    mutate( raf = pmin(allele1_freq, 1 - allele1_freq)) %>%
    select(rs_id,risk_allele,raf,n,n_NA)
cgems_summary %<>% left_join(metadata,by = "rs_id") %>% arrange(-n_NA) %>%
    mutate(
        alleles2 = str_split(alleles,"\\/") %>% map(unlist),
        is_neg_strand = ! map2_lgl(risk_allele,alleles2, ~ .x %in% .y),
        is_neg_strand = if_else(is_neg_strand,"yes","no")) %>%
    select(-alleles2)
cgems %<>% select(rs_id,ind_data,risk_allele) %>% unnest()
cgems %<>%
    mutate(
        contains_risk_allele = allele1 == risk_allele | allele2 == risk_allele,
        contains_risk_allele = case_when(
            is.na(contains_risk_allele) ~ NA_real_,
            contains_risk_allele ~ 1,
            ! contains_risk_allele ~ 0
            ))
cgems_mat <- cgems %>%
    select(rs_id,individual,status,contains_risk_allele) %>%
    spread(rs_id,contains_risk_allele)
##
cgems_snps <- cgems_mat %>% as.data.frame() %>%
    column_to_rownames("individual") %>%
    select(-status) %>% as.matrix()
cgems_summary %>% mutate( p_NA = n_NA / n) %>% head(2)
cgems_summary %>% filter(is_neg_strand == "yes") %>% as.data.frame %>%
    select(rs_id,risk_allele,alleles)
cgems_summary %>% write_tsv(file.path(indr,"2019_07_22_snp_summary.tsv"))
snps_to_keep <- cgems_summary %>% filter(!(n_NA  > 400 | raf  <= .05) ) %>%
    pull(rs_id)
## remove snps with low risk allele frequency or higher number of missing values
cgems_mat %<>% select(individual,status,one_of(snps_to_keep))
cgems_snps <- cgems_snps[,snps_to_keep]
nsnps <- ncol(cgems_snps)
pmiss_sample <- apply( cgems_snps , 1 , function(x)sum(is.na(x))) / nsnps
### only keep samples with less than 10% of missing genotypes
## this means we will remove 35 samples
samples_to_keep <- names(pmiss_sample)[pmiss_sample < .1]
cgems_mat %<>% filter(individual %in% samples_to_keep)
cgems_snps <- cgems_snps[samples_to_keep,]
cgems_mat %>% count(status)
#### still similar
# # A tibble: 2 x 2
#   status     n
#    <dbl> <int>
# 1      0  1189
# 2      1  1210
cgems_mat_by_snps <- cgems_mat %>% gather(snp,allele,-status,-individual)
cgems_mat_by_snps %<>% na.omit() %>% nest(-snp)
cgems_mat_by_snps %<>%
    mutate(
        incidence_table = map(data, select,-individual) %>% map(table),
        fisher = map(incidence_table,fisher.test),
        fisher = map(fisher,broom::tidy),
        odds_ratio = map_dbl(fisher,"estimate"),
        risk_allele = if_else(odds_ratio > 1,"yes","no"))
cgems_mat_by_snps %>%
    select(-data,-incidence_table,-fisher) %>% saveRDS(file.path(outdr,"2019_09_10_snp_odds_ratio.rds"))
WWHS_BreastCancer/bin/2019_08_08_AMIA_paper_analysis/2019_08_08_aim1_elastic_nets_ROC_comparison.R
@@ -49,18 +49,32 @@
    ),
    fit = fct_rev(fit)  ) 
indr <- here::here("results/2019_08_15_Emca4_interactions")
results_inter = file.path(indr,"2019_08_15_predictive_results.rds") %>% readRDS()
results_inter %<>%
    mutate(
        cov = str_replace(cov,"_geno_inter","_Emca4 interactions"))
results %<>% mutate_if(is.factor, list( ~ as.character(.)))
results_prev %<>% mutate_if(is.factor, list( ~ as.character(.)))
results_inter %<>% mutate_if(is.factor, list( ~ as.character(.)))
results3 = 
  bind_rows(
    results %>% mutate( data_group = "segregated"),
    results_prev %>% mutate( data_group = "pooled together"))
    results_prev %>% mutate( data_group = "pooled together"),
    results_inter %>% mutate( data_group = "Emca4 interactions"))
results3 %<>%
  mutate(
      cov = str_replace(cov,"\\_"," + "),
      cov = str_split(cov," + ") %>% map(str_to_title) %>% map_chr( ~ str_c(.,collapse = "_")),
    cov = fct_reorder(cov,test_prob_auc),
    cov = fct_rev(cov),
    cov = fct_relabel(cov, .fun = function(x)str_replace_all(x,"_"," + ")))
pal = RColorBrewer::brewer.pal(3,"Pastel1")
@@ -83,8 +97,9 @@
    colour = "Data group",
    fill = "Data group"
  )+
  scale_color_manual(values  = c("navyblue","darkred"))+
  scale_fill_manual(values = rev(pal[1:2]))
  scale_color_manual(values  = c("forestgreen","navyblue","darkred")) +
  scale_fill_manual(values = rev(pal[1:3]))
ggsave(
  filename = file.path(figsdr,"2019_08_08_AUC_comparison.pdf"),
@@ -110,22 +125,24 @@
        t_test(test_prob_auc ~ cov,order = groups,alternative = alt)
}
tidy_wilcox <- function(data,groups,alt = "greater")
tidy_wilcox <- function(data,groups,alt = "greater",cf = 0.95)
{
    data %<>% 
        filter(cov %in% groups) %>% 
        mutate( cov = factor(cov))
        
    wilcox.test( test_prob_auc ~ cov , data = data , alternative = alt) %>%
    wilcox.test( test_prob_auc ~ cov , data = data , alternative = alt,conf.int = TRUE,
        conf.level = cf) %>%
        broom::tidy()
}
tidy_wilcox_lvl <- function(data,val,group,alt = "greater")
tidy_wilcox_lvl <- function(data,val,group,alt = "greater",cf = 0.95)
{
    data %<>% 
        filter(cov == group) 
        
    wilcox.test(x = data$test_prob_auc,  mu = val, alternative = alt) %>%
    wilcox.test(x = data$test_prob_auc,  mu = val, alternative = alt,
        conf.int = TRUE,conf.level = cf) %>%
        broom::tidy()
}
@@ -146,12 +163,12 @@
tidy_t_test(plot_results,c("Gwas + Emca4 + Density + Survey","Snps + Survey"))
tidy_wilcox(plot_results,c("Gwas + Emca4 + Survey","Gwas + Emca4 + Density + Survey"))
tidy_wilcox(plot_results,c("Gwas + Emca4 + Density + Survey","Snps + Survey"),alt = "less")
tidy_wilcox(plot_results,c("Gwas + Emca4 + Density + Survey","Snps + Survey"),alt = "less",cf = .1)
tidy_t_test(plot_results,c("Gwas","Survey"),"less")
tidy_wilcox(plot_results,c("Snps","Survey"),alt = "less")
tidy_wilcox(plot_results,c("Snps","Survey"),alt = "two.sided",cf = .05)
tidy_t_test(plot_results,c("Snps","Survey"),alt = "greater")
WWHS_BreastCancer/bin/2019_08_08_AMIA_paper_analysis/2019_08_13_plot_performance_curves.R
@@ -70,10 +70,14 @@
test_roc %<>% 
    mutate(
        roc = future_map(prob, ~ yardstick::roc_curve(data = ., prob,truth = status)),
        pr = future_map(prob, ~ yardstick::pr_curve(data = ., prob , truth = status)))
        pr = future_map(prob, ~ yardstick::pr_curve(data = ., prob , truth = status)),
        auroc = future_map(prob, ~ yardstick::roc_auc(data = ., prob, truth = status)),
        aupr = future_map(prob, ~ yardstick::pr_auc(data = .,prob,truth = status)))
test_roc %<>% 
    mutate(
        auroc = map_dbl(auroc,".estimate"),
        aupr = map_dbl(aupr,".estimate"),
        cov = str_remove(cov,"_geno") %>% str_remove("_risk"),
        cov = str_split(cov,"_") %>% map(str_to_title) %>% map_chr( ~ str_c(.,collapse = "_")))
@@ -108,8 +112,6 @@
            
}
test_roc$cov %>% unique
comparison_order1 <- test_roc %>% 
    filter(geno == "genotype") %>% 
    filter(cov %in% c("Emca4","Density","Gwas","Only_Snps")) %>%
@@ -117,10 +119,25 @@
    group_by(cov) %>% 
    summarize(
        roc = list(merge_curve_list(roc)),
        pr = list(merge_curve_list(pr)))
        pr = list(merge_curve_list(pr)),
        auroc_sd = sd(auroc),
        aupr_sd = sd(aupr),
        n = n())
        # ,
        # auroc_q05 = quantile(auroc,prob = .05),
        # auroc_q95 = quantile(auroc,prob = .95),
        # aupr_q05 = quantile(aupr,prob = 0.05),
        # aupr_q95 = quantile(aupr,prob = 0.95))
    
test_roc %>%
    filter(geno == "genotype") %>%
    filter(cov %in% c("Emca4","Density","Gwas","Only_Snps")) %>%
    mutate( cov = if_else(cov == "Only_Snps","Snps",cov)) %>%
    group_by(cov) %>%
    summarize(
        roc = list(conf_int(roc)))
library(ggsci)
roc1 <- comparison_order1 %>% 
@@ -175,7 +192,15 @@
    group_by(cov) %>% 
    summarize(
        roc = list(merge_curve_list(roc)),
        pr = list(merge_curve_list(pr)))
        pr = list(merge_curve_list(pr)),
        auroc_sd = sd(auroc),
        aupr_sd = sd(aupr),
        n = n())
        # ,
        # auroc_q05 = quantile(auroc,prob = .05),
        # auroc_q95 = quantile(auroc,prob = .95),
        # aupr_q05 = quantile(aupr,prob = 0.05),
        # aupr_q95 = quantile(aupr,prob = 0.95))
roc_wsurvey <- comparison_wsurvey %>% 
    select(-pr) %>% 
@@ -229,3 +254,19 @@
        auroc = map_dbl(roc, ~ -pracma::trapz(1 - .$specificity, .$sensitivity)),
        aupr = map(pr,na.omit) %>% map_dbl(~ -pracma::trapz(.$recall,.$precision)))
z = qnorm(.95)
z
comparison_order1 %>%
    select_if(negate(is.list)) %>%
    mutate(
        auc_ci_m = auroc - z * auroc_sd / sqrt(n),
        auc_ci_M = auroc + z * auroc_sd / sqrt(n),
        aupr_ci_m = aupr - z * aupr_sd / sqrt(n),
        aupr_ci_M = aupr + z * aupr_sd / sqrt(n))
comparison_wsurvey %>%
    select_if(negate(is.list)) %>%
    mutate(
        auc_ci_m = auroc - z * auroc_sd / sqrt(n),
        auc_ci_M = auroc + z * auroc_sd / sqrt(n),
        aupr_ci_m = aupr - z * aupr_sd / sqrt(n),
        aupr_ci_M = aupr + z * aupr_sd / sqrt(n))
WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/1_create_interaction_blocks.R
New file
@@ -0,0 +1,137 @@
library(magrittr)
library(tidyverse)
library(furrr)
plan(multiprocess, workers = 8)
param_dr <- here::here("results","2019_05_28_clean_complete_data","model_parameters")
model_dr <- here::here("bin","2019_05_28_prepare_predictive_models","models")
data_dr <- here::here("results","2019_07_24_segregated_data")
cv_dr <- here::here("results","2019_05_28_clean_complete_data")
snp_dr <- here::here("results","2019_08_15_Emca4_interactions","datasets")
dir.create(snp_dr,showWarnings = FALSE)
out_dr <- here::here("results","2019_08_15_Emca4_interactions","predictions")
dir.create(out_dr,showWarnings = FALSE)
dir.create(file.path(out_dr,"logs"),showWarnings = FALSE)
dir.create(file.path(out_dr,"err"),showWarnings = FALSE)
dir.create(file.path(out_dr,"out"),showWarnings = FALSE)
params <- list.files(param_dr,full.names = TRUE)
models <- list.files(model_dr,full.names = TRUE)
models <- tibble(models)
### need to modify if we add more more models
models %<>%
    mutate(
        params = case_when(
            str_detect(models,"decision_trees") ~ "dont.rds",
            str_detect(models,"lasso") ~ str_subset(params,"lasso"),
            str_detect(models,"ridge") ~ str_subset(params, "ridge"),
            str_detect(models,"elastic") ~ str_subset(params, "glmnet"),
            str_detect(models,"gbm") ~ str_subset(params,"gbm"),
            str_detect(models,"logistic") ~ "dont.rds",
            str_detect(models,"random_forest") ~ str_subset(params,"rf"),
            str_detect(models,"svm_linear") ~ str_subset(params,"svm_linear"),
            str_detect(models,"svm_poly") ~ str_subset(params,"svm_poly"),
            str_detect(models,"svm_radial") ~ str_subset(params,"svm_radial")
        )
    ) %>%
    filter(!str_detect(models,"gbm")) %>%
    filter(!str_detect(models,"svm")) %>%
    filter(!str_detect(models,"logistic")) %>%
    filter(!str_detect(models,"adaptive"))
models %<>% filter(str_detect(basename(models),"elastic"))
data_sets <- tibble(
    data_file = list.files(data_dr,full.names = TRUE,pattern ="rds")) %>%
    mutate(
        phase = if_else(str_detect(data_file,"aim1"), "aim1","aim2"),
        phase_dr = file.path(cv_dr,paste0(phase,"_cv")))
data_sets %<>% filter(str_detect(basename(data_file),"aim1"))
data_sets %<>% filter(str_detect(basename(data_file),"geno"))
data_sets %<>% filter(str_detect(basename(data_file),"emca4"))
phase_dr <- data_sets$phase_dr %>% unique()
### add the couple of all_snps, all_snps_survey datasets
only_snps   <- here::here("results/2019_05_28_clean_complete_data/aim1_only_snps_geno.rds")
snps_survey <- here::here("results/2019_05_28_clean_complete_data/aim1_snps_survey_geno.rds")
data_sets %<>% add_row(data_file = only_snps,phase = "aim1",phase_dr)
data_sets %<>% add_row(data_file = snps_survey, phase = "aim1",phase_dr)
#### add the interactions:
emca4_list <- c("rs2935776","rs3870371","rs1562430","rs1456315","rs6470588","rs1264202","rs7014346","rs9792269","rs284489")
build_interactions <- function(data,snps)
{
    data_mat <- data %>% select(-person_id,-status) %>% select_if(is.numeric)
    data_mat <- as.matrix(data_mat)
    all_pairs <- t(combn(snps,2))
    nms <- apply(all_pairs, 1, FUN = function(x)str_c(x[1],x[2],sep = "_"))
    sp_all_pairs <- all_pairs %>%
        as.data.frame() %>%
        { split(.,seq_len(nrow(.)))}
    inter_list <- map(sp_all_pairs, ~ data_mat[,c(.$V1,.$V2)])
    inter_list %<>% map(matrixStats::rowProds)
    interactions <- bind_cols(inter_list) %>%
        as_tibble() %>%
        set_colnames(nms)
    bind_cols(data,interactions)
}
data_sets %<>%
    mutate(
        data = future_map(data_file,readRDS),
        data_int = future_map(data,build_interactions,emca4_list))
data_sets %<>%
    select(-data) %>%
    mutate(
        data_file = basename(data_file) %>% str_replace(".rds","_inter.rds"),
        data_file = file.path(snp_dr,data_file),
        data_int = map2(data_int,data_file,saveRDS)) %>%
    select(-data_int)
data_sets %<>% nest(-phase_dr,-phase)
data_sets %<>%
    mutate(
        cv_files = map(phase_dr,list.files,full.names = TRUE),
        cv_files = map(cv_files , ~ tibble(cv_files = . )),
        all_files = map2(data,cv_files,crossing)
    )
data_sets %<>% select(phase,all_files) %>% unnest()
data_sets %<>% crossing(models)
data_sets %<>% select(models,data_file,cv_files,params)
data_sets %<>% mutate_all(list( aux = ~ basename(.) %>% str_remove(".rds") %>%
    str_remove_all("_parameters") %>% str_remove_all(".R"))) %>%
    mutate(
        outdir = paste(data_file_aux,cv_files_aux,models_aux,sep = "_")
    )
data_sets %<>% select(-contains("aux"))
data_sets %>% write_csv(file.path(out_dr,"2019_08_15_predictions_Emca4_inter_elastic_nets_params.csv"),
                col_names = FALSE)
WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/2019_08_15_elastic_nets_emca4_interactions_models_condor_submit
New file
@@ -0,0 +1,16 @@
universe         = vanilla
executable       = /s/bin/Rscript
arguments        = $(script) --data_file $(data) --cv_idx $(cv) --param $(param) --train_p .75 --cv_fold 5 --cores 8 --seed 12345 --outdir $(basedir)/$(outdr)
output           = $(basedir)/out/$(CLUSTER).$(PROCESS)_$(outdr).out
error            = $(basedir)/err/$(CLUSTER).$(PROCESS)_$(outdr).err
log              = $(basedir)/logs/$(CLUSTER).$(PROCESS)_$(outdr).log
request_memory   = 12G
request_cpus     = 8
on_exit_hold     = (ExitBySignal == True) || (ExitCode != 0)
periodic_release = (NumJobStarts < 3) && ((CurrentTime - EnteredCurrentStatus) > 120)
basedir          = ./results/2019_08_15_Emca4_interactions/predictions
queue script, data, cv, param, outdr from $(basedir)/2019_08_15_predictions_Emca4_inter_elastic_nets_params.csv
WWHS_BreastCancer/bin/2019_08_15_Emca4_interactions/2_collect_results.R
New file
@@ -0,0 +1,76 @@
library(magrittr)
library(tidyverse)
library(BiocParallel)
mp = MulticoreParam(workers = 8)
results_dr <- here::here("results/2019_08_15_Emca4_interactions/predictions")
all_files = results_dr %>% list.files(pattern = "rds", full.names = TRUE)
results <- tibble( file = all_files) %>%
    mutate(
        aux = basename(all_files),
        type = case_when(
            str_detect(aux, "model_fit") ~ "model",
            str_detect(aux, "train_metrics") ~ "train_metrics",
            str_detect(aux, "test_metrics") ~ "test_metrics",
            str_detect(aux, "train_prob") ~ "train_prob",
            str_detect(aux, "test_prob") ~ "test_prob"
        ))
results %<>%
    mutate(
        aim = if_else(str_detect(aux,"aim1"),"aim1","aim2"),
        geno = if_else(str_detect(aux,"geno"),"genotype","risk_allele"),
        cov = str_split(aux,"_cvfold") %>% map_chr(1) %>%
            str_split("\\_") %>% map_chr( ~ paste(.[-1],collapse = "_")),
        adaptive = if_else(str_detect(file,"adaptive"),"yes","no"),
        fit = case_when(
            str_detect(aux,"decision") ~ "decision_trees",
            str_detect(aux,"random") ~ "random_forests",
            str_detect(aux,"logis") ~ "logistic_regression",
            str_detect(aux,"elasti") ~ "elastic_nets",
            str_detect(aux,"lasso") ~ "lasso_regression",
            str_detect(aux,"ridge") ~ "ridge_regression"),
        aux = str_split(aux,"\\_"),
        time = map_chr(aux, ~ .[nchar(.) == 1]),
        cv_fold = map_chr(aux, ~ .[nchar(.) == 2]))
results  %<>% select(-aux)  %>% spread(type,file)
read_filelist <- function(files,mp)
{
    bplapply(files, readRDS, BPPARAM = mp)
}
results  %<>%
    mutate_at(vars(contains("prob")), list( ~ read_filelist(.,mp))) %>%
    mutate_at(vars(contains("metric")), list( ~ read_filelist(.,mp)))
auc <- function(prob_estimate,mp)
{
    estimate_df <- bplapply(prob_estimate,
        function(x)yardstick::roc_auc(x,status,prob),BPPARAM = mp)
    map_dbl(estimate_df,".estimate")
}
aupr <- function(prob_estimate,mp)
{
    estimate_df <- bplapply(prob_estimate,
        function(x)yardstick::pr_auc(x,status,prob),BPPARAM = mp)
    map_dbl(estimate_df,".estimate")
}
results %<>%
    mutate_at(vars(contains("prob")),
        list( ~ auc(.,mp),~ aupr(.,mp)))
results  %>% saveRDS(file.path(dirname(results_dr),"2019_08_15_predictive_results.rds"))
WWHS_BreastCancer/figs/2019_08_15_AMIA_paper_submission/2019_08_12_ROC_comparison_ord1.pdf
Binary files differ
WWHS_BreastCancer/figs/2019_08_15_AMIA_paper_submission/2019_08_12_ROC_comparison_survey.pdf
Binary files differ