From db8fcaa6e55028efe111653924b301ccf7892813 Mon Sep 17 00:00:00 2001
From: welch16 <welch16@gmail.com>
Date: Tue, 10 Sep 2019 10:51:33 -0500
Subject: [PATCH] adding script to calculate odds ratio of snp models only

---
 RFEST_runs/results/2019_09_10_calculate_risk_alleles/2019_09_10_snp_odds_ratio.R |  146 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 146 insertions(+), 0 deletions(-)

diff --git a/RFEST_runs/results/2019_09_10_calculate_risk_alleles/2019_09_10_snp_odds_ratio.R b/RFEST_runs/results/2019_09_10_calculate_risk_alleles/2019_09_10_snp_odds_ratio.R
new file mode 100644
index 0000000..b1c6867
--- /dev/null
+++ b/RFEST_runs/results/2019_09_10_calculate_risk_alleles/2019_09_10_snp_odds_ratio.R
@@ -0,0 +1,146 @@
+
+library(magrittr)
+library(tidyverse)
+
+### we are going to 
+
+metadata <- read_tsv(here::here("rawdata","metadata","emca4_snps.tsv")) %>% 
+	rename(rs_id = SNP)
+
+indr <- here::here("data","filtered")
+outdr <- here::here("data","clean")
+
+cases <- file.path(indr,"genotype_CGEMS_CASE.rds") %>% readRDS()
+controls <- file.path(indr,"genotype_CGEMS_CONTROL.rds") %>% readRDS()
+
+rename_variables <- function(data)
+{
+	set_names(data, c("rs_id","specimen","individual","allele1","allele2"))	
+}
+
+## remove specimen column because it is not necessary for our 
+## purposes
+cases %<>% rename_variables() %>% select(-specimen)
+controls %<>% rename_variables() %>% select(-specimen)
+
+cases %>% group_by(rs_id) %>% tally()
+controls %>% group_by(rs_id) %>% tally()
+
+cases %>% group_by(individual) %>% tally() %>% nrow()
+controls %>% group_by(individual) %>% tally() %>% nrow()
+
+## calculate allele frequency to determine the risk allele
+## defined as the minor allele
+
+## parse case and control information as a binary variables
+cases %<>% mutate( status = 1)
+controls %<>% mutate( status = 0)
+cgems <- bind_rows(cases,controls)
+
+## as expected, in this df, the allele 1 NA's are the same individuals 
+## than in allele2
+
+cgems %<>% 
+	nest(-rs_id, .key = "ind_data") %>% 
+	mutate(
+		alleles = map(ind_data, ~ unique(c(.$allele1,.$allele2))) %>% map(sort),
+		allele1 = map_chr(alleles,1),
+		allele2 = map_chr(alleles,2),
+		n = map_int(ind_data,nrow),
+		n_NA = map(ind_data, filter,is.na(allele1)) %>% map_int(nrow),
+		n_AA = map2(ind_data, alleles, ~ filter(.x, allele1 == .y[1] & allele2 == .y[2])) %>% 
+			map_int(nrow),
+		n_aa = map2(ind_data, alleles, ~ filter(.x, allele1 == .y[2] & allele2 == .y[2])) %>% 
+			map_int(nrow),
+		n_Aa = n - n_NA - n_AA - n_aa,
+		allele1_freq = ( 2 * n_AA + n_Aa) / (2 * n),
+		risk_allele = if_else(allele1_freq > 0.5, allele2, allele1)) %>%   
+	select(-alleles,-allele1,-allele2) 
+
+cgems_summary <- cgems %>% 
+	select(-ind_data) %>% 
+	mutate( raf = pmin(allele1_freq, 1 - allele1_freq)) %>% 
+	select(rs_id,risk_allele,raf,n,n_NA)
+
+cgems_summary %<>% left_join(metadata,by = "rs_id") %>% arrange(-n_NA) %>% 
+	mutate(
+		alleles2 = str_split(alleles,"\\/") %>% map(unlist),
+		is_neg_strand = ! map2_lgl(risk_allele,alleles2, ~ .x %in% .y),
+		is_neg_strand = if_else(is_neg_strand,"yes","no")) %>% 
+	select(-alleles2)
+	
+cgems_summary
+
+cgems %<>% select(rs_id,ind_data,risk_allele) %>% unnest()
+
+cgems
+
+cgems %<>% 
+	mutate(
+		contains_risk_allele = allele1 == risk_allele | allele2 == risk_allele,
+		contains_risk_allele = case_when(
+			is.na(contains_risk_allele) ~ NA_real_,
+			contains_risk_allele ~ 1,
+			! contains_risk_allele ~ 0
+			))
+
+cgems_mat <- cgems %>%
+	select(rs_id,individual,status,contains_risk_allele) %>% 
+	spread(rs_id,contains_risk_allele)
+	
+## 
+cgems_snps <- cgems_mat %>% as.data.frame() %>% 
+	column_to_rownames("individual") %>% 
+	select(-status) %>% as.matrix()
+
+cgems_summary %>% mutate( p_NA = n_NA / n) %>% head(2)
+
+cgems_summary %>% filter(is_neg_strand == "yes") %>% as.data.frame %>% 
+	select(rs_id,risk_allele,alleles)
+
+cgems_summary %>% write_tsv(file.path(indr,"2019_07_22_snp_summary.tsv"))
+
+snps_to_keep <- cgems_summary %>% filter(!(n_NA  > 400 | raf  <= .05) ) %>% 
+	pull(rs_id)
+	
+## remove snps with low risk allele frequency or higher number of missing values
+cgems_mat %<>% select(individual,status,one_of(snps_to_keep))
+cgems_snps <- cgems_snps[,snps_to_keep]
+
+nsnps <- ncol(cgems_snps)
+
+pmiss_sample <- apply( cgems_snps , 1 , function(x)sum(is.na(x))) / nsnps
+
+### only keep samples with less than 10% of missing genotypes
+## this means we will remove 35 samples
+samples_to_keep <- names(pmiss_sample)[pmiss_sample < .1]
+
+cgems_mat %<>% filter(individual %in% samples_to_keep)
+cgems_snps <- cgems_snps[samples_to_keep,]
+
+cgems_mat %>% count(status)
+
+#### still similar
+
+# # A tibble: 2 x 2
+#   status     n
+#    <dbl> <int>
+# 1      0  1189
+# 2      1  1210
+
+cgems_mat_by_snps <- cgems_mat %>% gather(snp,allele,-status,-individual)
+cgems_mat_by_snps %<>% na.omit() %>% nest(-snp)
+
+cgems_mat_by_snps %<>% 
+	mutate(
+		model_snp = map(data, ~ glm(status ~ 0 + allele, data = . ,family = binomial(link = "logit"))))
+		
+cgems_mat_by_snps %<>% 
+	mutate(
+		snp_coeff = map_dbl(model_snp, ~ coef(.)[1]),
+		odds_ratio = exp(snp_coeff),
+		risk_allele = if_else(odds_ratio  > 1,"yes","no"))
+		
+cgems_mat_by_snps %>% count(risk_allele)
+
+cgems_mat_by_snps %>% select(-data,-model_snp) %>% saveRDS(file.path(outdr,"2019_09_10_snp_odds_ratio.rds"))
\ No newline at end of file

--
Gitblit v1.8.0