####=======================================================================================#### #### #### BUHMBOX v0.38 #### Copy Right: Buhm Han #### This code was written based on R v3.1 #### #### Usage: Rscript BUHMBOX.R [Arguments] #### Argument 1: SNP file (col 1: rsid, col 2: risk allele, col 3: frequency, col 4: OR) #### col 3,4 can contain "NA", which will be mean-value-imputed #### Argument 2: PLINK dosage raw format of the risk alleles -- cases. #### Argument 3: PLINK dosage raw format of the risk alleles -- controls. (optinal: NA can be put here if control data is not available.) #### Argument 4: Method mode. Two letters representing freq mode, and OR mode. #### YY: use both freq and OR; YN: use only freq; #### NY: use only OR; NN: use neither; #### If freq / OR are not used, default values 0.5 / 1.2 are used for all SNPs. #### Argument 5: Estimation mode Y/N #### Argument 6: Perform Mendelian Randomization (risk score regression) or not. Y/N #### Argument 7: Output file #### Argument 8: PC file (optional) #### #### Development history: #### 9/8/14: Initial launch #### 10/13/14: v0.2: Regression technique for very low LD #### 11/6/14: v0.3: Added option to Mendelian Randomization. #### 12/1/14: v0.31: bug fix -- fixed error that MR doesn't work when PC file exists. #### 12/23/14: v0.32: control raw format becomes optional. #### 1/13/15: v0.33: variance GC control implemented based on controls. #### 1/27/15: v0.34: Automatic conversion of OR<1 (input doesn't have to be all risk alleles) #### 4/22/15: v0.35: Now report weighted average correlation, as well as correlation and weight matrices in separate files #### 7/25/15: v0.36: (1) Variance GC control is deprecated. Instead, we use delta correlation between cases and controls. #### (2) Many more output files, including delta correlation and GRS. #### (3) PCs are now regressed out after combining cases and controls. #### (4) Default filtering of r2<0.1 is added. (ignore elements based on control cor matrix) #### 10/13/15: v0.37: Bug fix when flipping OR<1. #### 2/3/16: v0.38: Bug inflating GRS p-value is fixed. #### ####=======================================================================================#### ## Initialize log logcnt <- 1 counter <- function() { counter <- sprintf("[%d]", logcnt) logcnt <<- logcnt + 1 ## Globally increase counter counter } logs <- "==============================LOG===============================" print.log <- function(..., cnt=TRUE) { msg <- paste(...) if (cnt) { msg <- paste(counter(), msg) } logs <<- paste(logs, msg, sep="\n") ## Globally add log write(msg, stderr()) } ## Read arguments args <- commandArgs(trailingOnly=TRUE) print.log(paste("Command line arguments:", paste(args, collapse=" "))) snp.file <- args[1] case.file <- args[2] control.file <- args[3] method.mode <- args[4] method.mode.freq <- substr(method.mode, 1, 1) method.mode.OR <- substr(method.mode, 2, 2) stopifnot(method.mode.freq == "N" || method.mode.freq == "Y") stopifnot(method.mode.OR == "N" || method.mode.OR == "Y") estimation.mode <- args[5] stopifnot(estimation.mode == "Y" || estimation.mode == "N") mendelian.mode <- args[6] stopifnot(mendelian.mode == "Y" || mendelian.mode == "N") out.file <- args[7] pc.file <- NA if (length(args) >= 8) { pc.file <- args[8] } use.control <- TRUE if (control.file == "NA") { use.control <- FALSE print.log("Control file is NA, therefore Mendelian randomization mode is set to \"N\"") mendelian.mode <- "N" } ## Read data print.log("Reading files and performing QC....") snpinfo <- as.matrix(read.table(snp.file)) case.data <- read.table(case.file, header=T) IDs <- paste(case.data[,1], case.data[,2]) N <- length(IDs) dosage <- as.matrix(case.data[, -(1:6)]) rsids.with.postfix <- colnames(dosage) rsids <- strtrim(rsids.with.postfix, nchar(rsids.with.postfix)-2) p <- length(rsids) ## number of SNPs defined in dosage file ## Read control data if (use.control) { control.data <- read.table(control.file, header=T) control.IDs <- paste(control.data[,1], control.data[,2]) control.N <- length(control.IDs) control.dosage <- as.matrix(control.data[, -(1:6)]) rsids.with.postfix <- colnames(control.dosage) control.rsids <- strtrim(rsids.with.postfix, nchar(rsids.with.postfix)-2) if (!identical(rsids, control.rsids)) { print.log("SNP sets are different for case and control files") quit() } } ## Match SNP info by rsid matching. ## QC1: Remove SNPs not in SNP file rafs <- rep(0, p) ORs <- rep(0, p) keep <- rep(TRUE, p) for (i in 1:p) { j <- which(snpinfo[,1] == rsids[i])[1] if (length(j) > 0 && !is.na(j)) { rafs[i] <- as.numeric(snpinfo[j, 3]) ORs[i] <- as.numeric(snpinfo[j, 4]) # if (!is.na(ORs[i]) && ORs[i] < 1) { # ORs[i] <- NA # print.log("SNP", rsids[i], " OR is <1: set to NA.") # } } else { print.log("SNP", rsids[i], "is removed from plink file due to not existing in SNP file") keep[i] <- FALSE } } rafs <- rafs[keep] ORs <- ORs[keep] dosage <- dosage[, keep] if (use.control) { control.dosage <- control.dosage[, keep] } rsids <- rsids[keep] p <- length(rsids) ## Deal with ORs < 1 for (i in 1:p) { if (!is.na(ORs[i]) && ORs[i] < 1) { print.log("OR is < 1 for SNP ",rsids[i],": risk allele is set opposite") ORs[i] <- 1/ORs[i] rafs[i] <- 1-rafs[i] dosage[,i] <- 2-dosage[,i] if (use.control) { control.dosage[,i] <- 2-control.dosage[,i] } } } ## Numerical conversion and naive mean-value-imputation suppressWarnings(class(dosage) <- "numeric") if (use.control) { suppressWarnings(class(control.dosage) <- "numeric") } suppressWarnings(class(rafs) <- "numeric") suppressWarnings(class(ORs) <- "numeric") if (method.mode.freq == "N") { ## Use default value, if "N" mode print.log("Method mode (freq) is N, we set every freq to default value 0.5") rafs <- rep(0.5, p) } else { ## Impute RAF if (sum(is.na(rafs)) > 0) { mean.raf <- mean(rafs, na.rm=T) print.log("Imputation:", paste(rsids[is.na(rafs)], collapse=","), "RAF was imputed to mean value", mean.raf) rafs[is.na(rafs)] <- mean.raf } } if (method.mode.OR == "N") { ## Use default value, if "N" mode print.log("Method mode (OR) is N, we set every OR to default value 1.2") ORs <- rep(1.2, p) } else { ## Impute OR if (sum(is.na(ORs)) > 0) { mean.OR <- mean(ORs, na.rm=T) print.log("Imputation:", paste(rsids[is.na(ORs)], collapse=","), "OR was imputed to mean value", mean.OR) ORs[is.na(ORs)] <- mean.OR } } ## QC2: remove individuals having NA na.count <- apply(is.na(dosage), 1, sum) if (sum(na.count > 0) > 0) { dosage <- dosage[na.count == 0, ] IDs <- IDs[na.count == 0] N <- length(IDs) print.log(sum(na.count > 0), "Case individuals were removed due to having at least one NA") print.log(paste(IDs[na.count > 0], collapse=":"), cnt=F) } if (use.control) { na.count <- apply(is.na(control.dosage), 1, sum) if (sum(na.count > 0) > 0) { control.dosage <- control.dosage[na.count == 0, ] control.IDs <- control.IDs[na.count == 0] control.N <- length(control.IDs) print.log(sum(na.count > 0), "Control individuals were removed due to having at least one NA") print.log(paste(control.IDs[na.count > 0], collapse=":"), cnt=F) } } ## QC3: remove nonpolymorphic SNPs poly <- (apply(dosage, 2, var) != 0) if (use.control) { poly <- poly & (apply(control.dosage, 2, var) != 0) } if (sum(!poly) > 0) { print.log(sum(!poly), "SNPs were removed due to being-non-polymorphic in cases or controls") print.log(rsids[!poly], cnt=F) rafs <- rafs[poly] ORs <- ORs[poly] dosage <- dosage[, poly] if (use.control) { control.dosage <- control.dosage[, poly] } rsids <- rsids[poly] p <- length(rsids) } ## Report post-QC results print.log("After QC,", N, "cases, ") if (use.control) { print.log(control.N, " controls, ") } print.log("and", p, "SNPs are left. The SNPs left are:") print.log(paste(rsids, collapse=":"), cnt=F) ## I will perform two kinds of regression cleaning. (PC and use of control) ## Before that, I will store the current data ## so that I can use it for plain Mendelian randomization. dosage.before.regression.clean <- dosage if (use.control) { control.dosage.before.regression.clean <- control.dosage } ## RegressionCleaaning1: Regress out PC from dosage pc <- NULL control.pc <- NULL if (!is.na(pc.file)) { print.log("Regressing out PCs.........") pc.data <- read.table(pc.file) pc.all <- as.matrix(pc.data[,-(1:2)]) pcIDs <- paste(pc.data[,1], pc.data[,2]) pc <- matrix(0, N, ncol(pc.all)) for (i in 1:N) { j <- which(pcIDs == IDs[i]) if (length(j) > 0 && !is.na(j)) { pc[i,] <- pc.all[j,] } else { print.log("Can't find ID", IDs[i], "in PC file. Quitting...") quit() } } class(pc) <- "numeric" if (use.control) { control.pc <- matrix(0, control.N, ncol(pc.all)) for (i in 1:control.N) { j <- which(pcIDs == control.IDs[i]) if (length(j) > 0 && !is.na(j)) { control.pc[i,] <- pc.all[j,] } else { print.log("Can't find ID", control.IDs[i], "in PC file. Quitting...") quit() } } class(control.pc) <- "numeric" } if (!use.control) { for (i in 1:p) { dosage.line <- dosage[,i] lm.rst <- lm(dosage.line ~ pc) dosage[,i] <- resid(lm.rst) } } else { ## If use.control, we combine data first before regressing out PCs. (7/25/15, v0.36) combined.dosage = rbind(dosage, control.dosage) combined.pc = rbind(pc, control.pc) for (i in 1:p) { combined.dosage.line <- combined.dosage[,i] lm.rst <- lm(combined.dosage.line ~ combined.pc) combined.dosage[,i] = resid(lm.rst) } dosage = combined.dosage[1:N,] control.dosage = combined.dosage[-(1:N),] } } ## RegressionCleaining2: Calculate control z-score variance and correct for residual LD. ## Deprecated: not used anymore since v0.36 R <- cor(dosage) if (use.control) { control.R <- cor(control.dosage) control.Rz <- control.R * sqrt(control.N) control.Rz.var <- var(as.vector(control.Rz[upper.tri(R)])) print.log("Control correlation z-score's variance = ", control.Rz.var) } ##-------------------------------------------------------## ## At this moment, what variables are cleaned, updated, ## and available? ## - dosage, control.dosage ## - IDs, control.IDs ## - N, control.N ## - rafs, ORs, rsids, p ## ## Also for Mendelian randomization, ## - dosage.before.regression.clean ## - control.dosage.before.regression.clean ## - pc, control.pc (if PC file was given) ##-------------------------------------------------------## ## Variables to store Results titles <- NULL results <- NULL ##==============================================## ## Part 1. Testing ## ##==============================================## print.log("Executing Test...") ## Capital letter RAFs are rafs in the "affected" subgroup RAFs <- ORs*rafs/((ORs-1)*rafs+1) ## Target null hypothesis is identity matrix. I <- diag(p) ## Get weights of pairs for optimal method. W <- matrix(0, p, p) for (i in 1:p) { for (j in 1:p) { if (i < j) { p1 <- rafs[i] p2 <- rafs[j] r1 <- ORs[i] r2 <- ORs[j] DcorDphi <- sqrt((1-p1)*p1*(1-p2)*p2)*(r1-1)*(r2-1) / (1+p1*(r1-1)) / (1+p2*(r2-1)) W[i,j] <- DcorDphi ## Fill in only Upper triangle } } } ###################------------------- ## New Filtering based on r2 -- 7/8/15 W[control.R**2 > 0.1] <- 0 sqrt.sum.W.W <- sqrt(sum(W*W)) ## For later use. ## Calculate correlation of dosage ##R <- cor(dosage) ## Calculate z-scores U <- sqrt(N)*(R-I) if (use.control) { #Rzt=0.5*log((1+(R-I))/(1-(R-I))) ###------------------ try Fisher trans. 7/8/15 #control.Rzt=0.5*log((1+(control.R-I))/(1-(control.R-I))) #U <- sqrt(1/(1/(N-3)+1/(control.N-3)))*(Rzt-control.Rzt) #U <- sqrt(control.N)*(control.R-I) U <- sqrt(1/(1/N+1/control.N))*(R-control.R) } U[lower.tri(U)] <- 0 ## Make it upper triangle zscore <- sum(W*U) / sqrt.sum.W.W #### 1/13/14 v0.33: genomic-control z-score by control value variance. if (use.control) { #zscore <- zscore / sqrt(control.Rz.var) } log10pvalue <- pnorm(zscore, lower.tail=F, log.p=T)/log(10) pvalue <- 10^log10pvalue weighted.r <- sum(W*R)/sum(W) weighted.r.SE <- sqrt.sum.W.W / sum(W) / sqrt(N) #titles <- c(titles, "PVALUE", "LOG10P", "N", "ZSCORE", "WEIGHTED.R", "WEIGHTED.R.STDERR") #results <- c(results, pvalue, log10pvalue, N, zscore, weighted.r, weighted.r.SE) titles <- c(titles, "PVALUE", "LOG10P", "N", "NCASE", "NCONT", "ZSCORE", "NLOCUS") results <- c(results, pvalue, log10pvalue, N+control.N, N, control.N, zscore, p) ##==============================================## ## Part 2. Estimation ## ##==============================================## ## First, I should define likelihood functions. ## Full likelihood function ---- (slow) ll.fun <- function(x) { Q <- I ## Q denotes true R based on model for (i in 1:p) { for (j in 1:p) { if (i < j) { vari <- 2*(1-x)*rafs[i]*(1-rafs[i]) + 2*x*RAFs[i]*(1-RAFs[i]) + 4*x*(1-x)*(rafs[i]-RAFs[i])^2 varj <- 2*(1-x)*rafs[j]*(1-rafs[j]) + 2*x*RAFs[j]*(1-RAFs[j]) + 4*x*(1-x)*(rafs[j]-RAFs[j])^2 meani <- 2*(1-x)*rafs[i] + 2*x*RAFs[i] meanj <- 2*(1-x)*rafs[j] + 2*x*RAFs[j] meanij <- 4*(1-x)*rafs[i]*rafs[j] + 4*x*RAFs[i]*RAFs[j] covij <- meanij - meani*meanj corij <- covij / sqrt(vari*varj) Q[i,j] <- Q[j,i] <- corij } } } Q.inv <- solve(Q) T <- I + Q*Q.inv T.inv <- solve(T) q <- p*(p-1)/2 Sigma.inv <- matrix(0, q, q) ij <- 1 for (i in 1:p) { for (j in 1:p) { if (i < j) { km <- 1 for (k in 1:p) { for (m in 1:p) { if (k < m) { Sigma.inv[ij, km] <- Q.inv[i,k]*Q.inv[j,m] + Q.inv[i,m]*Q.inv[j,k] - Q.inv[i,j]*(T.inv[i,k] + T.inv[j,k] + T.inv[i,m] + T.inv[j,m])*Q.inv[k,m] km <- km + 1 } } } ij <- ij + 1 } } } det.Sigma <- 1/det(Sigma.inv) ## write(det.Sigma, stderr()) Y <- rep(0, q) ## Observed correlations in 1-d vector mu <- rep(0, q) ## Mean correlations of model in 1-d vector ij <- 1 for (i in 1:p) { for (j in 1:p) { if (i < j) { Y[ij] <- R[i,j] mu[ij] <- Q[i,j] ij <- ij + 1 } } } ll <- -q*log(2*pi)/2 - log(det.Sigma)/2 + q*log(N)/2 - (1/2)*(Y-mu)%*%(Sigma.inv*N)%*%(Y-mu) ll } ## Approximating likelihood function ---- (fast) ll.approx.fun <- function(x) { Q <- I ## Q denotes true R based on model for (i in 1:p) { for (j in 1:p) { if (i < j) { vari <- 2*(1-x)*rafs[i]*(1-rafs[i]) + 2*x*RAFs[i]*(1-RAFs[i]) + 4*x*(1-x)*(rafs[i]-RAFs[i])^2 varj <- 2*(1-x)*rafs[j]*(1-rafs[j]) + 2*x*RAFs[j]*(1-RAFs[j]) + 4*x*(1-x)*(rafs[j]-RAFs[j])^2 meani <- 2*(1-x)*rafs[i] + 2*x*RAFs[i] meanj <- 2*(1-x)*rafs[j] + 2*x*RAFs[j] meanij <- 4*(1-x)*rafs[i]*rafs[j] + 4*x*RAFs[i]*RAFs[j] covij <- meanij - meani*meanj corij <- covij / sqrt(vari*varj) Q[i,j] <- Q[j,i] <- corij } } } Q.inv <- solve(Q) T <- I + Q*Q.inv T.inv <- solve(T) Y <- R - Q Q.inv.Y <- Q.inv%*%Y approx.ll <- -sum(diag(Q.inv.Y%*%Q.inv.Y))/2 + diag(Q.inv.Y)%*%T.inv%*%diag(Q.inv.Y) approx.ll } ## Search function based on likelihood function. (Grid + Newton) search.mle <- function(bins, lower, upper) { lls <- rep(0, length(bins)) for (i in 1:length(bins)) { ## lls[i] <- ll.fun(bins[i]) lls[i] <- ll.approx.fun(bins[i]) } j <- which(lls == max(lls)) maxbin <- bins[j] optim.rst <- optim(par=maxbin, ## fn=ll.fun, fn=ll.approx.fun, method="L-BFGS-B", lower=lower, upper=upper, control=list(fnscale=-1)) mle.x <- optim.rst$par[1] ml <- optim.rst$value list(MLE=mle.x, LL=ml) } ## Execute estimation!! titles <- c(titles, "MLE.LEFT", "LL.LEFT", "MLE.RIGHT", "LL.RIGHT") if (estimation.mode == "Y") { print.log("Estimation mode is Y: Estimating proportion..") left.half <- search.mle(seq(0, 0.5, 0.01), 0, 0.5) right.half <- search.mle(seq(0.5, 1, 0.01), 0.5, 1) results <- c(results, left.half$MLE, left.half$LL, right.half$MLE, right.half$LL) } else { results <- c(results, NA, NA, NA, NA) } ##=======================================================## ## Part 3. Mendelian randomization (risk score approach ##=======================================================## if (mendelian.mode == "Y") { print.log("Mendelian randomization mode is Y: Performing MR..") GRS <- c(dosage.before.regression.clean%*%log(ORs), control.dosage.before.regression.clean%*%log(ORs)) pheno <- c(rep(1, N), rep(0, control.N)) pc.combined <- rbind(pc, control.pc) if (is.null(pc.combined)) { glm.rst <- glm(pheno ~ GRS, family=binomial(logit)) } else { glm.rst <- glm(pheno ~ pc.combined + GRS, family=binomial(logit)) } coeffs <- summary(glm.rst)$coefficients beta <- coeffs["GRS", 1] stderr <- coeffs["GRS", 2] deviance <- anova(glm.rst, test="Chisq")["GRS", 2] log10pvalue <- pchisq(deviance, df=1, lower.tail=F, log.p=T)/log(10) pvalue <- 10^log10pvalue titles <- c(titles, "MR.PVALUE", "MR.LOG10P", "MR.BETA", "MR.STDERR") results <- c(results, pvalue, log10pvalue, beta, stderr) write.table(dosage.before.regression.clean%*%log(ORs), paste(out.file, ".GRS", sep=""), quote=F, row.names=F, col.names=F) write.table(control.dosage.before.regression.clean%*%log(ORs), paste(out.file, ".cGRS", sep=""), quote=F, row.names=F, col.names=F) } ############################################ ## Print results ############################################ print.log("Finished, writing results...") write.table(rbind(titles, results), paste(out.file, ".BBrst", sep=""), quote=F, row.names=F, col.names=F) colnames(R) = rownames(R) = colnames(W) = rownames(W) = rsids write.table(R, paste(out.file, ".caseR", sep="")) if (use.control) { write.table(control.R, paste(out.file, ".contR", sep="")) write.table(R-control.R, paste(out.file, ".diffR", sep="")) } write.table(W, paste(out.file, ".weights", sep="")) cat(logs, file=paste(out.file, ".log", sep=""))