####=======================================================================================####
####
#### BUHMBOX v0.38
####   Copy Right: Buhm Han
####   This code was written based on R v3.1
####
#### Usage: Rscript BUHMBOX.R [Arguments]
#### Argument 1: SNP file (col 1: rsid, col 2: risk allele, col 3: frequency, col 4: OR)
####             col 3,4 can contain "NA", which will be mean-value-imputed
#### Argument 2: PLINK dosage raw format of the risk alleles -- cases.
#### Argument 3: PLINK dosage raw format of the risk alleles -- controls. (optinal: NA can be put here if control data is not available.)
#### Argument 4: Method mode. Two letters representing freq mode, and OR mode.
####             YY: use both freq and OR; YN: use only freq;
####             NY: use only OR; NN: use neither;
####             If freq / OR are not used, default values 0.5 / 1.2 are used for all SNPs.
#### Argument 5: Estimation mode Y/N
#### Argument 6: Perform Mendelian Randomization (risk score regression) or not. Y/N
#### Argument 7: Output file
#### Argument 8: PC file (optional)
####
#### Development history:
####   9/8/14: Initial launch
####   10/13/14: v0.2: Regression technique for very low LD
####   11/6/14: v0.3: Added option to Mendelian Randomization.
####   12/1/14: v0.31: bug fix -- fixed error that MR doesn't work when PC file exists.
####   12/23/14: v0.32: control raw format becomes optional.
####   1/13/15: v0.33: variance GC control implemented based on controls.
####   1/27/15: v0.34: Automatic conversion of OR<1 (input doesn't have to be all risk alleles)
####   4/22/15: v0.35: Now report weighted average correlation, as well as correlation and weight matrices in separate files
####   7/25/15: v0.36: (1) Variance GC control is deprecated. Instead, we use delta correlation between cases and controls. 
####                   (2) Many more output files, including delta correlation and GRS.
####                   (3) PCs are now regressed out after combining cases and controls.
####                   (4) Default filtering of r2<0.1 is added. (ignore elements based on control cor matrix)
####   10/13/15: v0.37: Bug fix when flipping OR<1. 
####   2/3/16: v0.38: Bug inflating GRS p-value is fixed.
####
####=======================================================================================####

## Initialize log
logcnt <- 1
counter <- function() {
    counter <- sprintf("[%d]", logcnt)
    logcnt <<- logcnt + 1 ## Globally increase counter
    counter
}
logs <- "==============================LOG==============================="
print.log <- function(..., cnt=TRUE) {
    msg <- paste(...)
    if (cnt) {
        msg <- paste(counter(), msg)
    }
    logs <<- paste(logs, msg, sep="\n") ## Globally add log
    write(msg, stderr())
}

## Read arguments
args <- commandArgs(trailingOnly=TRUE)
print.log(paste("Command line arguments:", paste(args, collapse=" ")))
snp.file <- args[1]
case.file <- args[2]
control.file <- args[3]
method.mode <- args[4]
method.mode.freq <- substr(method.mode, 1, 1)
method.mode.OR <- substr(method.mode, 2, 2)
stopifnot(method.mode.freq == "N" || method.mode.freq == "Y")
stopifnot(method.mode.OR == "N" || method.mode.OR == "Y")
estimation.mode <- args[5]
stopifnot(estimation.mode == "Y" || estimation.mode == "N")
mendelian.mode <- args[6]
stopifnot(mendelian.mode == "Y" || mendelian.mode == "N")
out.file <- args[7]
pc.file <- NA
if (length(args) >= 8) {
    pc.file <- args[8]
}
use.control <- TRUE
if (control.file == "NA") {
    use.control <- FALSE
    print.log("Control file is NA, therefore Mendelian randomization mode is set to \"N\"")
    mendelian.mode <- "N"
}

## Read data
print.log("Reading files and performing QC....")
snpinfo <- as.matrix(read.table(snp.file))
case.data <- read.table(case.file, header=T)
IDs <- paste(case.data[,1], case.data[,2])
N <- length(IDs)
dosage <- as.matrix(case.data[, -(1:6)])
rsids.with.postfix <- colnames(dosage)
rsids <- strtrim(rsids.with.postfix, nchar(rsids.with.postfix)-2)
p <- length(rsids) ## number of SNPs defined in dosage file

## Read control data
if (use.control) {
    control.data <- read.table(control.file, header=T)
    control.IDs <- paste(control.data[,1], control.data[,2])
    control.N <- length(control.IDs)
    control.dosage <- as.matrix(control.data[, -(1:6)])
    rsids.with.postfix <- colnames(control.dosage)
    control.rsids <- strtrim(rsids.with.postfix, nchar(rsids.with.postfix)-2)
    if (!identical(rsids, control.rsids)) {
       print.log("SNP sets are different for case and control files")
       quit()
    }
}

## Match SNP info by rsid matching.
## QC1: Remove SNPs not in SNP file 
rafs <- rep(0, p)
ORs <- rep(0, p)
keep <- rep(TRUE, p)
for (i in 1:p) {
    j <- which(snpinfo[,1] == rsids[i])[1]
    if (length(j) > 0 && !is.na(j)) {
        rafs[i] <- as.numeric(snpinfo[j, 3])
        ORs[i] <- as.numeric(snpinfo[j, 4])
#        if (!is.na(ORs[i]) && ORs[i] < 1) {
#            ORs[i] <- NA
#            print.log("SNP", rsids[i], " OR is <1: set to NA.")
#        }
    } else {
        print.log("SNP", rsids[i], "is removed from plink file due to not existing in SNP file")
        keep[i] <- FALSE
    }
}
rafs <- rafs[keep]
ORs <- ORs[keep]
dosage <- dosage[, keep]
if (use.control) { control.dosage <- control.dosage[, keep] }
rsids <- rsids[keep]
p <- length(rsids)

## Deal with ORs < 1
for (i in 1:p) {
    if (!is.na(ORs[i]) && ORs[i] < 1) {
        print.log("OR is < 1 for SNP ",rsids[i],": risk allele is set opposite")
        ORs[i] <- 1/ORs[i]
        rafs[i] <- 1-rafs[i]
        dosage[,i] <- 2-dosage[,i]
        if (use.control) { control.dosage[,i] <- 2-control.dosage[,i] }
    }
}

## Numerical conversion and naive mean-value-imputation
suppressWarnings(class(dosage) <- "numeric")
if (use.control) { suppressWarnings(class(control.dosage) <- "numeric") }
suppressWarnings(class(rafs) <- "numeric")
suppressWarnings(class(ORs) <- "numeric")
if (method.mode.freq == "N") { ## Use default value, if "N" mode
    print.log("Method mode (freq) is N, we set every freq to default value 0.5")
    rafs <- rep(0.5, p)
} else {
    ## Impute RAF
    if (sum(is.na(rafs)) > 0) {
        mean.raf <- mean(rafs, na.rm=T)
        print.log("Imputation:", paste(rsids[is.na(rafs)], collapse=","), "RAF was imputed to mean value", mean.raf)
        rafs[is.na(rafs)] <- mean.raf
    }
}
if (method.mode.OR == "N") { ## Use default value, if "N" mode
    print.log("Method mode (OR) is N, we set every OR to default value 1.2")
    ORs <- rep(1.2, p)
} else {
    ## Impute OR
    if (sum(is.na(ORs)) > 0) {
        mean.OR <- mean(ORs, na.rm=T)
        print.log("Imputation:", paste(rsids[is.na(ORs)], collapse=","), "OR was imputed to mean value", mean.OR)
        ORs[is.na(ORs)] <- mean.OR
    }
}

## QC2: remove individuals having NA
na.count <- apply(is.na(dosage), 1, sum)
if (sum(na.count > 0) > 0) {
    dosage <- dosage[na.count == 0, ] 
    IDs <- IDs[na.count == 0]
    N <- length(IDs)
    print.log(sum(na.count > 0),
              "Case individuals were removed due to having at least one NA")
    print.log(paste(IDs[na.count > 0], collapse=":"), cnt=F)
}

if (use.control) {
    na.count <- apply(is.na(control.dosage), 1, sum)
    if (sum(na.count > 0) > 0) {
        control.dosage <- control.dosage[na.count == 0, ] 
        control.IDs <- control.IDs[na.count == 0]
        control.N <- length(control.IDs)
        print.log(sum(na.count > 0),
                  "Control individuals were removed due to having at least one NA")
        print.log(paste(control.IDs[na.count > 0], collapse=":"), cnt=F)
    }
}
## QC3: remove nonpolymorphic SNPs
poly <- (apply(dosage, 2, var) != 0)
if (use.control) {
    poly <- poly & (apply(control.dosage, 2, var) != 0)
}
if (sum(!poly) > 0) {
    print.log(sum(!poly), "SNPs were removed due to being-non-polymorphic in cases or controls")
    print.log(rsids[!poly], cnt=F)
    rafs <- rafs[poly]
    ORs <- ORs[poly]
    dosage <- dosage[, poly]
    if (use.control) { control.dosage <- control.dosage[, poly] }
    rsids <- rsids[poly]
    p <- length(rsids)
}

## Report post-QC results
print.log("After QC,", N, "cases, ")
if (use.control) { print.log(control.N, " controls, ") }
print.log("and", p, "SNPs are left. The SNPs left are:")
print.log(paste(rsids, collapse=":"), cnt=F)

## I will perform two kinds of regression cleaning. (PC and use of control)
## Before that, I will store the current data
## so that I can use it for plain Mendelian randomization.
dosage.before.regression.clean <- dosage
if (use.control) { control.dosage.before.regression.clean <- control.dosage }

## RegressionCleaaning1: Regress out PC from dosage
pc <- NULL
control.pc <- NULL
if (!is.na(pc.file)) {
    print.log("Regressing out PCs.........")
    pc.data <- read.table(pc.file)
    pc.all <- as.matrix(pc.data[,-(1:2)])
    pcIDs <- paste(pc.data[,1], pc.data[,2])
    pc <- matrix(0, N, ncol(pc.all))
    for (i in 1:N) {
        j <- which(pcIDs == IDs[i])
        if (length(j) > 0 && !is.na(j)) {
            pc[i,] <- pc.all[j,]
        } else {
            print.log("Can't find ID", IDs[i], "in PC file. Quitting...")
            quit()
        }
    }
    class(pc) <- "numeric"
    if (use.control) {
        control.pc <- matrix(0, control.N, ncol(pc.all))
        for (i in 1:control.N) {
            j <- which(pcIDs == control.IDs[i])
            if (length(j) > 0 && !is.na(j)) {
                control.pc[i,] <- pc.all[j,]
            } else {
                print.log("Can't find ID", control.IDs[i], "in PC file. Quitting...")
                quit()
            }
        }
        class(control.pc) <- "numeric"
    }
    if (!use.control) {
        for (i in 1:p) {
            dosage.line <- dosage[,i]
            lm.rst <- lm(dosage.line ~ pc)
            dosage[,i] <- resid(lm.rst)
        }
    } else {  ## If use.control, we combine data first before regressing out PCs. (7/25/15, v0.36)
        combined.dosage = rbind(dosage, control.dosage)
        combined.pc = rbind(pc, control.pc)
        for (i in 1:p) {
            combined.dosage.line <- combined.dosage[,i]
            lm.rst <- lm(combined.dosage.line ~ combined.pc)
            combined.dosage[,i] = resid(lm.rst)
        }
        dosage = combined.dosage[1:N,]
        control.dosage = combined.dosage[-(1:N),]
    }
}

## RegressionCleaining2: Calculate control z-score variance and correct for residual LD.
## Deprecated: not used anymore since v0.36
R <- cor(dosage)
if (use.control) {
    control.R <- cor(control.dosage)
    control.Rz <- control.R * sqrt(control.N)
    control.Rz.var <- var(as.vector(control.Rz[upper.tri(R)]))
    print.log("Control correlation z-score's variance = ", control.Rz.var)
}


##-------------------------------------------------------##
## At this moment, what variables are cleaned, updated,
## and available?
## - dosage, control.dosage
## - IDs, control.IDs
## - N, control.N
## - rafs, ORs, rsids, p
##
## Also for Mendelian randomization,
## - dosage.before.regression.clean
## - control.dosage.before.regression.clean
## - pc, control.pc (if PC file was given)
##-------------------------------------------------------##
 
## Variables to store Results
titles <- NULL
results <- NULL

##==============================================##
## Part 1. Testing                              ##
##==============================================##
print.log("Executing Test...")
## Capital letter RAFs are rafs in the "affected" subgroup
RAFs <- ORs*rafs/((ORs-1)*rafs+1) 
## Target null hypothesis is identity matrix.
I <- diag(p) 
## Get weights of pairs for optimal method.
W <- matrix(0, p, p)
for (i in 1:p) {
    for (j in 1:p) {
        if (i < j) {
            p1 <- rafs[i]
            p2 <- rafs[j]
            r1 <- ORs[i]
            r2 <- ORs[j]
            DcorDphi <- sqrt((1-p1)*p1*(1-p2)*p2)*(r1-1)*(r2-1) / (1+p1*(r1-1)) / (1+p2*(r2-1)) 
            W[i,j] <- DcorDphi ## Fill in only Upper triangle
        }
    }
}
###################-------------------
## New Filtering based on r2 -- 7/8/15
W[control.R**2 > 0.1] <- 0

sqrt.sum.W.W <- sqrt(sum(W*W)) ## For later use.
## Calculate correlation of dosage
##R <- cor(dosage)
## Calculate z-scores
U <- sqrt(N)*(R-I)
if (use.control) {
    #Rzt=0.5*log((1+(R-I))/(1-(R-I)))  ###------------------ try Fisher trans. 7/8/15
    #control.Rzt=0.5*log((1+(control.R-I))/(1-(control.R-I)))
    #U <- sqrt(1/(1/(N-3)+1/(control.N-3)))*(Rzt-control.Rzt)
    #U <- sqrt(control.N)*(control.R-I)
    U <- sqrt(1/(1/N+1/control.N))*(R-control.R)
}
U[lower.tri(U)] <- 0 ## Make it upper triangle
zscore <- sum(W*U) / sqrt.sum.W.W

#### 1/13/14 v0.33: genomic-control z-score by control value variance.
if (use.control) {
    #zscore <- zscore / sqrt(control.Rz.var)
}

log10pvalue <- pnorm(zscore, lower.tail=F, log.p=T)/log(10)
pvalue <- 10^log10pvalue

weighted.r <- sum(W*R)/sum(W)
weighted.r.SE <- sqrt.sum.W.W / sum(W) / sqrt(N) 

#titles <- c(titles, "PVALUE", "LOG10P", "N", "ZSCORE", "WEIGHTED.R", "WEIGHTED.R.STDERR")
#results <- c(results,  pvalue, log10pvalue, N, zscore, weighted.r, weighted.r.SE)
titles <- c(titles, "PVALUE", "LOG10P", "N", "NCASE", "NCONT", "ZSCORE", "NLOCUS")
results <- c(results,  pvalue, log10pvalue, N+control.N, N, control.N, zscore, p)

##==============================================##
## Part 2. Estimation                           ##
##==============================================##

## First, I should define likelihood functions.

## Full likelihood function ---- (slow)
ll.fun <- function(x) {
    Q <- I ## Q denotes true R based on model
    for (i in 1:p) {
        for (j in 1:p) {
            if (i < j) {
                vari <- 2*(1-x)*rafs[i]*(1-rafs[i]) + 2*x*RAFs[i]*(1-RAFs[i]) + 4*x*(1-x)*(rafs[i]-RAFs[i])^2
                varj <- 2*(1-x)*rafs[j]*(1-rafs[j]) + 2*x*RAFs[j]*(1-RAFs[j]) + 4*x*(1-x)*(rafs[j]-RAFs[j])^2
                meani <- 2*(1-x)*rafs[i] + 2*x*RAFs[i]
                meanj <- 2*(1-x)*rafs[j] + 2*x*RAFs[j]
                meanij <- 4*(1-x)*rafs[i]*rafs[j] + 4*x*RAFs[i]*RAFs[j]
                covij <- meanij - meani*meanj
                corij <- covij / sqrt(vari*varj)
                Q[i,j] <- Q[j,i] <- corij
            }
        }
    }
    Q.inv <- solve(Q)
    T <- I + Q*Q.inv
    T.inv <- solve(T)
    q <- p*(p-1)/2
    Sigma.inv <- matrix(0, q, q)
    ij <- 1
    for (i in 1:p) {
        for (j in 1:p) {
            if (i < j) {
                km <- 1
                for (k in 1:p) {
                    for (m in 1:p) {
                        if (k < m) {

                            Sigma.inv[ij, km] <- Q.inv[i,k]*Q.inv[j,m] + Q.inv[i,m]*Q.inv[j,k] -
                              Q.inv[i,j]*(T.inv[i,k] + T.inv[j,k] + T.inv[i,m] + T.inv[j,m])*Q.inv[k,m]
                            
                            km <- km + 1
                        }
                    }
                }
                ij <- ij + 1
            }
        }
    }
    det.Sigma <- 1/det(Sigma.inv)
    ## write(det.Sigma, stderr())
    Y <- rep(0, q) ## Observed correlations in 1-d vector
    mu <- rep(0, q) ## Mean correlations of model in 1-d vector
    ij <- 1
    for (i in 1:p) {
        for (j in 1:p) {
            if (i < j) {
                Y[ij] <- R[i,j]
                mu[ij] <- Q[i,j]
                ij <- ij + 1
            }
        }
    }
    ll <- -q*log(2*pi)/2 - log(det.Sigma)/2 + q*log(N)/2 - (1/2)*(Y-mu)%*%(Sigma.inv*N)%*%(Y-mu)
    ll
}

## Approximating likelihood function ---- (fast)
ll.approx.fun <- function(x) {
    Q <- I ## Q denotes true R based on model
    for (i in 1:p) {
        for (j in 1:p) {
            if (i < j) {
                vari <- 2*(1-x)*rafs[i]*(1-rafs[i]) + 2*x*RAFs[i]*(1-RAFs[i]) + 4*x*(1-x)*(rafs[i]-RAFs[i])^2
                varj <- 2*(1-x)*rafs[j]*(1-rafs[j]) + 2*x*RAFs[j]*(1-RAFs[j]) + 4*x*(1-x)*(rafs[j]-RAFs[j])^2
                meani <- 2*(1-x)*rafs[i] + 2*x*RAFs[i]
                meanj <- 2*(1-x)*rafs[j] + 2*x*RAFs[j]
                meanij <- 4*(1-x)*rafs[i]*rafs[j] + 4*x*RAFs[i]*RAFs[j]
                covij <- meanij - meani*meanj
                corij <- covij / sqrt(vari*varj)
                Q[i,j] <- Q[j,i] <- corij
            }
        }
    }
    Q.inv <- solve(Q)
    T <- I + Q*Q.inv
    T.inv <- solve(T)
    Y <- R - Q
    Q.inv.Y <- Q.inv%*%Y
    approx.ll <- -sum(diag(Q.inv.Y%*%Q.inv.Y))/2 + diag(Q.inv.Y)%*%T.inv%*%diag(Q.inv.Y)
    approx.ll
}

## Search function based on likelihood function. (Grid + Newton)
search.mle <- function(bins, lower, upper) {
    lls <- rep(0, length(bins))
    for (i in 1:length(bins)) {
        ## lls[i] <- ll.fun(bins[i]) 
        lls[i] <- ll.approx.fun(bins[i]) 
    }
    j <- which(lls == max(lls))
    maxbin <- bins[j]
    optim.rst <- optim(par=maxbin,
                       ## fn=ll.fun,
                       fn=ll.approx.fun,
                       method="L-BFGS-B",
                       lower=lower, upper=upper, control=list(fnscale=-1))
    mle.x <- optim.rst$par[1]
    ml <- optim.rst$value
    list(MLE=mle.x, LL=ml)
}

## Execute estimation!!
    titles <- c(titles, "MLE.LEFT", "LL.LEFT", "MLE.RIGHT", "LL.RIGHT")
if (estimation.mode == "Y") {
    print.log("Estimation mode is Y: Estimating proportion..")
    left.half <- search.mle(seq(0, 0.5, 0.01), 0, 0.5)
    right.half <- search.mle(seq(0.5, 1, 0.01), 0.5, 1)
    results <- c(results, left.half$MLE, left.half$LL, right.half$MLE, right.half$LL)
} else {
    results <- c(results, NA, NA, NA, NA)
}

##=======================================================##
## Part 3. Mendelian randomization (risk score approach
##=======================================================##

if (mendelian.mode == "Y") {
    print.log("Mendelian randomization mode is Y: Performing MR..")
    GRS <- c(dosage.before.regression.clean%*%log(ORs), 
             control.dosage.before.regression.clean%*%log(ORs))
    pheno <- c(rep(1, N), rep(0, control.N))
    pc.combined <- rbind(pc, control.pc)
    if (is.null(pc.combined)) {
        glm.rst <- glm(pheno ~ GRS, family=binomial(logit))
    } else {
        glm.rst <- glm(pheno ~ pc.combined + GRS, family=binomial(logit))
    }   
    coeffs <- summary(glm.rst)$coefficients
    beta <- coeffs["GRS", 1]
    stderr <- coeffs["GRS", 2]
    deviance <- anova(glm.rst, test="Chisq")["GRS", 2]
    log10pvalue <- pchisq(deviance, df=1, lower.tail=F, log.p=T)/log(10)
    pvalue <- 10^log10pvalue
    titles <- c(titles, "MR.PVALUE", "MR.LOG10P", "MR.BETA", "MR.STDERR")
    results <- c(results, pvalue, log10pvalue, beta, stderr)
    write.table(dosage.before.regression.clean%*%log(ORs), paste(out.file, ".GRS", sep=""), quote=F, row.names=F, col.names=F)
    write.table(control.dosage.before.regression.clean%*%log(ORs), paste(out.file, ".cGRS", sep=""), quote=F, row.names=F, col.names=F)
}

############################################
## Print results
############################################
print.log("Finished, writing results...")
write.table(rbind(titles, results), paste(out.file, ".BBrst", sep=""), quote=F, row.names=F, col.names=F)
colnames(R) = rownames(R) = colnames(W) = rownames(W) = rsids
write.table(R, paste(out.file, ".caseR", sep=""))
if (use.control) {
write.table(control.R, paste(out.file, ".contR", sep=""))
write.table(R-control.R, paste(out.file, ".diffR", sep=""))
}
write.table(W, paste(out.file, ".weights", sep=""))
cat(logs, file=paste(out.file, ".log", sep=""))