## Load source code

args=(commandArgs(TRUE))

##args is now a list of character vectors
## First check to see if arguments are passed.
## Then cycle through each element of the list and evaluate the expressions.
if(length(args)==0){
    print("No arguments supplied.")
    }else{
        for(i in 1:length(args)){
	         eval(parse(text=args[[i]]))
	 }
     }
	print(tum)
	print(fasta)
	print(samp)
	print(c)
	print(hg)
	print(outstem)
	print(percentident)
	print(nalign)
	print(refdir)
	print(codedir)
	print(reffilesdir)
	
source(paste(codedir,"functions.R",sep=""))
source(paste(codedir,"MiscFunctions.R",sep=""))




annotate_candidates = function( samp, c, hg ) {
	
	
	TElengths = get_unique_TEs( fasta )
	
	## Get candidates from Tumor and Normal
	Tumfile = paste(outstem,"-Tumor.clusters.chr",c,".txt",sep="")
	Tum = try ( read.delim( Tumfile, as.is=T, sep="\t" ), silent = TRUE )
	
	Normfile = paste(outstem,"-Normal.clusters.chr",c,".txt",sep="")
	Norm = try ( read.delim( Normfile, as.is=T, sep="\t" ), silent = TRUE )
	
		
	if( (class(Tum) == "try-error" | length(Tum) == 1 ) & (class(Norm) == "try-error" | length(Norm) == 1) ) {
		write.table(NA,file=paste(outstem,".chr",c,".candidates.txt",sep=""),quote=F,row.names=F,sep="\t")
	}
	
	if(class(Tum) == "try-error" | length(Tum) == 1 ) { 
		print("No tumor candidates")
		write.table(NA,file=paste(outstem,".chr",c,".candidates.txt",sep=""),quote=F,row.names=F,sep="\t")
		return(NA)
		#nc = ncol(Norm)
		#cand = Norm
	} else { 
		nc = ncol(Tum)
		cand = Tum
	}
	
	cand = cand[ with(cand,order(nreads,decreasing=TRUE)),]
	
	## Filter candidates 
	## Reduce list of candidates by nreads
	## Because we want a smaller list for annotation and assembly

	#source("/xchip/cga4/home/ehelman/Transposons/RetroSeq/Filter_Cands.R")
	#cand = filter_candidates(cand)
	#Norm = filter_candidates(Norm)
		
	if(nrow(cand) == 0) {
		print("No tumor candidates pass filter")
		write.table(NA,file=paste(outstem,".chr",c,".candidates.txt",sep=""),quote=F,row.names=F,sep="\t")
		return(NA)
	}
	## Add TE annotation
	inverted = NA
	TEstatus = NA
	cand = cbind(cand,inverted,TEstatus)
	for( i in 1:nrow(cand) ) {
		cand[i,"inverted"] = is_inverted(cand[i,])
		cand[i,"TEstatus"] = get_TEstatus(TElengths,cand[i,])	
	}	
	
	## Add gene annotation
	annotated_cands = add_annotation(cand, Norm, hg)
	#annotated_cands = annotated_cands[1:which(is.na(annotated_cands[,1]))[1],]
		
	write.table(annotated_cands,file=paste(outstem,".chr",c,".candidates.txt",sep=""),quote=F,row.names=F,sep="\t")
	#return(annotated_cands)
}



is_inverted = function ( row ) {
	inverted = NA
	
	if( !is.na(row["has_partner"]) ) 
		inverted = ifelse( row["maxpos_r"] > row["maxpos_f"], FALSE, TRUE )
	
	return(inverted)

}


get_TEstatus = function ( TElengths, row ) { 
	status = "FULL_LENGTH"
	
	TElen = TElengths[row[,"TE"]]
	if(is.na(TElen)){
		shortTEnames = sapply(names(TElengths),function(x) unlist(strsplit(x,split="\\|"))[1])
		TElen = TElengths[which(shortTEnames == row[,"TE"])]
	}
	thresh = max(TElen/6,100)
	
	threep = (!is.na(row["minpos_f"]) & (row["minpos_f"] > TElen-thresh || row["maxpos_f"] > TElen-thresh)) |
		(!is.na(row["minpos_r"]) & (row["minpos_r"] > TElen-thresh || row["maxpos_r"] > TElen-thresh))
	fivep = (!is.na(row["minpos_f"]) & (row["minpos_f"] < thresh || row["maxpos_f"] < thresh)) |
		(!is.na(row["minpos_r"]) & (row["minpos_r"] < thresh || row["maxpos_r"] < thresh))	
	
	if( threep && !fivep )
		status = "5' TRUNCATED"
	if( !threep && fivep )
		status = "3' TRUNCATED"
	if( !threep && !fivep )
		status = "middle"
		
	return(status)
	
}


add_annotation = function ( cand, Norm, hg ) {
	
	nc = ncol(cand)
	cand = cbind(cand,NA,0,NA,NA,NA,NA,NA,NA,NA)
	colnames(cand)[(nc+1):(nc+9)] = c("in_normal","nnorm","in_gene","gene_region","in_CNV","dbRIP","1000G","lincRNA","near_refTE")

	## Read in annotation information
	# /xchip/cga/reference/gistic2/uber_prefiltered_cnv_list_2010-08-12.txt
	
	#refdir = "/xchip/cga4/home/ehelman/Transposons/reffiles/"
	#refdir = paste(getwd(),"/reffiles/",sep="")
	refseq = read.delim(paste(reffilesdir,"refseq_",hg,".txt",sep=""),sep="\t",stringsAsFactors=F)
	repeatmasker = read.delim(paste(reffilesdir,"RepeatMasker_",hg,"_chr",c,".BED",sep=""),sep="\t",stringsAsFactors=F)
	#refL1 = read.delim(paste(reffilesdir,"repeatmaskerL1_",hg,"_chr",c,".BED",sep=""),sep="\t",stringsAsFactors=F)
	#refAlu = read.delim(paste(reffilesdir,"repeatmaskerAlu_",hg,"_chr",c,".BED",sep=""),sep="\t",stringsAsFactors=F)
	CNV = read.delim(paste(reffilesdir,"CNV_",hg,".txt",sep=""),sep="\t",stringsAsFactors=F)
	dbRIP = read.delim(paste(reffilesdir,"EwingKazazRIPs_",hg,".txt",sep=""),as.is=T,sep="\t")
	G1000 = read.delim(paste(reffilesdir,"1000Gevents_",hg,".txt",sep=""),as.is=T,sep="\t") 
	lincRNA = read.delim(paste(reffilesdir,"lincRNA_",hg,".txt",sep=""),as.is=T,sep="\t") 


	# Find location of cands in normals, genes, CNVs
	print("Classifying candidates...")
	matchmarg = 200
	genereg = 1000
	for ( i in 1:nrow(cand) ) { 
  		if(length(Norm)==1 || length(cand)==1) {
			print("Norm or Turm is NA")
			idx = FALSE
  		} else {
			idx = any(Norm[,"chr"]==cand[i,"chr"] & Norm[,"start"]<=cand[i,"stop"]+matchmarg & Norm[,"stop"]>=cand[i,"start"]-matchmarg)
		}
		if(is.na(idx)) { 
			idx = FALSE
		}
		nnorm = 0
		#normmarg = 100
		if(idx == TRUE) {
			whichnorm = which(Norm[,"chr"]==cand[i,"chr"] & Norm[,"start"]<=cand[i,"stop"]+matchmarg & Norm[,"stop"]>=cand[i,"start"]-matchmarg)
			nnorm = sum(Norm[whichnorm,"nreads"])
		}
		
		cand[i,"in_normal"] = idx
  		cand[i,"nnorm" ] = nnorm
  		
		whichrefseq = which(refseq[,"chrom"]==paste("chr",cand[i,"chr"],sep="") & (refseq[,"txStart"]-genereg)<=cand[i,"stop"] & (refseq[,"txEnd"]+genereg)>=cand[i,"start"])
    	if(length(whichrefseq)>0) {
    		whichrefseq=whichrefseq[1] # Just use first gene 
    		exonstarts = unlist(strsplit(refseq[whichrefseq,"exonStarts"],split=","))
			exonstops = unlist(strsplit(refseq[whichrefseq,"exonEnds"],split=","))
			midbkp = mean(cand[i,"bkp_start"],cand[i,"bkp_stop"])
			whichexon = which( exonstops > midbkp & exonstarts < midbkp)
			if(length(whichexon) > 0) {
				cand[i,"gene_region"] = paste("exon ",whichexon,sep="")
			} else{
				afterstop = which( exonstops < midbkp )
				afterstop = afterstop[length(afterstop)]
				beforestart = which(exonstarts > midbkp)[1]
		
				distbeforestart = as.numeric(exonstarts[beforestart]) - midbkp
				distafterstop = midbkp - as.numeric(exonstops[afterstop]) 
				if((!is.na(distbeforestart)) & (length(distafterstop)==0 || distbeforestart < distafterstop)) {
					cand[i,"gene_region"] = paste(distbeforestart,"bp before exon",beforestart,sep="")
				} else #if(distafterstop < distbeforestart)
					cand[i,"gene_region"] = paste(distafterstop,"bp after exon",afterstop,sep="")
			}
		}

  		idreg = which(refseq[,"chrom"]==paste("chr",cand[i,"chr"],sep="") & refseq[,"txStart"]<=cand[i,"stop"] & refseq[,"txEnd"]>=cand[i,"start"])
  		idc = which(CNV[,"Chromosome"]==cand[i,"chr"] & CNV[,"Start"]<=cand[i,"stop"] & CNV[,"End"]>=cand[i,"start"])
  		
  		idmasker = which(repeatmasker[,"Chr"]==paste("chr",cand[i,"chr"],sep="") & repeatmasker[,"Start"]<=cand[i,"stop"] & repeatmasker[,"Stop"]>=cand[i,"start"])
		
		id1000g = which(G1000[,"CHR"]==cand[i,"chr"] & G1000[,"POS"]<=cand[i,"stop"]+matchmarg & G1000[,"POS"]>=cand[i,"start"]-matchmarg)
		idlinc = which(lincRNA[,1]==paste("chr",cand[i,"chr"],sep="") & lincRNA[,2]<=cand[i,"stop"] & lincRNA[,3]>=cand[i,"start"])
		idRIP = which(dbRIP[,"Chr"]==cand[i,"chr"] & dbRIP[,"AvgLoc"]<=cand[i,"stop"]+matchmarg & dbRIP[,"AvgLoc"]>=cand[i,"start"]-matchmarg)

  		
  		ugenes = unique(refseq[whichrefseq[1:length(whichrefseq)],"name2"])
  		if(length(whichrefseq)>0) cand[i,"in_gene"] = paste(ugenes,collapse="; ")
	  	if(length(idc)>0) cand[i,"in_CNV"] = as.character(CNV[idc[1],"ID"])
	  	if(length(idlinc)>0) cand[i,"lincRNA"] =  as.character(paste(lincRNA[idlinc,4],collapse=","))

	  	
		if(length(idmasker)>0) 
			cand[i,"near_refTE"] = as.character(paste(sapply(idmasker, function(x) { paste(repeatmasker[x,5:7],collapse="#") }),collapse=", "))
		
	  	if(length(id1000g)>0) cand[i,"1000G"] = as.character(paste(G1000[id1000g,"ELEMENT"],collapse=","))	
	  	if(length(idRIP)>0) cand[i,"dbRIP"] = paste(idRIP,collapse=",")

	}
	
	return(cand)
}



annotate_candidates ( samp, c, hg)


