## Load source code
args=(commandArgs(TRUE))

##args is now a list of character vectors
## First check to see if arguments are passed.
## Then cycle through each element of the list and evaluate the expressions.
if(length(args)==0){
    print("No arguments supplied.")
}else{
    for(i in 1:length(args)){
         eval(parse(text=args[[i]]))
    }
}
print(tum)
print(fasta)
print(fastafile)
print(samp)
print(bam)
print(candfile)
print(assembleddir)
print(type)
print(startcand)
print(endcand)
print(lsfdir)
print(evalue)
print(codedir)
print(hgnum)
print(readsdir)
print(reffilesdir)

if(startcand == 0) startcand = 1
startcand = as.integer(startcand)
endcand = as.integer(endcand)
print(startcand)
print(endcand)
source(paste(codedir,"/functions.R",sep=""))
source(paste(codedir,"/MiscFunctions.R",sep=""))


##
## Assemble weird reads in candidate regions
##
## Use java class CandidateAnalysis.java
## Launch java job and wait for it to finish before moving on

Sys.setenv(TMPDIR="/xchip/cga4/home/ehelman/tmp/")

## 
## Launches CandidateAnalysis.java to gather reads for each candidate region in BAM 
## 
AssembleReads = function (tum,samp,fastafile,bam,blacklist,c,refdir,readsdir,assembleddir,queue,startreg,stopreg,breakstart,breakstop,readsfor,readsrev,type,jobname) {

	#readsfile = paste(readsdir,samp,"-",t,".chr",c,".reads.txt",sep="")
	#if(!file.exists(readsfile)) {

	#reffilesdir = paste(getwd(),"/reffiles/",sep="")
	classpath = paste(reffilesdir,"sam.jar:",getwd(),"/classes",sep="")
	cmd = paste("java -Xmx2g -classpath",classpath,"org.broadinstitute.cga.tools.seq.CandidateAnalysis",
		bam,blacklist,tum,samp,c,startreg,stopreg,breakstart,breakstop,"\"",readsfor,"\" \"",readsrev,"\"",readsdir,assembleddir,type,sep=" ")
		 
	print(cmd)
	system(cmd)
	#Sys.sleep(20)
	#} else print("Reads file already exists")

}

runBLAST = function(blastedfile,readsfile,blastedclips,splitfile,fastafile) {        

	if(!file.exists(blastedfile)) {
	    largerfasta = "/xchip/cga/ehelman/Transposons/db/ActiveFams_TEs.fasta"
		fastaformat = paste(largerfasta,".nsq",sep="")
		if(!file.exists(fastaformat)) {
			cmd = paste("/xchip/cga/ehelman/ncbi-blast-2.2.27+/bin/makeblastdb -dbtype 'nucl' -in ",largerfasta,sep="")
            system(cmd)
		}
		cmd = paste("/xchip/cga/ehelman/ncbi-blast-2.2.27+/bin/blastn -query ",readsfile," -task blastn -db ",largerfasta," -out ",blastedfile,".partial -evalue ",evalue," -outfmt 6 -dbsize 3000000 -dust 'no' -soft_masking 'false' -perc_identity 60 -culling_limit 10",sep="")
	    print(cmd)
	    system(cmd)
		system(paste("mv ",paste(blastedfile,".partial",sep="")," ",blastedfile,sep=""))

	    #cmd = paste("/xchip/cga/ehelman/ncbi-blast-2.2.27+/bin/blastn -query ",splitfile," -task blastn -db ",largerfasta," -out ",blastedclips,".partial -evalue 0.01 -outfmt 6 -dbsize 3000000 -dust 'no' -soft_masking 'false' -perc_identity 60 -culling_limit 2",sep="")
	    #print(cmd)
	    #system(cmd)
	    #	system(paste("mv ",paste(blastedclips,".partial",sep="")," ",blastedclips,sep=""))

	    #while (!file.exists(blastedfile))
	     #      Sys.sleep(20)

		print("Done blasting reads!")
	
	} else print("Blasted file already exists")
}


runBLAT = function (blattedfile, readsfile, chr, start, stop) {
	print("BLATTING contigs")
	#BLAT = "/xchip/cga/ehelman/Transposons/BLAT/blat"
	BLAT = "blat"
	db = paste(reffilesdir,"hg",hgnum,".2bit",sep="")
	cmd = paste(BLAT," ",db,":chr",chr,":",start,"-",stop," ",readsfile," -stepSize=5 -repMatch=2253 -minScore=0 -minIdentity=0 -noTrimA -noHead -out=blast8 ",blattedfile,".partial",sep="")
	print(cmd)
	system(cmd, intern=TRUE)
	while (!file.exists(paste(blattedfile,".partial",sep="")))
	    Sys.sleep(20)
	system(paste("mv ",paste(blattedfile,".partial",sep="")," ",blattedfile,sep=""))
	print("Done blatting")
}


## 
## COMBINE CONTIGS ACROSS POLYAs
##
combine_across_poly = function (contigsfile) {
	contigs = try(read.delim(contigsfile,header=F,as.is=T,sep="\t"),silent=TRUE)
	if(class(contigs)=="try-error" || nrow(contigs)==0) return()
	whichseqinfo = grep(">",contigs[,1])
	seqinfo = contigs[whichseqinfo,1]
	lengths = sapply(seqinfo,function(x) as.numeric(unlist(strsplit(x,split="length: "))[2]))
	names = gsub(">","",sapply(seqinfo,function(x) unlist(strsplit(x,split=" K: "))[1]))
	seqs = vector()
	numEls = length(whichseqinfo)
	if( numEls > 1 ) {
    	for(i in 1:(numEls-1)) {
            seqs[i] = paste(contigs[(whichseqinfo[i]+1):(whichseqinfo[i+1]-1),1],collapse="")
    	}
    }
    seqs[numEls] = paste(contigs[(whichseqinfo[numEls]+1):nrow(contigs),1],collapse="")

	if(length(seqs) < 2) return()
	
	twolongest = order(lengths,decreasing=T)[1:2]
	## Longest starts with polytract and second ends with polytract
	if( (substr(seqs[twolongest[1]],1,10) == "AAAAAAAAAA" && (substr(seqs[twolongest[2]],lengths[twolongest[2]]-9,lengths[twolongest[2]]) == "AAAAAAAAAA")) ||
	    (substr(seqs[twolongest[1]],1,10) == "TTTTTTTTTT" && (substr(seqs[twolongest[2]],lengths[twolongest[2]]-9,lengths[twolongest[2]]) == "TTTTTTTTTT")) ) {
		 combinedseq = paste(seqs[twolongest[2]],substr(seqs[twolongest[1]],11,lengths[twolongest[1]]),sep="")
		print("option1")
	## Longest ends with polytract and second starts with polytract
	} else if((substr(seqs[twolongest[2]],1,10) == "AAAAAAAAAA" && (substr(seqs[twolongest[1]],lengths[twolongest[1]]-9,lengths[twolongest[1]]) == "AAAAAAAAAA")) ||
	   (substr(seqs[twolongest[2]],1,10) == "TTTTTTTTTT" && (substr(seqs[twolongest[1]],lengths[twolongest[1]]-9,lengths[twolongest[1]]) == "TTTTTTTTTT")) ) {
		combinedseq = paste(seqs[twolongest[1]],substr(seqs[twolongest[2]],11,lengths[twolongest[2]]),collapse="")
		print("option2")
	## Longest starts with polytract and REVERSE COMPLEMENT of second ends with polytract
	} else if((substr(seqs[twolongest[1]],1,10) == "AAAAAAAAAA" && (substr(rev.comp(seqs[twolongest[2]]),lengths[twolongest[2]]-9,lengths[twolongest[2]]) == "AAAAAAAAAA")) ||
	    (substr(seqs[twolongest[1]],1,10) == "TTTTTTTTTT" && (substr(rev.comp(seqs[twolongest[2]]),lengths[twolongest[2]]-9,lengths[twolongest[2]]) == "TTTTTTTTTT")) ) {
		 combinedseq = paste(rev.comp(seqs[twolongest[2]]),substr(seqs[twolongest[1]],11,lengths[twolongest[1]]),sep="")
		 print("option3")
	## Longest ends with polytract and REVERSE COMPLEMENT of second starts with polytract
	} else if((substr(rev.comp(seqs[twolongest[2]]),1,10) == "AAAAAAAAAA" && (substr(seqs[twolongest[1]],lengths[twolongest[1]]-9,lengths[twolongest[1]]) == "AAAAAAAAAA")) ||
	    (substr(rev.comp(seqs[twolongest[2]]),1,10) == "TTTTTTTTTT" && (substr(seqs[twolongest[1]],lengths[twolongest[1]]-9,lengths[twolongest[1]]) == "TTTTTTTTTT")) ) {
		 combinedseq = paste(seqs[twolongest[1]],substr(rev.comp(seqs[twolongest[2]]),10,lengths[twolongest[2]]),sep="")
		 print("option4")
	} else
		return()
	
	combinedname = paste(names[twolongest[1]],"+",names[twolongest[2]],sep="")
	names(combinedname) = paste(">", names[twolongest[1]],"+",names[twolongest[2]], " K: xx length: ",nchar(combinedseq),sep="")
	seqs = c(combinedseq,seqs)
	names = c(combinedname,names)
	
	write(names(names)[1],file=contigsfile)
	write(seqs[1],file=contigsfile,append=T)
	for(i in 2:length(seqs)) {
		write(names(names)[i],file=contigsfile,append=T)
		write(seqs[i],file=contigsfile,append=T)
	}
}


Assemble = function () {
	cands = read.delim(candfile,sep="\t",as.is=T)
	if(endcand > nrow(cands)) endcand = nrow(cands)

	stopat = which(cands[,"nreads"] < 8)[1]
	queue = "hour"
	jobname = paste(samp,fasta,"CA",sep="_")

	for(i in startcand:endcand) {
		print(paste("Candidate number ",i,sep=""))
		c = cands[i,"chr"]
		startreg = cands[i,"start"]
		stopreg = cands[i,"stop"]	
		breakstart = cands[i,"bkp_start"]
		breakstop = cands[i,"bkp_stop"]
		readsfor = cands[i,"Reads_f"]
		readsrev = cands[i,"Reads_r"]
		assemblyfold = paste(assembleddir,samp,"-",type,".chr",c,".region.",startreg,"-",stopreg,".assembly/",sep="")
		ensure_dir_exists(assemblyfold)
		print(paste("Assembly in ",assemblyfold,sep=""))
		
		contigsf = paste(assemblyfold,"contigs_f.fa",sep="")
		contigsr = paste(assemblyfold,"contigs_r.fa",sep="")
		clippedf = paste(readsdir,samp,"-",type,".chr",c,".region.",startreg,"-",stopreg,"split_forward.txt",sep="")
		clippedr = paste(readsdir,samp,"-",type,".chr",c,".region.",startreg,"-",stopreg,"split_reverse.txt",sep="")

		if(!file.exists(contigsf) | !file.exists(contigsr)) {
			AssembleReads(tum,samp,fastafile,bam,blacklist,c,refdir,readsdir,assembleddir,queue,startreg,stopreg,breakstart,breakstop,readsfor,readsrev,type,jobname)

		}
		
		print("Combining contigs across polyA tracts")
		combine_across_poly(contigsf)
		combine_across_poly(contigsr)		

	
		blastedfile_f = paste(assemblyfold,"blasted_f.txt",sep="")
		blastedfile_r = paste(assemblyfold,"blasted_r.txt",sep="")
		blastedclips_f = paste(assemblyfold,"blastedclips_f.txt",sep="")
		blastedclips_r = paste(assemblyfold,"blastedclips_r.txt",sep="")
		blattedfile_f = paste(assemblyfold,"blatted_f.txt",sep="")
		blattedfile_r = paste(assemblyfold,"blatted_r.txt",sep="")
	
		if(!file.exists(blastedfile_f)) {
			runBLAST(blastedfile_f,contigsf,blastedclips_f,clippedf,fastafile)
		}
		if(!file.exists(blastedfile_r))
			runBLAST(blastedfile_r,contigsr,blastedclips_r,clippedr,fastafile)
		if(!file.exists(blattedfile_f)) {
			runBLAT(blattedfile_f,contigsf,c,startreg,stopreg)
		}
		if(!file.exists(blattedfile_r))
			runBLAT(blattedfile_r,contigsr,c,startreg,stopreg)
		
	}
}


Assemble()

print("Done assembling")
