## 
#################################################################################
## TRANSPOSEQ v1.0
## 
## Elena Helman
## Broad Institute
#################################################################################
##



#################################################################################
## Load in arguments and code ##
#################################################################################

args=(commandArgs(TRUE))

if(length(args)==0){
    print("No arguments supplied.")
} else {
    for(i in 1:length(args)){
         eval(parse(text=args[[i]]))
    }
}
print(tum)
print(samp)
print(fastafile)
print(tumorbam)
print(normalbam)
print(hgnum)
print(fasta)
print(queue)
print(nalign)
print(percentident)
print(evalue)
print(nreads)
print(readsfold)
print(dir)

cleanup = TRUE
nocandidates = FALSE

nreads = as.numeric(nreads)

codedir = paste(getwd(),"/",sep="")
print(codedir)

source(paste(codedir,"functions.R",sep=""))
source(paste(codedir,"Filter_Cands.R",sep=""))


lsfdir = paste(dir,"lsf/",samp,"/",sep="")

sampdir = paste(dir,samp,"/",sep="")
ensure_dir_exists(sampdir)

DIR = paste(sampdir,samp,"_",fasta,"/",sep="")
ensure_dir_exists(DIR)
#setwd(DIR)
assembleddir=paste(DIR,samp,".assembly/",sep="")
ensure_dir_exists(assembleddir)

outstem = paste(DIR,samp, sep="")

readsdir=paste(readsfold,samp,"/",sep="")
ensure_dir_exists(readsdir)
readsdirassembly=paste(readsdir,"assembly/",sep="")
ensure_dir_exists(readsdirassembly)

reffilesdir = "/xchip/cga/ehelman/Transposons/TranspoSeq/reffiles/"
print(reffilesdir)

blacklist=paste(reffilesdir,"lane_blacklist.txt",sep="")
refdir=paste(reffilesdir,"hg",hgnum,sep="")

centfile = paste(reffilesdir,"chrpositions_hg",hgnum,".txt",sep="")
cent = read.delim(centfile,as.is=T,sep="\t")

fast = unlist(strsplit(fasta,split="_"))[1]
hg = paste("Hg",hgnum,sep="")




bams = vector()
if(is.na(tumorbam) | !file.exists(tumorbam)) {
	print("Tumor bam was not given or does not exist")
	} else {
		bams = c(bams,tumorbam)
		names(bams) = c(names(bams),"Tumor")
	}
if(is.na(normalbam) | !file.exists(normalbam)) {
	print("Normal bam was not given or does not exist")
	} else {
		names = names(bams)
		bams = c(bams,normalbam)
		names(bams) = c(names,"Normal")
	}
if(length(bams)==0) {
	print("No existing bams given...Stopping")
	stop
	}

##
## Define all output file names
##
candsuffix = "allcandidates"
onesidedsuffix = "allonesided"
twosidedsuffix = "alltwosided"
candfile = paste(outstem,candsuffix,"txt",sep=".")
onesidedfile =  paste(outstem,onesidedsuffix,"txt",sep=".")
twosidedfile =  paste(outstem,twosidedsuffix,"txt",sep=".")
germlinefile = paste(outstem,".germline.txt",sep="")
somaticfile = paste(outstem,".somatic.txt",sep="")
germlinefilefiltered = paste(outstem,".germline.filtered.txt",sep="")
germlinefilt_onesided = paste(outstem,".germline.onesided.txt",sep="")
somaticfilefiltered = paste(outstem,".somatic.filtered.txt",sep="")
somaticfilt_onesided = paste(outstem,".somatic.onesided.txt",sep="")

outassemblystem = paste(assembleddir,samp,sep="")
outreadsstem = paste(readsdir,"assembly/",samp,sep="")
assembledcandfile = paste(outstem,".allcandidates.assembled.txt",sep="")

print(outstem)

## Determine whether assembled file already created
if( file.exists( paste(outstem,".allcandidates.assembled.txt",sep="") ) ) {
       print("Assembled file exists")
} else { 


#################################################################################
## Part 1: GetReads ##
##   Starts jobs for each chromosome arm that
##   1. parse BAM file and write discordant pairs to files
##   2. BLAST these reads to reference fasta 
##   3. parse BLAST output to only what user defined as aligned
#################################################################################

 print("TRANSPOSEQ")
 print("Beginning Get Reads step...")

 setwd(codedir)
 scriptname = "GetReads.R"
 script = paste(codedir,scriptname,sep="")
 #script = scriptname
 jobname = paste(samp,"1",sep="_")

 #queue = "cga"
 #queue = "week"
 #queue = "hour"
 q = queue
 if(queue == "hour") q = "hour -W 4:00"


 for(j in 1:nrow(cent)) {
	c = cent[j,"chr"]
	print(c)
    startpos = cent[j,"start"]
    stoppos = cent[j,"stop"]
       
    for(i in 1:length(bams)) {
        bam = bams[i]
            
        outstem_part = paste(outstem,"-",names(bam),sep="")
		parsedfile = paste(outstem,"-",names(bam),".chr",c,".reads.aligned.txtparsed.txt",sep="")
		chrfile = paste(outstem,"-",names(bam),".chr",c,".txt",sep="")
				
        if( !file.exists(parsedfile) && !file.exists(chrfile) ) {
        	readsfile = paste(readsdir,samp,"-",names(bam),".chr",c,".reads.txt",sep="")
            #if (!file.exists(readsfile)) q = "cga"
                	
            cmd = paste("bsub -q hour -W 4:00 -J ",jobname," -o ",paste(lsfdir,samp,"_",fasta,"_",c,".out",sep=""),
                " R CMD BATCH --no-save --no-restore '--args tum=\"",tum,"\" samp=\"",samp,"\" fasta=\"",fastafile,
                "\" bam=\"",bam,"\" blacklist=\"",blacklist,"\" c=\"",c,"\" startpos=\"",startpos,"\" stoppos=\"",stoppos,
                "\" fastafile=\"",fastafile,"\" refdir=\"",refdir,"\" readsdir=\"",readsdir,"\" lsfdir=\"",lsfdir,"\" outstem=\"",
                outstem_part,"\" t=\"",names(bam),"\" queue=\"",queue,"\" evalue=\"",evalue,"\" nalign=\"",nalign,"\" percentident=\"",
                percentident,"\"' ",script," ",paste(lsfdir,scriptname,"_",samp,"_",c,"_",names(bam),".out",sep=""),sep="")

        	cat(cmd)
        	cat("\n")
        	system(cmd)

        }
    }
	
 }
	
 while ( system(paste("bjobs -J ", jobname, " | wc -l"), intern=T)>0 )
    {
    	Sys.sleep(30)
    }
        

 print("Done with Get Reads step")

 representativereads = try ( read.delim(paste(readsdir,samp,"-Tumor.chr1p.reads.txt",sep=""),as.is=T,sep="\t"), silent = T )
 if( class(representativereads) == "try-error" || nrow(representativereads) == 0) stop("No Tumor chr1p reads!")

#################################################################################
## Part 2: ProcessReads ##
##   Starts jobs for each chromosome arm that
##   1. Cluster unique pairmates file
##   2. Determine and annotate candidate events
#################################################################################

 print("Beginning Process Read step...")
 scriptname = "ProcessReads.R"
 script = paste(codedir,scriptname,sep="")
 setwd(codedir)
 #script = scriptname
 jobname = paste(samp,fast,"2",sep="_")

 if(queue == "hour") q = "hour -W 4:00"

 for(j in 1:nrow(cent)) {
	c = cent[j,"chr"]
    print(c)
    startpos = cent[j,"start"]
    stoppos = cent[j,"stop"]
    candfile = paste(outstem,".chr",c,".candidates.txt",sep="")
	if(!file.exists(candfile)) {
    	cmd = paste("bsub -q hour -W 4:00 -J ",jobname," -o ",paste(lsfdir,samp,"_",fasta,"_",c,".out",sep="")," R CMD BATCH --no-save --no-restore '--args tum=\"",
        	tum,"\" samp=\"",samp,"\" codedir=\"",codedir,"\" fasta=\"",fasta,"\" fastafile=\"",fastafile,"\" blacklist=\"",blacklist,"\" c=\"",c,"\" hg=\"",hg,
        	"\" refdir=\"",refdir,"\" reffilesdir=\"",reffilesdir,"\" readsdir=\"",readsdir,"\" lsfdir=\"",lsfdir,"\" outstem=\"",outstem,"\" queue=\"",queue,
        	"\" nalign=\"",nalign,"\" evalue=\"",evalue,"\" percentident=\"",percentident,"\"' ",script," ", paste(lsfdir,scriptname,"_",samp,"_",c,"_",fasta,".out",sep=""),sep="")
        print(c)

        system(cmd)
	}
 }


 while ( system(paste("bjobs -J ", jobname, " | wc -l"), intern=T)>0 )
	{
    	Sys.sleep(30)
    }
 print("Done with Process Reads step")




#################################################################################
##  Consolidate and filter all chromosomes into preliminary candidate list
##     Includes germline,somatic, and one-sided events
#################################################################################


##
## Define all output file names
##
 candsuffix = "allcandidates"
 onesidedsuffix = "allonesided"
 twosidedsuffix = "alltwosided"
 candfile = paste(outstem,candsuffix,"txt",sep=".")
 onesidedfile =  paste(outstem,onesidedsuffix,"txt",sep=".")
 twosidedfile =  paste(outstem,twosidedsuffix,"txt",sep=".")
 germlinefile = paste(outstem,".germline.txt",sep="")
 somaticfile = paste(outstem,".somatic.txt",sep="")
 germlinefilefiltered = paste(outstem,".germline.filtered.txt",sep="")
 germlinefilt_onesided = paste(outstem,".germline.onesided.txt",sep="")
 somaticfilefiltered = paste(outstem,".somatic.filtered.txt",sep="")
 somaticfilt_onesided = paste(outstem,".somatic.onesided.txt",sep="")

 outassemblystem = paste(assembleddir,samp,sep="")
 outreadsstem = paste(readsdir,"assembly/",samp,sep="")
 assembledcandfile = paste(outstem,".allcandidates.assembled.txt",sep="")

 if(!file.exists(candfile)) {
	print("Consolidating candidates...")
	candidates = vector()
	for ( j in 1:nrow(cent) ) {
		print(j)
		c = cent[j,"chr"]
		cands = try( read.delim(paste(outstem,".chr",c,".candidates.txt",sep=""),sep="\t",as.is=TRUE ), silent=TRUE )
		if( class(cands) == "try-error" | length(cands) == 1 ) next
		candidates = rbind(candidates,cands)
	}
	if(length(candidates) == 0) {
		print("No candidates!")
		nocandidates = TRUE
		write.table(NA,file=assembledcandfile,sep="\t",quote=F,row.names=F)
		write.table(NA,file=somaticfile,sep="\t",quote=F,row.names=F)
		write.table(NA,file=germlinefile,sep="\t",quote=F,row.names=F)
		write.table(NA,file=germlinefilefiltered,sep="\t",quote=F,row.names=F)
		write.table(NA,file=germlinefilt_onesided,sep="\t",quote=F,row.names=F)
		write.table(NA,file=somaticfilefiltered,sep="\t",quote=F,row.names=F)
		write.table(NA,file=somaticfilt_onesided,sep="\t",quote=F,row.names=F)
	} else {
	
		candidates = subset(candidates,!is.na(chr))
		candidates = candidates[order(candidates[,"nreads"],decreasing=T),]
	
		print("Candidates read in..")
		source(paste(codedir,"/Filter_Cands.R",sep=""))
	
		candidates[,"TE"] = gsub("L1PREC\\|","L1PR\\|",candidates[,"TE"])
		## First-pass filter candidates for min number of total supporting reads
		filtcands = filter_candidates( candidates, numreads = nreads )
		print(nrow(candidates))
		print(nrow(filtcands))
		candidates = filtcands
	
		if(nrow(candidates) == 0) {
			print("No filtered candidates!")
			nocandidates = TRUE
			write.table(NA,file=assembledcandfile,sep="\t",quote=F,row.names=F)
			write.table(NA,file=somaticfile,sep="\t",quote=F,row.names=F)
			write.table(NA,file=germlinefile,sep="\t",quote=F,row.names=F)
			write.table(NA,file=germlinefilefiltered,sep="\t",quote=F,row.names=F)
			write.table(NA,file=germlinefilt_onesided,sep="\t",quote=F,row.names=F)
			write.table(NA,file=somaticfilefiltered,sep="\t",quote=F,row.names=F)
			write.table(NA,file=somaticfilt_onesided,sep="\t",quote=F,row.names=F)
		} else {
	
			write.table(candidates,file=candfile,sep="\t",row.names=F,quote=F)
	
			onesided = subset(candidates, is.na(nreads_f) | (nreads_f < 2) | is.na(nreads_r) | nreads_r < 2)
			twosided = subset(candidates, !is.na(nreads_f) & nreads_f > 1 & !is.na(nreads_r) & nreads_r > 1)

			write.table(onesided,file=onesidedfile,sep="\t",row.names=F,quote=F)
			write.table(twosided,file=twosidedfile,sep="\t",row.names=F,quote=F)
		}
	}
 }

 print("Candidate file written")

if( nrow(candidates) == 0) {
	stop("No candidates...?")
}



#################################################################################
## Part 3: AssembleReads ##
##   Starts jobs for each numperjob candidate events
##   1. parse BAM and write discordant reads within candidate region
##   2. de-novo assemble these reads
##	 3. BLAST contigs from assembly to reference fasta 
#################################################################################

 if(nocandidates == FALSE) {
	print("Beginning assembly step...")

	## WHICH FILE TO ASSEMBLE: all candidates, two-sided candidates, etc
	filetoassemblesuffix = candsuffix
	filetoassemble = candfile
	#filetoassemblesuffix = twosidedsuffix
	#filetoassemble = twosidedfile


	scriptname = "AssembleReads.R"
	setwd(codedir)
	script = paste(codedir,scriptname,sep="")
	#script = scriptname
	jobname = paste(samp,"3",sep="_")

	assembleddir=paste(DIR,samp,".assembly/",sep="")
	ensure_dir_exists(assembleddir)
	readsdirassembly=paste(readsdir,"assembly/",sep="")
	ensure_dir_exists(readsdirassembly)
	toassemble = read.delim(file=filetoassemble,as.is=T,sep="\t")

	outassemblystem = paste(assembleddir,samp,sep="")
	outreadsstem = paste(readsdir,"assembly/",samp,sep="")
	assembledcandfile = paste(outstem,".",filetoassemblesuffix,".assembled.txt",sep="")

	randi = sample(1:nrow(toassemble),1)
	exampleassembledfile = paste(outassemblystem,"-Tumor.chr",toassemble[randi,"chr"],".region.",toassemble[randi,"start"],"-",toassemble[randi,"stop"],".assembly/blasted_f.txt",sep="")

	if(!file.exists(assembledcandfile) && !file.exists(exampleassembledfile) ) {

		for(i in 1:length(bams)) {
			bam = bams[i]
			## Divide assembly to one job per 100
			startcand = endcand = 1
			jobsatatime = min(150,nrow(toassemble))
			numperjob = floor(nrow(toassemble)/jobsatatime)
			k=0
			#queue = "cga"
			for(k in 0:min(5000,floor(nrow(toassemble)/numperjob))) {
				jobname = paste(samp,"3",k,sep="_")
				startcand = k*numperjob		
				if(k==0) startcand=1
				assemblepath = paste(assembleddir,samp,"-",names(bams)[i],".chr",toassemble[startcand,"chr"],".region.",toassemble[startcand,"start"],"-",toassemble[startcand,"stop"],".assembly",sep="")
				assembledf = paste(assemblepath,"/contigs_f.fa",sep="")
        		assembledr = paste(assemblepath,"/contigs_r.fa",sep="")
        		nspan = paste(assemblepath,"/nfile.txt",sep="")
				if(!file.exists(nspan)) {
					cmd = paste("bsub -q hour -W 4:00 -J ",jobname," -o ",paste(lsfdir,samp,"_",endcand,".out",sep="")," R CMD BATCH --no-save --no-restore '--args tum=\"",tum,"\" samp=\"",samp,
					"\" hgnum=\"",hgnum,"\" evalue=\"",evalue,"\" fasta=\"",fasta,"\" fastafile=\"",fastafile,"\" bam=\"",bam,"\" blacklist=\"",blacklist,"\" type=\"",names(bam),"\" hg=\"",hg,"\" refdir=\"",refdir,
					"\" codedir=\"",codedir,"\" readsdir=\"",readsdirassembly,"\" reffilesdir=\"",reffilesdir,"\" lsfdir=\"",lsfdir,"\" assembleddir=\"",assembleddir,"\" startcand=\"",k*numperjob,"\" endcand=\"",(k+1)*numperjob,"\" outstem=\"",outstem,"\" queue=\"",queue,"\" candfile=\"",
					candfile,"\"' ",script," ", paste(lsfdir,scriptname,"_",names(bam),"_",k,".out",sep=""),sep="")	
					print(cmd)
					system(cmd)
				}
			}
		}
	}

	while ( system(paste("bjobs -J ",paste(samp,"_3*",sep=""), " | wc -l"), intern=T)>0 )
        {
         Sys.sleep(60)
        }

	print("Done with Assembly step")
 }





#################################################################################
## Part 4: AddAssembly ##
##   1. parse BLAST output and determine which contig to include
##   2. Add all relevant contig,sequence and split read info to candidate file
#################################################################################

 print("Adding back assembly information")

 setwd(codedir)
 scriptname = "AddAssembly.R"
 script = paste(codedir,scriptname,sep="")
 #script = scriptname
 jobname = paste(samp,fast,"4",sep="_")

 if(!file.exists(assembledcandfile)) {

	cmd = paste("bsub -q hour -W 4:00 -J ",jobname," -o ",paste(lsfdir,samp,"_",fasta,".out",sep="")," R CMD BATCH --no-save --no-restore '--args candfile=\"",
		candfile,"\" refdir=\"",refdir,"\" codedir=\"",codedir,"\" assembledcandfile=\"",assembledcandfile,"\" lsfdir=\"",lsfdir,"\" outassemblystem=\"",
		outassemblystem,"\" outreadsstem=\"",outreadsstem,"\"' ",script," ", paste(lsfdir,scriptname,"_",samp,"_",fasta,".out",sep=""),sep="")	
	print(cmd)
	system(cmd)

	while ( system(paste("bjobs -J ", jobname, " | wc -l"), intern=T)>0 )
    	{
        Sys.sleep(30)
        }
 }
 
}

if(!file.exists(assembledcandfile)) {
	 Sys.sleep(30)
}

#################################################################################
##  Post-processing filter and separate into different event lists
#################################################################################
if(nocandidates == FALSE) {

	print("Post-processing filtering and separating into event lists")

	cands = read.delim(assembledcandfile,as.is=T,sep="\t")


	source(paste(codedir,"/Filter_Cands.R",sep=""))

	filteredsomatic = filter_somatic (cands,somaticfile,germlinefile)
	filteredgermline = filter_germline (cands,germlinefile)


	## Print onesided and partnered candidates for germline and somatic separately
	filter_onesided (filteredsomatic, somaticfilefiltered, somaticfilt_onesided)
	filter_onesided (filteredgermline, germlinefilefiltered, germlinefilt_onesided)

}

#################################################################################
## Delete non-essential files 
#################################################################################

if (cleanup == TRUE) {
	print("Removing intermediate files")
	
	keywords = c("aligned","p.txt","q.txt","clusters","\\.candidates\\.")
	for(k in keywords) {
	print(paste("Removing files with keyword",k))
		filestoremove = dir(DIR)[grep(k,dir(DIR))]
		if(length(filestoremove) > 0) {
			print(paste(length(filestoremove),"files"))
			rmfiles = paste("rm ",DIR,filestoremove,sep="")
			for(r in rmfiles) {
				system(r)
			}
		}
	}
	rmassembledreads = paste("rm -r ",readsdirassembly,sep="")
	system(rmassembledreads)
	if(length(dir(readsdir)) > 0) {
		rmreads = paste("rm -r ",readsdir,sep="")
		system(rmreads)
	}
}

#################################################################################
## FINISHED RUNNING TRANSPOSEQ ##
#################################################################################

print("TRANSPOSEQ FINISHED")


