## Load source code

args=(commandArgs(TRUE))

##args is now a list of character vectors
## First check to see if arguments are passed.
## Then cycle through each element of the list and evaluate the expressions.
if(length(args)==0){
    print("No arguments supplied.")
    }else{
        for(i in 1:length(args)){
	         eval(parse(text=args[[i]]))
	 }
     }

	print(assembledcandfile)
	print(germlinefile)
	print(tum)
	print(samp)
	print(fasta)
	print(dir)
	
source("/xchip/cga1/ehelman/functions.R")

filteredassembledfile = paste(gsub(".txt","",assembledcandfile),".filtered.txt",sep="")

if(file.exists(filteredassembledfile)) {
	print(paste(filteredassembledfile,"file exists"))
	break
}

candidates = try (read.delim(assembledcandfile,as.is=T,sep="\t"),silent = T )
if(class(candidates) == "try-error") {
		print("Assembled file doesn't exist")
		break
	}

#print( which(is.na(candidates[,"contig1_length"]) & is.na(candidates[,"contig2_length"])) )

	candidates = cbind(candidates,candidates[,"nreads"] / candidates[,"nTotRegion"])
	colnames(candidates)[ncol(candidates)] = "Fraction"

	#cands = subset(candidates,nreads_f > 1 & nreads_r > 1)
	## Only L1HS, AluY and SVA
	#aluy = candidates[union(grep("AluY",candidates[,"contig1_TE"]),grep("AluY",candidates[,"contig2_TE"])),"start"]
	#activecands = subset(candidates, (TE == "L1HS" | TE_alt == "L1HS" | contig1_TE == "L1HS#L1#Homo" | contig2_TE == "L1HS#L1#Homo") |  
	#	(start %in% aluy) |
	#	(contig1_TE == "SVA#SINE#Homo" | contig2_TE == "SVA#SINE#Homo") )



	## Filter contig in reference
	#a = activecands
	a = candidates
	contiginref = vector()	
	TEinref = vector()
	for(j in 1:nrow(a)) {
		s = unlist(strsplit(a[j,"near_refTE"],split=",| |/"))
		if("x" %in% s) 	{ # Add exception for AluSg/x
			whichx = which(s == "x")
			s[whichx] = "AluSx"
		}
		
		## First contig in ref
		infirst = pmatch(s,a[j,"contig1_TE"])
		insecond = pmatch(s,a[j,"contig2_TE"])
		
		
		# either
		if( (any(!is.na(infirst)) | any(!is.na(insecond))) | any(!is.na(infirst)) & is.na(a[j,"contig2_TE"]) | any(!is.na(insecond)) & is.na(a[j,"contig1_TE"]))
			contiginref = c(contiginref,j)


	}
	contiginref = unique(contiginref)
	if(length(contiginref) > 0)
	a = a[-contiginref,]
	
	## Filter out reference
	#TEinREF= apply(candidates, 1, function(x) length(grep(x["TE"],x["near_refTE"])))
	#whichTEnotinREF = which(TEinREF == 0)
	#nonrefcands = candidates[whichTEnotinREF,]

	##
	## Filter number of germline reads
	##
	#totreads = 9
	#maxnorm = 3
	#f = subset(a, nreads > totreads & nnorm < maxnorm)
	
	##
	## Filter allelic fraction
	##
	minfract = 0.005
	f = subset(a,Fraction > minfract)
	## Filter for number of bad reads in region
	
	##
	## Filter number of bad reads in region
	##
	maxbad = 150
	maxprop = .3
	#f = subset(f, nbadregion < maxbad)
	f = subset(f, nbadregion/nTotRegion < maxprop & nbadregion < maxbad)
	
	
	##
	## Filter out clusters with only one read supporting a side
	##
	f = subset(f, nreads_f > 1 & nreads_r > 1)
	
	# filter size of clusters
	f = subset(f,stop_f-start_f < 600 & stop_r-start_r < 600)
	
	##
	## Filter number of weirds in Normal versus Tumor
	##
	tum2normratio = 0.3
	#f2 = subset(f, nWeirdsNormal < (nWeirdsTumor/3) | (nWeirdsTumor < 16 & nWeirdsTumor-nWeirdsNormal > 5) )
	f2 = subset(f, nWeirdsNormal < tum2normratio* nWeirdsTumor | (nWeirdsTumor < 16 & nWeirdsTumor-nWeirdsNormal > 5) )

	nnormalweirds = 25
	f2 = subset(f2,nWeirdsNormal < nnormalweirds)
	#f2 = subset(f2, nWeirdsTumor < 1000)
	
	##
	## Filter contig assemblies
	##
	g = f2[which(f2[,"contig1_length"] > 15  | f2[,"contig2_length"] > 15),]
	
	print(nrow(g))
	
	## 
	##Filter germlines out and add them to germline file
	##
	ingerm = subset(g,!is.na(dbRIP) | !is.na(X1000G))
	if(nrow(ingerm) > 0) {
	germlinefile = paste(sampdir,samp,".germline.txt",sep="")
	germline = try ( read.delim(germlinefile,as.is=T,sep="\t"),silent = T )
	if(class(germline) == "try-error") {
		print(samp)
	}
		# add to germline file
		m = match(ingerm[,"start"],germline[,"start"])
		germline = rbind(germline,ingerm[which(is.na(m)),1:35])
		write.table(germline,germlinefile,sep="\t",quote=F,row.names=F)
		
		# take away from somatic
		g = subset(g,is.na(dbRIP) & is.na(X1000G))
	}
	
	##
	## Filter meanq
	##
	qualthresh = 50 #50 for CRC
	qualthresh = 20 #for others
	qualthresh = 1
	
	h = subset(g, meanq_f >= qualthresh & meanq_r >= qualthresh)
	print(nrow(h))
	
	##
	## Filter location of alignment? -- no 'middles'?
	##
	#p = subset(h, TEstatus != "middle")
	#print(nrow(p))
	
	write.table(h,file=filteredassembledfile,quote=F,row.names=F,sep="\t")

}



## Germline filter
for(i in 1:length(samps)) {
	samp = samps[i]
	sampdir = paste(DIR,samp,"/",samp,"_",fasta,"/",sep="")
	assembledfile = paste(sampdir,samp,".germline.txt",sep="")
	candidates = try ( read.delim(assembledfile,as.is=T,sep="\t"),silent = T )
	if(class(candidates) == "try-error") {
		print(samp)
		next
	}
	
	## Reads supporting
	minreads = 9
	a = subset(candidates, nreads > minreads & nreads_f > 1 & nreads_r > 1)
	
	print(nrow(a))
	
	##
	## Filter number of bad reads in region
	##
	#maxbad = 150
	#maxprop = .3
	#b = subset(a, nbadregion < maxbad)
	#b = subset(a, nbadregion/nTotRegion < maxprop & nbadregion < maxbad)

	# Not near reference
	#b = subset(a,is.na(near_refTE))
	#print(nrow(b))
	b = a
	# Quality is good
	qualthresh = 1
	c = subset(b, meanq_f > qualthresh & meanq_r > qualthresh)
	
	# Cluster size
	d = subset(c,stop_f-start_f < 600 & stop_r-start_r < 600)
	
	print(nrow(d))
	#write.table(d,file=paste("/xchip/cga4/home/ehelman/Transposons/Analysis/",tum,"/",samp,".germline.filtered.txt",sep=""),quote=F,row.names=F,sep="\t")
	write.table(d,file=paste("/xchip/cga2/ehelman/Transposons/Analysis/postrevision/",tum,"/",samp,".germline.filtered.txt",sep=""),quote=F,row.names=F,sep="\t")

}







## Enter manual review
for(i in 1:length(samps)) {
	
	samp = samps[i]
	dir = paste("/xchip/cga4/home/ehelman/Transposons/Analysis/",tum,"/",sep="")
	filteredfile = paste(dir,samp,".filtered.txt",sep="")
	candidates = try ( read.delim(filteredfile,as.is=T,sep="\t") )
	if(class(candidates) == "try-error") {
		print(i)
		next
	}
	print(samp)
	print(nrow(candidates))
	manual = NA
	notes = NA
	candidates = cbind(candidates,manual,notes)
	
	
	print(nrow(candidates))
	
	write.table(candidates,file=paste(dir,samp,".manual.xls",sep=""),quote=F,row.names=F,sep="\t")
	
}

for(i in 1:length(samps)){
samp = samps[i]
old = try ( read.delim(file=paste(dir,samp,".manual.xls",sep=""),as.is=T), silent = TRUE )
if(class(old) == "try-error")
	print(i)
#print(nrow(old))
}

match(old[,"start"],candidates[,"start"])
candidates[match(old[,"start"],candidates[,"start"]),"manual"] = old[,"manual"]
candidates[match(old[,"start"],candidates[,"start"]),"notes"] = old[,"notes"]









