#################################################################################
##  FILTER candidate events
#################################################################################

## Parameters for filtering
NUMREADS = 8 ## total reads supporting event
NUMNORMS = 0 ## max number of reads in normal bam allowed to qualify as somatic event
NUMSIDE = 1 ## max number of reads on one side to qualify it as one-sided event

MINFRACT = 0.005 ## minimum allele fraction
MAXBAD = 150 ## maximum number of bad reads (qual=0) in candidate region
MAXPROP = .5 ## maximum proportion of bad reads in candidate region (relative to total reads)
QUALTHRESH = 20 ## minimum mean quality of reads supporting candidate
TUM2NORMRATIO = 0.4 ## max ratio of number of total discordant reads in tumor versus normal within region
NNORMALWEIRDS = 100 ## maximum number of discordant pairs in the normal within candidate region

	#qualthresh = 50 #50 for CRC
	#qualthresh = 20 #for others
	#qualthresh = 1


##
## First-pass filter by number of total reads supporting event
## 	and whether insert identified is exactly the same as reference nearby
##
filter_candidates = function ( cands, numreads=NUMREADS, qualthresh=QUALTHRESH ) {
	
	print("First-pass filtering")
	##			        
	## Filter out candidates with minimum number of reads to support
	##
	print(nrow(cands))
	print(numreads)
	cands = subset(cands, nreads >= numreads)
	print(nrow(cands))	
	##
	## Filter out candidates with candidate regions that are too big or too small
	##
	#cands = subset(cands, stop > start+180)
	#cands = subset(cands, stop < start+1500)
	
	print("Filtering mean quality")
	##
	## Filter mean quality
	##
	cands = subset(cands, (is.na(meanq_f) & meanq_r >= qualthresh) | (is.na(meanq_r) & meanq_f >= qualthresh) |
		(meanq_f >= qualthresh & meanq_r >= qualthresh))
	print(nrow(cands))

	print("Filtering reference")
	##
	## Filter out reference
	##
	## If TE and any of nearby references are in the same family, remove!
	matchToNearbyRef = apply(cands,1, function(x) {
		refs = unlist(strsplit(as.character(x["near_refTE"]),split=",\\s"))
		reffams = sapply(refs,function(y) unlist(strsplit(y,split="#"))[3])
		TEfam = unlist(strsplit(as.character(x["TE"]),split="\\|"))[2]
		ifelse(TEfam %in% reffams,0,1)
	})
	cands = cands[matchToNearbyRef==1,]
	print(nrow(cands))	
	
	print("Filtering out multi-aligned candidates")
	##
	##sapply(cands[,
	
	print("returning")
	return(cands)
}



## Filter somatic events
filter_somatic = function ( cands, somaticfile, numnorms=NUMNORMS, minfract=MINFRACT, maxbad=MAXBAD, maxprop=MAXPROP,
	tum2normratio=TUM2NORMRATIO, nnormalweirds=NNORMALWEIRDS ) {
	
	## Print out all somatic candidate events
	## 	Filter somatic
	cands = subset(cands,nnorm <= numnorms ) 
	cands = cands[order(cands[,"nreads"],decreasing=T),]
	
	## Add allele fraction information
	cands = cbind(cands,as.numeric(cands[,"nSpanClipped"] / (cands[,"nSpanClipped"]+cands[,"nSpanNormal"])))
	colnames(cands)[ncol(cands)] = "Fraction"


	write.table(cands,file=somaticfile,quote=F,sep="\t",row.names=F)
	
	if(nrow(cands)==0)
		return(cands)
		
	##
	## Filter further
	##
	
	print("Filtering somatic events")
	print(nrow(cands))	
	## Filter allele fraction
	#print(paste(length(which(cands[,"Fraction"] == Inf)),"regions have Inf total reads..."))
	w = which(is.na(cands[,"Fraction"]) | cands[,"Fraction"] > minfract)
	print(paste("w",length(w)))
	cands = subset(cands,is.na(Fraction) | Fraction > minfract)
	print(nrow(cands))	
	## 
	## Filter number of bad reads
	##
	print("Filtering number of bad reads")
	cands = subset(cands, nbadregion/nTotRegion < maxprop)
	# (nbadregion < maxbad | is.na(nbadregion)))
	print(nrow(cands))
	
	##
	## Filter out events with no contigs
	##
	print("Filtering out events with no contigs")
	cands = subset(cands,!(is.na(R_contig_TE) & is.na(F_contig_TE)))
	print(nrow(cands))
	
	## Both contig is entirely reference sequence:
	print("Filtering out both contigs entirely in reference sequence")
	margin=15
	#cands = subset(cands, (is.na(cands[,"R_contig_length"]) | (is.na(cands[,"R_contig_ref_end"]) & is.na(cands[,"R_contig_ref_start"]))) | cands[,"R_contig_ref_end"]-cands[,"R_contig_ref_start"] < cands[,"R_contig_length"]-margin)
	#	| (is.na(cands[,"F_contig_length"]) | (is.na(cands[,"F_contig_ref_end"]) & is.na(cands[,"F_contig_ref_start"]))) | cands[,"F_contig_ref_end"]-cands[,"F_contig_ref_start"] < cands[,"F_contig_length"]-margin) )
	cands = subset(cands, ((is.na(cands[,"R_contig_length"]) | (is.na(cands[,"R_contig_ref_end"]) & is.na(cands[,"R_contig_ref_start"]))) 
		| (cands[,"R_contig_ref_end"]-cands[,"R_contig_ref_start"] < cands[,"R_contig_length"]-margin)
		| (is.na(cands[,"F_contig_length"]) | (is.na(cands[,"F_contig_ref_end"]) & is.na(cands[,"F_contig_ref_start"]))) 
		| (cands[,"F_contig_ref_end"]-cands[,"F_contig_ref_start"] < cands[,"F_contig_length"]-margin)))

	print(nrow(cands))
	
	##
	## Filter out reference from contig
	##
	## If TE and any of nearby references are in the same family, remove!
	print("Filtering out reference from contig")
	#print(length(which(is.na(cands[,"near_refTE"]))))
	#teststart = " 65857119"	
	#print(teststart)
	matchToNearbyRef = apply(cands,1, function(x) {
		toreturn = 1
		#if(x["start"] == teststart) print("equals start")
		if( !is.na(x["near_refTE"]) ) {
		#print("ref not NA")
		refs = unlist(strsplit(as.character(x["near_refTE"]),split=",\\s"))
		refs = unlist(strsplit(refs,split="\\s"))
		reffams = sapply(refs,function(y) unlist(strsplit(y,split="#"))[3])
		#if(x["start"] == teststart) {
			#print(refs)
			#print(reffams)
		#}
		if(!is.na(x["nreads_f"]) && x["nreads_f"]>1) {
			TEfam = unlist(strsplit(as.character(x["F_contig_TE"]),split="\\|"))[2]
			#if(x["start"] == teststart)
			#	print(TEfam)
			if (TEfam %in% reffams) toreturn = 0
			#if(x["start"] == teststart)
			#	print(paste("1",toreturn))
			Ambigs = unlist(strsplit(as.character(x["F_contig_TE_ambiguity"]),split=";\\s"))
			Ambigfams = sapply(Ambigs,function(y) unlist(strsplit(y,split="\\|"))[2])
			#if(x["start"] == teststart)
			#	print(Ambigfams)
			if ( !is.na(Ambigfams) && any(!is.na(match(Ambigfams,reffams))) )  toreturn = 0
			#if(x["start"] == teststart)
			#	print(paste("2",toreturn))
		
		}
		if(!is.na(x["nreads_r"]) && x["nreads_r"]>1) {
			TEfam = unlist(strsplit(as.character(x["R_contig_TE"]),split="\\|"))[2]
			#if(x["start"] == teststart)
			#	print(TEfam)
			if (TEfam %in% reffams) toreturn = 0
			#if(x["start"] == teststart)
			#	print(paste("3",toreturn))
			Ambigs = unlist(strsplit(as.character(x["R_contig_TE_ambiguity"]),split=";\\s"))
			Ambigfams = sapply(Ambigs,function(y) unlist(strsplit(y,split="\\|"))[2])
			#if(x["start"] == teststart)
			#	print(Ambigfams)
			if ( !is.na(Ambigfams) && any(!is.na(match(Ambigfams,reffams))) )  toreturn = 0
			#if(x["start"] == teststart)
			#	print(paste("4",toreturn))
		}
		}
		return(toreturn)
	})
	#print(matchToNearbyRef[which(cands[,"start"] == as.numeric(teststart))])
	print(table(matchToNearbyRef))
	cands = cands[matchToNearbyRef==1,]
	#print(length(which(is.na(cands[,"near_refTE"]))))	
	print(nrow(cands))
	#print(which(cands[,"start"] == as.numeric(teststart)))	
	##
	## Filter number of discordant reads in Normal versus Tumor
	##
	print("Filtering number of discordant reads in normal versus tumor...")
	#cands=subset(cands,nWeirdsNormal == 0 | nWeirdsTumor-nWeirdsNormal > 5)
	#cands = subset(cands, nWeirdsNormal < tum2normratio*nWeirdsTumor | (nWeirdsTumor < 16 & nWeirdsTumor-nWeirdsNormal > 5) )
	print(nrow(cands))
	#print(which(cands[,"start"] == "111696945"))
	cands = subset(cands,nWeirdsNormal < nnormalweirds)
	#print(which(cands[,"start"] == as.numeric(teststart)))
	print(nrow(cands))
	#print(which(cands[,"start"] == "111696945"))
	cands = cands[order(cands[,"nreads"],decreasing=T),]
	#print(which(cands[,"start"] == "111696945"))

	return(cands)
}


## Filter germline events
filter_germline = function ( cands, germlinefile, numnorms=NUMNORMS, minfract=MINFRACT, maxbad=MAXBAD, maxprop=MAXPROP ) {
	
	## Print out all somatic candidate events
	## 	Filter somatic
	cands = subset(cands,nnorm > numnorms ) 
	cands = cands[order(cands[,"nreads"],decreasing=T),]
	
	## Add allele fraction information
	cands = cbind(cands,cands[,"nSpanClipped"] / (cands[,"nSpanClipped"]+cands[,"nSpanNormal"]))
	colnames(cands)[ncol(cands)] = "Fraction"

	write.table(cands,file=germlinefile,quote=F,sep="\t",row.names=F)
	
	if(nrow(cands)==0)
		return(cands)
	##
	## Filter further
	##
	print("Filtering germline events")
	
	##
	## Filter out reference from contig
	##
	## If TE and any of nearby references are in the same family, remove!
	print(nrow(cands))
	matchToNearbyRef = apply(cands,1, function(x) {
		toreturn = 1
		if(!is.na(x["near_refTE"])) {
		refs = unlist(strsplit(as.character(x["near_refTE"]),split=",\\s"))
		refs = unlist(strsplit(refs,split="\\s"))
		reffams = sapply(refs,function(y) unlist(strsplit(y,split="#"))[3])

		if(!is.na(x["nreads_f"]) && x["nreads_f"]>1) {
			TEfam = unlist(strsplit(as.character(x["F_contig_TE"]),split="\\|"))[2]
			if (TEfam %in% reffams) toreturn = 0
			Ambigs = unlist(strsplit(as.character(x["F_contig_TE_ambiguity"]),split=";\\s"))
			Ambigfams = sapply(Ambigs,function(y) unlist(strsplit(y,split="\\|"))[2])
			if ( !is.na(Ambigfams) && any(!is.na(match(Ambigfams,reffams))) )  toreturn = 0
		}
		if(!is.na(x["nreads_r"]) && x["nreads_r"]>1) {
			TEfam = unlist(strsplit(as.character(x["R_contig_TE"]),split="\\|"))[2]
			if (TEfam %in% reffams) toreturn = 0
			Ambigs = unlist(strsplit(as.character(x["R_contig_TE_ambiguity"]),split=";\\s"))
			Ambigfams = sapply(Ambigs,function(y) unlist(strsplit(y,split="\\|"))[2])
			if ( !is.na(Ambigfams) && any(!is.na(match(Ambigfams,reffams))) )  toreturn = 0
		}
		}
		return(toreturn)
	})
	cands = cands[matchToNearbyRef==1,]
	print(nrow(cands))
	
	## Filter allele fraction
	print(paste(length(which(cands[,"Fraction"] == Inf)),"regions have Inf total reads..."))
	cands = subset(cands,is.na(Fraction) | Fraction > minfract)
	print(nrow(cands))	
	
	## 
	## Filter number of bad reads
	##
	#cands = subset(cands, nbadregion < maxbad)
	print("Filtering bad regions in germline")
	cands = subset(cands, nbadregion/nTotRegion < maxprop)
	print(nrow(cands))
	return(cands)

}


## Filter one-sided and two-sided events
filter_onesided = function ( events, filteredfile, filtfile_onesided, numside=NUMSIDE ) {

	onesided = subset(events,is.na(has_partner) | nreads_f <= numside | nreads_r <= numside)
	twosided = subset(events,!is.na(has_partner) & (nreads_f > numside & nreads_r > numside))

	write.table(twosided,file = filteredfile,sep="\t",quote=F,row.names=F)
	write.table(onesided,file = filtfile_onesided,sep="\t",quote=F,row.names=F)

}





