#!/usr/bin/env perl

#####################################################################################################
#                                                                                                   #
#                                   PriSM License                                                   #
#                                                                                                   #
# PriSM is distributed under the following BSD-style license:                                       #
# Copyright  2011-2014 Broad Institute, Inc.  All rights reserved.                                 #
#                                                                                                   #
# Redistribution and use in source and binary forms, with or without modification, are              #
# permitted provided that the following conditions are met:                                         #
#                                                                                                   #
#  1. Redistributions of source code must retain the above copyright notice, this list              #
#     of conditions and the following disclaimer.                                                   #
#                                                                                                   #
#  2. Redistributions in binary form must reproduce the above copyright notice, this list           #
#     of conditions and the following disclaimer in the documentation and/or other materials        #
#     provided with the distribution.                                                               #
#                                                                                                   #
#  3. Neither the name of the Broad Institute, Inc. nor the names of its contributors may be        #
#     used to endorse or promote products derived from this software without specific prior         #
#     written permission.                                                                           #
#                                                                                                   #
# THIS SOFTWARE IS PROVIDED AS IS.  BROAD MAKES NO EXPRESS OR IMPLIED REPRESENTATIONS OR          #
# WARRANTIES OF ANY KIND REGARDING THE SOFTWARE AND COPYRIGHT, INCLUDING, BUT NOT LIMITED TO,       #
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, CONFORMITY WITH ANY              #
# DOCUMENTATION, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT         #
# DISCOVERABLE. IN NO EVENT SHALL BROAD, THE COPYRIGHT HOLDERS, OR CONTRIBUTORS BE LIABLE FOR       #
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,        #
# BUT NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;    #
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,     #
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE       #
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF, HAVE REASON TO KNOW, OR IN FACT SHALL KNOW OF THE       #
# POSSIBILITY OF SUCH DAMAGE.                                                                       #
#                                                                                                   #
# If, by operation of law or otherwise, any of the aforementioned warranty disclaimers are          #
# determined inapplicable, your sole remedy, regardless of the form of action, including, but       #
# not limited to, negligence and strict liability, shall be replacement of the Software with        #
# an updated version if one exists.                                                                 #
#                                                                                                   #
# Development of PriSM has been funded in whole or in part with federal funds from the National     #
# Institute of Allergy and Infectious Diseases, National Institutes of Health, Department of        #
# Health and Human Services, under Contract No. HHSN266200400001C.                                  #
#                                                                                                   #
# In addition, PriSM is distributed, in part, under and subject to the provisions of licenses for:  #
# Perl, version 5.20.0,  1997-2010 Tom Christiansen, Nathan Torkington (all rights reserved).      #
#####################################################################################################

# ===================================================================================================
# Description:
#   To select valid primers and amplicons, create .tar.gz file to include all output files.
#  
# Usage: 
#    primer_design_web.pl aln_file genomeName genomeFasta multiprimer.conf
#   (aln_file is alignment file generated by CLUSTALW or MUSCLE, must have extension .aln or .muscle)
#   (You can edit multiprimer.conf to customize configure file settings)
#
# Creation Date: 2007-03-02
# Author: Qing Yu
# ===================================================================================================


use strict;
use PrimerDesign;
use MIME::QuotedPrint;
use MIME::Base64;

my $usage = <<USAGE;
Usage: 
    $0 aln_file genoName genomeFasta configureFile

For example:
    $0 den9.aln den9 den9.fasta den9_2008-5-31.conf
    
USAGE
;


if ( scalar(@ARGV) != 4)
{
	print "Wrong Input!!\n\n$usage\n";
    exit(0);
}
    
my $aln_name = $ARGV[0];  # aln filename, for example:Bankok_Thailand_clustalw.aln 
my $name = $ARGV[1];
my $genomeFile = $ARGV[2];
my $conf_file = $ARGV[3];

my $base_dir ="./";

chdir  $base_dir;
print "current primer_design_web dir\n";
`pwd`;
`cp -r $aln_name .`;
`cp -r $genomeFile .`;
`cp -r $conf_file .`;

my @tmp = split(/\//,$aln_name);
$aln_name = pop @tmp;

@tmp = split(/\//,$genomeFile);
$genomeFile = pop @tmp;

@tmp = split(/\//,$conf_file);
$conf_file = pop @tmp;

# get number of aligned sequence in aln file
unless (open(A,$aln_name)) 
{
	warn "<br>Failed to open $aln_name<br>\n";
	exit(1);
}

my %seq=();
my $align_num=0;

# Count the number of alignments
if ($aln_name =~/\.aln/i)   # clustawl output
{
	<A>; # kill the header
    while (<A>) 
	{
		next if /\*/;

		if( !(/\S+/) && $align_num>0)  # stop at empty line
		{
			last;
		}

		if (/(\S+)\s+(\S+)/) 
		{
			$align_num++;
		}
    }
}
elsif ($aln_name =~/\.muscle/i)   # muscle output
{
	$align_num =`grep -c '>' $aln_name`;
    chomp $align_num;
}

# the following files generated by make_all_consensus.pl
my $iupacFile = $aln_name.".cons.iupac";
my $fastaFile  = $aln_name.".cons.all";
my $fasta_fr_file = $aln_name.".cons.freq";
my $gapFile = $aln_name.".cons.gap";

(-e $iupacFile) || die "<br>$iupacFile does not exist! Exit now.<br>\n";
(-e $fastaFile) || die "<br>$fastaFile does not exist! Exit now.<br>\n";
(-e $fasta_fr_file) || die "<br>$fasta_fr_file does not exist! Exit now.<br>\n";
(-e $gapFile) || die "<br>$gapFile does not exist! Exit now.<br>\n";

# default configuration
my $monomers = 4;
my $primerSize = 20;
my $overlapLen = 19;
my $minGCpercent = 45;
my $maxGCpercent = 55;
my $minTM = 56;
my $maxTM = 65;
my $deg_baseNum = 3;
my $dna_c = 50;
my $salt_c = 50;
my $freq_thresh = 0.1; #0.1
my $de_thresh = 0.8; #0.8
my $self_compl=6;     # max allowed number of self complementary bases default 6
#my $hairpin_stem = 8; # max allowed number of bases matched in the same sequence when sequence folded, default 8
my $gap_thresh = 0.8;
my $hit_thresh = 0.8; # primer_hit_count/# of genomes when blast primer against genomes
my $maxAmplLen = 900;
my $minAmplLen = 500;
my $amplOverlapLen = 100;
my $maxCombination = 8;
my $maxPairCompl = 15;
my $align_start=855;  # just initialize, will change according to input genome
my $align_end =9455;  # just initialize, will change according to input genome

# working directory is the directory containing multiprimer.conf and .aln file
if (-e $conf_file)
{
	my $r_params = &get_conf_params($conf_file);
    $minGCpercent = $r_params->{"GCmin"};
    $maxGCpercent = $r_params->{"GCmax"};
    $minTM = $r_params->{"Tmin"};
    $maxTM = $r_params->{"Tmax"};
    $monomers = $r_params->{"MONO"};
    $primerSize = $r_params->{"SIZE"};
    $overlapLen = $r_params->{"OVERLAP"};
    $dna_c = $r_params->{"DNA_C"};
    $salt_c = $r_params->{"SALT_C"};
    $deg_baseNum = $r_params->{"DEGENERATE"};
    $freq_thresh = $r_params->{"FREQ_THRESHOLD"};
    $de_thresh = $r_params->{"DE_THRESHOLD"};
    $self_compl = $r_params->{"SELF_COMPL"};
   # $hairpin_stem = $r_params->{"HAIRPIN_STEM"};
    $gap_thresh = $r_params->{"GAP_THRESHOLD"};
    $hit_thresh = $r_params->{"HIT_THRESHOLD"};
    $maxAmplLen = $r_params->{"MAX_AMPL_LEN"};
    $minAmplLen = $r_params->{"MIN_AMPL_LEN"};
    $amplOverlapLen = $r_params->{"OVERLAP_THRESH"};
    $maxCombination = $r_params->{"COMBIN_THRESH"};
    $maxPairCompl = $r_params->{"PAIR_COMPL_THRESH"};
    $align_start = $r_params->{"ALIGN_START"};
    $align_end = $r_params->{"ALIGN_END"};
}

my $o_primerDesign = new bit::PrimerDesign($fastaFile,
					   $fasta_fr_file,
					   $iupacFile,
					   $gapFile,
					   $name,
					   $minGCpercent,
					   $maxGCpercent,
					   $minTM,
					   $maxTM,
					   $monomers,
					   $primerSize,
					   $overlapLen,
					   $dna_c,
					   $salt_c,
					   $deg_baseNum,
					   $freq_thresh,
					   $align_num,
					   $de_thresh,
					   $self_compl,
#					   $hairpin_stem,
					   $gap_thresh,
					   $hit_thresh,
					   $maxAmplLen,
					   $minAmplLen,
					   $amplOverlapLen,
					   $maxCombination,
					   $maxPairCompl,
					   $align_start,
					   $align_end);
$o_primerDesign->load();
$o_primerDesign->load_freq_file();
$o_primerDesign->load_iupac_file();
$o_primerDesign->load_gap_file();

my $outFile = $fastaFile.".pri";
my $invalid_primerFile = $fastaFile.".pri.invalid"; 
my $out_pri_fasta = $fastaFile.".pri.fasta";
my $out_de_pri_file = $fastaFile.".pri.de";
my $out_pri_conserv = $fastaFile.".pri.iupac.conserv";

my $r_all_gene_pri_ary;
my $r_all_de_pri_ary;    # primers with degenerate bases
my $r_all_iupac_pri_ary; # primers with iupac code
my $r_all_bad_pri_ary; 

# ----- for primer selection -----------------------
print "<br>Generating primer candidates ...<br><br>\n";
($r_all_gene_pri_ary,$r_all_de_pri_ary,$r_all_iupac_pri_ary,$r_all_bad_pri_ary) = $o_primerDesign->get_all_seq_primers();

print "<br>Writing all valid primers info to $outFile<br>\n";
$o_primerDesign->write_all_gene_primer_info($r_all_gene_pri_ary, $outFile);

print "<br>Writing all valid primers with IUPAC code to fasta file $out_pri_fasta and conservation file $out_pri_conserv<br>\n";
$o_primerDesign->write_all_valid_primer_fasta_iupac($r_all_gene_pri_ary, $r_all_de_pri_ary, $out_pri_fasta, $out_pri_conserv);

print "<br>Writing all degenerate primers to $out_de_pri_file<br>\n";
$o_primerDesign->write_degenerate_primer_info($r_all_de_pri_ary,$out_de_pri_file);

print "<br>Writing all invalid primers to $invalid_primerFile<br>\n";
$o_primerDesign->write_all_invalid_primer($r_all_bad_pri_ary,$invalid_primerFile);

print "<br><br>Finding primer candidates completes!<br><br>\n";


# ======== for amplicon selection ===============

my $F_pri_blast_outfile = $fastaFile.".blast_F";
my $R_pri_blast_outfile = $fastaFile.".blast_R";

my $r_genome_seq_hash = $o_primerDesign->load_genome($genomeFile);
my $r_pri_seq_hash = $o_primerDesign->load_genome($out_pri_fasta);

# ----- blast primer against genome -------
print "\n<br>Blasting primers against genome...<br><br>\n\n";

$o_primerDesign->primer_blast_genome_to_file($r_genome_seq_hash,$r_pri_seq_hash,$r_all_gene_pri_ary,$F_pri_blast_outfile,$R_pri_blast_outfile);

# ----- select amplicon output -----------
my $outfile = $fastaFile.".ampl.out";
my $outfile_fasta = $fastaFile.".ampl.fasta";
print "\n<br>Writing amplicon selection result to $outfile...<br><br>\n\n";
$o_primerDesign->amplicon_selection($r_genome_seq_hash, $r_pri_seq_hash,$F_pri_blast_outfile,$R_pri_blast_outfile,$out_pri_conserv,$outfile,$outfile_fasta );


# ----- generate tar file for all results -----------
my $tarFile = $aln_name.".tar.gz";
my $tmp = $aln_name.".cons.all.ampl.out ".$aln_name.".cons.all.ampl.fasta ".$aln_name.".cons.all.pri ".$aln_name.".cons.all.pri.fasta ".$aln_name.".cons.all.pri.de ".$aln_name.".cons.all.pri.invalid ".$aln_name.".cons.all.pri.iupac.conserv ".$aln_name.".cons.major ";
print $tmp."==\n";
my $cmd = "tar -czvPf $tarFile $tmp";
&doSystem($cmd);


exit;

################
sub doSystem 
{
    my ($string) = @_;
    
    my $rc = system($string);
    if ($rc) 
    {
		return "system call '$string' terminated with exit status ($rc)\n";
    }
    return undef;
}

##############################################
sub string_to_file
{
    my ($str, $f) = @_;
    open(OUT_F, ">$f") or die "Can't open $f for writing\n";
    print OUT_F $str;
    close OUT_F; 
}

###########################
# Parses conf file that has
# param/value pairs:
# CG%low=45=
# GC%hi=55=
############################
sub get_conf_params
{
	my ($in_f) = @_;
	my %params = ();
	my $lines = get_lines_from_file($in_f);
	foreach ( @{$lines} )
    {
		my ($param, $value) = split '=';
        if(exists $params{$param}) { die"get_conf_params(): Multiple values for $param\n"; }
        $params{$param} = $value;
    }
	return \%params;
}
