package PrimerDesign;

#####################################################################################################
#                                                                                                   #
#                                   PriSM License                                                   #
#                                                                                                   #
# PriSM is distributed under the following BSD-style license:                                       #
# Copyright  2011-2014 Broad Institute, Inc.  All rights reserved.                                 #
#                                                                                                   #
# Redistribution and use in source and binary forms, with or without modification, are              #
# permitted provided that the following conditions are met:                                         #
#                                                                                                   #
#  1. Redistributions of source code must retain the above copyright notice, this list              #
#     of conditions and the following disclaimer.                                                   #
#                                                                                                   #
#  2. Redistributions in binary form must reproduce the above copyright notice, this list           #
#     of conditions and the following disclaimer in the documentation and/or other materials        #
#     provided with the distribution.                                                               #
#                                                                                                   #
#  3. Neither the name of the Broad Institute, Inc. nor the names of its contributors may be        #
#     used to endorse or promote products derived from this software without specific prior         #
#     written permission.                                                                           #
#                                                                                                   #
# THIS SOFTWARE IS PROVIDED AS IS.  BROAD MAKES NO EXPRESS OR IMPLIED REPRESENTATIONS OR          #
# WARRANTIES OF ANY KIND REGARDING THE SOFTWARE AND COPYRIGHT, INCLUDING, BUT NOT LIMITED TO,       #
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, CONFORMITY WITH ANY              #
# DOCUMENTATION, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT         #
# DISCOVERABLE. IN NO EVENT SHALL BROAD, THE COPYRIGHT HOLDERS, OR CONTRIBUTORS BE LIABLE FOR       #
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,        #
# BUT NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;    #
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,     #
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE       #
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF, HAVE REASON TO KNOW, OR IN FACT SHALL KNOW OF THE       #
# POSSIBILITY OF SUCH DAMAGE.                                                                       #
#                                                                                                   #
# If, by operation of law or otherwise, any of the aforementioned warranty disclaimers are          #
# determined inapplicable, your sole remedy, regardless of the form of action, including, but       #
# not limited to, negligence and strict liability, shall be replacement of the Software with        #
# an updated version if one exists.                                                                 #
#                                                                                                   #
# Development of PriSM has been funded in whole or in part with federal funds from the National     #
# Institute of Allergy and Infectious Diseases, National Institutes of Health, Department of        #
# Health and Human Services, under Contract No. HHSN266200400001C.                                  #
#                                                                                                   #
# In addition, PriSM is distributed, in part, under and subject to the provisions of licenses for:  #
# Perl, version 5.20.0,  1997-2010 Tom Christiansen, Nathan Torkington (all rights reserved).      #
#####################################################################################################

# ===================================================================================================
# Description:
#   Objective Oriented Perl Module for primer design and amplicon selection 
#
# Creation Date: 2007-03-02
# Author: Qing Yu
# ===================================================================================================

use strict;


umask(0);

# -------- global variables -----------
my %ENT = ('AA' , 240,
	'AG' , 208,
	'AN' , 239,
	'CA' , 129,
	'CG' , 278,
	'CN' , 208,
	'GA' , 135,
	'GG' , 266,
	'GN' , 173,
	'TA' , 169,
	'TG' , 129,
	'TN' , 169,
	'NA' , 168,
	'NG' , 220,
	'NN' , 215,
	'AC' , 173,
	'AT' , 239,
	'CC' , 266,
	'CT' , 208,
	'GC' , 267,
	'GT' , 173,
	'TC' , 135,
	'TT' , 240,
	'NC' , 210,
	'NT' , 215);

my %ENTH = ('AA' , 91,
	 'AG',  78,
	 'AN' , 86,
	 'CA' , 58,
	 'CG' , 119,
	 'CN' , 78,
	 'GA' , 56,
	 'GG' , 110,
	 'GN' , 65,
	 'TA' , 60,
	 'TG' , 58,
	 'TN' , 60,
	 'NA' , 81,
	 'NG' , 94,
	 'NN' , 78,
	 'AC' , 65,
	 'AT' , 86,
	 'CC' , 110,
	 'CT' , 78,
	 'GC' , 111,
	 'GT' , 65,
	 'TC' , 56,
	 'TT' , 91,
	 'NC' , 105,
	 'NT' , 78 );



# number of free(unmatched) bases in the loop when sequence folded and matched bases exist at the loop ends. Loops of 4 and 5 nucleotides have been found to be the least destabilizing
my @HAIRPIN_LOOP_BASE_NUM = (4,5);
my $HAIRPIN_STEM_MIN = 8; # hairpin has minimum 8 base pairs in stem, more relax than 2


# ==========================================================
# To create PrimerDesign object.
# Input: 
#  fastafile - (alnName).aln.cons.all, generated by make_all_consensus.pl
#  frequency_file - (alnName).aln.cons.freq, generated by make_all_consensus.pl 
#  iupac_file - (alnName).aln.cons.iupac, generated by make_all_consensus.pl 
#  name - prefix of alignment file, for example, den9
#  minimum GC%, maximumGC%, minimum TM, maximum TM,
#  monomers,
#  primer size,
#  overlap length of two continuous primer candidates,
#  dna_concentration, salt_concentration
#  alignment number in the alignment file
# Output: 
#   PrimerDesign object
# ==========================================================
sub new {
    my $proto = shift;
    my ($filename, $freq_file, $iupac_file, $gap_file, $name, $minGC, $maxGC, $minTM, $maxTM, $monomers, $primerSize, $overlapLEN, $dna_concent, $salt_concent, $deBaseNum, $threshold, $alignNum,$de_threshold,$self_complement,$gap_thresh,$hit_thresh,$maxAmplLen,$minAmplLen,$amplOverlapLen,$maxCombination,$maxPairCompl,$align_start,$align_end ) = @_;
    my $class = ref($proto) || $proto;
    my $self  = {};

    $self->{FILENAME} = $filename;
    $self->{FREQ_FILE} = $freq_file;
    $self->{IUPAC_FILE} = $iupac_file;
    $self->{GAP_FILE} = $gap_file;   # file showing flag at gap position
    $self->{NAME} = $name;
    $self->{MIN_GC_PERCENT} = $minGC;
    $self->{MAX_GC_PERCENT} = $maxGC;
    $self->{MIN_TM} = $minTM;
    $self->{MAX_TM} = $maxTM;
    $self->{MONOMERS} = $monomers;
    $self->{PRIMER_SIZE} = $primerSize;
    $self->{OVERLAP_LENGTH} = $overlapLEN;
    $self->{DNA_CONCENTRATION} = $dna_concent;
    $self->{SALT_CONCENTRATION} = $salt_concent;
    $self->{DE_BASE_NUM} = $deBaseNum;
    $self->{FREQ_THRESHOLD} = $threshold;   # valid primer frequency threshold
    $self->{ALIGN_NUM} = $alignNum;
    $self->{DE_THRESHOLD} = $de_threshold;  # frequency threshold to define a degenerate base
    $self->{SELF_COMPL} = $self_complement; # max allowed number of self complementary bases
  #  $self->{HAIRPIN_STEM} = $hairpin_stem; # max allowed number of bases matched in the same sequence when sequence folded
    $self->{GAP_THRESHOLD} = $gap_thresh;
    $self->{HIT_THRESHOLD} = $hit_thresh;
    $self->{MAX_AMPL_LEN} = $maxAmplLen;
    $self->{MIN_AMPL_LEN} = $minAmplLen;
    $self->{OVERLAP_THRESH} = $amplOverlapLen;
    $self->{COMBIN_THRESH} = $maxCombination;
    $self->{PAIR_COMPL_THRESH} = $maxPairCompl;
    $self->{ALIGN_START} = $align_start; # alignment start pos in consensus
    $self->{ALIGN_END} = $align_end; # alignment end pos in consensus

    bless($self, $class);

    $self->{LOADED} = 0;

    return $self;
}

# =================================================
# To load the fasta file to PrimerDesign object.
# =================================================
sub load {
    my $this = shift;

    my ($desc, $seq);
    my $seq_hash;   # for forward sequence
    my $seq_hash_R; # for reverse sequence

   # print "*** loading fasta file = ".$this->{FILENAME}."\n\n";

    open FASTA, "<$this->{FILENAME}" ||die "Cannot open $this->{FILENAME}\n";

	while (<FASTA>) {
      	if (/^\>.+/) { # the description
	      	s/\s*$//; # remove all trailing whitespace
	      	s/^>//; # remove leading >
			if (defined $desc) {

				if ($seq !~ m/^[0-9\s\t]+$/){
				  $seq =~ s/\s+//g; # get rid of all white space
				}		

				$seq_hash->{$desc} = $seq;
				$seq_hash_R->{$desc} = get_rev_complement($seq);
			}
			$desc = $_;
	      	$seq = "";
	    }
	    else {
			chomp;
			if (!defined $this->{LINE_LENGTH}) {
				$this->{LINE_LENGTH} = length $_;
			}
			$seq .= $_;
	    }
	}
	close FASTA;

	if (defined $desc) {
	    if ($seq !~ m/^[0-9\s\t]+$/){
	      $seq =~ s/\s+//g; # get rid of all white space
	    }

	    $seq_hash->{$desc} = $seq;
	    $seq_hash_R->{$desc} = get_rev_complement($seq);
	}

    $this->{CONTENTS} = $seq_hash;   # for forward sequence
    $this->{CONTENTS_R} = $seq_hash_R;   # for reverse sequence
    $this->{LOADED} = 1;

    return;
}


# =================================================
# To load the frequency file to PrimerDesign object.
# =================================================
sub load_freq_file {
    my $this = shift;

    my $desc_fr;
    my $seq_fr;
    my $r_seq_fr;  # sequence frequency array reference
    my $seq_fr_hash;   # for forward sequence frequency
    my $seq_fr_hash_R;  # for reverse sequence frequency
    my $pos;

	#print "*** loading fasta frequency file = ".$this->{FREQ_FILE}."\n\n";

    open FASTA, "<$this->{FREQ_FILE}" ||die "Cannot open $this->{FREQ_FILE}\n";

	while (<FASTA>) {
		if (/^\>.+/) { # the description
	      	s/^>//; # remove leading >

			$r_seq_fr=&create_new_ary(); # assign a new memory space
			if (defined $desc_fr) {

				my @tmp1 = split(/\s/,$seq_fr);
				for (my $i=$pos; $i<scalar(@tmp1); $i++)
				 {
					$r_seq_fr->[$i] = $tmp1[$i];
					$pos++;
				 }
				$seq_fr_hash->{$desc_fr} = $r_seq_fr;

				my @tmp_a = reverse @{$r_seq_fr};
				$seq_fr_hash_R->{$desc_fr} = \@tmp_a;
			}
			chomp;
			$desc_fr = $_;
			$seq_fr = "";
			$pos=0;
	    }
	    else {
			chomp;
			if (!defined $this->{LINE_LENGTH}) {
				$this->{LINE_LENGTH} = length $_;
			}
			$seq_fr .= $_;
	    }
	}
	close FASTA;

	if (defined $desc_fr) {

		my @tmp2 = split(/\s/,$seq_fr);
		for (my $i=$pos; $i<scalar(@tmp2); $i++){
			$r_seq_fr->[$i] = $tmp2[$i];
			$pos++;
	    }
	    $seq_fr_hash->{$desc_fr} = $r_seq_fr;
		my @tmp_b = reverse @{$r_seq_fr};
		$seq_fr_hash_R->{$desc_fr} = \@tmp_b;

	}

    $this->{FREQ_CONTENTS} = $seq_fr_hash;     # for forward sequence
    $this->{FREQ_CONTENTS_R} = $seq_fr_hash_R; # for reverse sequence  
    $this->{FREQ_LOADED} = 1;

    return;
}


# =================================================
# To load the IUPAC code fasta file to PrimerDesign object.
# =================================================
sub load_iupac_file {
    my $this = shift;

    my ($desc, $seq);
    my $seq_hash;   # for forward sequence
    my $seq_hash_R; # for reverse sequence

	#print "*** loading fasta file = ".$this->{IUPAC_FILE}."\n\n";

    open FASTA, "<$this->{IUPAC_FILE}" ||die "Cannot open $this->{IUPAC_FILE}\n";

	while (<FASTA>) {
		if (/^\>.+/) { # the description
	      	s/\s*$//; # remove all trailing whitespace
	      	s/^>//; # remove leading >
			if (defined $desc) {

				if ($seq !~ m/^[0-9\s\t]+$/){
				  $seq =~ s/\s+//g; # get rid of all white space
				}		
				$seq_hash->{$desc} = $seq;
				$seq_hash_R->{$desc} = get_rev_complement($seq);
			}
			$desc = $_;
	      	$seq = "";
	    }
	    else {
			chomp;
			if (!defined $this->{LINE_LENGTH}) {
				$this->{LINE_LENGTH} = length $_;
			}
			$seq .= $_;
	    }
	}
	close FASTA;

	if (defined $desc) {

	    if ($seq !~ m/^[0-9\s\t]+$/){
	      $seq =~ s/\s+//g; # get rid of all white space
	    }
	    $seq_hash->{$desc} = $seq;
	    $seq_hash_R->{$desc} = get_rev_complement($seq);
	}

    $this->{IUPAC_CONTENTS} = $seq_hash;   # for forward iupac sequence
    $this->{IUPAC_CONTENTS_R} = $seq_hash_R;   # for reverse iupac sequence
    $this->{IUPAC_LOADED} = 1;

    return;
}

# =================================================
# To load the gap fasta file to PrimerDesign object.
# Only one fasta description in one gap file.
# =================================================
sub load_gap_file {
    my $this = shift;

    my $gap_str; # forward major consensus gap record string 

    open FASTA, "<$this->{GAP_FILE}" ||die "Cannot open $this->{GAP_FILE}\n";

    while (<FASTA>) {
		if (/^\>.+/) { # the description
			s/\s*$//; # remove all trailing whitespace
			s/^>//; # remove leading >

			$gap_str = "";
		}
		else {
			chomp;
			s/\s*$//; # remove all trailing whitespace
			$gap_str .= $_;
		}
    }
    close FASTA;

    $this->{GAP_CONTENTS} = $gap_str;   
    $this->{GAP_LOADED} = 1;

    return;
}


# =================================================
# To print the attributes of PrimerDesign object.
# =================================================
sub print_attributes {
	my $this = shift;

    print "filename:".$this->{FILENAME}."\n";
    print "fr_filename:".$this->{FREQ_FILE}."\n";
    print "iupac_filename:".$this->{IUPAC_FILE}."\n";
    print "alignment_name:".$this->{NAME}."\n";
    print "min_GC_percent:".$this->{MIN_GC_PERCENT}."\n";
    print "max_GC_percent:".$this->{MAX_GC_PERCENT}."\n";
    print "min_TM:".$this->{MIN_TM}."\n";
    print "max_TM:".$this->{MAX_TM}."\n";
    print "monomers:".$this->{MONOMERS}."\n";
    print "primer_size:".$this->{PRIMER_SIZE}."\n";
    print "overlap_length:".$this->{OVERLAP_LENGTH}."\n";
    print "dan_concentration:".$this->{DNA_CONCENTRATION}."\n";
    print "salt_concentration:".$this->{SALT_CONCENTRATION}."\n";
    print "allowed degenerated base number: ".$this->{DE_BASE_NUM}."\n";
    print "threshold of degenerated base frequency: ".$this->{FREQ_THRESHOLD}."\n";
    print "alignment number in consense: ".$this->{ALIGN_NUM}."\n";
    print "degenerate threshold: ".$this->{DE_THRESHOLD}."\n";
    print "self-complement: ".$this->{SELF_COMPL}."\n";
   # print "hairpin stem: ".$this->{HAIRPIN_STEM}."\n";
    print "alignment start position: ".$this->{ALIGN_START}."\n";
    print "alignment end position: ".$this->{ALIGN_END}."\n";


}


# =================================================
# To get all primers for each forward and reverse sequence in the input 
#  alignment fasta file ((alnName).aln.cons.all)
# Input: none
# Output: 
#    \@all_seq_primers - reference of array of hash. Each array item is a hash reference containing a valid primer.
#    \@all_de_primers - reference of array of hash. Each array item is a hash reference containing a primer candidate with degenerate bases.
#    \@all_seq_primers_iupac - reference of array of hash. Each array item is a hash reference containing a primer candidate with sequence in IUPAC code.
#    \@all_invalid_primers - reference of array of hash. Each array item is a hash reference containing an invalid primer candidate.
# =================================================
sub get_all_seq_primers {
    my $this = shift;

    if (!defined($this->{LOADED}) || $this->{LOADED} ==0) {
		$this->load();
    }

    if (!defined($this->{FREQ_LOADED}) || $this->{FREQ_LOADED} ==0) {
		$this->load_freq_file();
    }

    if (!defined($this->{IUPAC_LOADED}) || $this->{IUPAC_LOADED} ==0) {
		$this->load_iupac_file();
    }

    if (!defined($this->{GAP_LOADED}) || $this->{GAP_LOADED} ==0) {
		$this->load_gap_file();
    }

    my @all_seq_primers = ();       # all vaild primers
    my @all_de_primers = ();        # all primers with degenerate bases
    my @all_seq_primers_iupac = (); # all vaild primers with iupac code
    my @all_invalid_primers = ();   # all invalid primers
    my $r_seq_hash_F = $this->{CONTENTS};
    my $r_seq_fr_hash_F = $this->{FREQ_CONTENTS};
    my $r_seq_iupac_hash_F = $this->{IUPAC_CONTENTS};
    my $gapString = $this->{GAP_CONTENTS};
    my $r_seq_hash_R = $this->{CONTENTS_R};
    my $r_seq_fr_hash_R = $this->{FREQ_CONTENTS_R};
    my $r_seq_iupac_hash_R = $this->{IUPAC_CONTENTS_R};

    my $seq_len;

    foreach my $a_desc (keys %{$r_seq_hash_F}) {
		my $a_seq = $r_seq_hash_F->{$a_desc};

		my $seq_fr_len = scalar(@{$r_seq_fr_hash_F->{$a_desc}});
        $seq_len = length($a_seq);

		if ($a_seq=~/\.|\-/ig ) { # represent gap 
			print "Warning: The sequence of $a_desc contains . or -, it will not be processed!\n\n";
		}

		if ( length($a_seq) != $seq_fr_len ) {
			print "Warning: The sequence of $a_desc has different length from its frequency, it will not be processed!\n\n";
		}	
    }

    # get forward primers
     my $direct = "F";
     my $r_p_h_ary_F;
     my $r_de_p_h_ary_F;
     my $r_p_h_iupac_ary_F;
     my $r_bad_p_ary_F;
  
    ($r_p_h_ary_F, $r_de_p_h_ary_F, $r_p_h_iupac_ary_F, $r_bad_p_ary_F)= $this->_get_one_seq_primers($seq_len,$r_seq_hash_F, $r_seq_fr_hash_F, $r_seq_iupac_hash_F, $direct, $gapString);

    if (defined($r_p_h_ary_F)){
		push (@all_seq_primers, $r_p_h_ary_F);
    }
    if (defined($r_de_p_h_ary_F)){
		push (@all_de_primers, $r_de_p_h_ary_F);
    }
    if (defined($r_p_h_iupac_ary_F)){
		push (@all_seq_primers_iupac, $r_p_h_iupac_ary_F);
    }
    if (defined($r_bad_p_ary_F)){
		push (@all_invalid_primers, $r_bad_p_ary_F);
    }

    # get reverse primers
     $direct = "R";
     my $r_p_h_ary_R;
     my $r_de_p_h_ary_R;
     my $r_p_h_iupac_ary_R;
     my $r_bad_p_ary_R;
     my $rev_gapString = reverse($gapString);
     ($r_p_h_ary_R,$r_de_p_h_ary_R, $r_p_h_iupac_ary_R,$r_bad_p_ary_R) = $this->_get_one_seq_primers($seq_len,$r_seq_hash_R,$r_seq_fr_hash_R, $r_seq_iupac_hash_R, $direct,$rev_gapString);


    # reverse hash to present items by ascending position

    my @re_p;
    my @re_de;
    my @re_iupac;
    my @re_bad_p;
    if (defined($r_p_h_ary_R)){
		@re_p = reverse @{$r_p_h_ary_R};
		push (@all_seq_primers, \@re_p);
    }
    else{
		print "Warning: Reversive primer hash is not defined in get_all_seq_primers()\n";
    }

    if (defined($r_de_p_h_ary_R)){
		@re_de = reverse @{$r_de_p_h_ary_R};
		push (@all_de_primers, \@re_de);
    }
    else{
		print "Warning: Reversive degenerate primer hash is not defined in get_all_seq_primers()\n";
    }

    if (defined($r_p_h_iupac_ary_R)){
		@re_iupac = reverse @{$r_p_h_iupac_ary_R};
		push (@all_seq_primers_iupac, \@re_iupac);
    }
    else{
		print "Warning: Reversive iupac primer hash is not defined in get_all_seq_primers()\n";
    }

    if (defined($r_bad_p_ary_R)){
		@re_bad_p = reverse @{$r_bad_p_ary_R};
		push (@all_invalid_primers, \@re_bad_p);
	}
    else{
		print "Warning: Reversive invalid primer hash is not defined in get_all_seq_primers()\n";
    }

    return \@all_seq_primers, \@all_de_primers, \@all_seq_primers_iupac, \@all_invalid_primers;
  }


# =================================================
# Search primers by sliding a window along the sequence in certain direction.
# Apply restrictions and find all primers in this direction (F or R).
# Input:
#    sequence length
#    reference of sequence hash
#    reference of frequency hash
#    reference of sequence hash with IUPAC code
#    direction
# Output: 
#    Array reference of valid primer hash
#    Array reference of degenerate primer hash
#    Array reference of valid primer hash in IUPAC code
#    Array reference of invalid primer hash
# =================================================
sub _get_one_seq_primers {
    my $this = shift;

    my ($seqLen, $r_seq_hash,$r_seq_fr_hash, $r_seq_iupac_hash, $direction, $gapString) = @_;

    my $size = $this->{PRIMER_SIZE};
    my $step = $size - $this->{OVERLAP_LENGTH}; 
    my ($key, $seq_iupac) = each(%$r_seq_iupac_hash); # iupac has only one consensus

    my %primer_h = ();
    my $r_pri_h_ary;      # array reference for valid primers 
    my $r_de_pri_ary;     # array reference for primers with degeneratd bases
    my $r_iupac_pri_ary;  # array reference for primers with iupac code
    my $r_bad_pri_ary;    #array reference for invalid primers

    my $start_p;   # primer start position
    my $pri_name;
    my $gap_flag="-"; # 1 - gap exist in original alignment seq for this primer

    my $align_num = int($this->{ALIGN_NUM});

    my $tail_flag = 0; #if primer candidate outside alignment range, tail_flag=1

    for (my $i=0; $i<$seqLen; $i=$i+$step) {
		# primer length will less than $size 
		if ($i+$size-1>=$seqLen) {
			last;
		}

		if($i+$size-1<=$this->{ALIGN_START} ||$i>=$this->{ALIGN_END}) {
			$tail_flag =1;
		}
		else {
			$tail_flag =0;
		}

		foreach my $desc (sort keys %{$r_seq_fr_hash}) {
			$gap_flag="-";
			my $subseq_check = substr($r_seq_hash->{$desc}, $i,$size);
			my @subseq_fr=(); # freq array for a subseq 
		
			$subseq_check =~ tr/[a-z]/[A-Z]/;
			my $subseq = $subseq_check;
			my @tmpSubseq = split//,$subseq;

			# treat differently on the two tails outside alignment
			if($tail_flag!=1) { 
				for(my $k=0; $k<$size; $k++) {
					$subseq_fr[$k] = $r_seq_fr_hash->{$desc}->[$k+$i];

					# make degenerated base lower case
					if ($subseq_fr[$k] != $align_num && $subseq_fr[$k]/$align_num<$this->{DE_THRESHOLD}) {
						$tmpSubseq[$k] =~ tr/[A-Z]/[a-z]/;
					}
			   }
			   $subseq = join('',@tmpSubseq);
			}
			my $subseq_iupac = substr($seq_iupac, $i,$size);

			# check gap existence in original alignment
			my $sub_gapStr = substr($gapString, $i,$size);
			if ($sub_gapStr =~/1/) {
				$gap_flag = "y";
			}

			if ($direction eq "F") {
				$start_p = $i+1; # postion starts at 1
				$pri_name = $this->{NAME}."_".$start_p."_".$direction;	
			}
			else {
				$start_p = $seqLen-$i; # pos at forward stand from 5' to 3'
				$pri_name = $this->{NAME}."_".$start_p."_".$direction;
			}

			# check primer size
			# designed primer cannot be less than size
			if (length($subseq_check)<$size) {
				last;
			}	

			# check degenerate base number and frequence
			# no constrains of number of degenerated base in a primer
			# frequence of degenerate base mush over threshold (such as 80%)
			my $de_flag =0;
			my $r_de_freq  = &create_new_ary();     # array reference for the frequence values of all "defined" degenerate at this pos. If frequence of real degenerate is over DE_THRESHOLD, it is not treated as degenerate
			my $r_real_de_freq  = &create_new_ary();     # array reference for the frequence values of all real degenerate at this pos
			my $de_num_check =1;
			my $de_freq_check =1;	

			# only check degenerate number and degenerate freq inside alignment range
			if ($tail_flag!=1) {
				($de_num_check,$de_freq_check,$subseq, $r_de_freq, $de_flag, $r_de_pri_ary,$r_bad_pri_ary)=$this->_check_de_count_freq(\@subseq_fr,$subseq,$pri_name, $desc,$de_flag, $r_de_pri_ary, $r_de_freq,$r_bad_pri_ary, $r_real_de_freq );
				# degenerate number check fails
				if ($de_num_check !=1 ) {
					last;
				}
				# degenerate frequence check fails
				if ($de_freq_check !=1 ) {
					if ($de_flag ==1) { next;}   # continue to search in the minority seq
					if ($de_flag ==0) { last;} 
				}
			}

			# check duplicate primer in primer hash array
			my $dupli;
			if (defined($r_pri_h_ary) ) {
				($dupli,$r_pri_h_ary, $r_de_pri_ary,$r_iupac_pri_ary, $r_bad_pri_ary) = _check_duplicate_primer($r_pri_h_ary,$subseq,$pri_name, $desc, $r_de_freq, $de_flag,$r_de_pri_ary,$r_iupac_pri_ary, $r_bad_pri_ary);
				# duplicate primer
				if ($dupli ==1) {
					if ($de_flag ==1) { next;}   # continue to search in the minority seq
					if ($de_flag ==0) { last;}   # no degenerate,stop searching in minority seq, go to next pos
				}
			}

			# check monomers
			my $mono=1;
			($mono,$r_de_pri_ary,$r_bad_pri_ary) = $this->_check_monomers($subseq_check,$subseq,$pri_name,$desc, $r_de_freq, $de_flag, $r_de_pri_ary,$r_bad_pri_ary);
			if ($mono !=1 ) {
				if ($de_flag ==1) { next;}   # continue to search in the minority seq
				if ($de_flag ==0) { last;} 
			}

			# check TM
			my $tm_check=1;
			my $tm;
			($tm_check,$tm, $r_de_pri_ary,$r_bad_pri_ary) = $this->_get_TM($subseq_check,$subseq,$pri_name,$desc, $r_de_freq, $de_flag, $r_de_pri_ary,$r_bad_pri_ary);

			if ($tm_check !=1 ) {
				if ($de_flag ==1) { next;}   # continue to search in the minority seq
				if ($de_flag ==0) { last;} 
			}

			# check GC
			my $gc_check=1;
			my $gc;
			($gc_check,$gc, $r_de_pri_ary,$r_bad_pri_ary) = $this->_get_GC_percent($subseq_check,$subseq,$pri_name,$desc, $r_de_freq, $de_flag, $r_de_pri_ary, $r_bad_pri_ary );

			if ($gc_check !=1 ) {		
				if ($de_flag ==1) { next;}   # continue to search in the minority seq
				if ($de_flag ==0) { last;} 
			}

			# check hairpin
			my $hairpin_check =1;
			($hairpin_check,$r_de_pri_ary,$r_bad_pri_ary) = $this->_check_hairpin($subseq_check,$subseq,$pri_name,$desc, $r_de_freq, $de_flag, $r_de_pri_ary,$r_bad_pri_ary);
			if ($hairpin_check !=1 ) {
				if ($de_flag ==1) { next;}   # continue to search in the minority seq
				if ($de_flag ==0) { last;} 
			}

			# check self-complement
			my $self_compl_check =1;
			($self_compl_check,$r_de_pri_ary,$r_bad_pri_ary) = $this->_check_self_complement($subseq_check,$subseq,$pri_name,$desc, $r_de_freq, $de_flag, $r_de_pri_ary,$r_bad_pri_ary);
			if ( $self_compl_check !=1 ) {
				if ($de_flag ==1) { next;}   # continue to search in the minority seq
				if ($de_flag ==0) { last;} 
			}
		
			# === Now, primer candidate passes all checks ======
			# store valid primer hash to array
			# my $r_primer_h =  &_create_primer_hash($desc, $pri_name,$start_p,$subseq,$gc,$tm, $size, $r_de_freq, $direction, $gap_flag);
			my $r_primer_h =  &_create_primer_hash($desc, $pri_name,$start_p,$subseq,$gc,$tm, $size, $r_de_freq, $direction, $gap_flag,$r_real_de_freq);
			push (@{$r_pri_h_ary}, $r_primer_h);

			# store valid iupac primer hash to array
			my $r_pri_iupac_h =  &_create_primer_iupac_hash($pri_name, $subseq_iupac);
			push (@{$r_iupac_pri_ary}, $r_pri_iupac_h);

			# put valid degenerate base primer to array
			my $valid = "y";
			my $fail_reason ="--";
			if ($de_flag == 1) {
				my $r_de_pri_h =  &_create_de_primer_hash($desc, $pri_name,$subseq,"n", $gc, $tm, "<".$this->{MONOMERS}, "n","<".$this->{DE_BASE_NUM}, ">".$this->{FREQ_THRESHOLD}, $r_de_freq, $valid,$fail_reason);
				push (@{$r_de_pri_ary}, $r_de_pri_h);
			}

			# not need find minority primer for non-degenerate primer or tail primer
			if ($de_flag ==0 || $tail_flag ==1 ) { last;}  

		} # foreach my $desc
    }
	
	return $r_pri_h_ary,$r_de_pri_ary,$r_iupac_pri_ary,$r_bad_pri_ary;
}

# =================================================
# Create a new array space
# Input: None
# Output:
#    a new array reference
# =================================================
sub create_new_ary{
	my @ary = ();
    return \@ary;
}

# =================================================
# To create hash to store the info of one primer
# Input:
#   gene_name, primer_name, start_position, sequence, gc_value, tm_value
#   primer_size, array reference of frequency for each degenerate, direction
# Output:
#   hash reference of the primer
# =================================================
sub _create_primer_hash {
    my ($gene_name, $pri_name, $start_pos, $seq, $gc, $tm, $size,$r_de_fr_pent,$direction, $gapFlag, $r_real_de_fr_pent) = @_;

    my $r_p_h;

    $r_p_h->{"gene_name"} = $gene_name;
    $r_p_h->{"primer_name"} = $pri_name;
    $r_p_h->{"start_position"} = $start_pos;
    $r_p_h->{"sequence"} = $seq;
    $r_p_h->{"GC_percent"} = $gc;
    $r_p_h->{"TM"} = $tm;
    $r_p_h->{"size"} = $size;
    $r_p_h->{"frequence"} = $r_de_fr_pent;
    $r_p_h->{"direction"} = $direction;
    $r_p_h->{"gap"} = $gapFlag;
    $r_p_h->{"real_frequence"} = $r_real_de_fr_pent;

    return $r_p_h;
}

# =================================================
# To create hash to store the invalid primer
# Input:
#   gene_name, primer_name, sequence, fail_reason
# Output:
#   hash reference of the invalid primer
# =================================================
sub _create_bad_primer_hash {
    my ($gene_name, $pri_name, $seq, $r_de_fr_pent, $fail_reason) = @_;

    my $r_p_h;

    $r_p_h->{"gene_name"} = $gene_name;
    $r_p_h->{"primer_name"} = $pri_name;
    $r_p_h->{"sequence"} = $seq;
    $r_p_h->{"frequence"} = $r_de_fr_pent;
    $r_p_h->{"fail_reason"} = $fail_reason;

    return $r_p_h;
}

# =================================================
# To create hash to store the info of one iupac primer
# Input:
#   primer_name, primer sequence in IUPAC code
# Output:
#   hash reference of the primer in IUPAC code
# =================================================
sub _create_primer_iupac_hash {
    my ($pri_name, $seq_iupac) = @_;

    my $r_p_h;

    $r_p_h->{"primer_name"} = $pri_name;
    $r_p_h->{"sequence"} = $seq_iupac;

    return $r_p_h;
  }


# =================================================
# To create hash to store the info of one primer with degenerate base
# Input:
#   gene_name, primer_name, primer sequence, duplicate_flag, gc_value
#   tm_value, monomer_value, degenerate_num, frequency_threshold
#   array reference of degenerate frequency, valid_flag
# Output:
#   hash reference of the primer with degenerate
# =================================================
sub _create_de_primer_hash {
    my ($gene_name, $pri_name, $seq, $dupli,$gc, $tm, $mono,$compl,$degeNum,$thresCheck, $r_de_fr_pent, $valid,$fail_reason) = @_;

    my $r_p_h;

    $r_p_h->{"gene_name"} = $gene_name;
    $r_p_h->{"primer_name"} = $pri_name;
    $r_p_h->{"sequence"} = $seq;
    $r_p_h->{"duplicate"} = $dupli;
    $r_p_h->{"GC_percent"} = $gc;
    $r_p_h->{"TM"} = $tm;
    $r_p_h->{"monomer"} = $mono;
    $r_p_h->{"complement"} = $compl;
    $r_p_h->{"degenerate"} = $degeNum;
    $r_p_h->{"threshold_check"} = $thresCheck; 
    $r_p_h->{"frequence"} = $r_de_fr_pent;
    $r_p_h->{"valid"} = $valid;
    $r_p_h->{"FailReason"} = $fail_reason;

    return $r_p_h;
}


# =================================================
# Check if primer candidate has duplication in valid primer set.
# If yes, remove duplicate primer from valid primer set.
# Input:
#   array reference of valid primer hash, primer_sequence, primer_name, 
#   gene_name, array reference of degenerate frequency, degenerate_flag,
#   array refernece of degenerate primer hash
#   array reference of valid primer hash in IUPAC code
#   array reference of invalid primer hash
# Output:
#   duplicate_flag (1 true, 0 false)
#   array reference of valid primer hash
#   array refernece of degenerate primer hash
#   array reference of valid primer hash in IUPAC code
#   array reference of invalid primer hash
# =================================================
sub _check_duplicate_primer {
    my ($r_h_ary,$subseq, $p_name, $desc,$r_de_freq,$de_flag,$r_de_p_h_ary,$r_iupac_pri_ary, $r_bad_p_h_ary) = @_;

    my $duplicate = 0;  # flag for duplicate
    my $valid  ="y";
    my $fail_reason = "--";

    for(my $p=0; $p<scalar(@{$r_h_ary}); $p++) {
      	my $tmp1 = $r_h_ary->[$p]->{"sequence"};
      	$tmp1 =~ tr/[a-z]/[A-Z]/;
      	my $tmp2 = $subseq;
      	$tmp2 =~ tr/[a-z]/[A-Z]/;
      	
      	# new primer seq is the same as one in the valid primer array
		if ($tmp1 eq $tmp2) {
			$duplicate = 1;
			$valid = "n";
		  
			# if new primer has no same primer name as the one in valid pri array
			# the real duplicate primer
			if ($r_h_ary->[$p]->{"primer_name"} ne $p_name) {

				# insert bad primer hash to bad_primer_hash array
				$fail_reason = "Duplicated primer";
				my $r_bad_r_h = &_create_bad_primer_hash($desc,$p_name,$subseq,$r_de_freq,$fail_reason);
				push (@{$r_bad_p_h_ary}, $r_bad_r_h);
				# also old primer is duplicate primer, go to bad_primer_hash array
				my $r_old_bad_r_h = &_create_bad_primer_hash($r_h_ary->[$p]->{"gene_name"},$r_h_ary->[$p]->{"primer_name"},$r_h_ary->[$p]->{"sequence"},$r_de_freq,$fail_reason);
				push (@{$r_bad_p_h_ary}, $r_old_bad_r_h);

				#remove duplicate primer from valid primer array
				splice(@{$r_h_ary}, $p, 1);
				splice(@{$r_iupac_pri_ary}, $p, 1);

				foreach my $a (@{$r_de_p_h_ary}) {
					my $tmp_seq= $a->{"sequence"};
					$tmp_seq =~ tr/[a-z]/[A-Z]/;

					if ($tmp_seq eq $tmp1) {
						$a->{"duplicate"}="y";
						$a->{"valid"} = "n";
					}
				}	     

				# insert invalid degenerate primer to array
				if ($de_flag ==1) {
					my $r_de_pri_h =  &_create_de_primer_hash($desc, $p_name,$subseq,"y", "--", "--", "--","--","--","--", $r_de_freq, $valid,$fail_reason);
					push (@{$r_de_p_h_ary}, $r_de_pri_h);
				}
			}  # if ne
			last;  # if one duplicate found, stop
		}
    }

    return $duplicate,$r_h_ary,$r_de_p_h_ary,$r_iupac_pri_ary, $r_bad_p_h_ary;
}

# =================================================
# To check if degenerate bases exist in IUPAC sequence
# Input: 
#   sequence, number of degenerate
# Output: 
#   1 - true
#   0 - false
# =================================================
sub _check_degenerate_exist {
    my ($a_seq, $deg_num) = @_;

    my @seq = split(//, $a_seq);

    my @deg_base = grep{/[RYMKWSBDHVN]/i} @seq;
    if (scalar(@deg_base) < 1) {
        return 0;
    }
    else {
		return 1;
    }

}


# =================================================
# To check whether GC% of a sequence is out of the specified range
# Store invalid primer to invalid primer array
# Put degenerate primer to degenerate primer array
# Input:
#   primer sequence, formated primer sequence, primer_name,
#   gene_name, array reference of degenerate frequency, degenerate_flag,
#   array refernece of degenerate primer hash
#   array reference of invalid primer hash
# Output:
#   gc_flag (1 true, 0 false)
#   average GC%
#   array refernece of degenerate primer hash
#   array reference of invalid primer hash
# =================================================
sub _get_GC_percent {
    my $this = shift;
    my ($a_seq,$subseq,$pri_name, $desc, $r_de_freq, $de_flag, $r_de_p_h_ary,$r_bad_p_h_ary) = @_;

    $a_seq =~ s/U/T/ig;  # IUPAC code "U" represents "T"

    my $gc_count_min=0;
    my $gc_count_max=0;
    my $valid = "y";
    my $fail_reason ="--";

    for (my $i=0; $i<length($a_seq); $i++) {
		my $base = substr($a_seq, $i, 1);
		$base =~ tr/[a-z]/[A-Z]/;
		if ($base eq "G" || $base eq "C") 
		{
			$gc_count_min++;
			$gc_count_max++;
		}
		# -- check IUPAC base code
		elsif ($base eq "R")    # R represents A or G
		{
			$gc_count_max++; #if R represents G
		}
		elsif ($base eq "Y")    # Y represents C or T
		{
			$gc_count_max++; #if Y represents C
		}
		elsif ($base eq "S")    # S represents G or C
		{
			$gc_count_min++;
			$gc_count_max++;
		}
		elsif ($base eq "W")    # W represents A or T
		{
		}
		elsif ($base eq "K")    # K represents G or T
		{
			$gc_count_max++;  #if K represents G
		}
		elsif ($base eq "M")    # M represents A or C
		{
			$gc_count_max++;  #if M represents C
		}
		elsif ($base eq "B")    # B represents C or G or T
		{
			$gc_count_max++;  #if B represents C or G
		}
		elsif ($base eq "D")    # D represents A or G or T
		{
			$gc_count_max++;  #if B represents G
		}
		elsif ($base eq "H")    # H represents A or C or T
		{
			$gc_count_max++;  #if H represents C
		}
		elsif ($base eq "V")    # V represents A or C or G
		{
			$gc_count_max++;  #if V represents C or G
		}
		elsif ($base eq "N")    # N represents any base
		{
			$gc_count_max++;  #if N represents C or G
		}

    }

    my $gc_percent_min = 100*($gc_count_min/length($a_seq));
    my $gc_percent_max = 100*($gc_count_max/length($a_seq));
    my $gc_percent_avg = sprintf("%.2f", ($gc_percent_min+$gc_percent_max)/2 );

    my $gc_check =1;
    if ( int($gc_percent_max)<int($this->{MIN_GC_PERCENT}) || int($gc_percent_min)>int($this->{MAX_GC_PERCENT}) ) {
		# insert bad primer hash to bad_primer_hash array
		if ($gc_percent_min==$gc_percent_max) {
			$fail_reason ="GC_MIN%=GC_MAX%=$gc_percent_min Out of range";
			my $r_bad_r_h = &_create_bad_primer_hash($desc,$pri_name,$subseq,$r_de_freq,$fail_reason);
			push (@{$r_bad_p_h_ary}, $r_bad_r_h);
		}
		else {
			$fail_reason ="GC_MIN%=$gc_percent_min,GC_MAX%=$gc_percent_max,GC_AVG%=$gc_percent_avg Out of range";
			my $r_bad_r_h = &_create_bad_primer_hash($desc,$pri_name,$subseq,$r_de_freq,$fail_reason);
			push (@{$r_bad_p_h_ary}, $r_bad_r_h); 
		}

		$valid = "n";
		# insert invalid degenerate primer to array
		if ($de_flag==1) {
			my $r_de_pri_h =  &_create_de_primer_hash($desc, $pri_name,$subseq,"n",$gc_percent_avg, "--", "--", "--","--", "--", $r_de_freq, $valid,$fail_reason);
			push (@{$r_de_p_h_ary}, $r_de_pri_h);
		}

		$gc_check = 0;
    }

    return $gc_check, $gc_percent_avg, $r_de_p_h_ary, $r_bad_p_h_ary;

}


# =================================================
# Check whether TM is out of specified range
# Store invalid primer to invalid primer array
# Put degenerate primer to degenerate primer array
# Input:
#   primer sequence, formated primer sequence, primer_name,
#   gene_name, array reference of degenerate frequency, degenerate_flag,
#   array refernece of degenerate primer hash
#   array reference of invalid primer hash
# Output:
#   tm_flag (1 true, 0 false)
#   tm value
#   array refernece of degenerate primer hash
#   array reference of invalid primer hash
# =================================================
sub _get_TM {
    my $this = shift;
    my ($a_seq,$subseq,$pri_name, $desc, $r_de_freq, $de_flag, $r_de_p_h_ary, $r_bad_p_h_ary) = @_;

    # substitute IUPAC code R, Y, S, W, K, M, B, D, H, V, N, ., - to "N" for tm calculation
    $a_seq =~ s/[\.|\-]/N/ig;

    my $tm = 0;
    if ($a_seq =~ /[R|Y|S|W|K|M|B|D|H|V]/ig )
    {
		$tm = &_get_tm_avg_only($a_seq, $this->{DNA_CONCENTRATION}, $this->{SALT_CONCENTRATION});
    }
    else
    {
		$tm = &_tm($a_seq, $this->{DNA_CONCENTRATION}, $this->{SALT_CONCENTRATION});
    }

    my @de_freq=();
    my $tm_check = 1;
    my $valid = "y";
    my $fail_reason ="--";

    if ( int($tm)<int($this->{MIN_TM}) || int($tm)>int($this->{MAX_TM}) )
    {
		# insert bad primer hash to bad_primer_hash array
		$fail_reason = "TM=$tm Out of range";
		my $r_bad_r_h = &_create_bad_primer_hash($desc,$pri_name,$subseq,$r_de_freq,$fail_reason);
		push (@{$r_bad_p_h_ary}, $r_bad_r_h); 

		$valid = "n";

		# insert invalid degenerate primer to array
		if ($de_flag==1)
		{

			my $r_de_pri_h =  &_create_de_primer_hash($desc, $pri_name,$subseq,"n", "--", $tm, "--", "--","--", "--",$r_de_freq, $valid,$fail_reason);
			push (@{$r_de_p_h_ary}, $r_de_pri_h);

		}

		$tm_check = 0;
    }

    return $tm_check, $tm, $r_de_p_h_ary, $r_bad_p_h_ary;

}


# =================================================
# Check whether monomers are out of range
# Store invalid primer to invalid primer array
# Put degenerate primer to degenerate primer array
# Input:
#   primer sequence, formated primer sequence, primer_name,
#   gene_name, array reference of degenerate frequency, degenerate_flag,
#   array refernece of degenerate primer hash
#   array reference of invalid primer hash
# Output:
#   monomer_flag (1 true, 0 false)
#   array refernece of degenerate primer hash
#   array reference of invalid primer hash
# =================================================
sub _check_monomers {
    my $this = shift;
    my ($a_seq, $subseq,$pri_name, $desc, $r_de_freq, $de_flag, $r_de_p_h_ary,$r_bad_p_h_ary)= @_;

    my @de_freq=();
    my $valid = "y";
    my $fail_reason ="--";

    my $polyA = "A";
    my $polyT = "T";
    my $polyG = "G";
    my $polyC = "C";
    for (my $i=1;$i<=$this->{MONOMERS}; $i++)
    {
		$polyA .="A";
		$polyT .="T";
		$polyG .="G";
		$polyC .="C";
    }

    my $mono_check = 1;
    if ($a_seq =~/$polyA/i || $a_seq =~/$polyT/i || $a_seq =~/$polyG/i || $a_seq =~/$polyC/i )
    {
		# insert bad primer hash to bad_primer_hash array
		$fail_reason ="monomers more than $this->{MONOMERS}";
		my $r_bad_r_h = &_create_bad_primer_hash($desc,$pri_name,$subseq,$r_de_freq,$fail_reason);
		push (@{$r_bad_p_h_ary}, $r_bad_r_h); 

		$valid = "n";
		# insert invalid degenerate primer to array
		if ($de_flag ==1)
		{

			my $tmp = ">".$this->{MONOMERS};
			my $r_de_pri_h =  &_create_de_primer_hash($desc, $pri_name,$subseq,"n", "--", "--", $tmp,"--", "--", "--",$r_de_freq, $valid,$fail_reason);
			push (@{$r_de_p_h_ary}, $r_de_pri_h);

		}
		$mono_check = 0;
	}
		
    return $mono_check, $r_de_p_h_ary, $r_bad_p_h_ary;
  }

# =================================================
# To check whether degenerate base's frequency is above threshold
# Store invalid primer to invalid primer array
# Put degenerate primer to degenerate primer array
# Input:
#   primer sequence, formated primer sequence, primer_name,
#   gene_name, degenerate_flag,array reference of degenerate frequency,
#   array refernece of degenerate primer hash
#   array reference of invalid primer hash
# Output:
#   degenerate_number_flag (1 true, 0 false),
#   frequency_threshold_flag (1 true, 0 false),
#   formatted primer sequence,
#   array reference of degenerate frequency for this primer candidate,
#   degenerate_flag (1 true, 0 false),
#   array refernece of degenerate primer hash
#   array reference of invalid primer hash
# =================================================
sub _check_de_count_freq {
    my $this = shift;
    my ($r_subseq_fr,$subseq,$pri_name, $desc, $de_flag, $r_de_p_h_ary,  $r_de_freq,$r_bad_p_h_ary, $r_real_de_freq)= @_;

	my $de_num_check = 1;
	my $de_freq_check =1; # check if frequency less than threshold
	my $freq = $this->{ALIGN_NUM}; #default as majority
	my $valid ="y";
	my $de_count =0;   # count the number of degenerate base in a primer candicate
	my $fail_reason ="--";


    for (my $j=0; $j<length($subseq); $j++)
    {
		# Record if degenerate base exists, some degenerate maybe not be treated as degenerate base as its frequency is over DE_THRESHOLD.
		if ( $r_subseq_fr->[$j] != $this->{ALIGN_NUM} )
		{
			$freq = $r_subseq_fr->[$j];
			push @{$r_real_de_freq}, $freq;
		}

		# for defined degenerate base
		if ( ($r_subseq_fr->[$j] != $this->{ALIGN_NUM}) && ($r_subseq_fr->[$j]/$this->{ALIGN_NUM}< $this->{DE_THRESHOLD}) )  # if freq>=DE_THRESHOLD, it is not degenerate
		{	
			$de_flag = 1;
			$de_count ++;

			$freq = $r_subseq_fr->[$j];
			push @{$r_de_freq}, $freq;

			# check the total number of degenerate bases
			if ($de_count >  $this->{DE_BASE_NUM} )
			{
				$de_num_check = 0;
				$valid = "n";  # not a valid primer

				# insert bad primer hash to bad_primer_hash array
				$fail_reason ="Degenerated base more than $this->{DE_BASE_NUM}";
				my $r_bad_r_h = &_create_bad_primer_hash($desc,$pri_name,$subseq,$r_de_freq,$fail_reason);
				push (@{$r_bad_p_h_ary}, $r_bad_r_h); 

				last;
			}

			# check degenerate frequency threshold
			# if frequency is too low, this de_primer can't be trusted
			if ( $freq/$this->{ALIGN_NUM} < $this->{FREQ_THRESHOLD} )
			{
				$de_freq_check = 0;
				$valid = "n";  # not a valid primer

				# insert bad primer hash to bad_primer_hash array
				$fail_reason ="Degenerated frequency less than $this->{FREQ_THRESHOLD}";
				my $r_bad_r_h = &_create_bad_primer_hash($desc,$pri_name,$subseq,$r_de_freq,$fail_reason);

				push (@{$r_bad_p_h_ary}, $r_bad_r_h); 
				next;
	        }
	    }
	}

    # insert invalid degenerate primer to array, valid ones will be inserted at the end of all checks
    if ($de_freq_check!=1 && $de_flag ==1)
    {
		# check if this de primer exists in de primer hash array
		my $de_exist_flag= 0; # value=1, degenerate hash array already has this primer 
		foreach my $r_h (@{$r_de_p_h_ary})
		{
			if ( ($r_h->{"primer_name"} eq $pri_name) && ($r_h->{"sequence"} eq $subseq)) 
			{
				$de_exist_flag= 1;
				last;
			}
		}

		# insert into de primer hash array
		if ($de_exist_flag==0)
		{
			my $r_de_pri_h =  &_create_de_primer_hash($desc, $pri_name,$subseq,"n", "--", "--", "--", "--", "--","<".$this->{FREQ_THRESHOLD},$r_de_freq, $valid,$fail_reason);
			push (@{$r_de_p_h_ary}, $r_de_pri_h);
		}
    }

    return $de_num_check,$de_freq_check,$subseq, $r_de_freq, $de_flag,$r_de_p_h_ary, $r_bad_p_h_ary;
}

# =================================================
# To calculate TM (melting temperature)
# Input:
#   primer sequence, dna_concentration, salt_concentration
# Output:
#   tm value
# =================================================
sub _tm{
    my ($seq,$dna_conc,$salt_conc) = @_;
    $seq =~ tr/[a-z]/[A-Z]/;

    my $t0=273.15;
    my $logdna = 1.987 * log($dna_conc/4000000000);
    my $logsalt = 16.6 * log($salt_conc/1000) / log(10);
    my $entha = &_enthalpy($seq);
    my $entro = &_entropy($seq);
    my $r = ($entha*1000) / ($entro + $logdna) - $t0 + $logsalt;

    return sprintf("%.2f",$r);	# rounding to 2 decimals

}

# =================================================
# To calculate entropy value
# Input:
#   primer sequence
# Output:
#   entropy value
# =================================================
sub _entropy {
    my $seq = $_[0];
    my $S = 108;
    foreach (0..length($seq)-2) {          # length -2 because 0-indexed and taking duplets
		$S += $ENT{substr($seq,$_,2)};
    }
    my $r = -0.1*$S;
    return $r;
}

# =================================================
# To calculate enthalpy value
# Input:
#   primer sequence
# Output:
#   enthalpy value
# =================================================
sub _enthalpy {
    my $seq = $_[0];
    my $H = 0;
    foreach (0..length($seq)-2) {          # length -2 because 0-indexed and taking duplets 
		$H += $ENTH{substr($seq,$_,2)};
    }
    my $r = -0.1*$H;
    return $r;
}


# =================================================
# To print input alignment fasta file
# =================================================
sub print_input_fasta {
    my $this = shift;
    
    if ($this->{LOADED} ==0)
    {
		$this->load();
    }

    my $r_seq_hash = $this->{CONTENTS};
    foreach my $desc (keys %{$r_seq_hash})
    {
		my $seq = $r_seq_hash->{$desc};
		print ">$desc\n$seq\n";
    }

}

# =================================================
# To write all primers's info to output file
# Input:
#     Array reference of primer hash, output file
# Output:
#     Write to output file
# =================================================
sub write_all_gene_primer_info {
    my $this = shift;
    my ($r_all_gene_pri_ary, $outfile) = @_;

    my $content = "Name\tPrimer_Seq\tGC%\tTM\tGAP\t";

    for (my $i=0; $i<$this->{DE_BASE_NUM}; $i++)
    {
    	$content .= "de_Freq_".($i+1)."\t";
    }

    $content .= "\n";

    foreach my $r_a_gene_pri_ary (@{$r_all_gene_pri_ary})
    {
		foreach my $r_p_h (@{$r_a_gene_pri_ary})
		{
			$content .= $r_p_h->{"primer_name"}."\t";
			$content .= $r_p_h->{"sequence"}."\t";
			$content .= $r_p_h->{"GC_percent"}."\t";
			$content .= $r_p_h->{"TM"}."\t";
			$content .= $r_p_h->{"gap"}."\t";

			if ( scalar(@{$r_p_h->{"frequence"}})==0 )
			{
				$content .="--\t--\t--\n";
			}
			else
			{ 
				for (my $i=0; $i<$this->{DE_BASE_NUM}; $i++)
				{
					if (defined($r_p_h->{"frequence"}->[$i]) )
					{
						$content .= $r_p_h->{"frequence"}->[$i]."/".$this->{ALIGN_NUM}."\t";
					}
					else
					{
						$content .= "--\t";
					}
				}
				$content .= "\n";
			}
		}
    }

	#print "\nWriting all primers info to $outfile\n\n";
    &string_to_file($content, $outfile);
}

# =================================================
# To write all primers from major alignment sequence to fasta file
# Input:
#     Array reference of primer hashfrom major alignment, output file
# Output:
#     Write to output file
# =================================================
sub write_all_gene_primer_fasta_major {
    my $this = shift;
    my ($r_all_gene_pri_ary, $outfile) = @_;

    my $content;
    foreach my $r_a_gene_pri_ary (@{$r_all_gene_pri_ary})
    {
		foreach my $r_p_h (@{$r_a_gene_pri_ary})
		{
			$content .= ">".$r_p_h->{"primer_name"}."\n";
			$content .= $r_p_h->{"sequence"}."\n";
		}
    }

	#print "\nWriting all primers to fasta file $outfile\n\n";
    &string_to_file($content, $outfile);
}

# =================================================
# To write all invalid primer info to file
# Input:
#     Array reference of invalid primer hash, output file
# Output:
#     Write to output file
# =================================================
sub write_all_invalid_primer {
    my $this = shift;
    my ($r_all_bad_pri_ary, $outfile) = @_;

    my $content = "Primer_Name\tConsensus_Source\tPrimer_Seq\tFail_Reason\t";

    for (my $i=0; $i<$this->{DE_BASE_NUM}; $i++)
    {
    	$content .= "de_Freq_".($i+1)."\t";
    }
    $content .= "\n";

    foreach my $r_a_bad_pri_ary (@{$r_all_bad_pri_ary})
    {

		foreach my $r_p_h (@{$r_a_bad_pri_ary})
		{
			$content .= $r_p_h->{"primer_name"}."\t";
			$content .= $r_p_h->{"gene_name"}."\t";
			$content .= $r_p_h->{"sequence"}."\t";
			$content .= $r_p_h->{"fail_reason"}."\t";

			if ( scalar(@{$r_p_h->{"frequence"}})==0 )
			{
				$content .="--\t--\t--\n";
			}
			else
			{ 
				for (my $i=0; $i<$this->{DE_BASE_NUM}; $i++)
				{
					if (defined($r_p_h->{"frequence"}->[$i]) )
					{
						$content .= $r_p_h->{"frequence"}->[$i]."/".$this->{ALIGN_NUM}."\t";
					}
				   else
					{
						$content .= "--\t";
					}
				}
				$content .= "\n";
			}
		}
	}

	&string_to_file($content, $outfile);
}

# =================================================
# To write iupac sequence of primers from major alignment to fasta file
# Input:
#     Array reference of primer hash with iupac code, output file
# Output:
#     Write to output file
# =================================================
sub write_all_gene_primer_fasta_iupac {
    my $this = shift;
    my ($r_all_gene_pri_iupac_ary, $outfile) = @_;

    my $content;
    my @non_dupli_pName=(); 
    foreach my $r_a_pri_iupac_ary (@{$r_all_gene_pri_iupac_ary})
    {
		foreach my $r_p_h (@{$r_a_pri_iupac_ary})
		{
			# make sure no duplicate primer name in iupac primer fasta
			if (scalar(@non_dupli_pName)>0 && elem_present_in_arr(\@non_dupli_pName, $r_p_h->{"primer_name"})==1 )  { next; }

			$content .= ">".$r_p_h->{"primer_name"}."\n";
			$content .= $r_p_h->{"sequence"}."\n";
			push (@non_dupli_pName,$r_p_h->{"primer_name"});
		}
    }

    &string_to_file($content, $outfile);
}


# =================================================
# To write all acceptalbe primers in IUPAC code to fasta file
# Acceptable primers:
#   Valid non-degenerate primer, or, 
#   if at least one degenerate primer at a certain position is valid, and if 
# other degenerate primer candidates at the same position are invalid only 
# because the degenerate base frequency is lower than freq_threshold, those
# other primer candidates are acceptable primers in IUPAC code.
# Input:
#     Array reference of valid primer hash, array reference of degenerate
# primer hash, output fasta file, output primer seq and conservation file.
# Output:
#     outfile1 - valid primer iupac fasta
#     outfile2 - valid primer iupac sequence and conservation score
# =================================================
sub write_all_valid_primer_fasta_iupac {
    my $this = shift;

    my ($r_all_gene_pri_ary, $r_de_pri_ary, $outfile1, $outfile2) = @_;

    my $content;
    my $content2;
    my @non_dupli_pName=(); 
    my @non_dupli_iupac_seq=();
    my %pri_conserv =();   # key: primerName, value: lowest conservation score for this primer

    foreach my $r_a_gene_pri_ary (@{$r_all_gene_pri_ary})
    {	
		foreach my $r_p_h (@{$r_a_gene_pri_ary})
		{
			if (!exists($pri_conserv{$r_p_h->{"primer_name"}}) )
			{
			  $pri_conserv{$r_p_h->{"primer_name"}} = 999;
			}

			if (scalar(@non_dupli_pName)>0 && elem_present_in_arr(\@non_dupli_pName, $r_p_h->{"primer_name"})==1 )  { next; }

			if ( scalar(@{$r_p_h->{"real_frequence"}})==0 )  # purely non-degenerate primer
			{
				if (elem_present_in_arr(\@non_dupli_iupac_seq, $r_p_h->{"sequence"})!=1)
				{
					$content .= ">".$r_p_h->{"primer_name"}."\n";
					$content .=$r_p_h->{"sequence"}."\n";
					$pri_conserv{$r_p_h->{"primer_name"}} = "1";   # no degenerate

					$content2 .= $r_p_h->{"primer_name"}."\t".$r_p_h->{"sequence"}."\t".$pri_conserv{$r_p_h->{"primer_name"}}."\n";
					push (@non_dupli_iupac_seq,$r_p_h->{"sequence"});
					push (@non_dupli_pName,$r_p_h->{"primer_name"});
				}
			}
			elsif ( scalar(@{$r_p_h->{"real_frequence"}})!=0 && scalar(@{$r_p_h->{"frequence"}})==0 ) # defined non-degenerate
			{
				if (elem_present_in_arr(\@non_dupli_iupac_seq, $r_p_h->{"sequence"})!=1)
				{
					$content .= ">".$r_p_h->{"primer_name"}."\n";
					$content .=$r_p_h->{"sequence"}."\n";

					$pri_conserv{$r_p_h->{"primer_name"}} = $this->{DE_THRESHOLD};   # defined non-degenerate has frequence over DE_THRESHOLD

					$content2 .= $r_p_h->{"primer_name"}."\t".$r_p_h->{"sequence"}."\t".$pri_conserv{$r_p_h->{"primer_name"}}."\n";
					push (@non_dupli_iupac_seq,$r_p_h->{"sequence"});
					push (@non_dupli_pName,$r_p_h->{"primer_name"});
				}
			}
			else  # degenerate primer
			{
				my @all_seq =();  # all degenerate primers with the same primer_name at this pos
				my @all_valid_seq=();#all vaild degenerate primers with the same primer name at this pos

				my @all_de_freq =();  # array of hash %de_freq, each item is %de_freq for one position	

				#get all degenerate primers with the same primer_name
				foreach my $r_a_de_pri_ary (@{$r_de_pri_ary})
				{
					foreach my $r_de_p_h (@{$r_a_de_pri_ary})
					{

						if ($r_de_p_h->{"primer_name"} eq $r_p_h->{"primer_name"})
						{
							if ($r_de_p_h->{"valid"} eq "y")
							{
								push @all_valid_seq, [split(//,$r_de_p_h->{"sequence"})];				
							}

							if ($r_de_p_h->{"valid"} eq "y" ||$r_de_p_h->{"FailReason"} =~ /Degenerated frequency less than/)   # use primer with fail reason as low degenerated freq in final iupac
							{
								push @all_seq, [split(//,$r_de_p_h->{"sequence"})];
								my $de_base_str=""; # all degenerate base string for this primer

								my @a_seq = split(//,$r_de_p_h->{"sequence"});
								for (my $i=0; $i<scalar(@a_seq); $i++)
								{
									if ($a_seq[$i] eq lc($a_seq[$i]))  # degenerate  base
									{
										$de_base_str .=$a_seq[$i];
									}
								}
				
								# get each de position's all de frequency
								for (my $k=0;$k<scalar(@{$r_de_p_h->{"frequence"}});$k++)
								{
									if (scalar(@all_de_freq)==0)
									{
										my %de_freq = (); # new hash, key: de base; value: de's freq
										push @all_de_freq, \%de_freq;
									}
									if (!exists($all_de_freq[$k]->{substr($de_base_str,$k,1)} ))
									{
										$all_de_freq[$k]->{substr($de_base_str,$k,1)} =$r_de_p_h->{"frequence"}->[$k];
									}
								}


							}

						} # if primer name eq
					} # foreach my $r_de_p_h
				} # foreach my $r_a_de_pri_ary

				# get lowest sum of frequency from all degenerate positions
				foreach my $a (@all_de_freq)
				{
					my $conser_sum=0;
					foreach my $de (keys %{$a})
					{
						$conser_sum = $conser_sum+$a->{$de};
					}
					if ( $pri_conserv{$r_p_h->{"primer_name"}}>$conser_sum) 
					{
						$pri_conserv{$r_p_h->{"primer_name"}} = $conser_sum;
					}
					
				}

				# get iupac for each position in the sequence
				my $pri_iupac_seq="";
				
				for (my $i=0; $i<length($r_p_h->{"sequence"}); $i++)
				{
					my @base_ary = ();  # array of bases of each seq at this pos
					my @valid_base_ary =();#array of bases of each vaild primer seq at this pos
					for (my $j=0; $j<scalar(@all_seq); $j++)
					{
						$all_seq[$j][$i] =~ tr/[a-z]/[A-Z]/;
						if (elem_present_in_arr(\@base_ary,$all_seq[$j][$i])!=1)
						{
							push @base_ary, $all_seq[$j][$i];
						}
					}

					for (my $m=0; $m<scalar(@all_valid_seq); $m++)
					{
						$all_valid_seq[$m][$i] =~ tr/[a-z]/[A-Z]/;
						if (elem_present_in_arr(\@valid_base_ary,$all_valid_seq[$m][$i])!=1)
						{
							push @valid_base_ary, $all_valid_seq[$m][$i];
						}
					}

					$pri_iupac_seq .= &assign_iupac(\@base_ary, \@valid_base_ary,$r_p_h->{"primer_name"});
				}

				$content .= ">".$r_p_h->{"primer_name"}."\n";
				$content .= $pri_iupac_seq."\n";
				$content2 .= $r_p_h->{"primer_name"}."\t".$pri_iupac_seq."\t".sprintf("%.2f",$pri_conserv{$r_p_h->{"primer_name"}}/$this->{ALIGN_NUM})."\n";

				push (@non_dupli_iupac_seq,$pri_iupac_seq);
				push (@non_dupli_pName,$r_p_h->{"primer_name"});

			} # else degenerate primer

		} # foreach my $r_p_h
    } # foreach my $r_a_gene_pri_ary 

    &string_to_file($content, $outfile1);
    &string_to_file($content2, $outfile2);
}

#===========================================================
# Convert multiple base representations at the same position to IUPAC code.
# Input: array reference of all base for a position in an invalid primer with
#           fail reason "Degenerated frequency less than",
#        array reference of all valid base representations for a position,
#        primer name
# Output: iupac code for this position
#===========================================================
sub assign_iupac {
    my ($r_base_ary, $r_valid_base_ary,$primer_name)=@_;

    my $iupac;
    my $item_num = scalar(@{$r_base_ary});
    if ($item_num ==1) 
    {
		$iupac = $r_base_ary->[0];
		return $iupac;
    }

    if ($item_num ==2 && elem_present_in_arr($r_base_ary,"A")==1 && elem_present_in_arr($r_base_ary,"G")==1 )
    {
		$iupac = "R";
    }
    elsif ($item_num ==2 && elem_present_in_arr($r_base_ary,"A")==1 && elem_present_in_arr($r_base_ary,"C")==1)
    {
		$iupac = "M";
    }
    elsif ($item_num ==2 && elem_present_in_arr($r_base_ary,"C")==1 && elem_present_in_arr($r_base_ary,"G")==1)
    {
		$iupac = "S";
    }
    elsif ($item_num ==2 && elem_present_in_arr($r_base_ary,"C")==1 && elem_present_in_arr($r_base_ary,"T")==1 )
    {
		$iupac = "Y";
    }
    elsif ($item_num ==2 && elem_present_in_arr($r_base_ary,"T")==1 && elem_present_in_arr($r_base_ary,"G")==1)
    {
		$iupac = "K";
    }
    elsif ($item_num ==2 && elem_present_in_arr($r_base_ary,"T")==1 && elem_present_in_arr($r_base_ary,"A")==1)
    {
		$iupac = "W";
    }
    elsif ($item_num ==3 && elem_present_in_arr($r_base_ary,"A")==1 && elem_present_in_arr($r_base_ary,"C")==1 && elem_present_in_arr($r_base_ary,"G")==1)
    {
		$iupac = "V";
    }
    elsif ($item_num ==3 && elem_present_in_arr($r_base_ary,"C")==1 && elem_present_in_arr($r_base_ary,"T")==1 && elem_present_in_arr($r_base_ary,"G")==1)
    {
		$iupac = "B";
    }
    elsif ($item_num ==3 && elem_present_in_arr($r_base_ary,"A")==1 && elem_present_in_arr($r_base_ary,"T")==1 && elem_present_in_arr($r_base_ary,"G")==1)
    {
		$iupac = "D";
    }
    elsif ($item_num ==3 && elem_present_in_arr($r_base_ary,"A")==1 && elem_present_in_arr($r_base_ary,"T")==1 && elem_present_in_arr($r_base_ary,"C")==1)
    {
		$iupac = "H";
    }
    elsif ($item_num ==4 && elem_present_in_arr($r_base_ary,"A")==1 && elem_present_in_arr($r_base_ary,"T")==1 && elem_present_in_arr($r_base_ary,"C")==1 && elem_present_in_arr($r_base_ary,"G")==1)
    {
		$iupac = "N";
		print "Warning: use N to represent unknown IUPAC in primer $primer_name\n";
    }
    elsif (elem_present_in_arr($r_base_ary,"N")==1)
    {
		$r_base_ary=&remove_elem_from_arr($r_base_ary, "N");
		$iupac = &assign_final_iupac_from_iupac($r_base_ary, $r_valid_base_ary,$primer_name);
    }
    else
    {
		# if primer seq has iupac, only use valid base in iupac base in final
		$iupac = &assign_final_iupac_from_iupac($r_base_ary, $r_valid_base_ary,$primer_name);
    }

    return $iupac;
}

#################################
sub assign_final_iupac_from_iupac {
    my ($r_base_ary,$r_valid_base_ary,$primer_name) =@_;
 
    my @final_base=();  # final output base set at this pos

    foreach my $a (@{$r_base_ary})
    {
		if (($a ne "R") && ($a ne "Y") && ($a ne "M") && ($a ne "K") && ($a ne "W") && ($a ne "S") && ($a ne "B") && ($a ne "D") && ($a ne "H") && ($a ne "V") )
		{
			if (scalar(@final_base)==0 || (scalar(@final_base)>0 && elem_present_in_arr(\@final_base,$a)!=1) )
			{
				push @final_base, $a;
			}
		}
		elsif ($a eq "R" )
		{
			if (elem_present_in_arr($r_valid_base_ary,"A")==1 && elem_present_in_arr(\@final_base,"A")!=1)
			{
				push @final_base, "A";
			}
			if (elem_present_in_arr($r_valid_base_ary,"G")==1 && elem_present_in_arr(\@final_base,"G")!=1)
			{
				push @final_base, "G";
			}
		}
		elsif ($a eq "Y" )
		{
			if (elem_present_in_arr($r_valid_base_ary,"C")==1 && elem_present_in_arr(\@final_base,"C")!=1)
			{
				push @final_base, "C";
			}
			if (elem_present_in_arr($r_valid_base_ary,"T")==1 && elem_present_in_arr(\@final_base,"T")!=1)
			{
				push @final_base, "T";
			}
		}
		elsif ($a eq "M" )
		{
			if (elem_present_in_arr($r_valid_base_ary,"C")==1 && elem_present_in_arr(\@final_base,"C")!=1)
			{
				push @final_base, "C";
			}
			if (elem_present_in_arr($r_valid_base_ary,"A")==1 && elem_present_in_arr(\@final_base,"A")!=1)
			{
				push @final_base, "A";
			}
		}
		elsif ($a eq "K" )
		{
			if (elem_present_in_arr($r_valid_base_ary,"T")==1 && elem_present_in_arr(\@final_base,"T")!=1)
			{
				push @final_base, "T";
			}
			if (elem_present_in_arr($r_valid_base_ary,"G")==1 && elem_present_in_arr(\@final_base,"G")!=1)
			{
				push @final_base, "G";
			}
		}
		elsif ($a eq "W" )
		{
			if (elem_present_in_arr($r_valid_base_ary,"T")==1 && elem_present_in_arr(\@final_base,"T")!=1)
			{
				push @final_base, "T";
			}
			if (elem_present_in_arr($r_valid_base_ary,"A")==1 && elem_present_in_arr(\@final_base,"A")!=1)
			{
				push @final_base, "A";
			}
		}
		elsif ($a eq "S" )
		{
			if (elem_present_in_arr($r_valid_base_ary,"C")==1 && elem_present_in_arr(\@final_base,"C")!=1)
			{
				push @final_base, "C";
			}
			if (elem_present_in_arr($r_valid_base_ary,"G")==1 && elem_present_in_arr(\@final_base,"G")!=1)
			{
				push @final_base, "G";
			}
		}
		elsif ($a eq "B" )
		{
			if (elem_present_in_arr($r_valid_base_ary,"C")==1 && elem_present_in_arr(\@final_base,"C")!=1)
			{
				push @final_base, "C";
			}
			if (elem_present_in_arr($r_valid_base_ary,"T")==1 && elem_present_in_arr(\@final_base,"T")!=1)
			{
				push @final_base, "T";
			}
			if (elem_present_in_arr($r_valid_base_ary,"G")==1 && elem_present_in_arr(\@final_base,"G")!=1)
			{
				push @final_base, "G";
			}
		}
		elsif ($a eq "D" )
		{
			if (elem_present_in_arr($r_valid_base_ary,"A")==1 && elem_present_in_arr(\@final_base,"A")!=1)
			{
				push @final_base, "A";
			}
			if (elem_present_in_arr($r_valid_base_ary,"T")==1 && elem_present_in_arr(\@final_base,"T")!=1)
			{
				push @final_base, "T";
			}
			if (elem_present_in_arr($r_valid_base_ary,"G")==1 && elem_present_in_arr(\@final_base,"G")!=1)
			{
				push @final_base, "G";
			}
		}
		elsif ($a eq "H" )
		{
			if (elem_present_in_arr($r_valid_base_ary,"A")==1 && elem_present_in_arr(\@final_base,"A")!=1)
			{
				push @final_base, "A";
			}
			if (elem_present_in_arr($r_valid_base_ary,"T")==1 && elem_present_in_arr(\@final_base,"T")!=1)
			{
				push @final_base, "T";
			}
			if (elem_present_in_arr($r_valid_base_ary,"C")==1 && elem_present_in_arr(\@final_base,"C")!=1)
			{
				push @final_base, "C";
			}
		}
		elsif ($a eq "V" )
		{
			if (elem_present_in_arr($r_valid_base_ary,"A")==1 && elem_present_in_arr(\@final_base,"A")!=1)
			{
				push @final_base, "A";
			}
			if (elem_present_in_arr($r_valid_base_ary,"G")==1 && elem_present_in_arr(\@final_base,"G")!=1)
			{
				push @final_base, "G";
			}
			if (elem_present_in_arr($r_valid_base_ary,"C")==1 && elem_present_in_arr(\@final_base,"C")!=1)
			{
				push @final_base, "C";
			}
		}

    }

    # assign iupac to final base array
    my $iupac_final;
    my $item_num = scalar(@final_base);
    if ($item_num ==1) 
    {
		$iupac_final = $final_base[0];
		return $iupac_final;
    }

    if ($item_num ==2 && elem_present_in_arr(\@final_base,"A")==1 && elem_present_in_arr(\@final_base,"G")==1 )
    {
		$iupac_final = "R";
    }
    elsif ($item_num ==2 && elem_present_in_arr(\@final_base,"A")==1 && elem_present_in_arr(\@final_base,"C")==1)
    {
		$iupac_final = "M";
    }
    elsif ($item_num ==2 && elem_present_in_arr(\@final_base,"C")==1 && elem_present_in_arr(\@final_base,"G")==1)
    {
		$iupac_final = "S";
    }
    elsif ($item_num ==2 && elem_present_in_arr(\@final_base,"C")==1 && elem_present_in_arr(\@final_base,"T")==1 )
    {
		$iupac_final = "Y";
    }
    elsif ($item_num ==2 && elem_present_in_arr(\@final_base,"T")==1 && elem_present_in_arr(\@final_base,"G")==1)
    {
		$iupac_final = "K";
    }
    elsif ($item_num ==2 && elem_present_in_arr(\@final_base,"T")==1 && elem_present_in_arr(\@final_base,"A")==1)
    {
		$iupac_final = "W";
    }
    elsif ($item_num ==3 && elem_present_in_arr(\@final_base,"A")==1 && elem_present_in_arr(\@final_base,"C")==1 && elem_present_in_arr(\@final_base,"G")==1)
    {
		$iupac_final = "V";
    }
    elsif ($item_num ==3 && elem_present_in_arr(\@final_base,"C")==1 && elem_present_in_arr(\@final_base,"T")==1 && elem_present_in_arr(\@final_base,"G")==1)
    {
		$iupac_final = "B";
    }
    elsif ($item_num ==3 && elem_present_in_arr(\@final_base,"A")==1 && elem_present_in_arr(\@final_base,"T")==1 && elem_present_in_arr(\@final_base,"G")==1)
    {
		$iupac_final = "D";
    }
    elsif ($item_num ==3 && elem_present_in_arr(\@final_base,"A")==1 && elem_present_in_arr(\@final_base,"T")==1 && elem_present_in_arr(\@final_base,"C")==1)
    {
		$iupac_final = "H";
    }
    elsif ($item_num ==4 && elem_present_in_arr(\@final_base,"A")==1 && elem_present_in_arr(\@final_base,"T")==1 && elem_present_in_arr(\@final_base,"C")==1 && elem_present_in_arr(\@final_base,"G")==1)
    {
		$iupac_final = "N";
		print "Warning: use N to represent unknown IUPAC in primer $primer_name\n";
    }

    return $iupac_final;
	
}
# =================================================
# To write all primers to fasta file
# Input:
#     Array reference of degenerate primer hash, output file
# Output:
#     Write to output file
# =================================================
sub write_degenerate_primer_info {
    my $this = shift;
    my ($r_de_pri_ary, $outfile) = @_;

    my $content = "Name\tPrimer_Seq\tDuplicate\tGC%\tTM\tMono\tComplement\tDege_Num\tDege_Thres\t";

    for (my $i=0; $i<$this->{DE_BASE_NUM}; $i++)
    {	
        $content .= "de_Freq_".($i+1)."\t";
    }
    $content .= "Valid\tInvalid_Reason\n";
    
    foreach my $r_a_de_pri_ary (@{$r_de_pri_ary})
    {
		foreach my $r_p_h (@{$r_a_de_pri_ary})
		{
			$content .= $r_p_h->{"primer_name"}."\t";
			$content .= $r_p_h->{"sequence"}."\t";
			$content .= $r_p_h->{"duplicate"}."\t";
			$content .= $r_p_h->{"GC_percent"}."\t";
			$content .= $r_p_h->{"TM"}."\t";
			$content .= $r_p_h->{"monomer"}."\t";
			$content .= $r_p_h->{"complement"}."\t";
			$content .= $r_p_h->{"degenerate"}."\t";
			$content .= $r_p_h->{"threshold_check"}."\t";

			for (my $i=0; $i<$this->{DE_BASE_NUM}; $i++)
			{
				if (defined($r_p_h->{"frequence"}->[$i]) )
				{
					$content .= $r_p_h->{"frequence"}->[$i]."/".$this->{ALIGN_NUM}."\t";
				}
				else
				{
					$content .= "--\t";
				}

			}
			$content .= $r_p_h->{"valid"}."\t";
			$content .= $r_p_h->{"FailReason"}."\n";
		}

    }

    &string_to_file($content, $outfile);
}


# =================================================
# Convert fasta sequences to sequence strings
# Input:
#   fasta file
# Output:
#   sequence string
# =================================================
sub get_seq_ary {
    my ($inFasta) = @_;

    my @ary = ();
    my $seq="";
    open (IN_F, $inFasta) or die "Can not open $inFasta\n";
    while(<IN_F>)
    {
		next if /^\n/;
		if (/^>/)  #New entry
		{
			if ($seq ne "")
			{
				$seq =~ s/\s+$//;
				push @ary, $seq;
			}
			$seq="";
			next;
		}
	
		chomp;
		$seq .= $_;
	}
	
	$seq =~ s/\s+$//;
	push @ary, $seq; # push last seq entry
	close (IN_F);
	
}


# =================================================
# Check whether self_complement scroe is above threshold.
# Store invalid primer to invalid primer array
# Put degenerate primer to degenerate primer array
# Input:
#   primer sequence, formated primer sequence, primer_name,
#   gene_name, array reference of degenerate frequency, degenerate_flag,
#   array refernece of degenerate primer hash
#   array reference of invalid primer hash
# Output:
#   self_complement_flag (1 true, 0 false)
#   array refernece of degenerate primer hash
#   array reference of invalid primer hash
# =================================================
sub _check_self_complement {
    my $this = shift;
    my ($a_seq, $subseq,$pri_name, $desc, $r_de_freq, $de_flag, $r_de_p_h_ary,$r_bad_p_h_ary)= @_;

    my $valid = "y";
    my $fail_reason ="--";

    my $self_compl_check = 1;

    my $seq_from_3_end = reverse $subseq;

    my $self_compl_num = $this->{SELF_COMPL};
    my $len = length($a_seq);

    my @seq_from_5 = split('', $a_seq);
    my @seq_from_3 = split('', $seq_from_3_end);
    my $max_compl_count=0;

    for (my $step=0; $step<=$len-$self_compl_num; $step++)
    {
		my $compl_count=0;
		my $seq5="";
		my $seq3="";
		for (my $i=1; $i<=$self_compl_num+$step; $i++)
		{
			my $pos_from_5end = $len-$self_compl_num+$i-$step;

			my $tmp_base = $seq_from_5[$pos_from_5end-1];
			$tmp_base =~ tr/[A|T|C|G]/[T|A|G|C]/;
			if ($seq_from_3[$i-1] eq $tmp_base)
			{
				$compl_count++;
			}
		}

		if ($compl_count>$max_compl_count)
		{
			$max_compl_count = $compl_count;
		}

	}

    # self_complementary bases greater than self_compl threshold
    if ($max_compl_count>=$self_compl_num)
    {
		$fail_reason = "self complementary bases more than $this->{SELF_COMPL}";
		# insert bad primer hash to bad_primer_hash array
		my $r_bad_r_h = &_create_bad_primer_hash($desc,$pri_name,$subseq,$r_de_freq,$fail_reason);
		push (@{$r_bad_p_h_ary}, $r_bad_r_h); 

		$valid = "n";
		# insert invalid degenerate primer to array
		if ($de_flag ==1)
		{
			my $r_de_pri_h =  &_create_de_primer_hash($desc, $pri_name,$subseq,"n", "--", "--", "--","y", "--", "--",$r_de_freq, $valid,$fail_reason);
			push (@{$r_de_p_h_ary}, $r_de_pri_h);
		}
		$self_compl_check = 0;
    }
	
    return $self_compl_check, $r_de_p_h_ary, $r_bad_p_h_ary;

  }

# =================================================
# Check whether hairpin exist in primer.
# Number of matched bases =2
# 4 or 5 free(unmatched) bases in the loop allowed when hairpin exists.
# Store invalid primer to invalid primer array
# Put degenerate primer to degenerate primer array
# Input:
#   primer sequence, formated primer sequence, primer_name,
#   gene_name, array reference of degenerate frequency, degenerate_flag,
#   array refernece of degenerate primer hash
#   array reference of invalid primer hash
# Output:
#   hairpin_flag (1 true, 0 false)
#   array refernece of degenerate primer hash
#   array reference of invalid primer hash
# =================================================
sub _check_hairpin {
    my $this = shift;
    my ($a_seq, $subseq,$pri_name, $desc, $r_de_freq, $de_flag, $r_de_p_h_ary,$r_bad_p_h_ary)= @_;

    my $seq_len = length($a_seq);
    my @seq_ary = split('', $a_seq);
    my $hairpin_flag = 0;    # if hairpin exists, hairpin_flag=1

    my $hairpin_check =1;
    my $valid = "y";
    my $fail_reason = "--";

    foreach my $num_in_loop (@HAIRPIN_LOOP_BASE_NUM)
    {
		my $hairpin = $HAIRPIN_STEM_MIN + $num_in_loop; # position distance between the second matched base pair away the hairpin loop, we only search the hairpin with one loop and $HAIRPIN_STEM_MIN matched base pairs

		for (my $pos_1=1; $pos_1<=$seq_len-$hairpin-1; $pos_1++)
		{
			my $pos_2 = $pos_1+1;
			my $match_pos_1 = $pos_1+$hairpin+1;  # match base pos with base in pos_1
			my $match_pos_2 = $match_pos_1-1;    # only consider 2 base pairs match in stem

			my $tmp_1 = $seq_ary[$match_pos_1-1];
			$tmp_1 =~ tr/[A|T|C|G]/[T|A|G|C]/;
			my $tmp_2 = $seq_ary[$match_pos_2-1];
			$tmp_2 =~ tr/[A|T|C|G]/[T|A|G|C]/;
			if ($seq_ary[$pos_1-1] eq $tmp_1 && $seq_ary[$pos_2-1] eq $tmp_2)
			{
				$hairpin_flag =1;
				last;
			}
		}

		if ($hairpin_flag ==1)
		{
			last;
		}
    }

    if ($hairpin_flag ==1)
    {
		$fail_reason = "hairpin exists";
		# insert bad primer hash to bad_primer_hash array
		my $r_bad_r_h = &_create_bad_primer_hash($desc,$pri_name,$subseq,$r_de_freq,$fail_reason);
		push (@{$r_bad_p_h_ary}, $r_bad_r_h); 

		$valid = "n";
		# insert invalid degenerate primer to array
		if ($de_flag ==1)
		{
			my $tmp = ">".$this->{MONOMERS};
			my $r_de_pri_h =  &_create_de_primer_hash($desc, $pri_name,$subseq,"n", "--", "--", "--", "y","--", "--",$r_de_freq, $valid,$fail_reason);
			push (@{$r_de_p_h_ary}, $r_de_pri_h);

		}
		$hairpin_check = 0;
    }
	
    return $hairpin_check, $r_de_p_h_ary, $r_bad_p_h_ary;
}


# =================================================
sub amplicon_selection {
    my $this = shift;

    my ($r_genome_seq_h_F, $r_pri_seq_h,$F_pri_file, $R_pri_file,$conserv_file,$outfile,$outfile_fasta) = @_;

    my $MAX_AMPL_LEN = $this->{MAX_AMPL_LEN};
    my $MIN_AMPL_LEN = $this->{MIN_AMPL_LEN};

    print "Selecting amplicons ...\n\n";

    my $r_F_p_ary = &file_to_array_of_hash($F_pri_file);
    my $r_R_p_ary = &file_to_array_of_hash($R_pri_file);   # reverse primer array


    my $lines = &get_lines_from_file($conserv_file);
    my %pri_name_conserv =(); # hash. key: primer name; value: lowest conservation score of de base

    foreach my $aLine ( @{$lines} )
    {
		my ($pri_name, $pri_seq, $pri_conserv) = split /\t/, $aLine;
		$pri_name_conserv{$pri_name} = $pri_conserv;
    }


    my $r_ampl_h; # amplicon hash. Key: Forward_pri name, Value: r_Reverse_good_pri_hash_ary

    foreach my $r_F_p_h (@{$r_F_p_ary})
    {
		my $f_start = $r_F_p_h->{"start_position"};
		my $r_candidates_F_hash;   # key: F primer name, Value: array of good R primer candidate
		my $r_R_good_pri_ary = &create_new_ary(); # reverse primers paired with this forward primer with good amplicon length

		my $ampl_len_flag =0;  # 1- R primer fitting amplicon len found. 0-not found 
		foreach my $r_R_p_h (@{$r_R_p_ary})
		{
			my $r_start = $r_R_p_h->{"start_position"};

			# --- find R primer within min-max amplicon length ------
			if (($r_start >= $f_start+$MIN_AMPL_LEN-$this->{PRIMER_SIZE}) && ($r_start <= $f_start+$MAX_AMPL_LEN-$this->{PRIMER_SIZE}))
			{
				$ampl_len_flag = 1;

				# ----- check complement between F primer and R primer
				my $check_compl = &_check_F_R_primer_complement($r_F_p_h->{"sequence"},$r_R_p_h->{"sequence"},$this->{PAIR_COMPL_THRESH});
				if ( $check_compl ==1 )
				{
					push (@{$r_R_good_pri_ary}, $r_R_p_h);
				}
			}

		}

		if ($ampl_len_flag ==0)
		{
			print "\t".$r_F_p_h->{"primer_name"}." no R primer fits amplicon length\n";
			next;
		}
		if (scalar(@{$r_R_good_pri_ary})<=0)
		{
			print "\t".$r_F_p_h->{"primer_name"}." F and R primer complement\n";
			next;
		}
		$r_ampl_h->{$r_F_p_h->{"primer_name"}} = $r_R_good_pri_ary;
    }

    # -------- check overlap between amplicons ---------
    my $r_final_ampl_hash = &check_ampl_overlap($r_ampl_h,$r_R_p_ary,$this->{OVERLAP_THRESH},$this->{PRIMER_SIZE});

    my $content="count\tampl_name\tavg_conservation\tpri_name_F\tsequence\tconservation\tGC\tTM\tgap\tpri_name_R\tsequence\tconservation\tGC\tTM\tgap\n";
    my $content_fasta = "";
    my $count =0;

    foreach my $p (sort {&_get_pos_from_name($a) <=> &_get_pos_from_name($b)} keys %{$r_final_ampl_hash})
    {
		$count++;

		foreach my $r_F_pri_h (@{$r_F_p_ary})
		{
			# find $p's sequence and print
			if ($r_F_pri_h->{"primer_name"} eq $p )
			{
				my $conserv_avg = sprintf("%.2f",($pri_name_conserv{$p}+$pri_name_conserv{$r_final_ampl_hash->{$p}->{"primer_name"}})/2);

				$content .= $count."\tHIV_AMPL_".$count."\t".$conserv_avg."\t".$p."\t".$r_F_pri_h->{"sequence"}."\t".$pri_name_conserv{$p}."\t".$r_F_pri_h->{"gc"}."\t".$r_F_pri_h->{"tm"}."\t".$r_F_pri_h->{"gap_flag"}."\t".$r_final_ampl_hash->{$p}->{"primer_name"}."\t".$r_final_ampl_hash->{$p}->{"sequence"}."\t".$pri_name_conserv{$r_final_ampl_hash->{$p}->{"primer_name"}}."\t".$r_final_ampl_hash->{$p}->{"gc"}."\t".$r_final_ampl_hash->{$p}->{"tm"}."\t".$r_final_ampl_hash->{$p}->{"gap_flag"}."\n";

				$content_fasta .= ">HIV_AMPL_".$count."_F\n".$r_F_pri_h->{"sequence"}."\n";
				$content_fasta .= ">HIV_AMPL_".$count."_R\n".$r_final_ampl_hash->{$p}->{"sequence"}."\n";
			}
		}
    }

    $content .= "\n\nStart and End position of good alignment region:\n";
    $content .= "algin_start = ".$this->{ALIGN_START}."\n";
    $content .= "algin_end = ".$this->{ALIGN_END}."\n";

    open FILE, ">$outfile" or print "\nCannot open $outfile for writing<br>\n\n";
    print FILE $content;
    close FILE;
    open FILE, ">$outfile_fasta" or print "\nCannot open $outfile_fasta for writing<br>\n\n";
    print FILE $content_fasta;
    close FILE;

}


# =================================================
# -- convert F and R primer blast against genome output file to array of hash
sub file_to_array_of_hash {
    my ($file) = @_;
    my $r_pri_ary;  # array of primer hash

    my $lines = &get_lines_from_file($file);
    shift(@{$lines});

    if (scalar(@{$lines})==0)
    {
		print "$file is empty! Exit now...\n\n";
		exit;
    }

    foreach my $aLine ( @{$lines} )
    {
		my ($pri_name, $start_pos, $pri_seq, $gc, $tm, $gap_flag, $ampl_used_flag) = split /\t/, $aLine;

		my $r_pri_h;  # primer hash
		$r_pri_h->{"primer_name"}=$pri_name;
		$r_pri_h->{"start_position"} = $start_pos;
		$r_pri_h->{"sequence"} = $pri_seq;
		$r_pri_h->{"gc"} = $gc;
		$r_pri_h->{"tm"} = $tm;
		$r_pri_h->{"gap_flag"} = $gap_flag;

		push @{$r_pri_ary}, $r_pri_h;
    }

    return $r_pri_ary;
}


# =================================================
sub check_ampl_overlap {
    my ($r_ampl_hash, $r_R_pri_ary, $OVERLAP_THRESH, $PRIMER_SIZE) =@_;

    # $OVERLAP_THRESH -- two amplicons overlap

    my $r_final_ampl_h;  # final selected amplicon hash, key- forward primer name, value-reverse primer hash
    my $r_R_pri_flag_h;  # all reverse primer used flag hash. Key- reverse primer name, value- used flag, 0-not used; 1-used;

    foreach my $r_R_p_h (@{$r_R_pri_ary})
    {
		$r_R_pri_flag_h->{$r_R_p_h->{"primer_name"}} =0;
    }

    my @sorted_F_pri_name =();

    # ---- sort the hash key (F primer name) ---
    foreach my $p ( sort {&_get_pos_from_name($a) <=> &_get_pos_from_name($b)} keys %{$r_ampl_hash})
    {
		push @sorted_F_pri_name, $p;
    }

    for (my $i=0; $i<scalar(@sorted_F_pri_name)-1; $i++)
    {
        my $next_F_pri = $sorted_F_pri_name[$i+1];
		my $next_F_pri_start = &_get_pos_from_name($next_F_pri);


		# -- get sorted Reverse primer array of hash for current Forward primer --
		my @sorted_cur_R_pri_ary =();
		foreach my $r_R_pri_h (sort { $a->{"start_position"}<=> $b->{"start_position"} }  @{$r_ampl_hash->{$sorted_F_pri_name[$i]}})
		{
			push @sorted_cur_R_pri_ary, $r_R_pri_h;
		}

		if (scalar(@sorted_cur_R_pri_ary)<=0)
		{
			print $sorted_F_pri_name[$i]." has no R primer set\n";
			next;
		}
		my $last_R_pri_start = $sorted_cur_R_pri_ary[scalar(@sorted_cur_R_pri_ary)-1]->{"start_position"};

		# -- report Gap between consecutive amplicons ----
        if ($last_R_pri_start+$PRIMER_SIZE < $next_F_pri_start)  # no overlap  
		{
			print "\tGap exists between " .$sorted_F_pri_name[$i]."\t".$next_F_pri."\n";
		}
		elsif ($last_R_pri_start+$PRIMER_SIZE - $next_F_pri_start < $OVERLAP_THRESH )
		{
			print "\tOverlap less $OVERLAP_THRESH between " .$sorted_F_pri_name[$i]."\t".$next_F_pri."\n";
		}
		else
		{
			for (my $j=0; $j<scalar(@sorted_cur_R_pri_ary); $j++)
			{
				if ($sorted_cur_R_pri_ary[$j]->{"start_position"}+$PRIMER_SIZE-$next_F_pri_start>=$OVERLAP_THRESH )
				{
					# if this R primer not yet used
					if ($r_R_pri_flag_h->{$sorted_cur_R_pri_ary[$j]->{"primer_name"}} ==0)
					{				
						# r_final_ampl_h's value is hash
						$r_final_ampl_h->{$sorted_F_pri_name[$i]} = $sorted_cur_R_pri_ary[$j];
						# mark the R primer
						$r_R_pri_flag_h->{$sorted_cur_R_pri_ary[$j]->{"primer_name"}} =1;

						last;
					}

				}
			}
		}
    }

    return $r_final_ampl_h;
}


# =================================================
# Get start position from primer name. 
# Primer name format: HIV_900_F
sub _get_pos_from_name {
    my ($pri_name)=@_;
    # pri_name format: HIV_1200_F
    my @tmp = split/\_/,$pri_name;

    return $tmp[1];
}



# =================================================
sub primer_blast_genome_to_file {
    my $this = shift;

    my ($r_genome_seq_h_F, $r_pri_seq_h,$r_all_gene_pri_ary, $F_pri_outfile, $R_pri_outfile) = @_;

    my $exe_dir = "./";

    my $r_all_F_pri_ary; # all forward primers pass hit threshold
    my $r_all_R_pri_ary; # all reverse primers pass hit threshold

    $r_pri_seq_h = _check_degenerate_combination($r_pri_seq_h,$this->{COMBIN_THRESH});

    my $align_num = keys( %{$r_genome_seq_h_F});

    print "\nPrimer blast against genome...\n";

    my $content_F ="Pri_Name\tStart_Pos\tPri_seq\tGC\tTM\tGap_flag\tampl_used_flag\n";
    my $content_R ="Pri_Name\tStart_Pos\tPri_seq\tGC\tTM\tGap_flag\tampl_used_flag\n";


    foreach my $p_desc (keys %{$r_pri_seq_h})
    {
		my $a_pri_seq = $r_pri_seq_h->{$p_desc};

		my $pri_gap_flag="";
		foreach my $r_a_gene_pri_ary (@{$r_all_gene_pri_ary})
		{
			foreach my $r_p_h (@{$r_a_gene_pri_ary})
			{
				if ($r_p_h->{"primer_name"} eq $p_desc)
				{
					$pri_gap_flag= $r_p_h->{"gap"};
					last;
				}
			}
		}

	    my $blast_hit_count =0;

	    my @tmp = split /\_/,$p_desc;
	    my $start_pos = $tmp[1];         # primer start position is in primer name

	    foreach my $a_desc (keys %{$r_genome_seq_h_F})
	    {
			my $a_genome_seq = $r_genome_seq_h_F->{$a_desc};


			my $genomeFasta = $exe_dir."genome.fasta";
			my $tmp = ">$a_desc\n$a_genome_seq";
			&string_to_file($tmp, $genomeFasta);

			# create a single primer fasta
			my $primerFasta = $exe_dir.$p_desc.".fasta";
			$tmp = ">$p_desc\n$a_pri_seq";
			&string_to_file($tmp, $primerFasta);

			# formatdb before blast
			my $cmd = "/prodinfo/prod3pty/blast/blast-2.2.14/bin/formatdb -i $genomeFasta -p F";
			&doSystem($cmd);

			my $blastout = $exe_dir.$p_desc.".blast";

			# blast primer seq against genome seq
			$cmd = "/prodinfo/prod3pty/blast/blast-2.2.14/bin/blastall -p blastn -F F -i $primerFasta -d $genomeFasta -o $blastout";
			&doSystem($cmd);

			if (-s $blastout)
			{
				# find the longest hit in blast outfile
				my $parseBlast = new bit::ParseBlast();
				my $match_start =-1;
				my $match_end =-1;
				my $db_start =-1;
				my $db_end =-1;
				($match_start, $match_end, $db_start, $db_end)=$parseBlast->get_first_hit_match_position($blastout);
				if ($match_start != -1)
				{
					$blast_hit_count ++;
				}
			}
			else
			{
				print "$blastout does not exist!<br>\n";
			}
			`rm -f $genomeFasta $primerFasta $blastout`;
	    }

	    if ($blast_hit_count/$align_num >= $this->{HIT_THRESHOLD})
	    {
			if ($p_desc =~/\_F$/)
			{
				my $gc_avg = &_get_gc_avg_only($a_pri_seq);

				my $tm_avg = &_get_tm_avg_only($a_pri_seq,$this->{DNA_CONCENTRATION}, $this->{SALT_CONCENTRATION});
				# assign ampl_used_flag=1
				$content_F .=$p_desc."\t".$start_pos."\t".$a_pri_seq."\t".$gc_avg."\t".$tm_avg."\t".$pri_gap_flag."\n";
			}
			elsif ($p_desc =~/\_R$/)
			{
				my $gc_avg = &_get_gc_avg_only($a_pri_seq);

				my $tm_avg = &_get_tm_avg_only($a_pri_seq, $this->{DNA_CONCENTRATION}, $this->{SALT_CONCENTRATION});
				# assign ampl_used_flag=0
				$content_R .=$p_desc."\t".$start_pos."\t".$a_pri_seq."\t".$gc_avg."\t".$tm_avg."\t".$pri_gap_flag."\n";
			}
	    }
	
	}

    &string_to_file($content_F, $F_pri_outfile);
    &string_to_file($content_R, $R_pri_outfile);
}


# =================================================
sub load_genome {
    my $this = shift;
    my ($genome_fasta_file) = @_;

    my ($desc, $seq);
    my $seq_hash;   # for forward sequence

    open FASTA, "<$genome_fasta_file" ||die "Cannot open $genome_fasta_file\n";

	while (<FASTA>) {
      	if (/^\>.+/) { # the description
	      	s/\s*$//; # remove all trailing whitespace
	      	s/^>//; # remove leading >
			if (defined $desc) {

				if ($seq !~ m/^[0-9\s\t]+$/){
				  $seq =~ s/\s+//g; # get rid of all white space
				}		
				$seq_hash->{$desc} = $seq;
			}
			$desc = $_;
			$seq = "";
	    }
	    else {
			chomp;
			$seq .= $_;
	    }
	}
	close FASTA;

	if (defined $desc) {
	    if ($seq !~ m/^[0-9\s\t]+$/){
			$seq =~ s/\s+//g; # get rid of all white space
	    }
	    $seq_hash->{$desc} = $seq;
	}
	
    return $seq_hash;
}

# =================================================
sub _check_degenerate_combination {
	my ($r_p_seq_h,$COMBIN_THRESH) = @_;

    print "before combination check, hash num=".keys(%{$r_p_seq_h})."\n";

    foreach my $p_desc (keys %{$r_p_seq_h})
    {
		my $pri_seq = $r_p_seq_h->{$p_desc};
		my $com_num = 1;   # number of degenerate combinations in primer

		for (my $i=0; $i<length($pri_seq); $i++)
		{
			my $base = substr($pri_seq, $i,1);

			# check if base is degenerate
			if ($base eq "R" ||$base eq "Y" ||$base eq "M"|| $base eq "K" ||$base eq "W" ||$base eq "S")
			{
				$com_num = $com_num*2;
			}
			elsif ($base eq "B" ||$base eq "D" ||$base eq "H"|| $base eq "V" )
			{
				$com_num = $com_num*3;
			}
			elsif ($base eq "N" )
			{
				$com_num = $com_num*4;
			}
		}

		if ($com_num > $COMBIN_THRESH)
		{
			#remove primer with combination number more than threshold from valid primer array
			delete $r_p_seq_h->{$p_desc};

		}

    }
    print "after combination check, hash num=".keys(%{$r_p_seq_h})."\n";

    return $r_p_seq_h;

}


# =================================================
sub _check_F_R_primer_complement {
    my ($pri_seq_f, $pri_seq_r, $PAIR_COMPL_THRESH)=@_;
   
    my $compl_check =1;  # 0- fail; 1- pass;

    my $compl_seq_r = $pri_seq_r;   # R-primer complement

    $compl_seq_r  =~ tr/[A|T|C|G]/[T|A|G|C]/;

    my $rev_compl_seq_r = get_rev_complement($pri_seq_r); # the reverse complement of R_primer
    my $compl_count=0;

    for (my $i=0; $i<length($pri_seq_r); $i++)
    {
		# ignore degenerate, as one of degenerate bases surely won't match
		if (substr($pri_seq_f,$i,1) eq substr($compl_seq_r,$i,1) || substr($pri_seq_f,$i,1) eq substr($rev_compl_seq_r,$i,1) )
		{
			$compl_count++;
		}
	}

    if ($compl_count > $PAIR_COMPL_THRESH)
	{
		$compl_check =0;   # fail the complement check
	}

    return $compl_check;
}

# =================================================
# Input sequence can be in IUPAC code
sub _get_gc_avg_only {
    my($a_seq)=@_;

    $a_seq =~ s/U/T/ig;  # IUPAC code "U" represents "T"
    my $gc_count_min=0;
    my $gc_count_max=0;
    my $valid = "y";
    my $fail_reason ="--";

    for (my $i=0; $i<length($a_seq); $i++)
    {
		my $base = substr($a_seq, $i, 1);
		$base =~ tr/[a-z]/[A-Z]/;

		if ($base eq "G" || $base eq "C")
		{
			$gc_count_min++;
			$gc_count_max++;
		}
		  # -- check IUPAC base code
		elsif ($base eq "R")    # R represents A or G
		{
			$gc_count_max++; #if R represents G
		}
		elsif ($base eq "Y")    # Y represents C or T
		{
			$gc_count_max++; #if Y represents C
		}
		elsif ($base eq "S")    # S represents G or C
		{
			$gc_count_min++;
			$gc_count_max++;
		}
		elsif ($base eq "W")    # W represents A or T
		{
		}
		elsif ($base eq "K")    # K represents G or T
		{
			$gc_count_max++;  #if K represents G
		}
		elsif ($base eq "M")    # M represents A or C
		{
			$gc_count_max++;  #if M represents C
		}
		elsif ($base eq "B")    # B represents C or G or T
		{
			$gc_count_max++;  #if B represents C or G
		}
		elsif ($base eq "D")    # D represents A or G or T
		{
			$gc_count_max++;  #if B represents G
		}
		elsif ($base eq "H")    # H represents A or C or T
		{
			$gc_count_max++;  #if H represents C
		}
		elsif ($base eq "V")    # V represents A or C or G
		{
			$gc_count_max++;  #if V represents C or G
		}
		elsif ($base eq "N")    # N represents any base
		{
			$gc_count_max++;  #if N represents C or G
		}

    }

    my $gc_percent_min = 100*($gc_count_min/length($a_seq));
    my $gc_percent_max = 100*($gc_count_max/length($a_seq));
    my $gc_percent_avg = sprintf("%.2f", ($gc_percent_min+$gc_percent_max)/2 );

    return $gc_percent_avg;
}


# =================================================
# Recursive function
# get all degenerate sequence combinations
#Input:  a_seq - contains degenerate bases
#        r_all - array reference for all sequence combinations
#Output: r_all
sub _get_all_de_combination {
    my($a_seq, $r_all)=@_;
	
    my $seq="";
    for (my $i=0; $i<length($a_seq); $i++)
    {

		my $base = substr($a_seq, $i, 1);
		$base =~ tr/[a-z]/[A-Z]/;

		if ($base eq "G" || $base eq "C"||$base eq "A" || $base eq "T")
		{

			if ($i==0)
			{
				$seq =$base;
			}
			else
			{
				$seq .=$base;
			}
			if (length($seq)==length($a_seq) )
			{
				if (elem_present_in_arr($r_all, $seq)==1)
				{ next;}
				push @{$r_all}, $seq;
			}

		}
		elsif ($base eq "R")
		{
			my $seq_1=substr($a_seq,0,$i)."A".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_1, $r_all);

			my $seq_2=substr($a_seq,0,$i)."G".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_2,$r_all);
		}
		elsif ($base eq "Y")
		{
			my $seq_1=substr($a_seq,0,$i)."C".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_1,$r_all);

			my $seq_2=substr($a_seq,0,$i)."T".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_2,$r_all);
		}
		elsif ($base eq "M")
		{
			my $seq_1=substr($a_seq,0,$i)."C".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_1,$r_all);

			my $seq_2=substr($a_seq,0,$i)."A".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_2,$r_all);
		}
		elsif ($base eq "K")
		{
			my $seq_1=substr($a_seq,0,$i)."T".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_1,$r_all);

			my $seq_2=substr($a_seq,0,$i)."G".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_2,$r_all);
		}
		elsif ($base eq "W")
		{
			my $seq_1=substr($a_seq,0,$i)."T".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_1,$r_all);

			my $seq_2=substr($a_seq,0,$i)."A".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_2,$r_all);
		}
		elsif ($base eq "S")
		{
			my $seq_1=substr($a_seq,0,$i)."C".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_1,$r_all);

			my $seq_2=substr($a_seq,0,$i)."G".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_2,$r_all);
		}
		elsif ($base eq "B")
		{
			my $seq_1=substr($a_seq,0,$i)."C".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_1,$r_all);

			my $seq_2=substr($a_seq,0,$i)."T".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_2,$r_all);

			my $seq_3=substr($a_seq,0,$i)."G".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_3,$r_all);
		}
		elsif ($base eq "D")
		{
			my $seq_1=substr($a_seq,0,$i)."A".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_1,$r_all);

			my $seq_2=substr($a_seq,0,$i)."T".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_2,$r_all);

			my $seq_3=substr($a_seq,0,$i)."G".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_3,$r_all);
		}
		elsif ($base eq "H")
		{
			my $seq_1=substr($a_seq,0,$i)."A".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_1,$r_all);

			my $seq_2=substr($a_seq,0,$i)."T".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_2,$r_all);

			my $seq_3=substr($a_seq,0,$i)."C".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_3,$r_all);
		}
		elsif ($base eq "V")
		{
			my $seq_1=substr($a_seq,0,$i)."A".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_1,$r_all);

			my $seq_2=substr($a_seq,0,$i)."C".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_2,$r_all);

			my $seq_3=substr($a_seq,0,$i)."G".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_3,$r_all);
		}
		elsif ($base eq "N")
		{
			my $seq_1=substr($a_seq,0,$i)."A".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_1,$r_all);

			my $seq_2=substr($a_seq,0,$i)."C".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_2,$r_all);

			my $seq_3=substr($a_seq,0,$i)."G".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_3,$r_all);

			my $seq_4=substr($a_seq,0,$i)."T".substr($a_seq,$i+1,length($a_seq)-1-$i);
			$r_all=&_get_all_de_combination($seq_4,$r_all);
		}
    }
	
    return $r_all;
}

# =================================================
# Input sequence can be in IUPAC code
sub _get_tm_avg_only {

    my($a_seq, $dna_c, $salt_c)=@_;
    my $tm_avg=0;
    my $tm_all=0;

    $a_seq =~ s/U/T/ig;  # IUPAC code "U" represents "T"

    my @all=();  # array of all de combinations

    my $r_combination=&_get_all_de_combination($a_seq, \@all);

    foreach my $a (@{$r_combination})
    {
		my $tm = &_tm($a, $dna_c, $salt_c);
		$tm_all = $tm_all+$tm;
    }

    $tm_avg= sprintf("%.2f",$tm_all/scalar(@{$r_combination}) );

    return $tm_avg;
}

# =================================================
# for all F and R primers with different size, find amplicons
sub amplicon_selection_from_all_pri {
    my $this = shift;

    my ($r_genome_seq_h_F, $r_pri_seq_h,$F_pri_file, $R_pri_file,$conserv_file,$outfile,$outfile_fasta) = @_;

    my $MAX_AMPL_LEN = $this->{MAX_AMPL_LEN};
    my $MIN_AMPL_LEN = $this->{MIN_AMPL_LEN};

    print "Selecting amplicons ...\n\n";

    my $r_F_p_ary = &file_to_array_of_hash($F_pri_file);
    my $r_R_p_ary = &file_to_array_of_hash($R_pri_file);   # reverse primer array


    my $lines = &get_lines_from_file($conserv_file);
    my %pri_name_conserv =(); # hash. key: primer name; value: lowest conservation score of de base

    foreach my $aLine ( @{$lines} )
    {
		my ($pri_name, $pri_seq, $pri_conserv) = split /\t/, $aLine;
		$pri_name_conserv{$pri_name} = $pri_conserv;
    }

    my $r_ampl_h; # amplicon hash. Key: Forward_pri name, Value: r_Reverse_good_pri_hash_ary

    foreach my $r_F_p_h (@{$r_F_p_ary})
    {
		my $f_start = $r_F_p_h->{"start_position"};
		my $r_candidates_F_hash;   # key: F primer name, Value: array of good R primer candidate
		my $r_R_good_pri_ary = &create_new_ary(); # reverse primers paired with this forward primer with good amplicon length

		my $ampl_len_flag =0;  # 1- R primer fitting amplicon len found. 0-not found 
		foreach my $r_R_p_h (@{$r_R_p_ary})
		{
			my $r_start = $r_R_p_h->{"start_position"};

			# --- find R primer within min-max amplicon length ------
			if (($r_start >= $f_start+$MIN_AMPL_LEN-$this->{PRIMER_SIZE}) && ($r_start <= $f_start+$MAX_AMPL_LEN-$this->{PRIMER_SIZE}))
			{
				$ampl_len_flag = 1;

				# ----- check complement between F primer and R primer
				my $check_compl = &_check_F_R_primer_complement2($r_F_p_h->{"sequence"},$r_R_p_h->{"sequence"},$this->{PAIR_COMPL_THRESH});
				if ( $check_compl ==1 )
				{
					push (@{$r_R_good_pri_ary}, $r_R_p_h);
				}
			}

		}

		if ($ampl_len_flag ==0)
		{
			print "\t".$r_F_p_h->{"primer_name"}." no R primer fits amplicon length\n";
			next;
		}
		if (scalar(@{$r_R_good_pri_ary})<=0)
		{
			print "\t".$r_F_p_h->{"primer_name"}." F and R primer complement\n";
			next;
		}
			$r_ampl_h->{$r_F_p_h->{"primer_name"}} = $r_R_good_pri_ary;
	}

	# -------- check overlap between amplicons ---------
	my $r_final_ampl_hash = &check_ampl_overlap($r_ampl_h,$r_R_p_ary,$this->{OVERLAP_THRESH},$this->{PRIMER_SIZE});

	my $content="count\tampl_name\tavg_conservation\tpri_name_F\tsequence\tconservation\tGC\tTM\tgap\tpri_name_R\tsequence\tconservation\tGC\tTM\tgap\n";
	my $content_fasta = "";
	my $count =0;

    foreach my $p (sort {&_get_pos_from_name($a) <=> &_get_pos_from_name($b)} keys %{$r_final_ampl_hash})
    {
		$count++;

		foreach my $r_F_pri_h (@{$r_F_p_ary})
		{
			# find $p's sequence and print
			if ($r_F_pri_h->{"primer_name"} eq $p )
			{
				my $conserv_avg = sprintf("%.2f",($pri_name_conserv{$p}+$pri_name_conserv{$r_final_ampl_hash->{$p}->{"primer_name"}})/2);

				$content .= $count."\tHIV_AMPL_".$count."\t".$conserv_avg."\t".$p."\t".$r_F_pri_h->{"sequence"}."\t".$pri_name_conserv{$p}."\t".$r_F_pri_h->{"gc"}."\t".$r_F_pri_h->{"tm"}."\t".$r_F_pri_h->{"gap_flag"}."\t".$r_final_ampl_hash->{$p}->{"primer_name"}."\t".$r_final_ampl_hash->{$p}->{"sequence"}."\t".$pri_name_conserv{$r_final_ampl_hash->{$p}->{"primer_name"}}."\t".$r_final_ampl_hash->{$p}->{"gc"}."\t".$r_final_ampl_hash->{$p}->{"tm"}."\t".$r_final_ampl_hash->{$p}->{"gap_flag"}."\n";

				$content_fasta .= ">HIV_AMPL_".$count."_F\n".$r_F_pri_h->{"sequence"}."\n";
				$content_fasta .= ">HIV_AMPL_".$count."_R\n".$r_final_ampl_hash->{$p}->{"sequence"}."\n";
			}
		}
    }

    $content .= "\n\nStart and End position of good alignment region:\n";
    $content .= "algin_start = ".$this->{ALIGN_START}."\n";
    $content .= "algin_end = ".$this->{ALIGN_END}."\n";

    open FILE, ">$outfile" or print "\nCannot open $outfile for writing<br>\n\n";
    print FILE $content;
    close FILE;
    open FILE, ">$outfile_fasta" or print "\nCannot open $outfile_fasta for writing<br>\n\n";
    print FILE $content_fasta;
    close FILE;

}

# =================================================
# for all F and R primers, primers in differenct size
sub _check_F_R_primer_complement2 {
    my ($pri_seq_f, $pri_seq_r, $PAIR_COMPL_THRESH)=@_;
   
    my $compl_check =1;  # 0- fail; 1- pass;

    my $compl_seq_r = $pri_seq_r;   # R-primer complement

    $compl_seq_r  =~ tr/[A|T|C|G]/[T|A|G|C]/;

    my $rev_compl_seq_r = get_rev_complement($pri_seq_r); # the reverse complement of R_primer
    my $compl_count=0;

    my $min_length = 0;
    if (length($pri_seq_f)>length($pri_seq_r))
    {
		$min_length = length($pri_seq_r);
    }
    else
    {
		$min_length = length($pri_seq_f);
    }

   # for (my $i=0; $i<length($pri_seq_r); $i++)
    for (my $i=0; $i<$min_length; $i++)  
    {
		# ignore degenerate, as one of degenerate bases surely won't match
		if (substr($pri_seq_f,$i,1) eq substr($compl_seq_r,$i,1) || substr($pri_seq_f,$i,1) eq substr($rev_compl_seq_r,$i,1) )
		{
			$compl_count++;
		}
	}

    if ($compl_count > $PAIR_COMPL_THRESH)
	{
		$compl_check =0;   # fail the complement check
	}

    return $compl_check;
}

# =================================================
sub get_first_hit_match_position($) {
	my $self   = shift;
	my ($file) = @_;

	my $match_start=-1;
	my $match_end =-1;
	my $db_start =-1;  # match start pos on sequence database
	my $db_end =-1;    # match end pos on sequence database

	open(FILE, "<$file") or die "Can't open $file for reading: $!";
	my @LINES = <FILE>;
	close(FILE);
	my $match_start_flag=0;   # not yet find match start in blast file
	my $db_start_flag=0;   # not yet find database sequence start in blast file

	foreach(@LINES)
	{
		if (/Score\s=/ && $match_start_flag==1 && $db_start_flag==1)  # second hit starts
		{
			last;
		}
		elsif(/Query:\s+(\d+)\s+(.*)\s+(\d+)/ && $match_start_flag==0 )
		{
			$match_start = $1;
			$match_start_flag =1; 
		}
		elsif (/Sbjct:\s+(\d+)\s+(.*)\s+(\d+)/ && $db_start_flag==0 )
		{
			$db_start = $1;
			$db_start_flag =1;
		}
		elsif (/Query:\s+(\d+)\s+(.*)\s+(\d+)/ && $match_start_flag==1 ) 
		{
			$match_end = $3;
		}
		elsif (/Sbjct:\s+(\d+)\s+(.*)\s+(\d+)/ && $db_start_flag==1 ) 
		{
			$db_end = $3;
		}

	}

	return $match_start, $match_end, $db_start, $db_end;
}

################################################
sub elem_present_in_arr   # Input - array_ref, element to be probed. 
{                         # Output 0/1.
	my ($r_arr, $elem) = @_;
	my $present = 0;
	foreach (@$r_arr)
    {
		if ($_ eq $elem)
		{
			$present = 1;
			last;
		}
    }
	return $present;                  
}

##############################################
sub string_to_file
{
    my ($str, $f) = @_;
    open(OUT_F, ">$f") or die "Can't open $f for writing\n";
    print OUT_F $str;
    close OUT_F; 
}

###############################################
sub get_rev_complement 
{
	my $rev_seq = reverse $_[0];
	$rev_seq =~ tr/ATGCatgc/TACGtacg/;
	return  $rev_seq;
}

#################################################
sub remove_elem_from_arr  # Input - array_ref, element to be removed.
{                         # Output - array_ref to new array.
	my ($r_arr, $elem) = @_;
	my @res_arr = ();
  
	foreach (@{$r_arr})
    {
		next if ($_ eq $elem);
		push @res_arr, $_;
    }      
	return \@res_arr; 
}

################
sub doSystem 
{
    my ($string) = @_;
    
    my $rc = system($string);
    if ($rc) 
    {
		return "system call '$string' terminated with exit status ($rc)\n";
    }
    return undef;
}

##############################################
sub get_lines_from_file
{
   my ($in_file) = @_;
   my @lines = ();
   open(IN_F, "$in_file") or die "Can not open $in_file\n";
   while(<IN_F>)
   {
    chomp;
    push @lines, $_; 
   }
   close(IN_F);
  
   return \@lines; 
}
 

1;

