#!/usr/bin/perl -w

#####################################################################################################
#                                                                                                   #
#                                   PriSM License                                                   #
#                                                                                                   #
# PriSM is distributed under the following BSD-style license:                                       #
# Copyright  2011-2014 Broad Institute, Inc.  All rights reserved.                                 #
#                                                                                                   #
# Redistribution and use in source and binary forms, with or without modification, are              #
# permitted provided that the following conditions are met:                                         #
#                                                                                                   #
#  1. Redistributions of source code must retain the above copyright notice, this list              #
#     of conditions and the following disclaimer.                                                   #
#                                                                                                   #
#  2. Redistributions in binary form must reproduce the above copyright notice, this list           #
#     of conditions and the following disclaimer in the documentation and/or other materials        #
#     provided with the distribution.                                                               #
#                                                                                                   #
#  3. Neither the name of the Broad Institute, Inc. nor the names of its contributors may be        #
#     used to endorse or promote products derived from this software without specific prior         #
#     written permission.                                                                           #
#                                                                                                   #
# THIS SOFTWARE IS PROVIDED AS IS.  BROAD MAKES NO EXPRESS OR IMPLIED REPRESENTATIONS OR          #
# WARRANTIES OF ANY KIND REGARDING THE SOFTWARE AND COPYRIGHT, INCLUDING, BUT NOT LIMITED TO,       #
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, CONFORMITY WITH ANY              #
# DOCUMENTATION, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT         #
# DISCOVERABLE. IN NO EVENT SHALL BROAD, THE COPYRIGHT HOLDERS, OR CONTRIBUTORS BE LIABLE FOR       #
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,        #
# BUT NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;    #
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,     #
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE       #
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF, HAVE REASON TO KNOW, OR IN FACT SHALL KNOW OF THE       #
# POSSIBILITY OF SUCH DAMAGE.                                                                       #
#                                                                                                   #
# If, by operation of law or otherwise, any of the aforementioned warranty disclaimers are          #
# determined inapplicable, your sole remedy, regardless of the form of action, including, but       #
# not limited to, negligence and strict liability, shall be replacement of the Software with        #
# an updated version if one exists.                                                                 #
#                                                                                                   #
# Development of PriSM has been funded in whole or in part with federal funds from the National     #
# Institute of Allergy and Infectious Diseases, National Institutes of Health, Department of        #
# Health and Human Services, under Contract No. HHSN266200400001C.                                  #
#                                                                                                   #
# In addition, PriSM is distributed, in part, under and subject to the provisions of licenses for:  #
# Perl, version 5.20.0,  1997-2010 Tom Christiansen, Nathan Torkington (all rights reserved).      #
#####################################################################################################

# ===================================================================================================
# Description:
#   To consensus sequence files 
#  
# Usage: 
#    make_all_consensus.pl aln_file multiprimer.conf
#   (aln_file is alignment file generated by CLUSTALW or MUSCLE, must have extension .aln or .muscle)
#   (You can edit multiprimer.conf to customize configure file settings)
#
# Creation Date: 2007-03-02
# Author: Qing Yu
# ===================================================================================================

use strict;


my $usage = <<USAGE;
Usage:
      $0 alnFile configureFile
  For example:
      $0 den9.aln multiprimer.conf
USAGE
;

my $GAP_THRESH;   # at certain position, 80% are "-"
my $DE_THRESH;  # threshold to define degenerate base. Even if there are several posible bases at certain position, if the most frequent base's frequency >DE_THRESH, we use the most frequent base as non-degenerate base and ignore others.

my $base_dir ="./";

my $alnfile = $ARGV[0];
my $conf_file = $ARGV[1];

chdir  $base_dir;
print "current make consensus dir\n";
`pwd`;
`cp -r $alnfile .`;
`cp -r $conf_file .`;

unless (open(A,$alnfile)) {
  warn "<br>Failed to open $alnfile<br>\n";
  exit(1);
}
my @tmp = split(/\//,$alnfile);
my $aln_name = pop @tmp;

#my $conf_file = $base_dir."multiprimer.conf";
my $conf_content="";

if (-e $conf_file)
  {
    my $r_params = &get_conf_params($conf_file);
    $GAP_THRESH = $r_params->{"GAP_THRESHOLD"};
    $DE_THRESH = $r_params->{"DE_THRESHOLD"};

    my $conf_lines = get_lines_from_file($conf_file);

    foreach my $aLine ( @{$conf_lines} )
      {
	if ($aLine !~ /ALIGN/)
	  {
	    $conf_content .=$aLine."\n"; 
	  }
      }
  }


my %seq=();      # key: geno_name, value: genome sequence
my $cons_major ="";
my $cons_iupac ="";
my $cons_gapFlag;  # string of gapFlag of each position. If gap, show 1.
my $tmp_gap_count =0;   # num of gaps after last gap and before current pos 
my @seqs=();     # array (genome) of array (each genome sequence) 

my $major_cons_file = $base_dir.$aln_name.".cons.major";
my $iupac_cons_file = $base_dir.$aln_name.".cons.iupac";
my $freq_cons_file = $base_dir.$aln_name.".cons.all";
my $freq_file = $base_dir.$aln_name.".cons.freq";
my $conserv_file = $base_dir.$aln_name.".cons.conserv"; #conservation file with frequency percentage for majority consensus
my $major_cons_gapFile = $base_dir.$aln_name.".cons.gap"; # show gap flag at each position


print "\n<br>Generate consensus files ......<br><br>\n\n";

if ($alnfile =~/\.aln/i)   # clustawl output
  {
    <A>; # kill the header
    while (<A>) {
      next if /\*/;

      tr/[a-z]/[A-Z]/;
      if (/(\S+)\s+(\S+)/) {
	$seq{$1} .= $2;     # key: geno_name, value: genome sequence
      }
    }
  }
elsif ($alnfile =~/\.muscle/i)   # muscle output
  {
    my ($desc, $sequence);

    open (FASTA, $alnfile) || die "Can't open $alnfile\n";

    while (<FASTA>) 
      {
	if (/^\>.+/) 
	  { # the description
	    s/\s*$//; # remove all trailing whitespace
	    s/^>//; # remove leading >
	    if (defined $desc) 
	      {

		if ($sequence !~ m/^[0-9\s\t]+$/){
		  $sequence =~ s/\s+//g; # get rid of all white space
		}		

		$seq{$desc} = $sequence;

	      }
	    $desc = $_;
	    $sequence = "";
	  }
	else 
	  {
	    chomp;
	    $sequence .= $_;
	  }
      }
    close FASTA;

    if (defined $desc) 
      {
	if ($sequence !~ m/^[0-9\s\t]+$/)
	  {
	    $sequence =~ s/\s+//g; # get rid of all white space
	  }

	$seq{$desc} = $sequence;

      }

  }
# convert genome sequence string to array
my $i=0;    # genome serial #
foreach my $genoName (keys %seq) 
  {
    $seq{$genoName} =~ s/\s+//;
    $seqs[$i] = [split(//,$seq{$genoName})];
    $i++;
  }

print "<br>number of seq = $i<br>\n";
my $r_cons_ary;
my $r_fr_ary;
my $r_fr_pent; # conservation frequency string for majority consensus


print "<br>whole seq length =".scalar(@{$seqs[0]})."<br>\n";


# get the start and end position of fully aligned genomes
my ($align_start, $align_end) = &get_align_range(\@seqs);
print "<br>align_start=$align_start  align_end=$align_end<br>\n";



my $pos_in_cons=0;    # pos in new consensus (no gap)

for (my $j = 0; $j <= $#{$seqs[0]}; $j++) 
  {

    my %base=();   # key - base, value - frequency number of this base at this pos

    # report align start pos and align end pos in consensus
    if ($j==$align_start)
      {
	print "<br>align start pos in consensus = $pos_in_cons<br>\n";
	$conf_content .= "ALIGN_START=$pos_in_cons=\n";
      }
    if ($j==$align_end)
      {
	print "<br>align end pos in consensus = $pos_in_cons<br>\n";
	$conf_content .= "ALIGN_END=$pos_in_cons=\n";

	# rewrite configure file with updated alignment start and end position
	open FILE, ">$conf_file" or print "\nCannot open $conf_file for writing<br>\n\n";
	print FILE $conf_content;
	close FILE;
      }


    for ($i = 0; $i <= $#seqs; $i++) 
      {

	if (!defined($seqs[$i][$j]))
	  {
	    print "<br>in main, pos=$j not defined, go next<br>\n";

	    next;
	  }

	if (!defined($seqs[$i][$j]))
	  {
	    print "<br>in main, pos=$j is empty, go next<br>\n";
	    next;
	  }

	if ($seqs[$i][$j] eq "N") 
	  { 
	    print "<br>Warning: Seq $i position $j is N<br>\n";
	  }

	# count frequency of each base at position j
	if (exists $base{$seqs[$i][$j]} )
	  {
	    $base{$seqs[$i][$j]}++;
	  }
	else
	  {
	    $base{$seqs[$i][$j]}=1;
	  }

      } # for $i

    my $align_seq_num = keys(%seq);

    my $major_base = "";
    $major_base = &get_major_base(\%base,$align_start, $align_end, $j,$align_seq_num);

    if ( $major_base eq "-")
      {
	$cons_gapFlag .= 1;
	$tmp_gap_count++;
      }
    else
      {	
	$cons_major .= $major_base;
	$cons_iupac .= &get_iupac(\%base, $align_start, $align_end, $j,$align_seq_num);

	if ($tmp_gap_count>0)
	  {
	    $tmp_gap_count--;  # when gap is major,major consensus skip that pos, but gapFlag string shows 1 at that pos, so here should reduce one "0" in gapFlag string when no gap as major base at this pos.
	  }
	else
	  {
	    $cons_gapFlag .= 0;
	  }


	# only when major_base is not gap, count the freq
	($r_cons_ary, $r_fr_ary, $r_fr_pent) = &get_freq_cons($r_cons_ary, $r_fr_ary,\%base, \@seqs, $align_start, $align_end, $j, $pos_in_cons);

	$pos_in_cons++;  # as gap inside alignment, pos_in_cons<=origin_pos($j)
      }
  }

my $cons_all_name = $aln_name.".cons";
my $cons_iupac_name = $aln_name.".cons.iupac";
my $cons_gap_name = $aln_name.".cons.gap";

print "<br>Write to $major_cons_file<br>\n";
&write_cons_file($cons_major, $major_cons_file, $cons_all_name);

print "<br>Write to $iupac_cons_file<br>\n";
&write_cons_file($cons_iupac, $iupac_cons_file, $cons_iupac_name);

print "<br>Write to $freq_cons_file<br>\n";
&write_all_cons_file($r_cons_ary, $freq_cons_file, $aln_name);

print "<br>Write to $freq_file<br>\n";
&write_all_freq_file($r_fr_ary, $freq_file, $aln_name);

print "<br>Write to $major_cons_gapFile<br>\n";
&write_cons_file($cons_gapFlag, $major_cons_gapFile, $cons_gap_name);

print "<br>Making consensus completes!<br>\n";
exit;


#===========================================================
# Find the majority base that has the highest frequency at
#  a certain position.
# Input: hash reference. Key: base, Value: frequency
# Output: the majority base at this position
#===========================================================
sub get_major_base
  {
  #  my ($r_base_h) = @_;
    my ($r_base_h, $align_s, $align_e, $pos,$align_seq_num) = @_;

    my $gap="-";

    # sort the hash value(base frequency) from max to min
    my @sorted = reverse sort { $r_base_h->{$a} <=> $r_base_h->{$b} } keys %{$r_base_h};


    if ($sorted[0] eq "-" )
      {

	if ($r_base_h->{$sorted[0]} == $align_seq_num)
	  {
	    return $gap;
	  }

	 # inside full alignment, if over 80% is gap, choose gap
	if ($pos<$align_s ||$pos>$align_e)
	  {
	    return $sorted[1]; # for non-alignment region,use non-gap base with highest frequency
	  }
	elsif($pos>=$align_s && $pos<=$align_e && $r_base_h->{$sorted[0]}/$align_seq_num >=$GAP_THRESH)
	   {
	     return $gap;
	   }
	 else
	   {
	     return $sorted[1];
	   }
      }
    else
      {
	return $sorted[0];
      }

  }

#===========================================================
# Convert a multialignment hash to IUPAC code sequence at
# a certain position
# Input: hash reference for a position. Key: base, Value: frequency;
#        position
# Output: iupac code for this position 
#===========================================================
sub get_iupac
{
    my ($r_base_h, $align_s, $align_e, $pos,$align_seq_num ) = @_;

    my $iupac;
    my @base_ary = ();  # array of bases at the position

    my $gap="-";

    # sort the hash value(base frequency) from max to min
    my @sorted = reverse sort { $r_base_h->{$a} <=> $r_base_h->{$b} } keys %{$r_base_h};


    if ($sorted[0] eq "-" )
    {
		if ($r_base_h->{$sorted[0]} == $align_seq_num)
		{
			return $gap;
		}	
		# inside full alignment, if over 80% is gap, choose gap
		elsif($pos>=$align_s && $pos<=$align_e && $r_base_h->{$sorted[0]}/$align_seq_num>=$GAP_THRESH)
		{		
			return $gap;
		}	
	}
    else
	{
		# treat as non-degenerate base
		if ($sorted[0] ne "N" && $r_base_h->{$sorted[0]}/$align_seq_num>= $DE_THRESH)
		{
			return $sorted[0];
		}
	}

    # push all possible base at a certain position in the array
    foreach my $a_base (keys %{$r_base_h}) 
	{
		if ($a_base ne "-")  
		{
			# IUPAC base may exist in original genome
			if ($a_base eq "R")
			{
				if ( elem_present_in_arr(\@base_ary,"A") !=1)
				{
					push(@base_ary,"A");
				}
				if ( elem_present_in_arr(\@base_ary,"G") !=1)
				{
					push(@base_ary,"G");
				}
			}
			elsif ($a_base eq "Y")
			{
				if ( elem_present_in_arr(\@base_ary,"C") !=1)
				{
					push(@base_ary,"C");
				}
				if ( elem_present_in_arr(\@base_ary,"T") !=1)
				{
					push(@base_ary,"T");
				}
			}
			elsif ($a_base eq "M")
			{
				if ( elem_present_in_arr(\@base_ary,"C") !=1)
				{
					push(@base_ary,"C");
				}
				if ( elem_present_in_arr(\@base_ary,"A") !=1)
				{
					push(@base_ary,"A");
				}
			}
			elsif ($a_base eq "K")
			{
				if ( elem_present_in_arr(\@base_ary,"T") !=1)
				{
					push(@base_ary,"T");
				}
				if ( elem_present_in_arr(\@base_ary,"G") !=1)
				{
					push(@base_ary,"G");
				}
			}
			elsif ($a_base eq "W")
			{
				if ( elem_present_in_arr(\@base_ary,"T") !=1)
				{
					push(@base_ary,"T");
				}
				if ( elem_present_in_arr(\@base_ary,"A") !=1)
				{
					push(@base_ary,"A");
				}
			}
			elsif ($a_base eq "S")
			{
				if ( elem_present_in_arr(\@base_ary,"C") !=1)
				{
					push(@base_ary,"C");
				}
				if ( elem_present_in_arr(\@base_ary,"G") !=1)
				{
					push(@base_ary,"G");
				}
			}
			elsif ($a_base eq "B")
			{
				if ( elem_present_in_arr(\@base_ary,"C") !=1)
				{
					push(@base_ary,"C");
				}
				if ( elem_present_in_arr(\@base_ary,"T") !=1)
				{
					push(@base_ary,"T");
				}
				if ( elem_present_in_arr(\@base_ary,"G") !=1)
				{
					push(@base_ary,"G");
				}
			}
			elsif ($a_base eq "D")
			{
				if ( elem_present_in_arr(\@base_ary,"A") !=1)
				{
					push(@base_ary,"A");
				}
				if ( elem_present_in_arr(\@base_ary,"T") !=1)
				{
					push(@base_ary,"T");
				}
				if ( elem_present_in_arr(\@base_ary,"G") !=1)
				{
					push(@base_ary,"G");
				}
			}
			elsif ($a_base eq "H")
			{
				if ( elem_present_in_arr(\@base_ary,"A") !=1)
				{
					push(@base_ary,"A");
				}
				if ( elem_present_in_arr(\@base_ary,"T") !=1)
				{
					push(@base_ary,"T");
				}
				if ( elem_present_in_arr(\@base_ary,"C") !=1)
				{
					push(@base_ary,"C");
				}
			}
			elsif ($a_base eq "V")
			{
				if ( elem_present_in_arr(\@base_ary,"A") !=1)
				{
					push(@base_ary,"A");
				}
				if ( elem_present_in_arr(\@base_ary,"C") !=1)
				{
					push(@base_ary,"C");
				}
				if ( elem_present_in_arr(\@base_ary,"G") !=1)
				{
					push(@base_ary,"G");
				}
			}
			else  # A,T,C,G, or N
			{
				if ( elem_present_in_arr(\@base_ary,$a_base) !=1)
				{
					push(@base_ary,$a_base);
				}
			}
		}
	}

    # decide the output IUPAC base at a certain position
    if (scalar(@base_ary)==1) 
	{
		$iupac = $base_ary[0];
		return $iupac;
	}

    if (scalar(@base_ary)==2 && elem_present_in_arr(\@base_ary,"A")==1 && elem_present_in_arr(\@base_ary,"G")==1 )
	{
		$iupac = "R";
	}
    elsif (scalar(@base_ary)==2 && elem_present_in_arr(\@base_ary,"A")==1 && elem_present_in_arr(\@base_ary,"C")==1)
	{
		$iupac = "M";
	}
    elsif (scalar(@base_ary)==2 && elem_present_in_arr(\@base_ary,"C")==1 && elem_present_in_arr(\@base_ary,"G")==1)
	{
		$iupac = "S";
	}
    elsif (scalar(@base_ary)==2 && elem_present_in_arr(\@base_ary,"C")==1 && elem_present_in_arr(\@base_ary,"T")==1 )
	{
		$iupac = "Y";
	}
    elsif (scalar(@base_ary)==2 && elem_present_in_arr(\@base_ary,"T")==1 && elem_present_in_arr(\@base_ary,"G")==1)
	{
		$iupac = "K";
	}
    elsif (scalar(@base_ary)==2 && elem_present_in_arr(\@base_ary,"T")==1 && elem_present_in_arr(\@base_ary,"A")==1)
	{
		$iupac = "W";
	}
    elsif (scalar(@base_ary)==3 && elem_present_in_arr(\@base_ary,"A")==1 && elem_present_in_arr(\@base_ary,"C")==1 && elem_present_in_arr(\@base_ary,"G")==1)
	{
		$iupac = "V";
	}
    elsif (scalar(@base_ary)==3 && elem_present_in_arr(\@base_ary,"C")==1 && elem_present_in_arr(\@base_ary,"T")==1 && elem_present_in_arr(\@base_ary,"G")==1)
	{
		$iupac = "B";
	}
    elsif (scalar(@base_ary)==3 && elem_present_in_arr(\@base_ary,"A")==1 && elem_present_in_arr(\@base_ary,"T")==1 && elem_present_in_arr(\@base_ary,"G")==1)
	{
		$iupac = "D";
	}
    elsif (scalar(@base_ary)==3 && elem_present_in_arr(\@base_ary,"A")==1 && elem_present_in_arr(\@base_ary,"T")==1 && elem_present_in_arr(\@base_ary,"C")==1)
	{
		$iupac = "H";
	}
    else
	{
		$iupac = "N";
		print "<br>Warning: use N to represent unknown IUPAC at position $pos<br><br>\n";
		foreach my $t (@base_ary)
		{
			print "$t\t";
		}
		print "\n";
	}

	return $iupac;
}


#===========================================================
# generate all degenerate consensus string and all degenerate  
#   frequency consensus strings
# Input:
#    consensus array - contains majority consensus and all
#       minority consensus 
#    frequency array - contains majority frequency consensus
#        and all minority consensus
#    hash reference - for a position. Key: base, Value: frequency
#    genome array - array (genome) of array (each genome sequence)
# Output: 
#    array of all consensus
#    array of all consensus of frequency
#    array of frequency percentage of majority consensus
#===========================================================
sub get_freq_cons
{
	my ($r_cons_ary, $r_fr_ary, $r_base_h, $r_seqs,$align_s, $align_e, $align_pos, $cons_pos) = @_;


    my @b_sorted = reverse sort { $r_base_h->{$a} <=> $r_base_h->{$b} } keys %{$r_base_h};
    my $r=0;   # No. of aligned sequences recorded for the whole function
    my $tmp_base="";

    my $align_num = scalar(@{$r_seqs});


	if (scalar(@b_sorted)==1)   # no degenerate base at this position
	{
	    # maximum possible frequency consensus number = number of sequence
	    # i - No. of aligned sequences
	    if ($b_sorted[0] ne "-")
		{
	       # if all sequence has "-" at this position, skip at this pos

			for (my $i=$r; $i<=$#{$r_seqs}; $i++)  
			{
				if ( $i==0 || defined($r_cons_ary->[$i]) )
				{
					$r_cons_ary->[$i] .=$b_sorted[0];
					$r_fr_ary->[$i]->[$cons_pos] = $r_base_h->{$b_sorted[0]};
				}
			}
		}
	}
	else # has degenerate base at this position
	{

	    if ($b_sorted[0] ne "-" || ($b_sorted[0] eq "-" && $r_base_h->{$b_sorted[0]}/$align_num <$GAP_THRESH && $align_pos>=$align_s && $align_pos<=$align_e) || ($b_sorted[0] eq "-" && $align_pos<$align_s ) || ($b_sorted[0] eq "-" && $align_pos>$align_e ))
		{
			foreach my $a_base (@b_sorted)
			{
				if ($a_base ne "-") #$b_sorted[0] is "-",not put freq of "-" in freq consensus
				{
					if ($r>0 && !defined($r_cons_ary->[$r]))
					{
		
						# there are no frequency bases in previous positions
						# use the same consensus bases in the last consensus in the array
						$r_cons_ary->[$r] = substr($r_cons_ary->[$r-1], 0, length($r_cons_ary->[$r-1])-1 );

						for (my $k=0; $k<=$cons_pos; $k++)
						{
							$r_fr_ary->[$r]->[$k] = $r_fr_ary->[$r-1][$k];
						}
					}

					$r_cons_ary->[$r] .=$a_base;
					$r_fr_ary->[$r]->[$cons_pos] = $r_base_h->{$a_base};
					$r++;

					$tmp_base = $a_base;

					# if this degenerate's freq >=DE_THRESH, treat as non-degenerate base, ignore other bases at this position
					if ($r_base_h->{$a_base}/$align_num >=$DE_THRESH)
					{
						last;
					}			 
				} #if $a_base ne "-"
			} # foreach my $a_base

			# num of degenerate bases of current pos less than max num of previous pos, fill freq seq with last base's info
			for (my $i=$r; $i<=$#{$r_seqs}; $i++)
			{
				if (defined($r_cons_ary->[$i]) )
				{
					if ($tmp_base ne "")
					{
						$r_cons_ary->[$i] .= $tmp_base;
						$r_fr_ary->[$i]->[$cons_pos] = $r_base_h->{$tmp_base};
					}
				}
			}

		} ## if $b_sorted[0] ne "-" ...

	}## if (scalar(@b_sorted)==1)

	#frequency percentage for majority consensus
    #$r_fr_ary->[0] is majority consensus frequency string
	my $r_fr_percent;
	for (my $j=0; $j<=$cons_pos; $j++)
    {	 
		if (exists($r_fr_ary->[0]->[$j]))
		{
			$r_fr_percent->[$j] = int(($r_fr_ary->[0]->[$j]/scalar(@{$r_seqs}))*100);
		}
		else
		{
			print "<br>pos=$j  not exist r_fr_ary->[0]->[j]<br><br>\n";
		}

    }

    return $r_cons_ary, $r_fr_ary, $r_fr_percent;
  }


#===========================================================
# Write all items in a consensus array to file
# Input:
#    consensus string 
#    output filename 
#    alignment name
# Output: 
#    a consensus file generated
#===========================================================
sub write_cons_file
{
	my ($string, $filename,$alnname) = @_;

    open(FILE,">$filename") or print "<br>Cannot open $filename<br>\n";;
    print FILE ">$alnname\n";
    my $l = length($string);
    for (my $i = 0; $i <= $l; $i+=60) 
	{
		print FILE substr($string,$i,60),"\n";
    }
    close(FILE);
  }


#===========================================================
# Write frequency percentage of majority consensus to file
# Input:
#    frequency percentage array 
#    output filename 
#    alignment name
# Output: 
#    a frequency percentage file generated
#===========================================================
sub write_fr_percent_file
{
    my ($r_pent, $filename,$alnname) = @_;

    open(FILE,">$filename");
    print FILE "$alnname\n";

    for (my $i=0; $i < scalar(@{$r_pent}); $i++)
    {
		print FILE $r_pent->[$i]."\n";
    }

    close(FILE);
}

#===========================================================
# Write majority consensus and all minority consensus to file
# Input:
#    array of all consensus
#    output filename 
#    alignment name
# Output: 
#    a multi-consensus file generated
#===========================================================
sub write_all_cons_file
{
    my ($r_ary, $filename,$alnname) = @_;

    open(FILE,">$filename");

    for (my $k=1; $k<=$#{$r_ary}+1; $k++)
	{

		print FILE ">$alnname.cons".".$k"."\n";
		my $l = length($r_ary->[$k-1]);
		for (my $i = 0; $i <= $l; $i+=60) 
		{
			print FILE substr($r_ary->[$k-1],$i,60),"\n";
		}

	}
    close(FILE);
}


#===========================================================
# Write majority consensus and all minority consensus to file
# Input:
#    array of all consensus of frequency
#    output filename 
#    alignment name
# Output: 
#    a multi-consensus of frequency file generated
#===========================================================
sub write_all_freq_file
{
    my ($r_fr_ary, $filename,$alnname) = @_;

    open(FILE,">$filename");

    for (my $k=1; $k<=$#{$r_fr_ary}+1; $k++)
	{
		print FILE ">$alnname.cons".".$k"."\n";

		for (my $i=0; $i < scalar(@{$r_fr_ary->[$k-1]}); $i++)
		{

			print FILE $r_fr_ary->[$k-1]->[$i]." ";
			if (($i+1)%60 == 0)
			{
				print FILE "\n";
			}
		}
		print FILE "\n";
	}

    close(FILE);
}


#===========================================================
# Find start and end position of fully aligned genomes 
#  (all genomes have no gap at alignment start and end pos)
# Input:
#    array (genomes) of array (each genome's sequence)
# Output: 
#    start position and end position of alignment
#===========================================================
sub get_align_range
{
    my ($r_seqs_ary) = @_;

    my $align_s;
    my $align_e;

    # forward search align_start pos along multi-alignment
    for (my $j = 0; $j < scalar(@{$r_seqs_ary->[0]}); $j++) 
    {
	
		my $align_flag_s =1;   # 1, aligned for each genome

		for (my $i = 0; $i < scalar(@{$r_seqs_ary}); $i++) 
		{
			if (!defined($r_seqs_ary->[$i]->[$j]))
			{
				print "\n<br>pos=$j in seqs not defined!<br><br>\n";
				next;
			}

			if ($r_seqs_ary->[$i]->[$j] eq "")
			{
				print "\n<br>pos=$j in seqs is empty!<br><br>\n";
				next;
			}

			if ($r_seqs_ary->[$i]->[$j] eq "-") 
			{
				$align_flag_s =0;
				last;
			}
		}

		if ($align_flag_s ==1)
		{
			$align_s = $j;
			last;
		}
	}

    # reverse search align_end pos along multi-alignment
    for (my $j = scalar(@{$r_seqs_ary->[0]})-1; $j >= 0; $j--) 
    {
	
		my $align_flag_e =0;   # 1, aligned for each genome

		for (my $i = 0; $i < scalar(@{$r_seqs_ary}); $i++) 
		{

			if (!defined($r_seqs_ary->[$i]->[$j]))
			{
				print "\n<br>align_end search, pos=$j in seqs not defined!<br><br>\n";
				last;
			}

			if ($r_seqs_ary->[$i]->[$j] eq "")
			{
				print "\n<br>align_end search, pos=$j in seqs is empty!<br><br>\n";
				last;
			}

			if ($r_seqs_ary->[$i]->[$j] eq "-") 
			{
				$align_flag_e =0;
				last;
			}

			$align_flag_e =1;
		}

		if ($align_flag_e ==1)
		{
			$align_e = $j;
			last;
		}
    }

    return $align_s, $align_e;
}

##############################################
sub get_lines_from_file
{
   my ($in_file) = @_;
   my @lines = ();
   open(IN_F, "$in_file") or die "Can not open $in_file\n";
   while(<IN_F>)
   {
    chomp;
    push @lines, $_; 
   }
   close(IN_F);
  
   return \@lines; 
}

###########################
# Parses conf file that has
# param/value pairs:
# CG%low=45=
# GC%hi=55=
############################
sub get_conf_params
{
	my ($in_f) = @_;
	my %params = ();
	my $lines = get_lines_from_file($in_f);
	foreach ( @{$lines} )
    {
		my ($param, $value) = split '=';
        if(exists $params{$param}) { die"get_conf_params(): Multiple values for $param\n"; }
        $params{$param} = $value;
    }
	return \%params;
}

################################################
sub elem_present_in_arr   # Input - array_ref, element to be probed. 
{                         # Output 0/1.
	my ($r_arr, $elem) = @_;
	my $present = 0;
	foreach (@$r_arr)
    {
		if ($_ eq $elem)
		{
			$present = 1;
			last;
		}
    }
	return $present;                  
}
