/* Copyright (C) 2003-2008 Dan Arlow
 * 
 * This file is part of motifADE.
 * 
 * motifADE is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * motifADE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with motifADE.  If not, see <http://www.gnu.org/licenses/>.
 */

/* 
 *  kmer_encoder.cpp
 *
 *  Interface to a class for converting k-mers into ints
 */


#include <vector>
#include <string>


#include "common.hpp"
#include "sequence.hpp"
#include "kmer_encoder.hpp"


// KmerEncoder constructor -- initializes this to encode k-mers of length kmersize
KmerEncoder::KmerEncoder( unsigned int kmersize )
	: kmerSize( kmersize ),
	  numKmers( times4n( 1, kmerSize ) ),
	  umerSize( kmersize - 1 ),
	  msbShift( twice( umerSize ) ),
	  masks( kmersize ),
	  successors( 4, vector< KmerValue >( 3 ) )
{
	if( kmersize == 0 )
		throw( MotifADEException( "KmerEncoder::KmerEncoder: k == 0." ) );
	if( kmersize >= sizeof( KmerValue ) * 4 )
		throw( MotifADEException( "KmerEncoder::KmerEncoder: k too large. in this implementation, k must be less than 4 times the word size of the system." ) );
	
	
	masks[ 0 ] = ~3;
	for( unsigned int i = 1; i < kmerSize; ++i )
		masks[ i ] = 3 | masks[ i - 1 ] << 2;
	
	for( unsigned int i = 0; i < 4; ++i  )
		for( unsigned int j = 0; j < 3; ++j )
			successors[ i ][ j ] = ( i + j + 1 ) & 3;	
}


// KmerEncoder destructor -- does nothing?
KmerEncoder::~KmerEncoder()
{
}


// KmerEncoder method -- converts a nucleotide to a number for use in k-mer encoding
KmerEncoder::KmerValue
KmerEncoder::ntToValue( char nt ) const
{
	switch( nt ) {
		case 'A': return 0;
		case 'C': return 1;
		case 'G': return 2;
		case 'T': return 3;
		case 'N': return INVALID_VALUE;
		case ' ': return INVALID_VALUE; // allows for Xiaohui's alignment files
		case '-': return INVALID_VALUE; // ditto
		case '#': return INVALID_VALUE; // again
//		default:  throw( MotifADEException( string( "KmerEncoder::ntToValue: unrecognized nucleotide character: " ) + nt ) 
	}
	return INVALID_VALUE; // should never get here with well-formed sequence
}


// KmerEncoder method -- converts a nucleotide value back to a character
char
KmerEncoder::valueToNt( KmerValue value ) const
{
	switch( value ) {
		case 0: return 'A';
		case 1: return 'C';
		case 2: return 'G';
		case 3: return 'T';
		case INVALID_VALUE:
			return 'N';
		default:
			throw( MotifADEException( "KmerEncoder::valueToNt: unrecognized nucleotide value!" ) );
	}
//	return 'N'; // should never get here
}


// KmerEncoder method -- returns the reverse of the given encoded k-mer
KmerEncoder::KmerValue
KmerEncoder::reverse( KmerValue value ) const
{
	KmerValue result = 0;
	for( unsigned int i = 0; i < kmerSize; ++i ) {
		result = ( result << 2 ) | ( value & 3 );
		value >>= 2;
	}
	return result;
}


// KmerEncoder method -- returns the complement the given encoded k-mer
KmerEncoder::KmerValue
KmerEncoder::complement( KmerValue value ) const
{
	return ~value;
}


// KmerEncoder method -- returns the reverse-complement the given encoded k-mer
KmerEncoder::KmerValue
KmerEncoder::reverseComplement( KmerValue value ) const
{
	return reverse( complement( value ) );
}


// KmerEncoder method -- returns true if given encoded k-mer is the same as the value
// returned by getForwardVersion -- FIXED 1/29/2007
bool
KmerEncoder::isForward( KmerValue value ) const
{
	return value == getForwardVersion( value );
}


// KmerEncoder method -- returns the "forward" version of the k-mer -- the smaller
// of it and its reverse complement's encoded value.  -- FIXED 1/29/2007
KmerEncoder::KmerValue
KmerEncoder::getForwardVersion( KmerValue value ) const
{
	KmerEncoder::KmerValue revc = reverseComplement( value );
	return fast_min( value, revc );
}


// KmerEncoder method -- simply computes the number of possible k-mers of length kmerSize
unsigned int
KmerEncoder::totalKmers() const
{
	return times4n( 1, kmerSize );
}


// KmerEncoder method -- gets the value of the nucleotide at the given position in the given encoded k-mer
KmerEncoder::KmerValue
KmerEncoder::getNtValue( KmerValue value, unsigned int pos ) const
{
	if( value >= numKmers || pos >= kmerSize ) return INVALID_VALUE;
	return ( value >> twice( pos ) ) & 3;
}


// KmerEncoder method -- sets the value of the nucleotide at the given position
KmerEncoder::KmerValue
KmerEncoder::setNtValue( KmerValue value, KmerValue newvalue, unsigned int pos ) const
{
	return ( masks[ pos ] & value ) | ( ( newvalue & 3 ) << twice( pos ) );
}


// KmerEncoder method -- computes the value of the next k-mer, given the value of the current k-mer and the following nucleotide -- deprecated from public interface
KmerEncoder::KmerValue
KmerEncoder::nextKmerValue( KmerValue currentValue, char nextNt ) const
{
	if( currentValue == INVALID_VALUE ) return INVALID_VALUE;
	
	KmerValue nextNtValue = ntToValue( nextNt );
	
	if( nextNtValue == INVALID_VALUE )
		return INVALID_VALUE;
	else
		return ( currentValue >> 2 ) | ( nextNtValue << msbShift );
}


// KmerEncoder method -- returns a pointer to a vector of the successors of the given nucleotide (by its KmerValue)
const vector< KmerEncoder::KmerValue >*
KmerEncoder::getSuccessors( KmerValue ntValue ) const
{
	if( ntValue > successors.size() )
		throw( MotifADEException( "KmerEncoder::getSuccessors: unrecognized nucleotide value!" ) );
	
	return &successors[ ntValue ];
}


// KmerEncoder method -- accumulates the encoded value of a sequence of nucleotides
KmerEncoder::KmerValue
KmerEncoder::computeKmerValue( NtSeqType::const_iterator pos ) const
{
	KmerValue value = 0, ntValue;
	for( unsigned int i = 0; i < kmerSize; ++i ) {
		ntValue = ntToValue( *pos );
		if( ntValue == INVALID_VALUE ) return INVALID_VALUE;
		value += times4n( ntValue, i );
		++pos;
	}
	return value;
}


// KmerEncoder method -- computes the value of the k-mer at the current position, given the value of the immediately preceding k-mer
KmerEncoder::KmerValue
KmerEncoder::nextKmerValue( KmerValue currentValue, NtSeqType::const_iterator pos ) const
{
	return nextKmerValue( currentValue, *( pos + umerSize ) );
}


// KmerEncoder method -- converts a KmerValue to a string
void
KmerEncoder::kmerValueToSequence( KmerValue value, string& s ) const
{
	s.resize( kmerSize );
	for( unsigned int i = 0; i < kmerSize; ++i ) {
		s[ i ] = valueToNt( value & 3 );
		value >>= 2;
	}
}


// KmerEncoder method -- returns a const_iterator of the first valid position in seq
KmerEncoder::NtSeqType::const_iterator
KmerEncoder::getFirstPos( const NtSeqType& seq ) const
{
	return seq.begin();
}


// KmerEncoder method -- returns a const_iterator of the last valid position in seq
KmerEncoder::NtSeqType::const_iterator
KmerEncoder::getLastPos( const NtSeqType& seq ) const
{
	return seq.end() - kmerSize;
}
