/* Copyright (C) 2003-2008 Dan Arlow
 * 
 * This file is part of motifADE.
 * 
 * motifADE is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * motifADE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with motifADE.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
  multi_dfa.cpp

  Implements a DFA-based search for multiple patterns in a string

  Written by Patrick Varilly, 16-17 Jul 2005
*/


#include "common.hpp"
#include "multi_dfa.hpp"
#include "iupac_alphabet.hpp"


#ifdef MOTIFADE_USE_HASH_MAP
	#ifdef MOTIFADE_HASH_MAP_NOT_IN_EXT
		#include <hash_map>
	#else
		#include <ext/hash_map>
	#endif
	#ifdef MOTIFADE_DUMB_GNU_HASH_MAP
		using namespace __gnu_cxx;
	#endif
#else
	#include <map>
#endif



// DFA-based search
// ================
//
// For every pattern, we first construct an NFA of the form:
//
//                   --C--
//        ^         /     \          *
//  ----> 0 --A--> 1 --A--> 2 --G--> 3
//                  \     /   
//                   --T--
//
// where 0 is the starting state (the ^ indicates loops back to 0 on
// every input) and 3 being the matching state.
//
// From this NFA, we construct the equivalent DFA (this code
// handles \epsilon transitions, to allow for more complex NFA's later on),
// via the subset construction (see the Dragon Book, Algorithm 3.2).
// Basically, we traverse all possible paths of the NFA in parallel,
// and each DFA state represents a subset of possible NFA states.
//
// Finally, the DFA is pursued to a matching state.
//
// Sigma denotes the alphabet { A, C, G, T, eps } throughout

sigma_t gCharToSigmaTable[256];

inline void initOneCharToSigma( char c, sigma_t s )
{
	// Accept upper & lower-case
	gCharToSigmaTable[uint8_t(tolower(c))] = s;
	gCharToSigmaTable[uint8_t(toupper(c))] = s;
}

int initCharToSigmaTable( void )
{
	for( int i = 0; i < 256; i++ )
		gCharToSigmaTable[i] = kSigma_illegal;

	initOneCharToSigma( 'A', kSigma_A );
	initOneCharToSigma( 'C', kSigma_C );
	initOneCharToSigma( 'G', kSigma_G );
	initOneCharToSigma( 'T', kSigma_T );

	return 0;
}

namespace StaticInitializers { int dummyCharToSigma = initCharToSigmaTable(); }

// Handle type-conversion (see above)
template< class T >
inline sigma_t toSigma( T c ) { return gCharToSigmaTable[uint8_t(c)]; }

// Helpful debugging aids
static ostream& operator<<( ostream& stream, const MatchInDFA &obj )
{
  stream << "(" << obj.motif_num << ", len=" << obj.motif_len << ")";
  return stream;
}

static ostream& operator<<( ostream& stream, const NFAStateVector &moves )
{
	for( NFAStateVector::const_iterator iter = moves.begin();
		 iter != moves.end(); ++iter )
		{
			if( iter != moves.begin() )
				stream << ", ";
			stream << (*iter)->num;
		}

	return stream;
}

/*static ostream& operator<<( ostream& stream, const bit_vector &bv )
{
	stream << "{";

	bool first = true;
	for( unsigned int s = 0; s < bv.size(); ++s )
		{
			if( ! bv[s] ) continue;
			
			if( !first )
				stream << ", ";
			else
				first = false;
			
			stream << s;
		}

	stream << "}";

	return stream;
}*/

template<class T>
static ostream& operator<<( ostream& stream, const set<T> &s )
{
	stream << "{";
	for( typename set<T>::iterator iter = s.begin(); iter != s.end(); ++iter )
		{
			if( iter != s.begin() )
				stream << ", ";
			
			stream << *iter;
		}
	stream << "}";

	return stream;
}

template<class T>
static ostream& operator<<( ostream& stream, const vector<T> &s )
{
	stream << "{";
	for( typename vector<T>::const_iterator iter = s.begin(); iter != s.end(); ++iter )
		{
			if( iter != s.begin() )
				stream << ", ";
			
			stream << *iter;
		}
	stream << "}";

	return stream;
}

// NFA (Nondeterministic Finite Automaton)
// =======================================

NFA::NFA( const vector<string>& strings )
{
	vector<PatternToID> patterns;

	// Dan, is there a more STL-ish way of doing this?
	int i = 1;
	for( vector<string>::const_iterator iter = strings.begin();
		 iter != strings.end(); ++iter, ++i )
		patterns.push_back( PatternToID( *iter, i ) );

	build_nfa( patterns );
}

void
NFA::build_nfa( const vector<PatternToID>& patterns )
{
	// Add up all patterns lengths
	string::size_type tot_len = 0;
	for( vector<PatternToID>::const_iterator iter = patterns.begin();
		 iter != patterns.end(); ++iter )
		tot_len += iter->pattern.size();
		
	// the NFA will have tot_len+1 states
	string::size_type j, k;
	num_states = tot_len+1;
	states.resize( num_states );

	// As we build it right now, it has no eps transitions!
	hasEpsTransitions = false;
		
	// The start state can loop back
	start = &states[0];
		
	states[0].num = 0;
	states[0].addMoveTo( kSigma_A, &states[0] );
	states[0].addMoveTo( kSigma_C, &states[0] );
	states[0].addMoveTo( kSigma_G, &states[0] );
	states[0].addMoveTo( kSigma_T, &states[0] );
		
	// Now encode the motif as state transitions
	k = 1;
	for( vector<PatternToID>::const_iterator iter = patterns.begin();
		 iter != patterns.end(); ++iter )
		{
			const string& motif = iter->pattern;
			string::size_type m = motif.size();
				
			for( j = 0; j < m; j++ )
				{
					char sym = motif[j];
					int src, target;
					if( j == 0 ) src = 0;
					else src = (j-1)+k;
					target = j+k;
						
					states[target].num = target;
						
					if( doesIUPACMatchChar( sym, 'A' ) )
						states[src].addMoveTo( kSigma_A, &states[target] );
					if( doesIUPACMatchChar( sym, 'C' ) )
						states[src].addMoveTo( kSigma_C, &states[target] );
					if( doesIUPACMatchChar( sym, 'G' ) )
						states[src].addMoveTo( kSigma_G, &states[target] );
					if( doesIUPACMatchChar( sym, 'T' ) )
						states[src].addMoveTo( kSigma_T, &states[target] );
				}
				
			states[(m-1)+k].match = iter->id;
			states[(m-1)+k].match_len = m;
				
			k += m;
		}
}

void
NFA::dump() const
{
	cout << "NFA Start State: " << start->num << endl;
	for( vector<NFAState>::const_iterator iter = states.begin();
		 iter != states.end(); ++iter )
		{
			cout << "State " << iter->num << ": ";
			if( iter->match != NFAState::NonMatching )
				cout << "[matches " << iter->match << "] ";
			cout << "A->" << iter->move[kSigma_A] << "; ";
			cout << "C->" << iter->move[kSigma_C] << "; ";
			cout << "G->" << iter->move[kSigma_G] << "; ";
			cout << "T->" << iter->move[kSigma_T] << "; ";
			cout << "eps->" << iter->move[kSigma_eps];
			cout << endl;
		}
}

// eps-closure(T) = T union {all states reachable from T via eps paths}
//                  - states with only eps transitions
nfa_states_t&
NFA::epsClose( nfa_states_t &T ) const
{
	// Skip this if there are no eps transitions
	if( !hasEpsTransitions ) return T;
	
	stack< int > statesToPursue;
	for( int s = 0; s < T.max_states; s++ )
		if( T.test( s ) )
			statesToPursue.push( s );

	while( !statesToPursue.empty() ) {
		int s = statesToPursue.top();
		statesToPursue.pop();
		
		const NFAStateVector &epsMoves = states[s].move[kSigma_eps];
		for( NFAStateVector::const_iterator iter = epsMoves.begin();
			 iter != epsMoves.end(); ++iter ) {

			int num = (*iter)->num;
			if( T.test( num ) == false ) {
				statesToPursue.push( num );
				T.insert( num );
			}
		}
	}

	cerr << "WARNING: HORRIBLE PERFORMANCE AHEAD!!!" << endl;
	cerr << "nfa_states_t.remove() acts on a vector<int>!!!" << endl;
	cerr << "It's probably better not to prune eps-only transitions in epsClose" << endl;

	// Prune out eps-out-only states
	for( int s = 0; s < T.max_states; s++ )
		if( T.test( s ) ) {
			
			const NFAState& state = states[s];
			
			// Don't remove completely empty nodes (like the match!)
			if( state.move[kSigma_eps].empty() )
				continue;
			
			bool epsOnly = true;
			for( sigma_t c = kSigma_first; c < kSigma_size; ++c )
				if( !state.move[c].empty() ) {
					epsOnly = false;
					break;
				}
			
			if( epsOnly )
				T.remove( s );
		}

	return T;
}

// move(T,c) = the set of states reachable from a state in T via a transition on c
nfa_states_t&
NFA::move( nfa_states_t &outStates, const nfa_states_t &T, const sigma_t c ) const
{
	outStates.clear();

	for( nfa_states_t::const_iterator T_iter = T.begin();
		 T_iter != T.end(); ++T_iter ) {
		int s = *T_iter;
		
		const NFAStateVector &moves = states[s].move[c];
		for( NFAStateVector::const_iterator iter = moves.begin();
			 iter != moves.end(); ++iter ) {
			
			int num = (*iter)->num;
			outStates.insert( num );
		}
	}

	return outStates;
}

match_set_t*
NFA::build_match_set( const nfa_states_t &S ) const
{
	// What does it match?
	match_set_t *matches = new match_set_t;

	// Why is this not using the vector<int> inside
	// nfa_states_t to iterate through the states?
	// Could optimize this... ???
	for( int s = 0; s < S.max_states; s++ )
		if( S.test( s ) ) {
			if( states[s].match != NFAState::NonMatching )
			  {
			    MatchInDFA toInsert( states[s].match, states[s].match_len );
#ifdef OLD_CODE
			    matches->insert( toInsert );
#else
			    matches->push_back( toInsert );
#endif
			  }
		}

#ifndef OLD_CODE
	sort( matches->begin(), matches->end() );
	match_set_t::iterator new_end = unique( matches->begin(), matches->end() );
	matches->resize( new_end - matches->begin() );
#endif
	
	if( matches->empty() ) {
		delete matches;
		return NULL;
	}
	else
		return matches;
}

// DFA (Deterministic Finite Automaton)
// ====================================

#ifdef MOTIFADE_USE_HASH_MAP
// Efficient construction of nfa_states->DFAState* map
class nfa_state_hash_t
{
public:
	inline size_t operator()( nfa_states_t* const &states ) const
	{
		return states->get_hash();
	}
};

typedef hash_map<nfa_states_t*,DFAState*,nfa_state_hash_t,nfa_states_ptr_equal_t> nfa_to_dfa_map_t;

#else

typedef map<nfa_states_t*,DFAState*,nfa_states_ptr_less_t> nfa_to_dfa_map_t;

#endif

// Build a DFA that recognizes the same language as the input NFA
void
DFA::build_from_nfa( const NFA &nfa )
{
	// Use the subset construction:
	// Every state in the DFA corresponds to a set of NFA
	// states.  We start at the {0} state in a stack statesToProcess,
	// and an empty nfa_to_dfa_map.
	//
	// Then we iterate until statesToProcess:
	// * Pop an NFA state set T off of statesToProcess and build a
	//   new DFA state D corresponding to T.
	// * For each character c in the alphabet, let S be the states moving
	//   from a state s in T under c (following all eps transitions fully).
	//   If S is in the nfa_to_dfa_map, then link D to the associate DFA state
	//   X under c.  Otherwise, create a new DFA state X, link to that,
	//   and push S into statesToProcess

	int next_state_num = 0;

	nfa_to_dfa_map_t nfa_to_dfa_map;
	nfa_states_t *S = new nfa_states_t( nfa.num_states );
	S->insert( 0 );
	nfa.epsClose( *S );
	start = new DFAState( next_state_num++, S );
	states.push_back( start );
	start->matches = nfa.build_match_set( *S );

	nfa_to_dfa_map[ S ] = start;

	nfa_states_t *temp = new nfa_states_t( nfa.num_states );

	// Jump start with the start set
	stack< pair< nfa_states_t*, DFAState* > > statesToProcess;
	statesToProcess.push( make_pair( S, start ) );

	while( !statesToProcess.empty() )
		{
			// Get next state
			pair< nfa_states_t*, DFAState* > top = statesToProcess.top();
			statesToProcess.pop();
			
			//if( states.size() % 100 == 0 )
			//{
			//	cout << "DFA now has " << states.size() << " states" << endl;
			//}

			nfa_states_t *T = top.first;
			DFAState* D = top.second;

			//cout << "Processing state " << T << endl;
			//T->dump();

			// For each character, see where it takes us
			for( sigma_t c = kSigma_first; c < kSigma_size; ++c )
				{
					nfa.epsClose( nfa.move( *temp, *T, c ) );

					DFAState* X;
					nfa_to_dfa_map_t::iterator XinMap = nfa_to_dfa_map.find( temp );
					if( XinMap != nfa_to_dfa_map.end() )
						X = XinMap->second;
					else
						{
							// New state (appropiates nfa_states_t* temp)
							X = new DFAState( next_state_num++, temp );
							X->matches = nfa.build_match_set( *temp );
							states.push_back( X );

							// Add it to the map
							nfa_to_dfa_map[ temp ] = X;

							// Gotta process this one
							statesToProcess.push( make_pair( temp, X ) );

							// Prepare for next new state
							temp = new nfa_states_t( nfa.num_states );
						}

					D->move[c] = X;
				}
		}

	delete temp;
}

// Kill all the internal structure
DFA::~DFA()
{
	// Delete all the states
	for( vector<DFAState*>::iterator iter = states.begin();
		 iter != states.end(); ++iter )
		delete *iter;
}

void
DFA::dump()
{
	cout << "DFA Start State: " << start->num << endl;
	for( vector<DFAState*>::iterator iter = states.begin();
		 iter != states.end(); ++iter )
		{
			cout << "State " << (*iter)->num << ": ";
			if( (*iter)->matches != NULL )
				cout << "[matches " << *(*iter)->matches << "] ";
			cout << "A->" << (*iter)->move[kSigma_A]->num << ", ";
			cout << "C->" << (*iter)->move[kSigma_C]->num << ", ";
			cout << "G->" << (*iter)->move[kSigma_G]->num << ", ";
			cout << "T->" << (*iter)->move[kSigma_T]->num;
			cout << endl;
		}
}

// The crown jewels!!!
string::size_type
DFA::findFirstMatch( const string& text,
		     const match_set_t*& matchIDs,
		     const string::size_type pos ) const
{
	// Check for trivial DFA
	if( start->matches != NULL ) {
		matchIDs = start->matches;
		return pos;
	}

	// Otherwise, run the string through the DFA
	string::size_type i;
	const DFAState* cur = start;
	for( i = pos; i < text.size(); i++ )
		{
			const sigma_t sig = toSigma( text[i] );
			
			if( sig == kSigma_illegal )
				// Reset search if we hit an illegal character
				cur = start;
			else
				{
					cur = cur->move[sig];
					if( cur->matches != NULL ) {
						matchIDs = cur->matches;
						return i;
					}
				}
		}

	return string::npos;
}

// The Holy Grail!!!
void
DFA::findAllMatches( const string& text,
		     vector<MatchInstance> &outMatches,
		     const string::size_type pos ) const
{
	string::size_type i = pos;

	// Check for trivial DFA
	outMatches.clear();
	if( start->matches != NULL ) {
		for( match_set_t::const_iterator iter = start->matches->begin();
			iter != start->matches->end(); ++iter )
				outMatches.push_back( MatchInstance( iter->motif_num,
								     i - iter->motif_len + 1 ) );
	}

	// Otherwise, run the string through the DFA
	const DFAState* cur = start;
	for( i = pos; i < text.size(); i++ )
	  {
	    const sigma_t sig = toSigma( text[i] );
	    
	    if( sig == kSigma_illegal )
				// Reset search if we hit an illegal character
	      cur = start;
	    else
	      {
		cur = cur->move[sig];
		if( cur->matches != NULL ) {
		  for( match_set_t::const_iterator iter = cur->matches->begin();
		       iter != cur->matches->end(); ++iter )
		    outMatches.push_back( MatchInstance( iter->motif_num,
							 i - iter->motif_len + 1 ) );
		}
	      }
	  }
}

// Testing
/*int main( int argc, char** argv )
{
	if( argc < 3 )
		{
			cout << "Usage: multi_dfa pattern1 pattern2 ... string_to_search" << endl;
			return -1;
		}

	// Collect patterns
	vector<string> strings;
	for( int i = 1; i < argc-1; i++ )
		strings.push_back( *(new string( argv[i] )) );

	// Make NFA, then DFA
	NFA nfa( strings );
	nfa.dump();
	DFA dfa( nfa );
	dfa.dump();

	// Go!
	set<int> matches;
	dfa.findAllMatches( *(new string( argv[argc-1] )), matches );
	if( matches.empty() )
		cout << "No Match!" << endl;
	else
		cout << "Matches: " << matches << endl;

	return 0;
}
*/
