/* Copyright (C) 2003-2008 Dan Arlow
 * 
 * This file is part of motifADE.
 * 
 * motifADE is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * motifADE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with motifADE.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
  multi_dfa.hpp

  Implements a DFA-based search for multiple patterns in a string

  Written by Patrick Varilly, 16-17 Jul 2005
*/

#ifndef MULTI_DFA_HPP
#define MULTI_DFA_HPP

#include <iostream>

#include <string>

#include <map>
#include <set>
#include <stack>
#include <vector>

#include "common.hpp"

using namespace std;

//#define OLD_CODE

struct MatchInDFA
{
  // DON'T USE ME
  MatchInDFA() : motif_num( -1 ), motif_len( -1 ) {}
  
  MatchInDFA( int num, int len ) : motif_num( num ), motif_len( len ) {}
  int motif_num;
  int motif_len;

  inline bool operator<( const MatchInDFA &than ) const
  {
    if( motif_num < than.motif_num )
      return true;
    else if( motif_num > than.motif_num )
      return false;
    else
      return motif_len < than.motif_len;
  }

  inline bool operator==( const MatchInDFA &than ) const
  {
    return (motif_num == than.motif_num)
      && (motif_len == than.motif_len);
  }
};

#ifdef OLD_CODE
typedef set<MatchInDFA> match_set_t;
#else
typedef vector<MatchInDFA> match_set_t;
#endif

struct MatchInstance {
	MatchInstance( int pat, int pos ) : patternNumber( pat ), textPosition( pos ) {}
	unsigned int patternNumber;
	unsigned int textPosition;
};

struct PatternToID
{
	PatternToID( const string &pat, int inID )
		: pattern( pat ), id( inID ) {}
	string pattern;
	int id;
};

// DFA-based search
// ================
//
// For every pattern, we first construct an NFA of the form:
//
//                   --C--
//        ^         /     \          *
//  ----> 0 --A--> 1 --A--> 2 --G--> 3
//                  \     /   
//                   --T--
//
// where 0 is the starting state (the ^ indicates loops back to 0 on
// every input) and 3 being the matching state.
//
// From this NFA, we construct the equivalent DFA (this code
// handles \epsilon transitions, to allow for more complex NFA's later on),
// via the subset construction (see the Dragon Book, Algorithm 3.2).
// Basically, we traverse all possible paths of the NFA in parallel,
// and each DFA state represents a subset of possible NFA states.
//
// Finally, the DFA is pursued to a matching state.
//
// Sigma denotes the alphabet { A, C, G, T, eps } throughout

typedef enum {
	kSigma_first = 0,
	
	kSigma_A = 0,
	kSigma_C = 1,
	kSigma_G = 2,
	kSigma_T = 3,

	kSigma_size = 4,
	
	kSigma_eps = 4,

	kSigma_extendedSize = 5,

	kSigma_illegal = -1
} sigma_t;

inline void operator++( sigma_t& c )
{
	c = sigma_t(int(c)+1);
}

// NFA
// ===
// A non-deterministic finite automaton for matching a set of patterns

// An NFA state
class NFAState;
typedef vector< NFAState* > NFAStateVector;
class NFAState
{
public:
	enum { NonMatching = -1 };
	
	int num;
	int match;
  int match_len;
	NFAStateVector move[kSigma_extendedSize];

	// Pre-allocate 1 out move (only the start state has more than
	// one out move for the kinds of NFA's that motif patterns
	// create) for A,C,G,T
	NFAState() : match( NonMatching ) {}

	inline void addMoveTo( sigma_t in, NFAState* target )
	{
		move[in].push_back( target );
	}
};

// This used to be set<int>, but it turned out to be the Achilles' heel of the
// implementation, hopefully now it should be a lot better
//typedef set<int> nfa_states_t;
class nfa_states_t
{
public:
  // <-- The Snake Bites THE Unprepared Man:
  // Incredibly, on some 64-bit architectures running g++
  // "1" is a 32-BIT SIGNED constant, so (1<<31) actually produces
  // -1, while (1<<32) produces 0.  So we declare this constant
  // to shift with that will be either 32 or 64 bits as appropiate

  #define ONE size_t(1)

	const int max_states;
	const int num_words;
	size_t* bits;
	size_t hash_val;

	// These sets are usually sparse & max_states is big
	vector<int> contents;
	
	explicit nfa_states_t( size_t max_states )
		: max_states( max_states ),
		  num_words( (max_states + (sizeof(size_t)*8) - 1) / (sizeof(size_t)*8) )
	{
		bits = new size_t[ num_words ];
		clear();
	}

	~nfa_states_t()
	{
		delete bits;
	}

	inline void clear()
	{
		for( int i = 0; i < num_words; i++ )
			bits[i] = 0;

		hash_val = 0;
		contents.clear();
	}

	inline void insert( const int s )
	{
		int word = s / (sizeof(size_t)*8);
		int bit = s % (sizeof(size_t)*8);

		if( (bits[word] & (1<<bit)) == 0 )
			{
				bits[word] |= (ONE<<bit);
				hash_val ^= (ONE<<bit);
				contents.push_back(s);
			}
	}

	inline void remove( const int s )
	{
		int word = s / (sizeof(size_t)*8);
		int bit = s % (sizeof(size_t)*8);

		if( (bits[word] & (ONE<<bit)) != 0 )
			{
				bits[word] &= ~(ONE<<bit);
				hash_val ^= (ONE<<bit);

				// HYPER-EXPENSIVE WITH VECTOR<INT>
				// CONSIDER NEVER REMOVING STATES (e.g.,
				// epsClose)  Not an issue for now
				contents.erase( find( contents.begin(), contents.end(), s ) );
			}
	}

	inline bool test( const int s ) const
	{
		int word = s / (sizeof(size_t)*8);
		int bit = s % (sizeof(size_t)*8);

		return ( (bits[word] & (ONE<<bit)) != 0 );
	}

	inline size_t get_hash() const { return hash_val; }

  inline void dump()
  {
    cout << "max_states = " << max_states << endl;
    cout << "num_words = " << num_words << endl;
    cout << hex;
    cout << "hash_val = " << hash_val << endl;
    for( int i = 0; i < num_words; i++ )
      cout << "bits[" << i << "] = " << bits[i] << endl;
    cout << dec;
    cout << "states: ";
    for( iterator iter = begin(); iter != end(); iter++ )
      cout << *iter << ", ";
    cout << "done" << endl;
    cout << endl;
  }

	// Access to contents' iterators
	typedef vector<int>::iterator iterator;
	typedef vector<int>::const_iterator const_iterator;

	inline iterator begin() { return contents.begin(); }
	inline iterator end() { return contents.end(); }
	inline const_iterator begin() const { return contents.begin(); }
	inline const_iterator end() const { return contents.end(); }
};

inline bool operator==( const nfa_states_t &one, const nfa_states_t &two )
{
	if( one.max_states != two.max_states ) return false;

	for( int i = 0; i < one.num_words; i++ )
		if( one.bits[i] != two.bits[i] )
			return false;

	return true;
}

inline bool operator<( const nfa_states_t &one, const nfa_states_t &two )
{
	if( one.max_states < two.max_states ) return true;
	else if( one.max_states > two.max_states ) return false;

	for( int i = 0; i < one.num_words; i++ )
		if( one.bits[i] < two.bits[i] )
			return true;
		else if( one.bits[i] > two.bits[i] )
			return false;

	return false;
}

class nfa_states_ptr_equal_t
{
public:
	inline bool operator()( const nfa_states_t* one, const nfa_states_t* two )
	{
		return *one == *two;
	}
};

class nfa_states_ptr_less_t
{
public:
	inline bool operator()( const nfa_states_t* one, const nfa_states_t* two )
	{
		return *one < *two;
	}
};

// An NFA
class NFA
{
public:
	int num_states;
	vector<NFAState> states;
	NFAState *start;
	bool hasEpsTransitions; // Skip eps-closures if not, 30% speedup

	explicit NFA( const vector<string>& strings );
	inline explicit NFA( const vector<PatternToID>& patterns ) { build_nfa( patterns ); }

	// Debugging
	void dump() const;

	// Operations
	// ==========
	// eps-closure(T) = T union {all states reachable from T via eps paths}
	//                  - states with only eps transitions
	nfa_states_t& epsClose( nfa_states_t &T ) const;

	// move(T,c) = the set of states reachable from a state
	//             in T via a transition on c
	nfa_states_t& move( nfa_states_t &outStates,
						const nfa_states_t &inStates,
						const sigma_t c ) const;

	// Builds a set with the union of all match values in states in S
	match_set_t* build_match_set( const nfa_states_t &S ) const;

private:
	void build_nfa( const vector<PatternToID>& patterns );
};

// DFA
// ===
// A deterministic finite automaton that recognizes the same language
// as an NFA

// A DFA state
class DFAState
{
public:
	int num;
	match_set_t *matches;
	nfa_states_t *nfa_states;
	DFAState* move[kSigma_size];

	inline DFAState( int num, nfa_states_t* nfa_states )
		: num(num), matches(NULL), nfa_states( nfa_states ) {}
	inline ~DFAState() { delete matches; delete nfa_states; }
};

class DFA
{
public:
	vector<DFAState*> states;
	DFAState* start;

	inline explicit DFA( const NFA &nfa ) { build_from_nfa( nfa ); }
	inline explicit DFA( const vector<string>& strings ) { NFA nfa( strings ); build_from_nfa( nfa ); }
	inline explicit DFA( const vector<PatternToID>& patterns ) { NFA nfa( patterns ); build_from_nfa( nfa ); }
	~DFA();

	// Debugging
	void dump();

	inline int getNumStates() { return states.size(); }

	// Matching
	inline bool contains( const string& text, const string::size_type pos = 0 ) const
	{
		const match_set_t *discardMatchIDs;
		return (findFirstMatch( text, discardMatchIDs, pos ) != string::npos);
	}
	
	string::size_type findFirstMatch( const string& text,
									  const match_set_t*& matchIDs,
									  const string::size_type pos = 0 ) const;

	void findAllMatches( const string& text,
						 vector<MatchInstance> &outMatches,
						 const string::size_type pos = 0 ) const;

	// Missing: finding all matches and creating a many-to-many map of
	// matchIDs to positions

private:
	void build_from_nfa( const NFA& nfa );
};

#endif /* MULTI_DFA_HPP */
