/* Copyright (C) 2003-2008 Dan Arlow
 * 
 * This file is part of motifADE.
 * 
 * motifADE is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * motifADE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with motifADE.  If not, see <http://www.gnu.org/licenses/>.
 */

/* 
 *  sequence_set_reader.hpp
 *
 *  Interface to a class that reads a SequenceSet from a stream.
 */

#ifndef SEQUENCE_SET_READER_H
#define SEQUENCE_SET_READER_H

#include <set>
#include <iostream>
#include <cctype>


#include "common.hpp"


#include "buffered_reader.hpp"
#include "sequence.hpp"
#include "sequence_set.hpp"
#include "operation_stats.hpp"


// nearly stateless class to read a SequenceSet from a stream
template< typename SequenceSetType, typename SequenceType = typename SequenceSetType::IDObjectType >
class SequenceSetReaderBase {
public:
	typedef set< typename SequenceType::IDType > IDSet;

public:
	struct Stats : OperationStats {
		Stats() { clear(); }
		
		virtual void clear()
		{
			repeatedIDs.clear();
			num_sequences_loaded = 0;
			num_nucleotides_loaded = 0;
			num_unique_ids = 0;
			num_repeated_ids = 0;
			num_id_repeats = 0;
			num_sequences_retained = 0;
			removeRepeatedIDs = true;
		}
		
		virtual ostream& print( ostream& os ) const {
			os << num_sequences_loaded << " sequences loaded, totalling " << num_nucleotides_loaded << " nucleotides." << endl;
			os << num_unique_ids << " unique IDs; " << num_repeated_ids << " repeated IDs, repeated a total of " << num_id_repeats << " times." << endl;
			os << "sequences with repeated IDs were " << ( removeRepeatedIDs ? "" : "not " ) << "removed." << endl;
			os << num_sequences_retained << " sequences retained." << endl;
			return os;
		}
		
		IDSet			repeatedIDs;
		
		unsigned int	num_sequences_loaded,
						num_nucleotides_loaded,
						num_unique_ids,
						num_repeated_ids,
						num_id_repeats,
						num_sequences_retained;
		
		bool			removeRepeatedIDs;
	};
		
	SequenceSetReaderBase( BufferedReader& ist, bool removeRepeats = true )
		: is( ist ), removeRepeatedIDs( removeRepeats ) { }
	
	void	load( SequenceSetType& sequences, Stats& stats )
	{
		stats.clear();
		stats.removeRepeatedIDs = removeRepeatedIDs;
		
		if( !is ) throw( MotifADEException( "SequenceSetReaderBase::load: couldn't read from stream!" ) );
		
		SequenceType* tmp;
		while( !is.eof() ) {
			char ch = is.peek();
			if( !isprint( ch ) ) {
				is.get();
				continue;
			}
			
			tmp = new SequenceType;
			is >> *tmp;
			tmp->parseID();
			
			++stats.num_sequences_loaded;
			stats.num_nucleotides_loaded += tmp->totalSequenceLength();
			
			bool unique = sequences.add( tmp );
			if( !unique ) {
				stats.repeatedIDs.insert( tmp->getID() );
				++stats.num_id_repeats;
				delete tmp;
			}
		}
		
		stats.num_repeated_ids = stats.repeatedIDs.size();
		stats.num_unique_ids = sequences.size() - stats.num_repeated_ids;
		
		if( removeRepeatedIDs )
			for( typename IDSet::iterator i = stats.repeatedIDs.begin(); i != stats.repeatedIDs.end(); ++i )
				sequences.remove( *i );
		
		stats.num_sequences_retained = sequences.size();
	}

private:
	BufferedReader& is;
	const bool		removeRepeatedIDs;
};


typedef SequenceSetReaderBase< SequenceSet >   SequenceSetReader;


#endif // SEQUENCE_SET_READER_H
