/* Copyright (C) 2003-2008 Dan Arlow
 * 
 * This file is part of motifADE.
 * 
 * motifADE is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * motifADE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with motifADE.  If not, see <http://www.gnu.org/licenses/>.
 */

/* 
 *  sequence_expression_map_reader.cpp
 *
 *  Simple class for reading a mapping between sequence IDs and expression IDs from a stream.
 */


#include <map>
#include <set>
#include <string>
#include <algorithm>

#include "common.hpp"
#include "buffered_reader.hpp"
#include "tokenizer.hpp"

#include "sequence.hpp"
#include "expression_vector.hpp"

#include "sequence_expression_map.hpp"
#include "column_map_reader.hpp"
#include "bijective_map_filter.hpp"
#include "map_inverter.hpp"
#include "operation_stats.hpp"
#include "sequence_expression_map_reader.hpp"



SequenceExpressionMapReader::ColumnPair
SequenceExpressionMapReader::getColumns( string& header )
{
	
	if( header.find( '\t' ) == string::npos )
		delimiter = ',';
	else
		delimiter = '\t';
	
	ColumnPair p;
	svector tokens;
	
	tokenizeCSV( back_inserter( tokens ), header, delimiter );
	svector::iterator pos;
	
	// find field for Expression ID
	pos = find( tokens.begin(), tokens.end(), "Probe Set ID" );
	if( pos == tokens.end() ) throw( MotifADEException( "SequenceExpressionMapReader::getColumns: \"Probe Set ID\" field not found in header!" ) );
	p.first = pos - tokens.begin();
	
	// find field for Sequence ID
	pos = find( tokens.begin(), tokens.end(), "RefSeq Transcript ID" );
	if( pos == tokens.end() )
		pos = find( tokens.begin(), tokens.end(), "Full Length Ref. Sequences" );
	if( pos == tokens.end() ) throw( MotifADEException( "SequenceExpressionMapReader::getColumns: \"Full Length Ref. Sequences\" (or \"RefSeq Transcript ID\") field not found in annotation_file header!" ) );
	p.second = pos - tokens.begin();
	
	return p;
}

void
SequenceExpressionMapReader::load( SequenceExpressionMap& sMap, SequenceExpressionMapReader::Stats& stats )
{
	stats.clear();
	
	sMap.clear();
	
	InvertedMMapType invMap;
	
	if( !is ) throw( MotifADEException( "SequenceExpressionMapReader::load: couldn't read header from stream!" ) );
	string header;
	is.getline( header, '\n' );
	
	ColumnPair columns( getColumns( header ) );
	
	ColumnMapReader< InvertedMMapType > mReader( is, columns.first, columns.second, delimiter );
	ColumnMapReader< InvertedMMapType >::Stats mReaderStats;
	mReader.load( invMap, mReaderStats );
	
	for( InvertedMMapType::iterator i = invMap.begin(); i != invMap.end(); ) {
		InvertedMMapType::iterator tmp = i++;
		Sequence::IDType& val = tmp->second;
		Sequence::IDLoc loc( Sequence::findID( val ) );
		if( loc.first == val.end() ) {
			++stats.num_missing_keys; // NOTE THE INVERSION!!!
			invMap.erase( tmp );
		} else {
			val.assign( loc.first, loc.second );
			val.resize( loc.second - loc.first );
		}
	}
	
	MMapType mMap;
	MapInverter< InvertedMMapType, MMapType > inverter;
	inverter.invert( invMap, mMap );
	
	BijectiveMapFilter< MMapType, SequenceExpressionMap > mFilter;
	BijectiveMapFilter< MMapType, SequenceExpressionMap >::Stats mFilterStats;
	mFilter.filter( mMap, sMap, mFilterStats );
	
//		cout << "repeated keys:" << endl;
//		typedef BijectiveMapFilter< MMapType, SequenceExpressionMap >::KeyCountMap KeyCountMap;
//		for( KeyCountMap::const_iterator j = mFilterStats.keyCounts.begin(); j != mFilterStats.keyCounts.end(); ++j ) {
//			if( j->second > 1 ) {
//				cout << j->first << ": " << j->second << endl;
//			}
//		}
	
	stats.num_mappings_loaded = mReaderStats.num_mappings_loaded;
	stats.num_missing_keys += mReaderStats.num_missing_values;   // NOTE THE INVERSION!!!
	stats.num_missing_values = mReaderStats.num_missing_keys;  // NOTE THE INVERSION!!! (and the +=)
	stats.num_unique_keys = mFilterStats.num_unique_keys;
	stats.num_unique_values = mFilterStats.num_unique_values;
	stats.num_repeated_keys = mFilterStats.num_repeated_keys;
	stats.num_repeated_values = mFilterStats.num_repeated_values;
	stats.num_key_repeats = mFilterStats.num_key_repeats;
	stats.num_value_repeats = mFilterStats.num_value_repeats;
	stats.num_mappings_retained = mFilterStats.num_mappings_retained;
}
