00001 #ifdef HAVE_CONFIG_H
00002 #include "config.h"
00003 #endif
00004
00005 #ifndef __GreedyBreakpointElimination_h__
00006 #define __GreedyBreakpointElimination_h__
00007
00008 #include <libMems/AbstractMatch.h>
00009 #include <iostream>
00010 #include <boost/multi_array.hpp>
00011 #include <libMems/PhyloTree.h>
00012 #include <libMems/SubstitutionMatrix.h>
00013 #include <libMems/SeedOccurrenceList.h>
00014 #include <libMems/IntervalList.h>
00015 #include <libMems/LCB.h>
00016 #include <stack>
00017
00018 namespace mems {
00019
00020 extern bool penalize_repeats;
00021
00025 template <class MatchType>
00026 class LcbTrackingMatch
00027 {
00028 public:
00029 MatchType original_match;
00030 MatchType node_match;
00031 size_t match_id;
00032 };
00033 typedef LcbTrackingMatch< mems::AbstractMatch* > TrackingMatch;
00034
00038 template <class MatchType>
00039 class TrackingLCB
00040 {
00041 public:
00042 TrackingLCB(){}
00043 TrackingLCB( const TrackingLCB& l ){ *this = l; }
00045 TrackingLCB( const mems::LCB& l ){ *this = l; }
00046 TrackingLCB& operator=( const mems::LCB& l )
00047 {
00048 left_end[0] = l.left_end[0];
00049 left_end[1] = l.left_end[1];
00050 right_end[0] = l.right_end[0];
00051 right_end[1] = l.right_end[1];
00052 left_adjacency[0] = l.left_adjacency[0];
00053 left_adjacency[1] = l.left_adjacency[1];
00054 right_adjacency[0] = l.right_adjacency[0];
00055 right_adjacency[1] = l.right_adjacency[1];
00056 lcb_id = l.lcb_id;
00057 weight = l.weight;
00058 to_be_deleted = false;
00059 return *this;
00060 }
00061 int64 left_end[2];
00062 int64 right_end[2];
00063 uint left_adjacency[2];
00064 uint right_adjacency[2];
00065 double weight;
00066 std::vector< MatchType > matches;
00067 int lcb_id;
00068 bool to_be_deleted;
00069 };
00070
00072 const uint LCB_UNASSIGNED = (std::numeric_limits<uint>::max)();
00073
00074 typedef boost::multi_array< std::vector< TrackingLCB< TrackingMatch* > >, 2 > PairwiseLCBMatrix;
00075
00076
00080 template< class MatchVector >
00081 double GetPairwiseAnchorScore(
00082 MatchVector& lcb, std::vector< genome::gnSequence* >& seq_table,
00083 const mems::PairwiseScoringScheme& subst_scoring, mems::SeedOccurrenceList& sol_1,
00084 mems::SeedOccurrenceList& sol_2, bool penalize_gaps = false );
00085
00086 class MoveScoreHeapComparator
00087 {
00088 public:
00089 bool operator()( const std::pair< double, size_t >& a, const std::pair< double, size_t >& b ) const
00090 {
00091 return a.first < b.first;
00092 }
00093 };
00094
00098 void getPairwiseLCBs(
00099 uint nI,
00100 uint nJ,
00101 uint dI,
00102 uint dJ,
00103 std::vector< TrackingMatch* >& tracking_matches,
00104 std::vector< TrackingLCB<TrackingMatch*> >& t_lcbs,
00105 boost::multi_array< double, 3 >& tm_score_array,
00106 boost::multi_array< size_t, 3 >& tm_lcb_id_array );
00107
00109 void initTrackingMatchLCBTracking(
00110 const std::vector< mems::TrackingMatch >& tracking_matches,
00111 size_t n1_count,
00112 size_t n2_count,
00113 boost::multi_array< size_t, 3 >& tm_lcb_id_array );
00114
00115
00123 template< class LcbVector >
00124 uint RemoveLCBandCoalesce( size_t lcbI, uint seq_count,
00125 LcbVector& adjacencies,
00126 std::vector< double >& scores,
00127 std::vector< std::pair< uint, uint > >& id_remaps,
00128 std::vector< uint >& impact_list );
00129
00130
00131 void printMatch( mems::AbstractMatch* m, std::ostream& os );
00132
00133 inline
00134 void printMatch( mems::AbstractMatch* m, std::ostream& os )
00135 {
00136 for( size_t ii = 0; ii < m->SeqCount(); ++ii )
00137 {
00138 if( ii > 0 )
00139 os << '\t';
00140 os << "(" << m->Start(ii) << "," << m->RightEnd(ii) << ")";
00141 }
00142 }
00143
00144 void printProgress( uint prev_prog, uint cur_prog, std::ostream& os );
00145
00146
00147 template< typename PairType >
00148 class LabelSort
00149 {
00150 public:
00151 LabelSort( uint seqI ) : ssc( seqI ) {};
00152 bool operator()( const PairType& pt1, const PairType& pt2 )
00153 {
00154 return ssc( pt1.first, pt2.first );
00155 }
00156 private:
00157 LabelSort();
00158 mems::SSC<mems::AbstractMatch> ssc;
00159 };
00160
00161 template<class MatchVector>
00162 void IdentifyBreakpoints( MatchVector& mlist, std::vector<gnSeqI>& breakpoints )
00163 {
00164 if( mlist.size() == 0 )
00165 return;
00166 breakpoints = std::vector<gnSeqI>(1, mlist.size()-1);
00167
00168 mems::SSC<mems::AbstractMatch> ssc(0);
00169 std::sort( mlist.begin(), mlist.end(), ssc );
00170 typedef typename MatchVector::value_type value_type;
00171 typedef std::pair< value_type, size_t > LabelPairType;
00172 std::vector< LabelPairType > label_list;
00173 typename MatchVector::iterator cur = mlist.begin();
00174 typename MatchVector::iterator end = mlist.end();
00175 size_t i = 0;
00176 for( ;cur != end; ++cur )
00177 {
00178 label_list.push_back( std::make_pair( *cur, i ) );
00179 ++i;
00180 }
00181
00182 uint seq_count = mlist[0]->SeqCount();
00183
00184 for( uint seqI = 1; seqI < seq_count; seqI++ )
00185 {
00186 LabelSort< LabelPairType > ls(seqI);
00187 std::sort( label_list.begin(), label_list.end(), ls );
00188
00189 typename std::vector< LabelPairType >::const_iterator prev = label_list.begin();
00190 typename std::vector< std::pair< typename MatchVector::value_type, size_t > >::const_iterator iter = label_list.begin();
00191 typename std::vector< std::pair< typename MatchVector::value_type, size_t > >::const_iterator lab_end = label_list.end();
00192
00193 bool prev_orient = (*prev).first->Orientation(seqI) == (*prev).first->Orientation(0);
00194 if( !prev_orient )
00195 breakpoints.push_back(prev->second);
00196
00197 for( ++iter; iter != lab_end; ++iter )
00198 {
00199 bool cur_orient = (*iter).first->Orientation(seqI) == (*iter).first->Orientation(0);
00200 if( prev_orient == cur_orient &&
00201 ( ( prev_orient && (*prev).second + 1 == (*iter).second) ||
00202 ( !prev_orient && (*prev).second - 1 == (*iter).second)
00203 )
00204 )
00205 {
00206 prev_orient = cur_orient;
00207 ++prev;
00208 continue;
00209 }
00210
00211
00212 if( prev_orient )
00213 breakpoints.push_back( prev->second );
00214 if( !cur_orient )
00215 breakpoints.push_back( iter->second );
00216
00217 prev_orient = cur_orient;
00218 ++prev;
00219 }
00220 if( prev_orient )
00221 breakpoints.push_back( prev->second );
00222 }
00223 std::sort( breakpoints.begin(), breakpoints.end() );
00224 std::vector<gnSeqI>::iterator uni = std::unique( breakpoints.begin(), breakpoints.end() );
00225 breakpoints.erase( uni, breakpoints.end() );
00226 }
00227
00228
00229 template< class MatchVector >
00230 void ComputeLCBs_v2( const MatchVector& meml, const std::vector<gnSeqI>& breakpoints, std::vector< MatchVector >& lcb_list )
00231 {
00232
00233 if( breakpoints.size() < 1 )
00234 return;
00235
00236 lcb_list.clear();
00237
00238
00239 std::vector<gnSeqI>::const_iterator break_iter = breakpoints.begin();
00240 uint prev_break = 0;
00241 MatchVector lcb;
00242 for( ; break_iter != breakpoints.end(); ++break_iter ){
00243
00244 lcb_list.push_back( lcb );
00245 lcb_list.back().insert( lcb_list.back().end(), meml.begin() + prev_break, meml.begin() + *break_iter + 1 );
00246 prev_break = *break_iter + 1;
00247 }
00248 }
00249
00250
00251 template <class MatchVector>
00252 void computeLCBAdjacencies_v3( const std::vector< MatchVector >& lcb_list, std::vector< double >& weights, std::vector< mems::LCB >& adjacencies )
00253 {
00254 adjacencies.clear();
00255 if( lcb_list.size() == 0 )
00256 return;
00257
00258 uint seq_count = lcb_list.front().front()->SeqCount();
00259 uint seqI;
00260 uint lcbI;
00261 for( lcbI = 0; lcbI < lcb_list.size(); ++lcbI ){
00262 mems::LCB lcb;
00263 std::vector<gnSeqI> left_end;
00264 std::vector<gnSeqI> length;
00265 std::vector<bool> orientation;
00266 FindBoundaries( lcb_list[lcbI], left_end, length, orientation );
00267
00268 lcb.left_adjacency = std::vector<uint>( left_end.size(), -1 );
00269 lcb.right_adjacency = std::vector<uint>( left_end.size(), -1 );
00270 lcb.left_end = std::vector<int64>( left_end.size(), 0 );
00271 lcb.right_end = std::vector<int64>( left_end.size(), 0 );
00272
00273 for( seqI = 0; seqI < seq_count; seqI++ ){
00274
00275 if( left_end[seqI] == mems::NO_MATCH )
00276 continue;
00277 lcb.left_end[seqI] = left_end[seqI];
00278 lcb.right_end[seqI] = left_end[seqI] + length[seqI];
00279 if( !orientation[seqI] )
00280 {
00281 lcb.left_end[seqI] = -lcb.left_end[seqI];
00282 lcb.right_end[seqI] = -lcb.right_end[seqI];
00283 }
00284 }
00285 lcb.lcb_id = adjacencies.size();
00286 lcb.weight = weights[ lcbI ];
00287 adjacencies.push_back( lcb );
00288 }
00289
00290 for( seqI = 0; seqI < seq_count; seqI++ ){
00291 mems::LCBLeftComparator llc( seqI );
00292 std::sort( adjacencies.begin(), adjacencies.end(), llc );
00293 for( lcbI = 1; lcbI + 1 < lcb_list.size(); lcbI++ ){
00294 adjacencies[ lcbI ].left_adjacency[ seqI ] = adjacencies[ lcbI - 1 ].lcb_id;
00295 adjacencies[ lcbI ].right_adjacency[ seqI ] = adjacencies[ lcbI + 1 ].lcb_id;
00296 }
00297 if( lcbI == lcb_list.size() )
00298 lcbI--;
00299
00300
00301 adjacencies[ 0 ].left_adjacency[ seqI ] = (uint)-1;
00302 adjacencies[ lcbI ].right_adjacency[ seqI ] = (uint)-1;
00303 if( lcbI > 0 ){
00304 adjacencies[ 0 ].right_adjacency[ seqI ] = adjacencies[ 1 ].lcb_id;
00305 adjacencies[ lcbI ].left_adjacency[ seqI ] = adjacencies[ lcbI - 1 ].lcb_id;
00306 }
00307 }
00308 mems::LCBIDComparator lic;
00309 std::sort( adjacencies.begin(), adjacencies.end(), lic );
00310
00311 }
00312
00316 inline
00317 void computeLCBAdjacencies_v3( mems::IntervalList& iv_list, std::vector< double >& weights, std::vector< mems::LCB >& adjacencies ){
00318 std::vector< std::vector< mems::Interval* > > nivs;
00319 for( size_t ivI = 0; ivI < iv_list.size(); ivI++ )
00320 nivs.push_back( std::vector< mems::Interval* >( 1, &iv_list[ivI] ) );
00321 computeLCBAdjacencies_v3( nivs, weights, adjacencies );
00322 }
00323
00328 template< class MatchVector >
00329 void filterMatches_v2( std::vector< mems::LCB >& adjacencies, std::vector< MatchVector >& lcb_list, std::vector< double >& weights, MatchVector& deleted_matches ){
00330 if( lcb_list.size() < 1 )
00331 return;
00332 MatchVector lcb_tmp = lcb_list[ 0 ];
00333 lcb_tmp.clear();
00334 std::vector< MatchVector > filtered_lcbs( lcb_list.size(), lcb_tmp );
00335 uint lcbI;
00336 for( lcbI = 0; lcbI < adjacencies.size(); lcbI++ ){
00337 if( adjacencies[ lcbI ].lcb_id == lcbI ){
00338 filtered_lcbs[ lcbI ].insert( filtered_lcbs[ lcbI ].end(), lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end() );
00339 continue;
00340 }
00341 if( adjacencies[ lcbI ].lcb_id == -1 ){
00342 std::cerr << "weird";
00343 continue;
00344 }
00345 if( adjacencies[ lcbI ].lcb_id == -2 )
00346 {
00347 deleted_matches.insert( deleted_matches.end(), lcb_list[lcbI].begin(), lcb_list[lcbI].end() );
00348 continue;
00349 }
00350
00351
00352
00353 std::stack< uint > visited_lcbs;
00354 visited_lcbs.push( lcbI );
00355 uint cur_lcb = adjacencies[ lcbI ].lcb_id;
00356 while( adjacencies[ cur_lcb ].lcb_id != cur_lcb ){
00357 visited_lcbs.push( cur_lcb );
00358 cur_lcb = adjacencies[ cur_lcb ].lcb_id;
00359 if( cur_lcb == -1 || cur_lcb == -2 ){
00360
00361 break;
00362 }
00363 }
00364 while( visited_lcbs.size() > 0 ){
00365 adjacencies[ visited_lcbs.top() ].lcb_id = cur_lcb;
00366 visited_lcbs.pop();
00367 }
00368
00369 if( cur_lcb != -1 && cur_lcb != -2 )
00370 filtered_lcbs[ cur_lcb ].insert( filtered_lcbs[ cur_lcb ].end(), lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end() );
00371 else
00372 deleted_matches.insert( deleted_matches.end(), lcb_list[lcbI].begin(), lcb_list[lcbI].end() );
00373 }
00374
00375
00376 lcb_list.clear();
00377 std::vector< double > new_weights;
00378 for( lcbI = 0; lcbI < filtered_lcbs.size(); lcbI++ ){
00379 if( filtered_lcbs[ lcbI ].size() > 0 ){
00380 lcb_list.push_back( filtered_lcbs[ lcbI ] );
00381 new_weights.push_back( weights[lcbI] );
00382 }
00383 }
00384
00385
00386 mems::MatchStartComparator<mems::AbstractMatch> msc( 0 );
00387 for( lcbI = 0; lcbI < lcb_list.size(); lcbI++ ){
00388 std::sort( lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end(), msc );
00389 }
00390
00391
00392 weights = new_weights;
00393 computeLCBAdjacencies_v3( lcb_list, weights, adjacencies );
00394
00395 }
00396
00397
00398 const score_t INV_SCORE = (std::numeric_limits<score_t>::max)();
00399 void computeMatchScores( const std::string& seq1, const std::string& seq2, const PairwiseScoringScheme& scoring, std::vector<score_t>& scores );
00400 void computeGapScores( const std::string& seq1, const std::string& seq2, const PairwiseScoringScheme& scoring, std::vector<score_t>& scores );
00401
00402
00403 template< class MatchVector >
00404 double GetPairwiseAnchorScore( MatchVector& lcb,
00405 std::vector< genome::gnSequence* >& seq_table,
00406 const mems::PairwiseScoringScheme& subst_scoring,
00407 mems::SeedOccurrenceList& sol_1,
00408 mems::SeedOccurrenceList& sol_2,
00409 bool penalize_gaps )
00410 {
00411 double lcb_score = 0;
00412 typename MatchVector::iterator match_iter = lcb.begin();
00413 for( ; match_iter != lcb.end(); ++match_iter )
00414 {
00415 typedef typename MatchVector::value_type MatchPtrType;
00416 MatchPtrType m = *match_iter;
00417 std::vector< score_t > scores(m->AlignmentLength(), 0);
00418 std::vector< std::string > et;
00419 mems::GetAlignment(*m, seq_table, et);
00420
00421
00422 mems::computeMatchScores( et[0], et[1], subst_scoring, scores );
00423 if( penalize_gaps )
00424 mems::computeGapScores( et[0], et[1], subst_scoring, scores );
00425
00426
00427 size_t merI = 0;
00428 size_t merJ = 0;
00429 double uni_count = 0;
00430 double uni_score = 0;
00431 const size_t m_aln_length = m->AlignmentLength();
00432 const int64 m_leftend_0 = m->LeftEnd(0);
00433 const int64 m_leftend_1 = m->LeftEnd(1);
00434 for( size_t colI = 0; colI < m_aln_length; ++colI )
00435 {
00436 if(et[0][colI] != '-' && et[1][colI] != '-' )
00437 {
00438 mems::SeedOccurrenceList::frequency_type uni1 = sol_1.getFrequency(m_leftend_0 + merI - 1);
00439 mems::SeedOccurrenceList::frequency_type uni2 = sol_2.getFrequency(m_leftend_1 + merJ - 1);
00440 mems::SeedOccurrenceList::frequency_type uniprod = uni1*uni2;
00441 uniprod = uniprod == 0 ? 1 : uniprod;
00442
00443
00444 if( scores[colI] > 0 )
00445 {
00446 if(penalize_repeats)
00447 scores[colI] = (score_t)((double)scores[colI] * (2.0 / uniprod)) - scores[colI];
00448 else
00449 scores[colI] = (score_t)((mems::SeedOccurrenceList::frequency_type)scores[colI] / uniprod);
00450 }
00451 }
00452 if(et[0][colI] != '-')
00453 merI++;
00454 if(et[1][colI] != '-')
00455 merJ++;
00456 }
00457
00458
00459 double m_score = 0;
00460 for( size_t i = 0; i < scores.size(); ++i )
00461 if( scores[i] != INV_SCORE )
00462 m_score += scores[i];
00463
00464 if( !( m_score > -1000000000 && m_score < 1000000000 ) )
00465 {
00466 std::cerr << "scoring error\n";
00467 genome::breakHere();
00468 }
00469 lcb_score += m_score;
00470 }
00471
00472
00473 return lcb_score;
00474 }
00475
00476
00477
00478 class EvenFasterSumOfPairsBreakpointScorer
00479 {
00480 public:
00481 EvenFasterSumOfPairsBreakpointScorer(
00482 double breakpoint_penalty,
00483 double minimum_breakpoint_penalty,
00484 boost::multi_array<double,2> bp_weight_matrix,
00485 boost::multi_array<double,2> conservation_weight_matrix,
00486 std::vector< TrackingMatch* > tracking_match,
00487 mems::PairwiseLCBMatrix& pairwise_adjacency_matrix,
00488 std::vector<node_id_t>& n1_descendants,
00489 std::vector<node_id_t>& n2_descendants,
00490 boost::multi_array< double, 3 >& tm_score_array,
00491 boost::multi_array< size_t, 3 >& tm_lcb_id_array,
00492 size_t seqI_begin,
00493 size_t seqI_end,
00494 size_t seqJ_begin,
00495 size_t seqJ_end
00496 );
00497
00502 size_t getMoveCount();
00503
00505 double score();
00506
00508 double operator()( std::pair< double, size_t >& the_move );
00509
00511 bool isValid( std::pair< double, size_t >& the_move );
00512
00513 bool remove( std::pair< double, size_t >& the_move, std::vector< std::pair< double, size_t > >& new_move_list, size_t& new_move_count );
00514
00516 void applyScoreDifference( boost::multi_array< double, 2 >& lcb_score_diff, boost::multi_array< size_t, 2 >& lcb_removed_count );
00517
00519 void undoScoreDifference( boost::multi_array< double, 2 >& lcb_score_diff, boost::multi_array< size_t, 2 >& lcb_removed_count );
00520
00522 size_t getMaxNewMoveCount();
00523
00528 bool remove( std::pair< double, size_t >& the_move, bool really_remove,
00529 boost::multi_array< double, 2 >& lcb_score_diff, boost::multi_array< size_t, 2 >& lcb_removed_count,
00530 bool score_new_moves, std::vector< std::pair< double, size_t > >& new_move_list, size_t& new_move_count );
00531
00533 std::vector< mems::TrackingMatch* > getResults();
00534
00536 bool validate();
00537
00538 protected:
00539 double bp_penalty;
00540 boost::multi_array<double,2> bp_weights;
00541 boost::multi_array<double,2> conservation_weights;
00542 std::vector< mems::TrackingMatch* > tracking_matches;
00543 mems::PairwiseLCBMatrix pairwise_adjacencies;
00544 std::vector<node_id_t> n1_des;
00545 std::vector<node_id_t> n2_des;
00546
00547 boost::multi_array< size_t, 2 > pairwise_lcb_count;
00548 boost::multi_array< double, 2 > pairwise_lcb_score;
00549
00550 std::vector< TrackingMatch* > deleted_tracking_matches;
00551
00552 double min_breakpoint_penalty;
00553
00554 private:
00555
00556 const size_t seqI_count;
00557 const size_t seqJ_count;
00558
00559
00560 boost::multi_array< std::vector< std::pair< uint, uint > >, 2 > all_id_remaps;
00561 boost::multi_array< std::vector< uint >, 2 > full_impact_list;
00562 boost::multi_array< double, 2 > internal_lcb_score_diff[3];
00563 boost::multi_array< size_t, 2 > internal_lcb_removed_count[3];
00564 int using_lsd;
00565 std::vector< double > lsd_zeros;
00566 std::vector< size_t > lrc_zeros;
00567 std::vector< double > bogus_scores;
00568 std::vector< size_t > my_del_lcbs;
00569 std::vector< size_t > lcb_ids;
00570
00571 boost::multi_array< double, 3 >& tm_score_array;
00572 boost::multi_array< size_t, 3 >& tm_lcb_id_array;
00573
00574
00575 const size_t seqI_first;
00576 const size_t seqJ_first;
00577 const size_t seqI_last;
00578 const size_t seqJ_last;
00579
00580
00581 bool first_time;
00582 };
00583
00584
00585 template< class BreakpointScorerType >
00586 int64 greedyBreakpointElimination_v4( std::vector< mems::LCB >& adjacencies, std::vector< double >& scores, BreakpointScorerType& bp_scorer, std::ostream* status_out, size_t g1_tag = 0, size_t g2_tag = 0 );
00587
00588 template< class SearchScorer >
00589 double greedySearch( SearchScorer& spbs );
00590
00591
00596 class SimpleBreakpointScorer
00597 {
00598 public:
00599 SimpleBreakpointScorer( std::vector< LCB >& adjacencies, double breakpoint_penalty, bool collinear );
00600
00601 size_t getMoveCount();
00602
00603 double score();
00604
00605 bool isValid( size_t lcbI, double move_score );
00606
00608 double operator()( size_t lcbI );
00609
00611 void remove( uint lcbI, std::vector< std::pair< double, size_t > >& new_moves );
00612
00613 private:
00614 std::vector< mems::LCB > adjs;
00615 double bp_penalty;
00616 std::vector< double > scores;
00617 double total_weight;
00618 size_t bp_count;
00619 bool collinear;
00620 };
00621
00622
00623 class GreedyRemovalScorer
00624 {
00625 public:
00626 GreedyRemovalScorer( std::vector< LCB >& adjacencies, double minimum_weight );
00627
00628 size_t getMoveCount();
00629
00630 double score();
00631
00632 bool isValid( size_t lcbI, double move_score );
00633
00635 double operator()( size_t lcbI );
00636
00638 void remove( uint lcbI, std::vector< std::pair< double, size_t > >& new_moves );
00639
00640 private:
00641 std::vector< mems::LCB > adjs;
00642 double min_weight;
00643 std::vector< double > scores;
00644 double total_weight;
00645 };
00646
00647
00648
00649
00650 template< class BreakpointScorerType >
00651 int64 greedyBreakpointElimination_v4( std::vector< mems::LCB >& adjacencies,
00652 std::vector< double >& scores, BreakpointScorerType& bp_scorer, std::ostream* status_out,
00653 size_t g1_tag, size_t g2_tag )
00654 {
00655
00656 uint lcb_count = adjacencies.size();
00657 double total_initial_lcb_weight = 0;
00658 for( size_t wI = 0; wI < scores.size(); wI++ )
00659 total_initial_lcb_weight += scores[wI];
00660 double total_current_lcb_weight = total_initial_lcb_weight;
00661
00662 if( adjacencies.size() == 0 )
00663 return 0;
00664 uint seq_count = adjacencies[0].left_end.size();
00665
00666 double prev_score = bp_scorer.score();
00667 uint report_frequency = 10;
00668 uint moves_made = 0;
00669
00670 size_t move_count = bp_scorer.getMoveCount();
00671 std::vector< std::pair< double, size_t > > move_heap( move_count * 2 );
00672 size_t heap_end = move_count;
00673 for( size_t moveI = 0; moveI < move_count; ++moveI )
00674 {
00675 move_heap[moveI].first = bp_scorer(moveI);
00676 move_heap[moveI].second = moveI;
00677 }
00678
00679 #ifdef LCB_WEIGHT_LOSS_PLOT
00680 std::vector< double >::iterator min_iter = std::min_element(scores.begin(), scores.end());
00681 double mins = *min_iter;
00682 if( status_out != NULL )
00683 {
00684 (*status_out) << g1_tag << '\t' << g2_tag << '\t' << lcb_count << '\t' << 1 - (total_current_lcb_weight / total_initial_lcb_weight) << '\t' << mins << endl;
00685 }
00686 #endif
00687
00688
00689
00690
00691
00692
00693
00694 MoveScoreHeapComparator mshc;
00695 std::make_heap( move_heap.begin(), move_heap.end(), mshc );
00696 while( heap_end > 0 )
00697 {
00698 std::pop_heap( move_heap.begin(), move_heap.begin()+heap_end, mshc );
00699 heap_end--;
00700 std::pair< double, size_t > best_move = move_heap[ heap_end ];
00701 #ifdef LCB_WEIGHT_LOSS_PLOT
00702 if( total_current_lcb_weight == scores[best_move.second] )
00703 break;
00704 #else
00705 if( (best_move.first < 0 ) ||
00706 total_current_lcb_weight == scores[best_move.second] )
00707 break;
00708 #endif
00709
00710 std::vector< std::pair< double, size_t > > new_moves;
00711 bool success = bp_scorer.isValid(best_move.second, best_move.first);
00712 if( !success )
00713 continue;
00714 bp_scorer.remove(best_move.second, new_moves);
00715
00716
00717 for( size_t newI = 0; newI < new_moves.size(); newI++ )
00718 {
00719 if( heap_end < move_heap.size() )
00720 {
00721 heap_end++;
00722 move_heap[heap_end-1] = new_moves[newI];
00723 std::push_heap( move_heap.begin(), move_heap.begin()+heap_end, mshc );
00724 }else{
00725
00726 size_t prev_size = move_heap.size();
00727 move_heap.insert( move_heap.end(), new_moves.begin()+newI, new_moves.end() );
00728 for( size_t newdI = 0; newdI < new_moves.size()-newI; newdI++ )
00729 std::push_heap( move_heap.begin(), move_heap.begin()+prev_size+newdI+1, mshc );
00730 heap_end = move_heap.size();
00731 break;
00732 }
00733 }
00734
00735 total_current_lcb_weight -= scores[best_move.second];
00736 std::vector< std::pair< uint, uint > > id_remaps;
00737 std::vector< uint > impact_list;
00738 lcb_count -= RemoveLCBandCoalesce( best_move.second, adjacencies[0].left_end.size(), adjacencies, scores, id_remaps, impact_list );
00739 #ifdef LCB_WEIGHT_LOSS_PLOT
00740 mins = scores[best_move.second];
00741 if( status_out != NULL )
00742 {
00743 (*status_out) << g1_tag << '\t' << g2_tag << '\t' << lcb_count << '\t' << 1 - (total_current_lcb_weight / total_initial_lcb_weight) << '\t' << mins << endl;
00744 }
00745 #endif
00746 double cur_score = bp_scorer.score();
00747 prev_score = cur_score;
00748 moves_made++;
00749 #ifndef LCB_WEIGHT_LOSS_PLOT
00750 if( status_out != NULL && moves_made % report_frequency == 0 )
00751 (*status_out) << "move: " << moves_made << " alignment score " << cur_score << std::endl;
00752 #endif
00753 }
00754
00755 return 0;
00756 }
00757
00758 extern bool debug_aligner;
00759
00761 template< class SearchScorer >
00762 double greedySearch( SearchScorer& spbs )
00763 {
00764 double prev_score = spbs.score();
00765 uint report_frequency = 10;
00766 uint moves_made = 0;
00767 if( debug_aligner )
00768 spbs.validate();
00769 size_t move_count = spbs.getMoveCount();
00770 std::vector< double > current_moves( spbs.getMoveCount() );
00771
00772
00773 size_t heap_end = spbs.getMoveCount();
00774 std::vector< std::pair< double, size_t > > move_heap( spbs.getMoveCount() * 2 );
00775 std::vector< std::pair< double, size_t > > new_moves( spbs.getMaxNewMoveCount() + 10 );
00776 for( size_t moveI = 0; moveI < move_count; ++moveI )
00777 {
00778 std::pair< double, size_t > p( 0, moveI );
00779 double scorediff = spbs(p) - prev_score;
00780 p.first = scorediff;
00781 move_heap[moveI] = p;
00782 current_moves[moveI] = p.first;
00783 }
00784
00785 if( debug_aligner )
00786 spbs.validate();
00787
00788
00789
00790
00791
00792
00793 MoveScoreHeapComparator mshc;
00794 std::make_heap( move_heap.begin(), move_heap.begin() + heap_end, mshc );
00795 double successful = 0;
00796 double invalids = 0;
00797 int progress = 0;
00798 int prev_progress = -1;
00799 while( heap_end > 0 )
00800 {
00801 std::pop_heap( move_heap.begin(), move_heap.begin()+heap_end, mshc );
00802 std::pair< double, size_t > best_move = move_heap[--heap_end];
00803 if( best_move.first < 0 )
00804 break;
00805
00806 if( best_move.first != current_moves[best_move.second] )
00807 continue;
00808
00809 if( !spbs.isValid(best_move) )
00810 {
00811 invalids++;
00812 continue;
00813 }
00814
00815 size_t new_move_count = 0;
00816 bool success = spbs.remove(best_move, new_moves, new_move_count);
00817 if( !success )
00818 {
00819 std::cerr << "numerical instability? need to investigate this...\n";
00820
00821 invalids++;
00822 continue;
00823 }
00824
00825 successful++;
00826 if( debug_aligner )
00827 spbs.validate();
00828
00829 current_moves[ best_move.second ] = -(std::numeric_limits<double>::max)();
00830 for( size_t newI = 0; newI < new_move_count; newI++ )
00831 current_moves[ new_moves[newI].second ] = new_moves[newI].first;
00832
00833 for( size_t newI = 0; newI < new_move_count; newI++ )
00834 {
00835 if( heap_end < move_heap.size() )
00836 {
00837 heap_end++;
00838 move_heap[heap_end-1] = new_moves[newI];
00839 std::push_heap( move_heap.begin(), move_heap.begin()+heap_end, mshc );
00840 }else{
00841
00842 move_heap.resize( (std::min)((size_t)(heap_end * 1.6), heap_end + new_move_count) );
00843 std::copy( new_moves.begin() + newI, new_moves.begin() + new_move_count, move_heap.begin()+heap_end );
00844 for( size_t newdI = 0; newdI < new_move_count-newI; newdI++ )
00845 std::push_heap( move_heap.begin(), move_heap.begin()+heap_end+newdI+1, mshc );
00846 heap_end = move_heap.size();
00847 break;
00848 }
00849 }
00850
00851 moves_made++;
00852 prev_progress = progress;
00853 progress = (100 * moves_made) / move_count;
00854 printProgress( prev_progress, progress, std::cout );
00855
00856
00857 }
00858
00859 return spbs.score();
00860 }
00861
00862 struct AlnProgressTracker
00863 {
00864 gnSeqI total_len;
00865 gnSeqI cur_leftend;
00866 double prev_progress;
00867 };
00868
00869
00870 }
00871
00872 #endif // __greedyBreakpointElimination_h__
00873