libMems/GreedyBreakpointElimination.h

Go to the documentation of this file.
00001 #ifdef HAVE_CONFIG_H
00002 #include "config.h"
00003 #endif
00004 
00005 #ifndef __GreedyBreakpointElimination_h__
00006 #define __GreedyBreakpointElimination_h__
00007 
00008 #include <libMems/AbstractMatch.h>
00009 #include <iostream>
00010 #include <boost/multi_array.hpp>
00011 #include <libMems/PhyloTree.h>
00012 #include <libMems/SubstitutionMatrix.h>
00013 #include <libMems/SeedOccurrenceList.h>
00014 #include <libMems/IntervalList.h>
00015 #include <libMems/LCB.h>
00016 #include <stack>
00017 
00018 namespace mems {
00019 
00020 extern bool penalize_repeats;
00021 
00025 template <class MatchType>
00026 class LcbTrackingMatch
00027 { 
00028 public:
00029         MatchType original_match;
00030         MatchType node_match;
00031         size_t match_id;        // used to index into global arrays of lcb_id and score
00032 };
00033 typedef LcbTrackingMatch< mems::AbstractMatch* > TrackingMatch;
00034 
00038 template <class MatchType>
00039 class TrackingLCB
00040 {
00041 public:
00042         TrackingLCB(){}
00043         TrackingLCB( const TrackingLCB& l ){ *this = l; }
00045         TrackingLCB( const mems::LCB& l ){ *this = l; }
00046         TrackingLCB& operator=( const mems::LCB& l )
00047         {
00048                 left_end[0] = l.left_end[0];
00049                 left_end[1] = l.left_end[1];
00050                 right_end[0] = l.right_end[0];
00051                 right_end[1] = l.right_end[1];
00052                 left_adjacency[0] = l.left_adjacency[0];
00053                 left_adjacency[1] = l.left_adjacency[1];
00054                 right_adjacency[0] = l.right_adjacency[0];
00055                 right_adjacency[1] = l.right_adjacency[1];
00056                 lcb_id = l.lcb_id;
00057                 weight = l.weight;
00058                 to_be_deleted = false;
00059                 return *this;
00060         }
00061         int64 left_end[2];      
00062         int64 right_end[2];  
00063         uint left_adjacency[2]; 
00064         uint right_adjacency[2];        
00065         double weight;          
00066         std::vector< MatchType > matches;
00067         int lcb_id;                     
00068         bool to_be_deleted;
00069 };
00070 
00072 const uint LCB_UNASSIGNED = (std::numeric_limits<uint>::max)();
00073 
00074 typedef boost::multi_array< std::vector< TrackingLCB< TrackingMatch* > >, 2 > PairwiseLCBMatrix;
00075 
00076 
00080 template< class MatchVector >
00081 double GetPairwiseAnchorScore( 
00082                 MatchVector& lcb, std::vector< genome::gnSequence* >& seq_table, 
00083                 const mems::PairwiseScoringScheme& subst_scoring, mems::SeedOccurrenceList& sol_1, 
00084                 mems::SeedOccurrenceList& sol_2, bool penalize_gaps = false );
00085 
00086 class MoveScoreHeapComparator
00087 {
00088 public:
00089         bool operator()( const std::pair< double, size_t >& a, const std::pair< double, size_t >& b ) const
00090         {
00091                 return a.first < b.first;       // want to order by > instead of <
00092         }
00093 };
00094 
00098 void getPairwiseLCBs( 
00099         uint nI, 
00100         uint nJ, 
00101         uint dI, 
00102         uint dJ, 
00103         std::vector< TrackingMatch* >& tracking_matches, 
00104         std::vector< TrackingLCB<TrackingMatch*> >& t_lcbs,
00105         boost::multi_array< double, 3 >& tm_score_array,
00106         boost::multi_array< size_t, 3 >& tm_lcb_id_array );
00107 
00109 void initTrackingMatchLCBTracking( 
00110   const std::vector< mems::TrackingMatch >& tracking_matches, 
00111         size_t n1_count, 
00112         size_t n2_count, 
00113         boost::multi_array< size_t, 3 >& tm_lcb_id_array );
00114 
00115 
00123 template< class LcbVector >
00124 uint RemoveLCBandCoalesce( size_t lcbI, uint seq_count, 
00125                                                   LcbVector& adjacencies, 
00126                                                   std::vector< double >& scores, 
00127                                                   std::vector< std::pair< uint, uint > >& id_remaps, 
00128                                                   std::vector< uint >& impact_list );
00129 
00130 
00131 void printMatch( mems::AbstractMatch* m, std::ostream& os );
00132 
00133 inline
00134 void printMatch( mems::AbstractMatch* m, std::ostream& os )
00135 {
00136         for( size_t ii = 0; ii < m->SeqCount(); ++ii )
00137         {
00138                 if( ii > 0 )
00139                         os << '\t';
00140                 os << "(" << m->Start(ii) << "," << m->RightEnd(ii) << ")";
00141         }
00142 }
00143 
00144 void printProgress( uint prev_prog, uint cur_prog, std::ostream& os );
00145 
00146 
00147 template< typename PairType >
00148 class LabelSort 
00149 {
00150 public:
00151         LabelSort( uint seqI ) : ssc( seqI ) {};
00152         bool operator()( const PairType& pt1, const PairType& pt2 )
00153         {
00154                 return ssc( pt1.first, pt2.first );
00155         }
00156 private:
00157         LabelSort();
00158         mems::SSC<mems::AbstractMatch> ssc;
00159 };
00160 
00161 template<class MatchVector>
00162 void IdentifyBreakpoints( MatchVector& mlist, std::vector<gnSeqI>& breakpoints )
00163 {
00164         if( mlist.size() == 0 )
00165                 return;
00166         breakpoints = std::vector<gnSeqI>(1, mlist.size()-1);
00167 
00168         mems::SSC<mems::AbstractMatch> ssc(0);
00169         std::sort( mlist.begin(), mlist.end(), ssc );
00170         typedef typename MatchVector::value_type value_type;
00171         typedef std::pair< value_type, size_t > LabelPairType;
00172         std::vector< LabelPairType > label_list;
00173         typename MatchVector::iterator cur = mlist.begin();
00174         typename MatchVector::iterator end = mlist.end();
00175         size_t i = 0;
00176         for( ;cur != end; ++cur )
00177         {
00178                 label_list.push_back( std::make_pair( *cur, i ) );
00179                 ++i;
00180         }
00181 
00182         uint seq_count = mlist[0]->SeqCount();
00183         // check for breakpoints in each sequence
00184         for( uint seqI = 1; seqI < seq_count; seqI++ )
00185         {
00186                 LabelSort< LabelPairType > ls(seqI); 
00187                 std::sort( label_list.begin(), label_list.end(), ls );
00188 
00189                 typename std::vector< LabelPairType >::const_iterator prev = label_list.begin();
00190                 typename std::vector< std::pair< typename MatchVector::value_type, size_t > >::const_iterator iter = label_list.begin();
00191                 typename std::vector< std::pair< typename MatchVector::value_type, size_t > >::const_iterator lab_end = label_list.end();
00192 
00193                 bool prev_orient = (*prev).first->Orientation(seqI) == (*prev).first->Orientation(0);
00194                 if( !prev_orient )      // if we start in a different orientation than the ref seq there's a bp here
00195                         breakpoints.push_back(prev->second);
00196 
00197                 for( ++iter; iter != lab_end; ++iter )
00198                 {
00199                         bool cur_orient = (*iter).first->Orientation(seqI) == (*iter).first->Orientation(0);
00200                         if( prev_orient == cur_orient &&
00201                                 ( ( prev_orient && (*prev).second + 1 == (*iter).second) ||
00202                                   ( !prev_orient && (*prev).second - 1 == (*iter).second) 
00203                                 )
00204                           )
00205                         {
00206                                 prev_orient = cur_orient;
00207                                 ++prev;
00208                                 continue;       // no breakpoint here
00209                         }
00210 
00211                         // always add the last match in a new block (scanning from left to right in seq 0)
00212                         if( prev_orient )
00213                                 breakpoints.push_back( prev->second );
00214                         if( !cur_orient )
00215                                 breakpoints.push_back( iter->second );
00216 
00217                         prev_orient = cur_orient;
00218                         ++prev;
00219                 }
00220                 if( prev_orient )
00221                         breakpoints.push_back( prev->second );
00222         }
00223         std::sort( breakpoints.begin(), breakpoints.end() );
00224         std::vector<gnSeqI>::iterator uni = std::unique( breakpoints.begin(), breakpoints.end() );
00225         breakpoints.erase( uni, breakpoints.end() );
00226 }
00227 
00228 
00229 template< class MatchVector >
00230 void ComputeLCBs_v2( const MatchVector& meml, const std::vector<gnSeqI>& breakpoints, std::vector< MatchVector >& lcb_list )
00231 {
00232         // there must be at least one end of a block defined
00233         if( breakpoints.size() < 1 )
00234                 return;
00235                 
00236         lcb_list.clear();
00237         
00238         // organize the LCBs into different MatchVector instances
00239         std::vector<gnSeqI>::const_iterator break_iter = breakpoints.begin();
00240         uint prev_break = 0;    // prev_break is the first match in the current block
00241         MatchVector lcb;
00242         for( ; break_iter != breakpoints.end(); ++break_iter ){
00243                 // add the new MatchList to the set if it made the cut
00244                 lcb_list.push_back( lcb );
00245                 lcb_list.back().insert( lcb_list.back().end(), meml.begin() + prev_break, meml.begin() + *break_iter + 1 );
00246                 prev_break = *break_iter + 1;
00247         }
00248 }
00249 
00250 
00251 template <class MatchVector>
00252 void computeLCBAdjacencies_v3( const std::vector< MatchVector >& lcb_list, std::vector< double >& weights, std::vector< mems::LCB >& adjacencies )
00253 {
00254         adjacencies.clear(); // start with no LCB adjacencies
00255         if( lcb_list.size() == 0 )
00256                 return; // there aren't any LCBs so there aren't any adjacencies!
00257 
00258         uint seq_count = lcb_list.front().front()->SeqCount();
00259         uint seqI;
00260         uint lcbI;
00261         for( lcbI = 0; lcbI < lcb_list.size(); ++lcbI ){
00262                 mems::LCB lcb;
00263                 std::vector<gnSeqI> left_end;
00264                 std::vector<gnSeqI> length;
00265                 std::vector<bool> orientation;
00266                 FindBoundaries( lcb_list[lcbI], left_end, length, orientation );
00267 
00268                 lcb.left_adjacency = std::vector<uint>( left_end.size(), -1 );
00269                 lcb.right_adjacency = std::vector<uint>( left_end.size(), -1 );
00270                 lcb.left_end = std::vector<int64>( left_end.size(), 0 );
00271                 lcb.right_end = std::vector<int64>( left_end.size(), 0 );
00272 
00273                 for( seqI = 0; seqI < seq_count; seqI++ ){
00274                         // support "ragged edges" on the ends of LCBs
00275                         if( left_end[seqI] == mems::NO_MATCH )
00276                                 continue;
00277                         lcb.left_end[seqI] = left_end[seqI];
00278                         lcb.right_end[seqI] = left_end[seqI] + length[seqI];
00279                         if( !orientation[seqI] )
00280                         {
00281                                 lcb.left_end[seqI] = -lcb.left_end[seqI];
00282                                 lcb.right_end[seqI] = -lcb.right_end[seqI];
00283                         }
00284                 }
00285                 lcb.lcb_id = adjacencies.size();
00286                 lcb.weight = weights[ lcbI ];
00287                 adjacencies.push_back( lcb );
00288         }
00289 
00290         for( seqI = 0; seqI < seq_count; seqI++ ){
00291                 mems::LCBLeftComparator llc( seqI );
00292                 std::sort( adjacencies.begin(), adjacencies.end(), llc );
00293                 for( lcbI = 1; lcbI + 1 < lcb_list.size(); lcbI++ ){
00294                         adjacencies[ lcbI ].left_adjacency[ seqI ] = adjacencies[ lcbI - 1 ].lcb_id;
00295                         adjacencies[ lcbI ].right_adjacency[ seqI ] = adjacencies[ lcbI + 1 ].lcb_id;
00296                 }
00297                 if( lcbI == lcb_list.size() )
00298                         lcbI--; // need to decrement when there is only a single LCB
00299 
00300                 // set first and last lcb adjacencies to -1
00301                 adjacencies[ 0 ].left_adjacency[ seqI ] = (uint)-1;
00302                 adjacencies[ lcbI ].right_adjacency[ seqI ] = (uint)-1;
00303                 if( lcbI > 0 ){
00304                         adjacencies[ 0 ].right_adjacency[ seqI ] = adjacencies[ 1 ].lcb_id;
00305                         adjacencies[ lcbI ].left_adjacency[ seqI ] = adjacencies[ lcbI - 1 ].lcb_id;
00306                 }
00307         }
00308         mems::LCBIDComparator lic;
00309         std::sort( adjacencies.begin(), adjacencies.end(), lic );
00310 
00311 }
00312 
00316 inline
00317 void computeLCBAdjacencies_v3( mems::IntervalList& iv_list, std::vector< double >& weights, std::vector< mems::LCB >& adjacencies ){
00318         std::vector< std::vector< mems::Interval* > > nivs;
00319         for( size_t ivI = 0; ivI < iv_list.size(); ivI++ )
00320                 nivs.push_back( std::vector< mems::Interval* >( 1, &iv_list[ivI] ) );
00321         computeLCBAdjacencies_v3( nivs, weights, adjacencies );
00322 }
00323 
00328 template< class MatchVector >
00329 void filterMatches_v2( std::vector< mems::LCB >& adjacencies, std::vector< MatchVector >& lcb_list, std::vector< double >& weights, MatchVector& deleted_matches ){
00330         if( lcb_list.size() < 1 )
00331                 return;
00332         MatchVector lcb_tmp = lcb_list[ 0 ];
00333         lcb_tmp.clear();
00334         std::vector< MatchVector > filtered_lcbs( lcb_list.size(), lcb_tmp );
00335         uint lcbI;
00336         for( lcbI = 0; lcbI < adjacencies.size(); lcbI++ ){
00337                 if( adjacencies[ lcbI ].lcb_id == lcbI ){
00338                         filtered_lcbs[ lcbI ].insert( filtered_lcbs[ lcbI ].end(), lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end() );
00339                         continue;
00340                 }
00341                 if( adjacencies[ lcbI ].lcb_id == -1 ){
00342                         std::cerr << "weird";
00343                         continue;       // this one was removed
00344                 }
00345                 if( adjacencies[ lcbI ].lcb_id == -2 )
00346                 {
00347                         deleted_matches.insert( deleted_matches.end(), lcb_list[lcbI].begin(), lcb_list[lcbI].end() );
00348                         continue;       // this one was removed
00349                 }
00350 
00351                 // this one points elsewhere
00352                 // search and update the union/find structure for the target
00353                 std::stack< uint > visited_lcbs;
00354                 visited_lcbs.push( lcbI );
00355                 uint cur_lcb = adjacencies[ lcbI ].lcb_id;
00356                 while( adjacencies[ cur_lcb ].lcb_id != cur_lcb ){
00357                         visited_lcbs.push( cur_lcb );
00358                         cur_lcb = adjacencies[ cur_lcb ].lcb_id;
00359                         if( cur_lcb == -1 || cur_lcb == -2 ){
00360 //                              std::cerr << "improper hoodidge\n";
00361                                 break;  // this one points to an LCB that got deleted
00362                         }
00363                 }
00364                 while( visited_lcbs.size() > 0 ){
00365                         adjacencies[ visited_lcbs.top() ].lcb_id = cur_lcb;
00366                         visited_lcbs.pop();
00367                 }
00368                 // add this LCB's matches to the target LCB.
00369                 if( cur_lcb != -1 && cur_lcb != -2 )
00370                         filtered_lcbs[ cur_lcb ].insert( filtered_lcbs[ cur_lcb ].end(), lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end() );
00371                 else
00372                         deleted_matches.insert( deleted_matches.end(), lcb_list[lcbI].begin(), lcb_list[lcbI].end() );
00373         }
00374 
00375 
00376         lcb_list.clear();
00377         std::vector< double > new_weights;
00378         for( lcbI = 0; lcbI < filtered_lcbs.size(); lcbI++ ){
00379                 if( filtered_lcbs[ lcbI ].size() > 0 ){
00380                         lcb_list.push_back( filtered_lcbs[ lcbI ] );
00381                         new_weights.push_back( weights[lcbI] );
00382                 }
00383         }
00384 
00385         // sort the matches inside consolidated LCBs
00386         mems::MatchStartComparator<mems::AbstractMatch> msc( 0 );
00387         for( lcbI = 0; lcbI < lcb_list.size(); lcbI++ ){
00388                 std::sort( lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end(), msc );
00389         }
00390 
00391         // calculate the LCB adjacencies
00392         weights = new_weights;
00393         computeLCBAdjacencies_v3( lcb_list, weights, adjacencies );
00394 
00395 }
00396 
00397 // predeclared to avoid need to include Islands.h
00398 const score_t INV_SCORE = (std::numeric_limits<score_t>::max)();
00399 void computeMatchScores( const std::string& seq1, const std::string& seq2, const PairwiseScoringScheme& scoring, std::vector<score_t>& scores );
00400 void computeGapScores( const std::string& seq1, const std::string& seq2, const PairwiseScoringScheme& scoring, std::vector<score_t>& scores );
00401 
00402 
00403 template< class MatchVector >
00404 double GetPairwiseAnchorScore( MatchVector& lcb, 
00405                                                           std::vector< genome::gnSequence* >& seq_table, 
00406                                                           const mems::PairwiseScoringScheme& subst_scoring, 
00407                                                           mems::SeedOccurrenceList& sol_1, 
00408                                                           mems::SeedOccurrenceList& sol_2, 
00409                                                           bool penalize_gaps )
00410 {
00411         double lcb_score = 0;
00412         typename MatchVector::iterator match_iter = lcb.begin();
00413         for( ; match_iter != lcb.end(); ++match_iter )
00414         {
00415                 typedef typename MatchVector::value_type MatchPtrType;
00416                 MatchPtrType m = *match_iter;
00417                 std::vector< score_t > scores(m->AlignmentLength(), 0);
00418                 std::vector< std::string > et;
00419                 mems::GetAlignment(*m, seq_table, et);
00420 
00421                 // get substitution/gap score
00422                 mems::computeMatchScores( et[0], et[1], subst_scoring, scores );
00423                 if( penalize_gaps )
00424                         mems::computeGapScores( et[0], et[1], subst_scoring, scores );
00425 
00426                 // scale match scores by uniqueness
00427                 size_t merI = 0;
00428                 size_t merJ = 0;
00429                 double uni_count = 0;
00430                 double uni_score = 0;
00431                 const size_t m_aln_length = m->AlignmentLength();
00432                 const int64 m_leftend_0 = m->LeftEnd(0);
00433                 const int64 m_leftend_1 = m->LeftEnd(1);
00434                 for( size_t colI = 0; colI < m_aln_length; ++colI )
00435                 {
00436                         if(et[0][colI] != '-' && et[1][colI] != '-' )
00437                         {
00438                                 mems::SeedOccurrenceList::frequency_type uni1 = sol_1.getFrequency(m_leftend_0 + merI - 1);
00439                                 mems::SeedOccurrenceList::frequency_type uni2 = sol_2.getFrequency(m_leftend_1 + merJ - 1);
00440                                 mems::SeedOccurrenceList::frequency_type uniprod = uni1*uni2;
00441                                 uniprod = uniprod == 0 ? 1 : uniprod;
00442                                 // scale by the uniqueness product, which approximates the number of ways to match up non-unique k-mers
00443                                 // in the worst case of a very repetitive match, the score becomes the negative of the match score
00444                                 if( scores[colI] > 0 )
00445                                 {
00446                                         if(penalize_repeats)
00447                                                 scores[colI] = (score_t)((double)scores[colI] * (2.0 / uniprod)) - scores[colI];
00448                                         else
00449                                                 scores[colI] = (score_t)((mems::SeedOccurrenceList::frequency_type)scores[colI] / uniprod);
00450                                 }
00451                         }
00452                         if(et[0][colI] != '-')
00453                                 merI++;
00454                         if(et[1][colI] != '-')
00455                                 merJ++;
00456                 }
00457 
00458 
00459                 double m_score = 0;
00460                 for( size_t i = 0; i < scores.size(); ++i )
00461                         if( scores[i] != INV_SCORE )
00462                                 m_score += scores[i];
00463 
00464                 if( !( m_score > -1000000000 && m_score < 1000000000 ) )
00465                 {
00466                         std::cerr << "scoring error\n";
00467                         genome::breakHere();
00468                 }
00469                 lcb_score += m_score;
00470         }
00471         
00472 
00473         return lcb_score;
00474 }
00475 
00476 
00477 
00478 class EvenFasterSumOfPairsBreakpointScorer
00479 {
00480 public:
00481         EvenFasterSumOfPairsBreakpointScorer( 
00482                 double breakpoint_penalty,
00483                 double minimum_breakpoint_penalty,
00484                 boost::multi_array<double,2> bp_weight_matrix, 
00485                 boost::multi_array<double,2> conservation_weight_matrix,
00486                 std::vector< TrackingMatch* > tracking_match,
00487                 mems::PairwiseLCBMatrix& pairwise_adjacency_matrix,
00488                 std::vector<node_id_t>& n1_descendants,
00489                 std::vector<node_id_t>& n2_descendants,
00490                 boost::multi_array< double, 3 >& tm_score_array,
00491                 boost::multi_array< size_t, 3 >& tm_lcb_id_array,
00492                 size_t seqI_begin,
00493                 size_t seqI_end,
00494                 size_t seqJ_begin,
00495                 size_t seqJ_end
00496                 );
00497 
00502         size_t getMoveCount();
00503 
00505         double score();
00506 
00508         double operator()( std::pair< double, size_t >& the_move  );
00509 
00511         bool isValid( std::pair< double, size_t >& the_move );
00512 
00513         bool remove( std::pair< double, size_t >& the_move, std::vector< std::pair< double, size_t > >& new_move_list, size_t& new_move_count );
00514 
00516         void applyScoreDifference( boost::multi_array< double, 2 >& lcb_score_diff, boost::multi_array< size_t, 2 >& lcb_removed_count );
00517 
00519         void undoScoreDifference( boost::multi_array< double, 2 >& lcb_score_diff, boost::multi_array< size_t, 2 >& lcb_removed_count );
00520 
00522         size_t getMaxNewMoveCount();
00523 
00528         bool remove( std::pair< double, size_t >& the_move, bool really_remove, 
00529                 boost::multi_array< double, 2 >& lcb_score_diff, boost::multi_array< size_t, 2 >& lcb_removed_count, 
00530                 bool score_new_moves, std::vector< std::pair< double, size_t > >& new_move_list, size_t& new_move_count );
00531 
00533         std::vector< mems::TrackingMatch* > getResults();
00534 
00536         bool validate();
00537 
00538 protected:
00539         double bp_penalty;
00540         boost::multi_array<double,2> bp_weights;
00541         boost::multi_array<double,2> conservation_weights;
00542         std::vector< mems::TrackingMatch* > tracking_matches;
00543         mems::PairwiseLCBMatrix pairwise_adjacencies;
00544         std::vector<node_id_t> n1_des;
00545         std::vector<node_id_t> n2_des;
00546 
00547         boost::multi_array< size_t, 2 > pairwise_lcb_count;
00548         boost::multi_array< double, 2 > pairwise_lcb_score;
00549 
00550         std::vector< TrackingMatch* > deleted_tracking_matches;
00551 
00552         double min_breakpoint_penalty;
00553 
00554 private:
00555         // avoid continuous size lookup
00556         const size_t seqI_count;
00557         const size_t seqJ_count;
00558 
00559         // variables used during score computation
00560         boost::multi_array< std::vector< std::pair< uint, uint > >, 2 > all_id_remaps;
00561         boost::multi_array< std::vector< uint >, 2 > full_impact_list;
00562         boost::multi_array< double, 2 > internal_lcb_score_diff[3];
00563         boost::multi_array< size_t, 2 > internal_lcb_removed_count[3];
00564         int using_lsd;
00565         std::vector< double > lsd_zeros;
00566         std::vector< size_t > lrc_zeros;
00567         std::vector< double > bogus_scores;
00568         std::vector< size_t > my_del_lcbs;
00569         std::vector< size_t > lcb_ids;
00570 
00571         boost::multi_array< double, 3 >& tm_score_array;
00572         boost::multi_array< size_t, 3 >& tm_lcb_id_array;
00573 
00574         // limit to a range of sequences
00575         const size_t seqI_first;
00576         const size_t seqJ_first;
00577         const size_t seqI_last;
00578         const size_t seqJ_last;
00579 
00580         // for debugging
00581         bool first_time;
00582 };
00583 
00584 
00585 template< class BreakpointScorerType >
00586 int64 greedyBreakpointElimination_v4( std::vector< mems::LCB >& adjacencies, std::vector< double >& scores, BreakpointScorerType& bp_scorer, std::ostream* status_out, size_t g1_tag = 0, size_t g2_tag = 0 );
00587 
00588 template< class SearchScorer >
00589 double greedySearch( SearchScorer& spbs );
00590 
00591 
00596 class SimpleBreakpointScorer
00597 {
00598 public:
00599         SimpleBreakpointScorer( std::vector< LCB >& adjacencies, double breakpoint_penalty, bool collinear );
00600 
00601         size_t getMoveCount();
00602 
00603         double score();
00604 
00605         bool isValid( size_t lcbI, double move_score );
00606 
00608         double operator()( size_t lcbI );
00609 
00611         void remove( uint lcbI, std::vector< std::pair< double, size_t > >& new_moves );
00612 
00613 private:
00614         std::vector< mems::LCB > adjs;
00615         double bp_penalty;
00616         std::vector< double > scores;
00617         double total_weight;
00618         size_t bp_count;
00619         bool collinear;
00620 };
00621 
00622 
00623 class GreedyRemovalScorer
00624 {
00625 public:
00626         GreedyRemovalScorer( std::vector< LCB >& adjacencies, double minimum_weight );
00627 
00628         size_t getMoveCount();
00629 
00630         double score();
00631 
00632         bool isValid( size_t lcbI, double move_score );
00633 
00635         double operator()( size_t lcbI );
00636 
00638         void remove( uint lcbI, std::vector< std::pair< double, size_t > >& new_moves );
00639 
00640 private:
00641         std::vector< mems::LCB > adjs;
00642         double min_weight;
00643         std::vector< double > scores;
00644         double total_weight;
00645 };
00646 
00647 
00648 
00649 
00650 template< class BreakpointScorerType >
00651 int64 greedyBreakpointElimination_v4( std::vector< mems::LCB >& adjacencies, 
00652                         std::vector< double >& scores, BreakpointScorerType& bp_scorer, std::ostream* status_out, 
00653                         size_t g1_tag, size_t g2_tag )
00654 {
00655         // repeatedly remove the low weight LCBs until the minimum weight criteria is satisfied
00656         uint lcb_count = adjacencies.size();
00657         double total_initial_lcb_weight = 0;
00658         for( size_t wI = 0; wI < scores.size(); wI++ )
00659                 total_initial_lcb_weight += scores[wI];
00660         double total_current_lcb_weight = total_initial_lcb_weight;
00661 
00662         if( adjacencies.size() == 0 )
00663                 return 0;       // nothing can be done
00664         uint seq_count = adjacencies[0].left_end.size();
00665         
00666         double prev_score = bp_scorer.score();
00667         uint report_frequency = 10;
00668         uint moves_made = 0;
00669 
00670         size_t move_count = bp_scorer.getMoveCount();
00671         std::vector< std::pair< double, size_t > > move_heap( move_count * 2 );
00672         size_t heap_end = move_count;
00673         for( size_t moveI = 0; moveI < move_count; ++moveI )
00674         {
00675                 move_heap[moveI].first = bp_scorer(moveI);
00676                 move_heap[moveI].second = moveI;
00677         }
00678 
00679 #ifdef LCB_WEIGHT_LOSS_PLOT
00680         std::vector< double >::iterator min_iter = std::min_element(scores.begin(), scores.end());
00681         double mins = *min_iter;
00682         if( status_out != NULL )
00683         {
00684                 (*status_out) << g1_tag << '\t' << g2_tag << '\t' << lcb_count << '\t' << 1 - (total_current_lcb_weight / total_initial_lcb_weight) << '\t' << mins << endl;
00685         }
00686 #endif
00687 
00688         // make a heap of moves ordered by score
00689         // repeatedly:
00690         // 1) pop the highest scoring move off the heap
00691         // 2) attempt to apply the move
00692         // 3) add any new moves to the heap
00693         // 4) stop when the highest scoring move no longer increases the score
00694         MoveScoreHeapComparator mshc;
00695         std::make_heap( move_heap.begin(), move_heap.end(), mshc );
00696         while( heap_end > 0 )
00697         {
00698                 std::pop_heap( move_heap.begin(), move_heap.begin()+heap_end, mshc );
00699                 heap_end--;
00700                 std::pair< double, size_t > best_move = move_heap[ heap_end ];
00701 #ifdef LCB_WEIGHT_LOSS_PLOT
00702                 if( total_current_lcb_weight == scores[best_move.second] )
00703                         break;  // don't remove the last LCB
00704 #else
00705                 if( (best_move.first < 0 ) ||
00706                         total_current_lcb_weight == scores[best_move.second] )
00707                         break;  // can't improve score
00708 #endif
00709 
00710                 std::vector< std::pair< double, size_t > > new_moves;
00711                 bool success = bp_scorer.isValid(best_move.second, best_move.first);
00712                 if( !success )
00713                         continue;
00714                 bp_scorer.remove(best_move.second, new_moves);
00715 
00716                 
00717                 for( size_t newI = 0; newI < new_moves.size(); newI++ )
00718                 {
00719                         if( heap_end < move_heap.size() )
00720                         {
00721                                 heap_end++;
00722                                 move_heap[heap_end-1] = new_moves[newI];
00723                                 std::push_heap( move_heap.begin(), move_heap.begin()+heap_end, mshc );
00724                         }else{
00725                                 // just push the rest on all at once
00726                                 size_t prev_size = move_heap.size();
00727                                 move_heap.insert( move_heap.end(), new_moves.begin()+newI, new_moves.end() );
00728                                 for( size_t newdI = 0; newdI < new_moves.size()-newI; newdI++ )
00729                                         std::push_heap( move_heap.begin(), move_heap.begin()+prev_size+newdI+1, mshc );
00730                                 heap_end = move_heap.size();
00731                                 break;
00732                         }
00733                 }
00734 
00735                 total_current_lcb_weight -= scores[best_move.second];
00736                 std::vector< std::pair< uint, uint > > id_remaps;
00737                 std::vector< uint > impact_list;
00738                 lcb_count -= RemoveLCBandCoalesce( best_move.second, adjacencies[0].left_end.size(), adjacencies, scores, id_remaps, impact_list );
00739 #ifdef LCB_WEIGHT_LOSS_PLOT
00740                 mins = scores[best_move.second];
00741                 if( status_out != NULL )
00742                 {
00743                         (*status_out) << g1_tag << '\t' << g2_tag << '\t' << lcb_count << '\t' << 1 - (total_current_lcb_weight / total_initial_lcb_weight) << '\t' << mins << endl;
00744                 }
00745 #endif
00746                 double cur_score = bp_scorer.score();
00747                 prev_score = cur_score;
00748                 moves_made++;
00749 #ifndef LCB_WEIGHT_LOSS_PLOT
00750                 if( status_out != NULL && moves_made % report_frequency == 0 )
00751                         (*status_out) << "move: " << moves_made << " alignment score " << cur_score << std::endl;
00752 #endif
00753         }
00754 
00755         return 0;
00756 }
00757 
00758 extern bool debug_aligner;
00759 
00761 template< class SearchScorer >
00762 double greedySearch( SearchScorer& spbs )
00763 {
00764         double prev_score = spbs.score();
00765         uint report_frequency = 10;
00766         uint moves_made = 0;
00767         if( debug_aligner )
00768                 spbs.validate();
00769         size_t move_count = spbs.getMoveCount();
00770         std::vector< double > current_moves( spbs.getMoveCount() );
00771         // use double the size for the move heap to avoid an almost instant reallocation
00772         // when a new move gets pushed onto the heap
00773         size_t heap_end = spbs.getMoveCount();
00774         std::vector< std::pair< double, size_t > > move_heap( spbs.getMoveCount() * 2 );
00775         std::vector< std::pair< double, size_t > > new_moves( spbs.getMaxNewMoveCount() + 10 );
00776         for( size_t moveI = 0; moveI < move_count; ++moveI )
00777         {
00778                 std::pair< double, size_t > p( 0, moveI );
00779                 double scorediff = spbs(p) - prev_score;
00780                 p.first = scorediff;
00781                 move_heap[moveI] = p;
00782                 current_moves[moveI] = p.first;
00783         }
00784 
00785         if( debug_aligner )
00786                 spbs.validate();
00787         // make a heap of moves ordered by score
00788         // repeatedly:
00789         // 1) pop the highest scoring move off the heap
00790         // 2) attempt to apply the move
00791         // 3) add any new moves to the heap
00792         // 4) stop when the highest scoring move no longer increases the score
00793         MoveScoreHeapComparator mshc;
00794         std::make_heap( move_heap.begin(), move_heap.begin() + heap_end, mshc );
00795         double successful = 0;
00796         double invalids = 0;
00797         int progress = 0;
00798         int prev_progress = -1;
00799         while( heap_end > 0 )
00800         {
00801                 std::pop_heap( move_heap.begin(), move_heap.begin()+heap_end, mshc );
00802                 std::pair< double, size_t > best_move = move_heap[--heap_end];
00803                 if( best_move.first < 0 )
00804                         break;  // can't improve score
00805 
00806                 if( best_move.first != current_moves[best_move.second] )
00807                         continue;
00808 
00809                 if( !spbs.isValid(best_move) )
00810                 {
00811                         invalids++;
00812                         continue;
00813                 }
00814 
00815                 size_t new_move_count = 0;
00816                 bool success = spbs.remove(best_move, new_moves, new_move_count);
00817                 if( !success )
00818                 {
00819                         std::cerr << "numerical instability?  need to investigate this...\n";
00820 //                      genome::breakHere();
00821                         invalids++;
00822                         continue;
00823                 }
00824 
00825                 successful++;
00826                 if( debug_aligner )
00827                         spbs.validate();
00828 
00829                 current_moves[ best_move.second ] = -(std::numeric_limits<double>::max)();
00830                 for( size_t newI = 0; newI < new_move_count; newI++ )
00831                         current_moves[ new_moves[newI].second ] = new_moves[newI].first;
00832 
00833                 for( size_t newI = 0; newI < new_move_count; newI++ )
00834                 {
00835                         if( heap_end < move_heap.size() )
00836                         {
00837                                 heap_end++;
00838                                 move_heap[heap_end-1] = new_moves[newI];
00839                                 std::push_heap( move_heap.begin(), move_heap.begin()+heap_end, mshc );
00840                         }else{
00841                                 // just push the rest on all at once
00842                                 move_heap.resize( (std::min)((size_t)(heap_end * 1.6), heap_end + new_move_count) );
00843                                 std::copy( new_moves.begin() + newI, new_moves.begin() + new_move_count, move_heap.begin()+heap_end );
00844                                 for( size_t newdI = 0; newdI < new_move_count-newI; newdI++ )
00845                                         std::push_heap( move_heap.begin(), move_heap.begin()+heap_end+newdI+1, mshc );
00846                                 heap_end = move_heap.size();
00847                                 break;
00848                         }
00849                 }
00850 
00851                 moves_made++;
00852                 prev_progress = progress;
00853                 progress = (100 * moves_made) / move_count;
00854                 printProgress( prev_progress, progress, std::cout );
00855 //              if( moves_made % report_frequency == 0 )
00856 //                      cout << "move: " << moves_made << " alignment score " << cur_score << " success ratio " << successful / invalids << endl;
00857         }
00858 
00859         return spbs.score();
00860 }
00861 
00862 struct AlnProgressTracker
00863 {
00864         gnSeqI total_len;
00865         gnSeqI cur_leftend;
00866         double prev_progress;
00867 };
00868 
00869 
00870 }       // namespace mems
00871 
00872 #endif // __greedyBreakpointElimination_h__
00873 

Generated on Fri Mar 14 06:01:03 2008 for libMems by doxygen 1.3.6