00001 #ifdef HAVE_CONFIG_H
00002 #include "config.h"
00003 #endif
00004
00005
00006 #include "libMems/GreedyBreakpointElimination.h"
00007 #include "libMems/ProgressiveAligner.h"
00008 #include "libMems/Aligner.h"
00009 #include "libMems/Islands.h"
00010 #include "libMems/DNAFileSML.h"
00011 #include "libMems/MuscleInterface.h"
00012 #include "libGenome/gnRAWSource.h"
00013 #include "libMems/gnAlignedSequences.h"
00014 #include "libMems/CompactGappedAlignment.h"
00015 #include "libMems/MatchProjectionAdapter.h"
00016 #include "libMems/PairwiseMatchFinder.h"
00017 #include "libMems/TreeUtilities.h"
00018 #include "libMems/PairwiseMatchAdapter.h"
00019
00020 #include <boost/dynamic_bitset.hpp>
00021 #include <boost/tuple/tuple.hpp>
00022
00023 #include <map>
00024 #include <fstream>
00025 #include <sstream>
00026 #include <stack>
00027 #include <algorithm>
00028 #include <limits>
00029 #include <iomanip>
00030
00031 using namespace std;
00032 using namespace genome;
00033
00034 namespace mems {
00035
00036
00037 bool penalize_repeats = false;
00038
00039 void printProgress( uint prev_prog, uint cur_prog, ostream& os )
00040 {
00041 if( prev_prog != cur_prog )
00042 {
00043 if( cur_prog / 10 != prev_prog / 10 )
00044 os << endl;
00045 os << cur_prog << "%..";
00046 os.flush();
00047 }
00048 }
00049
00050
00051
00052
00053 void getPairwiseLCBs(
00054 uint nI,
00055 uint nJ,
00056 uint dI,
00057 uint dJ,
00058 vector< TrackingMatch* >& tracking_matches,
00059 vector< TrackingLCB<TrackingMatch*> >& t_lcbs,
00060 boost::multi_array< double, 3 >& tm_score_array,
00061 boost::multi_array< size_t, 3 >& tm_lcb_id_array )
00062 {
00063
00064 vector< AbstractMatch* > pair_matches;
00065 for( size_t mI = 0; mI < tracking_matches.size(); ++mI )
00066 {
00067 if( tracking_matches[mI]->node_match->LeftEnd(nI) == NO_MATCH ||
00068 tracking_matches[mI]->node_match->LeftEnd(nJ) == NO_MATCH )
00069 continue;
00070 PairwiseMatchAdapter pma(tracking_matches[mI]->node_match, nI, nJ );
00071 pma.tm = tracking_matches[mI];
00072 if( pma.Orientation(0) == AbstractMatch::reverse )
00073 pma.Invert();
00074 pair_matches.push_back(pma.Copy());
00075 }
00076
00077 vector< gnSeqI > breakpoints;
00078 IdentifyBreakpoints( pair_matches, breakpoints );
00079
00080 vector< vector< AbstractMatch* > > LCB_list;
00081 ComputeLCBs_v2( pair_matches, breakpoints, LCB_list );
00082
00083
00084
00085
00086 vector< double > lcb_scores(LCB_list.size());
00087 for( size_t lcbI = 0; lcbI < LCB_list.size(); ++lcbI )
00088 {
00089 double lcb_score = 0;
00090 for( size_t mI = 0; mI < LCB_list[lcbI].size(); ++mI )
00091 {
00092 PairwiseMatchAdapter* pma = (PairwiseMatchAdapter*)LCB_list[lcbI][mI];
00093 lcb_score += tm_score_array[pma->tm->match_id][dI][dJ];
00094 }
00095 lcb_scores[lcbI] = lcb_score;
00096 }
00097
00098
00099 vector< LCB > adjacencies;
00100 computeLCBAdjacencies_v3( LCB_list, lcb_scores, adjacencies );
00101
00102 t_lcbs.resize(adjacencies.size());
00103 for( size_t lcbI = 0; lcbI < adjacencies.size(); ++lcbI )
00104 {
00105 t_lcbs[lcbI] = adjacencies[lcbI];
00106 t_lcbs[lcbI].matches.resize(LCB_list[lcbI].size());
00107 for( size_t mI = 0; mI < LCB_list[lcbI].size(); ++mI )
00108 t_lcbs[lcbI].matches[mI] = ((PairwiseMatchAdapter*)LCB_list[lcbI][mI])->tm;
00109
00110 sort( t_lcbs[lcbI].matches.begin(), t_lcbs[lcbI].matches.end() );
00111
00112
00113 for( size_t mI = 0; mI < t_lcbs[lcbI].matches.size(); ++mI )
00114 tm_lcb_id_array[t_lcbs[lcbI].matches[mI]->match_id][dI][dJ] = lcbI;
00115 }
00116
00117
00118 for( size_t mI = 0; mI < pair_matches.size(); ++mI )
00119 pair_matches[mI]->Free();
00120 }
00121
00123 void initTrackingMatchLCBTracking(
00124 const std::vector< TrackingMatch >& tracking_matches,
00125 size_t n1_count,
00126 size_t n2_count,
00127 boost::multi_array< size_t, 3 >& tm_lcb_id_array )
00128 {
00129 tm_lcb_id_array.resize( boost::extents[tracking_matches.size()][n1_count][n2_count] );
00130 for( size_t mI = 0; mI < tracking_matches.size(); ++mI )
00131 {
00132 for( size_t nI = 0; nI < n1_count; ++nI )
00133 for( size_t nJ = 0; nJ < n2_count; ++nJ )
00134 tm_lcb_id_array[mI][nI][nJ] = LCB_UNASSIGNED;
00135 }
00136 }
00137
00138
00146 template< class LcbVector >
00147 uint RemoveLCBandCoalesce( size_t lcbI, uint seq_count, LcbVector& adjacencies, std::vector< double >& scores, std::vector< std::pair< uint, uint > >& id_remaps, std::vector< uint >& impact_list )
00148 {
00149 uint removed_count = 0;
00150 vector< uint > imp_tmp(seq_count * (2 + seq_count * 4), LCB_UNASSIGNED);
00151 swap(impact_list, imp_tmp);
00152 size_t impactI = 0;
00153 id_remaps.clear();
00154
00155 adjacencies[ lcbI ].lcb_id = -2;
00156
00157
00158 uint seqI;
00159 uint left_adj;
00160 uint right_adj;
00161 for( seqI = 0; seqI < seq_count; seqI++ ){
00162 left_adj = adjacencies[ lcbI ].left_adjacency[ seqI ];
00163 right_adj = adjacencies[ lcbI ].right_adjacency[ seqI ];
00164 if( left_adj != -1 )
00165 adjacencies[ left_adj ].right_adjacency[ seqI ] = right_adj;
00166 if( right_adj != -1 && right_adj != adjacencies.size() )
00167 adjacencies[ right_adj ].left_adjacency[ seqI ] = left_adj;
00168 }
00169
00170
00171 for( seqI = 0; seqI < seq_count; seqI++ ){
00172 left_adj = adjacencies[ lcbI ].left_adjacency[ seqI ];
00173 right_adj = adjacencies[ lcbI ].right_adjacency[ seqI ];
00174 impact_list[impactI++] = left_adj;
00175 impact_list[impactI++] = right_adj;
00176 for( uint seqJ = 0; seqJ < seq_count; seqJ++ ){
00177 if( left_adj != -1 )
00178 {
00179 impact_list[impactI++] = adjacencies[ left_adj ].left_adjacency[ seqJ ];
00180 impact_list[impactI++] = adjacencies[ left_adj ].right_adjacency[ seqJ ];
00181 }
00182 if( right_adj != -1 )
00183 {
00184 impact_list[impactI++] = adjacencies[ right_adj ].left_adjacency[ seqJ ];
00185 impact_list[impactI++] = adjacencies[ right_adj ].right_adjacency[ seqJ ];
00186 }
00187 }
00188 }
00189
00190
00191 id_remaps.push_back( make_pair( lcbI, -1 ) );
00192 removed_count++;
00193
00194
00195 for( seqI = 0; seqI < seq_count; seqI++ ){
00196 left_adj = adjacencies[ lcbI ].left_adjacency[ seqI ];
00197 right_adj = adjacencies[ lcbI ].right_adjacency[ seqI ];
00198
00199 while( left_adj != -1 && adjacencies[ left_adj ].lcb_id != left_adj )
00200 left_adj = adjacencies[ left_adj ].left_adjacency[ seqI ];
00201 while( right_adj != -1 && adjacencies[ right_adj ].lcb_id != right_adj )
00202 right_adj = adjacencies[ right_adj ].right_adjacency[ seqI ];
00203 if( left_adj == -1 || right_adj == -1 )
00204 continue;
00205 if( adjacencies[ left_adj ].lcb_id != left_adj ||
00206 adjacencies[ right_adj ].lcb_id != right_adj )
00207 if( seqI > 0 )
00208 continue;
00209 else
00210 cerr << "trouble on down street\n";
00211
00212
00213 boolean orientation = adjacencies[ left_adj ].left_end[ seqI ] > 0 ? true : false;
00214 uint seqJ;
00215 for( seqJ = 0; seqJ < seq_count; seqJ++ ){
00216 boolean j_orientation = adjacencies[ left_adj ].left_end[ seqJ ] > 0;
00217 if( j_orientation == orientation &&
00218 adjacencies[ left_adj ].right_adjacency[ seqJ ] != right_adj )
00219 break;
00220 if( j_orientation != orientation &&
00221 adjacencies[ left_adj ].left_adjacency[ seqJ ] != right_adj )
00222 break;
00223
00224 if( adjacencies[ right_adj ].left_end[ seqJ ] > 0 != j_orientation )
00225 break;
00226 }
00227
00228 if( seqJ != seq_count ||
00229 adjacencies[ left_adj ].to_be_deleted ||
00230 adjacencies[ right_adj ].to_be_deleted )
00231 continue;
00232
00233
00234
00235
00236 id_remaps.push_back( make_pair( adjacencies[ right_adj ].lcb_id, left_adj ) );
00237 adjacencies[ right_adj ].lcb_id = left_adj;
00238 scores[ left_adj ] += scores[ right_adj ];
00239 adjacencies[ left_adj ].weight += adjacencies[ right_adj ].weight;
00240
00241
00242
00243 for( seqJ = 0; seqJ < seq_count; seqJ++ ){
00244 boolean j_orientation = adjacencies[ left_adj ].left_end[ seqJ ] > 0;
00245 uint rr_adj = adjacencies[ right_adj ].right_adjacency[ seqJ ];
00246 uint rl_adj = adjacencies[ right_adj ].left_adjacency[ seqJ ];
00247 if( j_orientation == orientation ){
00248 adjacencies[ left_adj ].right_end[ seqJ ] = adjacencies[ right_adj ].right_end[ seqJ ];
00249 adjacencies[ left_adj ].right_adjacency[ seqJ ] = rr_adj;
00250 if( rr_adj != -1 )
00251 adjacencies[ rr_adj ].left_adjacency[ seqJ ] = left_adj;
00252 }else{
00253 adjacencies[ left_adj ].left_end[ seqJ ] = adjacencies[ right_adj ].left_end[ seqJ ];
00254 adjacencies[ left_adj ].left_adjacency[ seqJ ] = rl_adj;
00255 if( rl_adj != -1 )
00256 adjacencies[ rl_adj ].right_adjacency[ seqJ ] = left_adj;
00257 }
00258 }
00259
00260 removed_count++;
00261 }
00262
00263 std::sort( impact_list.begin(), impact_list.end() );
00264 vector< uint >::iterator imp_end = std::unique( impact_list.begin(), impact_list.end() );
00265 vector< uint >::iterator imp_preend = std::lower_bound( impact_list.begin(), imp_end, LCB_UNASSIGNED );
00266 impact_list.erase( imp_preend, impact_list.end() );
00267
00268 return removed_count;
00269 }
00270
00271
00272 template< class LcbVector >
00273 void undoLcbRemoval( uint seq_count, LcbVector& adjs, std::vector< std::pair< uint, uint > >& id_remaps )
00274 {
00275 for( size_t rI = id_remaps.size(); rI > 0; --rI )
00276 {
00277 if( id_remaps[rI-1].second == -1 )
00278 {
00279
00280
00281 uint lcbI = id_remaps[rI-1].first;
00282 for( uint seqI = 0; seqI < seq_count; seqI++ )
00283 {
00284 uint left_adj = adjs[ lcbI ].left_adjacency[ seqI ];
00285 uint right_adj = adjs[ lcbI ].right_adjacency[ seqI ];
00286 if( left_adj != -1 )
00287 adjs[ left_adj ].right_adjacency[ seqI ] = lcbI;
00288 if( right_adj != -1 && right_adj != adjs.size() )
00289 adjs[ right_adj ].left_adjacency[ seqI ] = lcbI;
00290 }
00291 adjs[lcbI].lcb_id = lcbI;
00292 adjs[lcbI].to_be_deleted = false;
00293 }else{
00294
00295
00296 uint lcbI = id_remaps[rI-1].first;
00297 uint lcbJ = id_remaps[rI-1].second;
00298 adjs[lcbI].lcb_id = lcbI;
00299 adjs[lcbJ].weight -= adjs[lcbI].weight;
00300
00301
00302 for( uint seqI = 0; seqI < seq_count; ++seqI )
00303 {
00304 uint ladj = adjs[lcbI].left_adjacency[seqI];
00305 uint radj = adjs[lcbI].right_adjacency[seqI];
00306 if( ladj == lcbJ )
00307 {
00308 adjs[lcbJ].right_adjacency[seqI] = lcbI;
00309 if( radj != -1 && radj != adjs.size())
00310 adjs[radj].left_adjacency[seqI] = lcbI;
00311 }else
00312 if( radj == lcbJ )
00313 {
00314 adjs[lcbJ].left_adjacency[seqI] = lcbI;
00315 if( ladj != -1 && ladj != adjs.size())
00316 adjs[ladj].right_adjacency[seqI] = lcbI;
00317 }
00318 }
00319 }
00320 }
00321 }
00322
00323 EvenFasterSumOfPairsBreakpointScorer::EvenFasterSumOfPairsBreakpointScorer(
00324 double breakpoint_penalty,
00325 double minimum_breakpoint_penalty,
00326 boost::multi_array<double,2> bp_weight_matrix,
00327 boost::multi_array<double,2> conservation_weight_matrix,
00328 vector< TrackingMatch* > tracking_match,
00329 PairwiseLCBMatrix& pairwise_adjacency_matrix,
00330 vector<node_id_t>& n1_descendants,
00331 vector<node_id_t>& n2_descendants,
00332 boost::multi_array< double, 3 >& tm_score_array,
00333 boost::multi_array< size_t, 3 >& tm_lcb_id_array,
00334 size_t seqI_begin,
00335 size_t seqI_end,
00336 size_t seqJ_begin,
00337 size_t seqJ_end
00338 ) :
00339 bp_penalty( breakpoint_penalty ),
00340 min_breakpoint_penalty( minimum_breakpoint_penalty ),
00341 bp_weights( bp_weight_matrix ),
00342 conservation_weights( conservation_weight_matrix ),
00343 tracking_matches( tracking_match ),
00344 pairwise_adjacencies( pairwise_adjacency_matrix ),
00345 n1_des(n1_descendants),
00346 n2_des(n2_descendants),
00347 tm_score_array(tm_score_array),
00348 tm_lcb_id_array(tm_lcb_id_array),
00349 seqI_count(pairwise_adjacencies.shape()[0]),
00350 seqJ_count(pairwise_adjacencies.shape()[1]),
00351 seqI_first(seqI_begin),
00352 seqI_last(seqI_end),
00353 seqJ_first(seqJ_begin),
00354 seqJ_last(seqJ_end),
00355 first_time(true)
00356 {
00357 std::sort(tracking_matches.begin(), tracking_matches.end());
00358 pairwise_lcb_count.resize( boost::extents[pairwise_adjacencies.shape()[0]][pairwise_adjacencies.shape()[1]] );
00359 pairwise_lcb_score.resize( boost::extents[pairwise_adjacencies.shape()[0]][pairwise_adjacencies.shape()[1]] );;
00360 all_id_remaps.resize( boost::extents[pairwise_lcb_count.shape()[0]][pairwise_lcb_count.shape()[1]] );
00361 full_impact_list.resize( boost::extents[pairwise_lcb_count.shape()[0]][pairwise_lcb_count.shape()[1]] );
00362 my_del_lcbs.resize(100);
00363 for( size_t i = 0; i < 3; ++i )
00364 {
00365 internal_lcb_score_diff[i].resize( boost::extents[pairwise_adjacencies.shape()[0]][pairwise_adjacencies.shape()[1]] );
00366 internal_lcb_removed_count[i].resize( boost::extents[pairwise_adjacencies.shape()[0]][pairwise_adjacencies.shape()[1]] );
00367 }
00368 lsd_zeros.resize( internal_lcb_score_diff[0].num_elements(), 0 );
00369 lrc_zeros.resize( internal_lcb_removed_count[0].num_elements(), 0 );
00370 using_lsd = -1;
00371 size_t max_pair_adj_size = 0;
00372 for( size_t i = 0; i < seqI_count; ++i )
00373 {
00374 for( size_t j = 0; j < seqJ_count; ++j )
00375 {
00376 pairwise_lcb_count[i][j] = pairwise_adjacencies[i][j].size();
00377 pairwise_lcb_score[i][j] = 0;
00378 max_pair_adj_size = (std::max)(max_pair_adj_size, pairwise_adjacencies[i][j].size());
00379 for( size_t lcbI = 0; lcbI < pairwise_adjacencies[i][j].size(); ++lcbI )
00380 pairwise_lcb_score[i][j] += pairwise_adjacencies[i][j][lcbI].weight;
00381 }
00382 }
00383 bogus_scores.resize(max_pair_adj_size+10);
00384 };
00385
00386
00391 size_t EvenFasterSumOfPairsBreakpointScorer::getMoveCount()
00392 {
00393 size_t move_count = 0;
00394 for( size_t i = seqI_first; i < seqI_last; ++i )
00395 for( size_t j = seqJ_first; j < seqJ_last; ++j )
00396 move_count += pairwise_adjacencies[i][j].size();
00397 return move_count;
00398 }
00399
00401 double EvenFasterSumOfPairsBreakpointScorer::score()
00402 {
00403
00404
00405 double score = 0;
00406 for( size_t seqI = seqI_first; seqI < seqI_last; ++seqI )
00407 {
00408 for( size_t seqJ = seqJ_first; seqJ < seqJ_last; ++seqJ )
00409 {
00410 const double pw_lcb_score = pairwise_lcb_score[seqI][seqJ];
00411
00412 score += pairwise_lcb_score[seqI][seqJ];
00413
00414
00415 double cweights = 1 - conservation_weights[seqI][seqJ];
00416 double bweights = 1 - bp_weights[seqI][seqJ];
00417 double penalty = max( bp_penalty * cweights * cweights * cweights * cweights * bweights * bweights, min_breakpoint_penalty );
00418 if(first_time)
00419 cout << "Scoring with scaled breakpoint penalty: " << penalty << endl;
00420 first_time = false;
00421 score -= ( penalty * (pairwise_lcb_count[seqI][seqJ]-1));
00422 if( !(score > -1e200 && score < 1e200) )
00423 {
00424 genome::breakHere();
00425 cerr << "bp_weights[seqI][seqJ] " << bp_weights[seqI][seqJ] << endl;
00426 cerr << "conservation_weights[seqI][seqJ] " << conservation_weights[seqI][seqJ] << endl;
00427 cerr << "pairwise_lcb_count[seqI][seqJ] " << pairwise_lcb_count[seqI][seqJ] << endl;
00428 cerr << "pairwise_lcb_score[seqI][seqJ] " << pw_lcb_score << endl;
00429 cerr << "Invalid score!!\n";
00430 }
00431 }
00432 }
00433 return score;
00434 }
00435
00437 double EvenFasterSumOfPairsBreakpointScorer::operator()( pair< double, size_t >& the_move )
00438 {
00439 size_t new_move_count;
00440 vector< pair< double, size_t > > new_move_list;
00441 using_lsd++;
00442 std::copy(lsd_zeros.begin(),lsd_zeros.end(),internal_lcb_score_diff[using_lsd].data());
00443 std::copy(lrc_zeros.begin(),lrc_zeros.end(),internal_lcb_removed_count[using_lsd].data());
00444 remove( the_move, false, internal_lcb_score_diff[using_lsd], internal_lcb_removed_count[using_lsd], false, new_move_list, new_move_count );
00445 applyScoreDifference( internal_lcb_score_diff[using_lsd], internal_lcb_removed_count[using_lsd] );
00446 double m_score = score();
00447 undoScoreDifference( internal_lcb_score_diff[using_lsd], internal_lcb_removed_count[using_lsd] );
00448 using_lsd--;
00449 return m_score;
00450 }
00451
00452 bool EvenFasterSumOfPairsBreakpointScorer::isValid( pair< double, size_t >& the_move )
00453 {
00454 using_lsd++;
00455 std::copy(lsd_zeros.begin(),lsd_zeros.end(),internal_lcb_score_diff[using_lsd].data());
00456 std::copy(lrc_zeros.begin(),lrc_zeros.end(),internal_lcb_removed_count[using_lsd].data());
00457 vector< pair< double, size_t > > new_move_list;
00458 size_t new_move_count;
00459 bool success = remove( the_move, false, internal_lcb_score_diff[using_lsd], internal_lcb_removed_count[using_lsd], false, new_move_list, new_move_count );
00460 using_lsd--;
00461 return success;
00462 }
00463
00464 bool EvenFasterSumOfPairsBreakpointScorer::remove( pair< double, size_t >& the_move, vector< pair< double, size_t > >& new_move_list, size_t& new_move_count )
00465 {
00466 using_lsd++;
00467 std::copy(lsd_zeros.begin(),lsd_zeros.end(),internal_lcb_score_diff[using_lsd].data());
00468 std::copy(lrc_zeros.begin(),lrc_zeros.end(),internal_lcb_removed_count[using_lsd].data());
00469 bool success = remove( the_move, true, internal_lcb_score_diff[using_lsd], internal_lcb_removed_count[using_lsd], true, new_move_list, new_move_count );
00470 if( success )
00471 applyScoreDifference( internal_lcb_score_diff[using_lsd], internal_lcb_removed_count[using_lsd] );
00472 using_lsd--;
00473 return success;
00474 }
00475
00476 void EvenFasterSumOfPairsBreakpointScorer::applyScoreDifference( boost::multi_array< double, 2 >& lcb_score_diff, boost::multi_array< size_t, 2 >& lcb_removed_count )
00477 {
00478 size_t nelems = pairwise_lcb_count.num_elements();
00479 for( size_t elemI = 0; elemI < nelems; elemI++ )
00480 {
00481 if( !(lcb_score_diff.data()[elemI] > -1e200 && lcb_score_diff.data()[elemI] < 1e200) )
00482 {
00483 genome::breakHere();
00484 cerr << "Invalid score!!\n";
00485 }
00486 pairwise_lcb_count.data()[elemI] -= lcb_removed_count.data()[elemI];
00487 pairwise_lcb_score.data()[elemI] -= lcb_score_diff.data()[elemI];
00488 if( !(pairwise_lcb_score.data()[elemI] > -1e200 && pairwise_lcb_score.data()[elemI] < 1e200) )
00489 {
00490 genome::breakHere();
00491 cerr << "Invalid score!!\n";
00492 }
00493 }
00494 }
00495
00496 void EvenFasterSumOfPairsBreakpointScorer::undoScoreDifference( boost::multi_array< double, 2 >& lcb_score_diff, boost::multi_array< size_t, 2 >& lcb_removed_count )
00497 {
00498 size_t nelems = pairwise_lcb_count.num_elements();
00499 for( size_t elemI = 0; elemI < nelems; elemI++ )
00500 {
00501 if( !(lcb_score_diff.data()[elemI] > -1e200 && lcb_score_diff.data()[elemI] < 1e200) )
00502 {
00503 genome::breakHere();
00504 cerr << "Invalid score!!\n";
00505 }
00506 pairwise_lcb_count.data()[elemI] += lcb_removed_count.data()[elemI];
00507 pairwise_lcb_score.data()[elemI] += lcb_score_diff.data()[elemI];
00508 if( !(pairwise_lcb_score.data()[elemI] > -1e200 && pairwise_lcb_score.data()[elemI] < 1e200) )
00509 {
00510 genome::breakHere();
00511 cerr << "Invalid score!!\n";
00512 }
00513 }
00514 }
00515
00516 size_t EvenFasterSumOfPairsBreakpointScorer::getMaxNewMoveCount()
00517 {
00518 return 20 * seqI_count * seqJ_count;
00519 }
00520
00524 bool EvenFasterSumOfPairsBreakpointScorer::remove( pair< double, size_t >& the_move, bool really_remove, boost::multi_array< double, 2 >& lcb_score_diff, boost::multi_array< size_t, 2 >& lcb_removed_count, bool score_new_moves, vector< pair< double, size_t > >& new_move_list, size_t& new_move_count )
00525 {
00526 if( score_new_moves && !really_remove )
00527 {
00528 cerr << "Error: Incompatible options in the breakpoint scorer!!!\n";
00529 throw "oh shit!";
00530 }
00531 new_move_count = 0;
00532
00533 size_t moveI = the_move.second;
00534 size_t move_count = 0;
00535 size_t move_base = 0;
00536 size_t seqI = 0;
00537 size_t seqJ = 0;
00538 for( seqI = seqI_first; seqI < seqI_last; ++seqI )
00539 {
00540 for( seqJ = seqJ_first; seqJ < seqJ_last; ++seqJ )
00541 {
00542 all_id_remaps[seqI][seqJ].clear();
00543 full_impact_list[seqI][seqJ].clear();
00544 }
00545 }
00546
00547 for( seqI = seqI_first; seqI < seqI_last; ++seqI )
00548 {
00549 for( seqJ = seqJ_first; seqJ < seqJ_last; ++seqJ )
00550 {
00551 move_count += pairwise_adjacencies[seqI][seqJ].size();
00552 if( move_count > moveI )
00553 break;
00554 move_base = move_count;
00555 }
00556 if( move_count > moveI )
00557 break;
00558 }
00559
00560 size_t del_lcb = moveI - move_base;
00561 if( pairwise_adjacencies[seqI][seqJ][del_lcb].lcb_id != del_lcb && really_remove )
00562 {
00563 if( pairwise_adjacencies[seqI][seqJ][del_lcb].lcb_id == LCB_UNASSIGNED )
00564 cerr << "bad movement, dirty dancing\n";
00565 return false;
00566 }
00567 if( pairwise_adjacencies[seqI][seqJ][del_lcb].lcb_id != del_lcb )
00568 {
00569 return false;
00570 }
00571
00572 vector< TrackingMatch* > matches(pairwise_adjacencies[seqI][seqJ][del_lcb].matches);
00573 double cur_score = score();
00574
00575 if( really_remove )
00576 {
00577 deleted_tracking_matches.insert( deleted_tracking_matches.end(), matches.begin(), matches.end() );
00578 }
00579
00580 for( size_t i = seqI_first; i < seqI_last; ++i )
00581 {
00582 for( size_t j = seqJ_first; j < seqJ_last; ++j )
00583 {
00584 lcb_score_diff[i][j] = 0;
00585 vector< TrackingLCB< TrackingMatch* > >& adjs = pairwise_adjacencies[i][j];
00586
00587
00588 if( lcb_ids.size() < matches.size() )
00589 lcb_ids.resize( matches.size() + 100 );
00590 for( size_t mI = 0; mI < matches.size(); ++mI )
00591 lcb_ids[mI] = tm_lcb_id_array[matches[mI]->match_id][i][j];
00592 size_t lcb_id_count = matches.size();
00593 std::sort(lcb_ids.begin(), lcb_ids.begin()+lcb_id_count);
00594 vector< size_t >::iterator last = std::unique(lcb_ids.begin(), lcb_ids.begin()+lcb_id_count);
00595 lcb_id_count = last - lcb_ids.begin();
00596
00597 if( lcb_ids[lcb_id_count-1] == LCB_UNASSIGNED )
00598 lcb_id_count--;
00599
00600 vector< pair< size_t, vector< TrackingMatch* > > > aff_lcbs(lcb_id_count);
00601 for( size_t lI = 0; lI < lcb_id_count; ++lI )
00602 aff_lcbs[lI].first = lcb_ids[lI];
00603
00604
00605 for( size_t mI = 0; mI < matches.size(); ++mI )
00606 {
00607 size_t id = tm_lcb_id_array[matches[mI]->match_id][i][j];
00608 if( id == LCB_UNASSIGNED )
00609 continue;
00610 vector< pair< size_t, vector< TrackingMatch* > > >::iterator iter = std::lower_bound( aff_lcbs.begin(), aff_lcbs.end(), make_pair(id,vector< TrackingMatch* >() ) );
00611 iter->second.push_back( matches[mI] );
00612 }
00613
00614
00615 size_t my_del_count = 0;
00616 for( size_t lI = 0; lI < aff_lcbs.size(); ++lI )
00617 {
00618 vector< TrackingMatch* >& cur_matches = adjs[lcb_ids[lI]].matches;
00619 size_t diff = cur_matches.size() - aff_lcbs[lI].second.size();
00620 if( diff == 0 )
00621 {
00622 if( my_del_count + 1 >= my_del_lcbs.size() )
00623 my_del_lcbs.resize(2*my_del_lcbs.size());
00624 my_del_lcbs[my_del_count++] = lcb_ids[lI];
00625 adjs[lcb_ids[lI]].to_be_deleted = true;
00626 lcb_score_diff[i][j] += adjs[lcb_ids[lI]].weight;
00627 if( really_remove )
00628 {
00629 adjs[lcb_ids[lI]].weight = 0;
00630 cur_matches.clear();
00631 }
00632 continue;
00633 }
00634
00635
00636 double del_score_sum = 0;
00637 for( size_t mI = 0; mI < aff_lcbs[lI].second.size(); ++mI )
00638 del_score_sum += tm_score_array[aff_lcbs[lI].second[mI]->match_id][i][j];
00639 lcb_score_diff[i][j] += del_score_sum;
00640 full_impact_list[i][j].push_back( aff_lcbs[lI].first );
00641
00642 if( really_remove )
00643 {
00644 adjs[lcb_ids[lI]].weight -= del_score_sum;
00645
00646
00647 vector< TrackingMatch* > dest( diff );
00648 std::set_difference( cur_matches.begin(), cur_matches.end(),
00649 aff_lcbs[lI].second.begin(), aff_lcbs[lI].second.end(), dest.begin() );
00650 swap( dest, cur_matches );
00651 }
00652 }
00653
00654 lcb_removed_count[i][j] = 0;
00655
00656
00657 std::vector< std::pair< uint, uint > >& fid_remaps = all_id_remaps[i][j];
00658 std::vector< uint >& fimp_list = full_impact_list[i][j];
00659 for( size_t delI = 0; delI < my_del_count; ++delI )
00660 {
00661 if( adjs[my_del_lcbs[delI]].lcb_id != my_del_lcbs[delI] )
00662 continue;
00663
00664 std::vector< std::pair< uint, uint > > id_remaps;
00665 std::vector< uint > impact_list;
00666 uint removed_count = RemoveLCBandCoalesce( my_del_lcbs[delI], 2, adjs, bogus_scores, id_remaps, impact_list );
00667 fid_remaps.insert( fid_remaps.end(), id_remaps.begin(), id_remaps.end() );
00668 fimp_list.insert( fimp_list.end(), impact_list.begin(), impact_list.end() );
00669
00670 lcb_removed_count[i][j] += removed_count;
00671
00672 if( really_remove )
00673 {
00674
00675 for( size_t rI = 0; rI < id_remaps.size(); ++rI )
00676 {
00677 if( id_remaps[rI].second == -1 )
00678 continue;
00679 vector< TrackingMatch* >& src_matches = adjs[id_remaps[rI].first].matches;
00680 vector< TrackingMatch* >& dest_matches = adjs[id_remaps[rI].second].matches;
00681 for( size_t mI = 0; mI < src_matches.size(); ++mI )
00682 tm_lcb_id_array[src_matches[mI]->match_id][i][j] = id_remaps[rI].second;
00683 dest_matches.insert( dest_matches.end(), src_matches.begin(), src_matches.end() );
00684 std::sort( dest_matches.begin(), dest_matches.end() );
00685 src_matches.clear();
00686 }
00687 }
00688 }
00689 }
00690 }
00691
00692
00693 applyScoreDifference( lcb_score_diff, lcb_removed_count );
00694 double new_score = score();
00695
00696 if( score_new_moves )
00697 {
00698 size_t mbase = 0;
00699 for( size_t i = seqI_first; i < seqI_last; ++i )
00700 {
00701 for( size_t j = seqJ_first; j < seqJ_last; ++j )
00702 {
00703 vector< TrackingLCB< TrackingMatch* > >& adjs = pairwise_adjacencies[i][j];
00704 std::vector< uint >& fimp_list = full_impact_list[i][j];
00705 sort( fimp_list.begin(), fimp_list.end() );
00706 vector< uint >::iterator iter = std::unique( fimp_list.begin(), fimp_list.end() );
00707 fimp_list.erase( iter, fimp_list.end() );
00708 for( size_t fI = 0; fI < fimp_list.size(); fI++ )
00709 {
00710 if( adjs[fimp_list[fI]].lcb_id != fimp_list[fI] )
00711 {
00712 new_move_list[new_move_count++] = make_pair( -(std::numeric_limits<double>::max)(), mbase + fimp_list[fI] );
00713 continue;
00714 }
00715
00716 pair< double, size_t > p( 0, mbase + fimp_list[fI] );
00717 double scorediff = (*this)(p) - new_score;
00718 p.first = scorediff;
00719 new_move_list[new_move_count++] = p;
00720 }
00721 mbase += adjs.size();
00722 }
00723 }
00724 }
00725
00726
00727
00728 if( !really_remove )
00729 for( size_t i = seqI_first; i < seqI_last; ++i )
00730 for( size_t j = seqJ_first; j < seqJ_last; ++j )
00731 undoLcbRemoval( 2, pairwise_adjacencies[i][j], all_id_remaps[i][j] );
00732
00733 undoScoreDifference( lcb_score_diff, lcb_removed_count );
00734
00735
00736
00737 bool valid = true;
00738 if( new_score - cur_score < the_move.first - 0.00001 ||
00739 new_score - cur_score > the_move.first + 0.00001 )
00740 valid = false;
00741
00742 return valid;
00743 }
00744
00745 vector< TrackingMatch* > EvenFasterSumOfPairsBreakpointScorer::getResults()
00746 {
00747 std::sort(deleted_tracking_matches.begin(), deleted_tracking_matches.end());
00748 vector< TrackingMatch* > result_matches(tracking_matches.size()-deleted_tracking_matches.size());
00749 std::set_difference( tracking_matches.begin(), tracking_matches.end(), deleted_tracking_matches.begin(), deleted_tracking_matches.end(), result_matches.begin() );
00750 return result_matches;
00751 }
00752
00753 bool EvenFasterSumOfPairsBreakpointScorer::validate()
00754 {
00755 vector< TrackingMatch* > trams = getResults();
00756 bool success = true;
00757
00758 for( size_t tmI = 0; tmI < trams.size(); tmI++ )
00759 {
00760 TrackingMatch* tm = trams[tmI];
00761 for( size_t i = 0; i < tm_lcb_id_array.shape()[1]; ++i )
00762 for( size_t j = 0; j < tm_lcb_id_array.shape()[2]; ++j )
00763 {
00764
00765 if( tm->node_match->LeftEnd(n1_des[i]) == NO_MATCH ||
00766 tm->node_match->LeftEnd(n2_des[j]) == NO_MATCH ||
00767 tm_lcb_id_array[tm->match_id][i][j] == LCB_UNASSIGNED)
00768 continue;
00769
00770 size_t id = tm_lcb_id_array[tm->match_id][i][j];
00771 vector< TrackingMatch* >& matches = pairwise_adjacencies[i][j][id].matches;
00772 vector< TrackingMatch* >::iterator iter = std::lower_bound( matches.begin(), matches.end(), tm );
00773 if( iter == matches.end() || *iter != tm )
00774 {
00775 cerr << "Missing match!!\n";
00776 cerr << "lcb_id: " << id << endl;
00777 cerr << "match: " << tm << endl;
00778 genome::breakHere();
00779 success = false;
00780 }
00781 }
00782 }
00783
00784 for( size_t i = 0; i < pairwise_adjacencies.shape()[0]; ++i )
00785 for( size_t j = 0; j < pairwise_adjacencies.shape()[1]; ++j )
00786 {
00787 vector< TrackingLCB< TrackingMatch* > >& adjs = pairwise_adjacencies[i][j];
00788 for( size_t lcbI = 0; lcbI < adjs.size(); lcbI++ )
00789 {
00790 for( size_t mI = 0; mI < adjs[lcbI].matches.size(); ++mI )
00791 {
00792 vector< TrackingMatch* >::iterator iter = std::lower_bound( trams.begin(), trams.end(), adjs[lcbI].matches[mI] );
00793 if( *iter != adjs[lcbI].matches[mI] )
00794 {
00795 cerr << "Missing match: in adjacencies but not tracking_matches!!\n";
00796 cerr << "lcb_id: " << tm_lcb_id_array[adjs[lcbI].matches[mI]->match_id][i][j] << endl;
00797 genome::breakHere();
00798 success = false;
00799 }
00800 }
00801 }
00802 }
00803
00804
00805 vector< TrackingMatch* > final = trams;
00806
00807 vector< AbstractMatch* > new_matches(final.size());
00808 for( size_t mI = 0; mI < final.size(); ++mI )
00809 new_matches[mI] = final[mI]->original_match;
00810
00811 vector< gnSeqI > breakpoints;
00812 IdentifyBreakpoints( new_matches, breakpoints );
00813 vector< vector< AbstractMatch* > > LCB_list;
00814 IdentifyBreakpoints( new_matches, breakpoints );
00815 ComputeLCBs_v2( new_matches, breakpoints, LCB_list );
00816 cout << "breakpoints.size(): " << breakpoints.size() << "\tpairwise_lcb_count[0][0]: " << pairwise_lcb_count[0][0] << endl;
00817 if( breakpoints.size() != pairwise_lcb_count[0][0] )
00818 success = false;
00819 size_t adjI = 0;
00820 vector< TrackingLCB< TrackingMatch* > >& adjs = pairwise_adjacencies[0][0];
00821 for( size_t lcbI = 0; lcbI < LCB_list.size(); ++lcbI )
00822 {
00823
00824 while( adjI != -1 && adjI != adjs[adjI].lcb_id )
00825 adjI++;
00826
00827
00828 vector< AbstractMatch* > ms(adjs[adjI].matches.size()+LCB_list[lcbI].size(), (AbstractMatch*)NULL);
00829 std::sort( LCB_list[lcbI].begin(), LCB_list[lcbI].end() );
00830 vector< AbstractMatch* > asdf(adjs[adjI].matches.size());
00831 for( size_t mI = 0; mI < adjs[adjI].matches.size(); ++mI )
00832 asdf[mI] = adjs[adjI].matches[mI]->original_match;
00833 std::sort( asdf.begin(), asdf.end() );
00834 std::set_symmetric_difference( LCB_list[lcbI].begin(), LCB_list[lcbI].end(), asdf.begin(), asdf.end(), ms.begin() );
00835
00836 if( ms[0] != NULL )
00837 {
00838 cerr << "In adjacencies:\n";
00839 for( size_t asdfI = 0; asdfI < asdf.size(); asdfI++ )
00840 {
00841 printMatch(asdf[asdfI], cerr);
00842 cerr << endl;
00843 }
00844 cerr << "\nIn LCB_list:\n";
00845 for( size_t mI = 0; mI < LCB_list[lcbI].size(); mI++ )
00846 {
00847 printMatch(LCB_list[lcbI][mI], cerr);
00848 cerr << endl;
00849 }
00850 cerr << "\nAll matches ssc1\n";
00851 SingleStartComparator<AbstractMatch> ssc1(1);
00852 std::sort(new_matches.begin(), new_matches.end(), ssc1);
00853 for( size_t mI = 0; mI < new_matches.size(); mI++ )
00854 {
00855 printMatch(new_matches[mI], cerr);
00856 cerr << endl;
00857 }
00858
00859 cerr << "\nAll matches ssc0\n";
00860 SingleStartComparator<AbstractMatch> ssc0(0);
00861 std::sort(new_matches.begin(), new_matches.end(), ssc0);
00862 for( size_t mI = 0; mI < new_matches.size(); mI++ )
00863 {
00864 printMatch(new_matches[mI], cerr);
00865 cerr << endl;
00866 }
00867 genome::breakHere();
00868 }
00869 adjI++;
00870 }
00871
00872 return success;
00873 }
00874
00875
00876
00877 SimpleBreakpointScorer::SimpleBreakpointScorer( std::vector< LCB >& adjacencies, double breakpoint_penalty, bool collinear ) :
00878 adjs( adjacencies ),
00879 bp_penalty( breakpoint_penalty ),
00880 collinear( collinear )
00881 {
00882 scores = std::vector< double >(adjs.size(), 0);
00883 total_weight = 0;
00884 bp_count = adjs.size();
00885 for( size_t lcbI = 0; lcbI < adjs.size(); lcbI++ )
00886 total_weight += adjs[lcbI].weight;
00887 }
00888
00889 size_t SimpleBreakpointScorer::getMoveCount()
00890 {
00891 return adjs.size();
00892 }
00893
00894 double SimpleBreakpointScorer::score()
00895 {
00896 double bp_score = (double)bp_count * bp_penalty;
00897 return total_weight - bp_score;
00898 }
00899
00900 bool SimpleBreakpointScorer::isValid( size_t lcbI, double move_score )
00901 {
00902 if( adjs[lcbI].lcb_id != lcbI )
00903 return false;
00904 return (*this)(lcbI) == move_score;
00905 }
00906
00908 double SimpleBreakpointScorer::operator()( size_t lcbI )
00909 {
00910 double cur_score = score();
00911 std::vector< std::pair< uint, uint > > id_remaps;
00912 std::vector< uint > impact_list;
00913 uint bp_removed = RemoveLCBandCoalesce( lcbI, adjs[0].left_adjacency.size(), adjs, scores, id_remaps, impact_list );
00914 undoLcbRemoval( adjs[0].left_adjacency.size(), adjs, id_remaps );
00915 double bp_score = (double)(bp_count - bp_removed) * bp_penalty;
00916 double move_score = total_weight - adjs[lcbI].weight - bp_score;
00917 double score_diff = move_score - cur_score;
00918 if( collinear && bp_count - bp_removed > 0 && score_diff < 0 )
00919 return 1/(-score_diff);
00920 return move_score - cur_score;
00921 }
00922
00924 void SimpleBreakpointScorer::remove( uint lcbI, vector< pair< double, size_t > >& new_moves )
00925 {
00926 std::vector< std::pair< uint, uint > > id_remaps;
00927 std::vector< uint > impact_list;
00928 uint bp_removed = RemoveLCBandCoalesce( lcbI, adjs[0].left_adjacency.size(), adjs, scores, id_remaps, impact_list );
00929 total_weight -= adjs[lcbI].weight;
00930 bp_count -= bp_removed;
00931 for( size_t impI = 0; impI < impact_list.size(); impI++ )
00932 {
00933 if( adjs[impact_list[impI]].lcb_id != impact_list[impI] )
00934 continue;
00935 double scorediff = (*this)(impact_list[impI]);
00936 new_moves.push_back(make_pair(scorediff, impact_list[impI]));
00937 }
00938 }
00939
00940
00941 GreedyRemovalScorer::GreedyRemovalScorer( std::vector< LCB >& adjacencies, double minimum_weight ) :
00942 adjs( adjacencies ),
00943 min_weight( minimum_weight )
00944 {
00945 scores = std::vector< double >(adjs.size(), 0);
00946 total_weight = 0;
00947 for( size_t lcbI = 0; lcbI < adjs.size(); lcbI++ )
00948 total_weight += adjs[lcbI].weight - min_weight;
00949 }
00950
00951 size_t GreedyRemovalScorer::getMoveCount()
00952 {
00953 return adjs.size();
00954 }
00955
00956 double GreedyRemovalScorer::score()
00957 {
00958 return total_weight;
00959 }
00960
00961 bool GreedyRemovalScorer::isValid( size_t lcbI, double move_score )
00962 {
00963 if( adjs[lcbI].lcb_id != lcbI )
00964 return false;
00965 return (*this)(lcbI) == move_score;
00966 }
00967
00969 double GreedyRemovalScorer::operator()( size_t lcbI )
00970 {
00971 return -(adjs[lcbI].weight-min_weight);
00972 }
00973
00975 void GreedyRemovalScorer::remove( uint lcbI, vector< pair< double, size_t > >& new_moves )
00976 {
00977 std::vector< std::pair< uint, uint > > id_remaps;
00978 std::vector< uint > impact_list;
00979 uint bp_removed = RemoveLCBandCoalesce( lcbI, adjs[0].left_adjacency.size(), adjs, scores, id_remaps, impact_list );
00980 total_weight -= (adjs[lcbI].weight-min_weight);
00981 for( size_t impI = 0; impI < impact_list.size(); impI++ )
00982 {
00983 if( adjs[impact_list[impI]].lcb_id != impact_list[impI] )
00984 continue;
00985 double scorediff = (*this)(impact_list[impI]);
00986 new_moves.push_back(make_pair(scorediff, impact_list[impI]));
00987 }
00988 }
00989
00990
00991
00992
00993 }
00994