00001
00002
00003
00004
00005
00006
00007
00008
00009 #include "libMems/Aligner.h"
00010 #include "libMems/Islands.h"
00011 #include "libMems/DNAFileSML.h"
00012 #include "libMems/MuscleInterface.h"
00013 #include "libGenome/gnRAWSource.h"
00014 #include "libMems/DistanceMatrix.h"
00015 #include "libMems/Files.h"
00016
00017 #include <map>
00018 #include <fstream>
00019 #include <sstream>
00020 #include <stack>
00021 #include <algorithm>
00022 #include <limits>
00023
00024 using namespace std;
00025 using namespace genome;
00026 namespace mems {
00027
00028
00029 boolean validateLCB( MatchList& lcb );
00030 void validateRangeIntersections( vector< MatchList >& lcb_list );
00031 bool debug_shite = false;
00032
00036 boolean validateLCB( MatchList& lcb ){
00037 vector< Match* >::iterator lcb_iter = lcb.begin();
00038 if( lcb.size() == 0 )
00039 return true;
00040 uint seq_count = (*lcb_iter)->SeqCount();
00041 uint seqI = 0;
00042 boolean complain = false;
00043 for(; seqI < seq_count; seqI++ ){
00044 lcb_iter = lcb.begin();
00045 int64 prev_coord = 0;
00046 for(; lcb_iter != lcb.end(); lcb_iter++ ){
00047 if( (*lcb_iter)->Start( seqI ) == NO_MATCH )
00048 continue;
00049 else if( prev_coord != 0 && (*lcb_iter)->Start( seqI ) < prev_coord ){
00050 complain = true;
00051 }
00052 prev_coord = (*lcb_iter)->Start( seqI );
00053 }
00054 }
00055 return !complain;
00056 }
00057
00062 void EliminateOverlaps( MatchList& ml ){
00063 if( ml.size() < 2 )
00064 return;
00065 vector< Match* > result_matches;
00066 uint seq_count = ml[0]->SeqCount();
00067 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00068 SingleStartComparator<AbstractMatch> msc( seqI );
00069 sort( ml.begin(), ml.end(), msc );
00070 int64 matchI = 0;
00071 int64 nextI = 0;
00072 int64 deleted_count = 0;
00073 vector< Match* > new_matches;
00074
00075
00076 for(; matchI != ml.size(); matchI++ )
00077 if( ml[ matchI ]->Start( seqI ) != NO_MATCH )
00078 break;
00079
00080 for(; matchI < ml.size(); matchI++ ){
00081 if( ml[ matchI ] == NULL )
00082 continue;
00083
00084 for( nextI = matchI + 1; nextI < ml.size(); nextI++ ){
00085 if( ml[ nextI ] == NULL )
00086 continue;
00087
00088 boolean deleted_matchI = false;
00089
00090 int64 startI = ml[ matchI ]->Start( seqI );
00091 int64 lenI = ml[ matchI ]->Length();
00092 int64 startJ = ml[ nextI ]->Start( seqI );
00093
00094 int64 diff = absolut( startJ ) - absolut( startI ) - lenI;
00095
00096 if( diff < 0 ){
00097 diff = -diff;
00098 Match* new_match;
00099
00100
00101
00102 if( ( ml[ nextI ]->Multiplicity() > ml[ matchI ]->Multiplicity() ) ||
00103 ( ml[ nextI ]->Multiplicity() == ml[ matchI ]->Multiplicity() && ml[ nextI ]->Length() > ml[ matchI ]->Length() ) ){
00104
00105 new_match = ml[matchI]->Copy();
00106
00107 if( diff >= lenI ){
00108
00109 ml[ matchI ]->Free();
00110 ml[ matchI ] = NULL;
00111 matchI--;
00112 deleted_matchI = true;
00113 deleted_count++;
00114 }else{
00115 if( startI > 0 ){
00116 ml[ matchI ]->CropEnd( diff );
00117 new_match->CropStart( new_match->Length() - diff );
00118 }else{
00119 ml[ matchI ]->CropStart( diff );
00120 new_match->CropEnd( new_match->Length() - diff );
00121 }
00122 }
00123 }else{
00124
00125 new_match = ml[nextI]->Copy();
00126
00127 if( diff >= ml[ nextI ]->Length() ){
00128
00129 ml[ nextI ]->Free();
00130 ml[ nextI ] = NULL;
00131 deleted_count++;
00132 }else{
00133 if( startJ > 0 ){
00134 ml[ nextI ]->CropStart( diff );
00135 new_match->CropEnd( new_match->Length() - diff );
00136 }else{
00137 ml[ nextI ]->CropEnd( diff );
00138 new_match->CropStart( new_match->Length() - diff );
00139 }
00140 }
00141
00142 }
00143 new_match->SetStart( seqI, 0 );
00144 if( new_match->Multiplicity() > 1 && new_match->Length() > 0 )
00145 new_matches.push_back( new_match );
00146 else
00147 {
00148 new_match->Free();
00149 new_match = NULL;
00150 }
00151 if( deleted_matchI )
00152 break;
00153 }else
00154 break;
00155 }
00156
00157
00158
00159
00160 }
00161
00162 if( deleted_count > 0 ){
00163 result_matches.reserve( ml.size() - deleted_count );
00164 for( int64 copyI = 0; copyI < ml.size(); copyI++ ){
00165 if( ml[ copyI ] != NULL )
00166 result_matches.push_back( ml[ copyI ] );
00167 }
00168 ml.clear();
00169 ml.insert( ml.end(), result_matches.begin(), result_matches.end() );
00170 }
00171 ml.insert( ml.end(), new_matches.begin(), new_matches.end() );
00172 result_matches.clear();
00173 new_matches.clear();
00174 }
00175
00176 }
00177
00178
00179 const gnSeqI default_min_r_gap_size = 200;
00180 Aligner::Aligner( uint seq_count ) :
00181 debug(false),
00182 seq_count(seq_count),
00183 min_recursive_gap_length(default_min_r_gap_size),
00184 collinear_genomes(false),
00185 gal(&(MuscleInterface::getMuscleInterface())),
00186 permutation_weight(-1),
00187 cur_min_coverage(-1),
00188 max_extension_iters(4)
00189 {}
00190
00191 Aligner::Aligner( const Aligner& al ) :
00192
00193 nway_mh( al.nway_mh ),
00194 seq_count( al.seq_count ),
00195 debug( al.debug),
00196 LCB_minimum_density( al.LCB_minimum_density),
00197 LCB_minimum_range( al.LCB_minimum_range ),
00198 cur_min_coverage( al.cur_min_coverage),
00199 min_recursive_gap_length( al.min_recursive_gap_length ),
00200 collinear_genomes( al.collinear_genomes ),
00201 gal( al.gal ),
00202 permutation_weight( al.permutation_weight ),
00203 permutation_filename( al.permutation_filename ),
00204 max_extension_iters( al.max_extension_iters )
00205 {}
00206
00207 Aligner& Aligner::operator=( const Aligner& al )
00208 {
00209 gap_mh = al.gap_mh;
00210 nway_mh = al.nway_mh;
00211 seq_count = al.seq_count;
00212 debug = al.debug;
00213
00214 LCB_minimum_density = al.LCB_minimum_density;
00215 LCB_minimum_range = al.LCB_minimum_range;
00216
00217 cur_min_coverage = al.cur_min_coverage;
00218 min_recursive_gap_length = al.min_recursive_gap_length;
00219 collinear_genomes = al.collinear_genomes;
00220
00221 gal = al.gal;
00222
00223 permutation_weight = al.permutation_weight;
00224 permutation_filename = al.permutation_filename;
00225
00226 max_extension_iters = al.max_extension_iters;
00227
00228 return *this;
00229 }
00230
00231 void Aligner::SetMinRecursionGapLength( gnSeqI min_r_gap ) {
00232 min_recursive_gap_length = min_r_gap;
00233 }
00234
00235 void Aligner::SetGappedAligner( GappedAligner& gal ){
00236 this->gal = &(gal);
00237 }
00238
00239 void Aligner::SetMaxGappedAlignmentLength( gnSeqI len ){
00240 gal->SetMaxAlignmentLength( len );
00241 }
00242
00243
00244
00245 void scanLabels( set< uint >& no_match_labels, uint& start_label, boolean forward ){
00246 uint labelI;
00247
00248 if( forward ){
00249 for( labelI = start_label + 1; ; labelI++){
00250 set< uint >::iterator label_iter = no_match_labels.find( labelI );
00251 if( label_iter == no_match_labels.end() ){
00252 start_label = labelI - 1;
00253 break;
00254 }
00255 }
00256 }else{
00257 for( labelI = start_label; labelI > 0; labelI--){
00258 set< uint >::iterator label_iter = no_match_labels.find( labelI - 1 );
00259 if( label_iter == no_match_labels.end() ){
00260 start_label = labelI;
00261 break;
00262 }
00263 }
00264 }
00265 }
00266
00267 boolean checkCollinearity( Match* m1, Match* m2 ){
00268 for( uint seqI = 0; seqI < m1->SeqCount(); seqI++ ){
00269 if( m1->Start( seqI ) == NO_MATCH ||
00270 m2->Start( seqI ) == NO_MATCH )
00271 continue;
00272 if((( m1->Start( seqI ) > 0 &&
00273 m2->Start( seqI ) > 0 ) ||
00274 (m1->Start( seqI ) < 0 &&
00275 m2->Start( seqI ) < 0 )) &&
00276 m1->Start( seqI ) <= m2->Start( seqI ) )
00277 continue;
00278 return false;
00279 }
00280 return true;
00281 }
00282
00283 void scanFit( list< LabeledMem >& pair_list, list< LabeledMem >::iterator& list_iter, Match* new_match, uint sort_seq ){
00284
00285 list< LabeledMem >::iterator cur_iter = list_iter;
00286 list< LabeledMem >::iterator last_iter = list_iter;
00287
00288 int64 initial_start = absolut( list_iter->mem->Start( sort_seq ) );
00289
00290 uint match_count = 0;
00291 for(; last_iter != pair_list.end(); ++last_iter ){
00292 if( last_iter->mem->Start( sort_seq ) == NO_MATCH ){
00293 ++match_count;
00294 continue;
00295 }
00296
00297
00298 if( absolut( last_iter->mem->Start( sort_seq ) ) < initial_start ||
00299 absolut( last_iter->mem->Start( sort_seq ) ) > new_match->Start( sort_seq ) )
00300 break;
00301 ++match_count;
00302 }
00303 vector< vector< int > > score_vector;
00304 score_vector.reserve( new_match->SeqCount() - sort_seq - 1 );
00305 for( uint seqI = sort_seq + 1; seqI < new_match->SeqCount(); ++seqI ){
00306 vector< int > sv;
00307 score_vector.push_back( sv );
00308 score_vector[ score_vector.size() - 1 ].reserve( match_count );
00309 }
00310 uint matchI = 0;
00311 for(; cur_iter != last_iter; ++cur_iter ){
00312
00313 for( uint seqI = sort_seq + 1; seqI < new_match->SeqCount(); ++seqI ){
00314 int64 p_start = cur_iter->mem->Start( seqI );
00315 int64 m_start = new_match->Start( seqI );
00316 p_start = p_start < 0 ? -p_start : p_start;
00317 m_start = m_start < 0 ? -m_start : m_start;
00318 if( m_start == NO_MATCH ){
00319 score_vector[ seqI - sort_seq - 1 ].push_back( 0 );
00320 }else if( p_start == NO_MATCH ){
00321 score_vector[ seqI - sort_seq - 1 ].push_back( 0 );
00322 }else if( p_start < m_start ){
00323 score_vector[ seqI - sort_seq - 1 ].push_back( 1 );
00324 }else
00325 score_vector[ seqI - sort_seq - 1 ].push_back( -1 );
00326 }
00327 }
00328 vector< int > scores;
00329 scores.reserve( match_count );
00330 for( matchI = match_count; matchI > 0; matchI-- )
00331 scores.push_back( 0 );
00332 for( uint seqI = 0; seqI < new_match->SeqCount() - sort_seq - 1; ++seqI ){
00333 boolean redefined = false;
00334 for( matchI = match_count; matchI > 0; matchI-- ){
00335 if( !redefined ){
00336 if( score_vector[ seqI ][ matchI - 1 ] >= 0 ){
00337 if( score_vector[ seqI ][ matchI - 1 ] == 1 )
00338 redefined = true;
00339 ++scores[ matchI - 1 ];
00340 }
00341 }else{
00342 if( score_vector[ seqI ][ matchI - 1 ] == -1 )
00343 redefined = false;
00344 }
00345 }
00346 }
00347
00348 cur_iter = list_iter;
00349 int max_score = 0;
00350 for( matchI = 0; matchI < match_count; ++matchI ){
00351 if( scores[ matchI ] > max_score ){
00352 max_score = scores[ matchI ];
00353 list_iter = cur_iter;
00354 }
00355 ++cur_iter;
00356 }
00357 }
00358
00362 void AaronsLCB( MatchList& mlist, set<uint>& breakpoints ){
00363 breakpoints.clear();
00364 if( mlist.size() == 0 )
00365 return;
00366
00367 if( mlist.size() == 1 ){
00368 breakpoints.insert( 0 );
00369 return;
00370 }
00371 uint seq_count = mlist[0]->SeqCount();
00372
00373 SingleStartComparator<AbstractMatch> msc( 0 );
00374 sort( mlist.begin(), mlist.end(), msc );
00375 vector<Match*>::iterator mem_iter = mlist.begin();
00376 list<LabeledMem> pair_list;
00377
00378 map<uint, Match*> debug_label_map;
00379 boolean debugging = false;
00380
00381
00382 list< PlacementMatch > placement_list;
00383
00384 for(; mem_iter != mlist.end(); ++mem_iter ){
00385 if( (*mem_iter)->Start( 0 ) != NO_MATCH ){
00386
00387 LabeledMem lm;
00388 lm.mem = *mem_iter;
00389 lm.label = 0;
00390 pair_list.push_back( lm );
00391 }else{
00392 PlacementMatch pm;
00393 pm.mem = *mem_iter;
00394 pm.iter = pair_list.end();
00395 placement_list.push_back( pm );
00396 }
00397 }
00398 LabeledMemComparator lmc( 0 );
00399 pair_list.sort( lmc );
00400 list< LabeledMem >::iterator pair_iter = pair_list.begin();
00401 for(; pair_iter != pair_list.end(); ++pair_iter ){
00402 PlacementMatch pm;
00403 pm.mem = pair_iter->mem;
00404 pm.iter = pair_iter;
00405 placement_list.push_back( pm );
00406 }
00407
00408
00409 for( uint seqI = 1; seqI < seq_count; ++seqI ){
00410 PlacementMatchComparator pmc( seqI );
00411 placement_list.sort( pmc );
00412 list< PlacementMatch >::iterator placement_prev;
00413 list< PlacementMatch >::iterator placement_iter = placement_list.begin();
00414 if( placement_iter->iter == pair_list.end() &&
00415 placement_iter->mem->Start( seqI ) != NO_MATCH ){
00416 LabeledMem lm;
00417 lm.mem = placement_iter->mem;
00418 lm.label = 0;
00419 pair_list.insert( pair_list.begin(), lm );
00420 placement_iter->iter = pair_list.begin();
00421 }
00422
00423 for( ++placement_iter; placement_iter != placement_list.end(); ++placement_iter ){
00424 placement_prev = placement_iter;
00425 placement_prev--;
00426
00427 if( placement_iter->iter != pair_list.end() )
00428 continue;
00429
00430 if( placement_iter->mem->Start( seqI ) == NO_MATCH )
00431 continue;
00432
00433 list< LabeledMem >::iterator insert_iter = placement_prev->iter;
00434 if( insert_iter == pair_list.end() || placement_prev->mem->Start( seqI ) == NO_MATCH )
00435 insert_iter = pair_list.begin();
00436 else{
00437 if( insert_iter->mem->Start( seqI ) < 0 ){
00438
00439 if( placement_iter->mem->Start( seqI ) > 0 )
00440 placement_iter->mem->Invert();
00441 if( !checkCollinearity( placement_iter->mem, insert_iter->mem ) ){
00442 placement_iter->mem->Invert();
00443 scanFit( pair_list, insert_iter, placement_iter->mem, seqI );
00444 ++insert_iter;
00445 }
00446 }else{
00447
00448 scanFit( pair_list, insert_iter, placement_iter->mem, seqI );
00449 ++insert_iter;
00450 }
00451 }
00452
00453 LabeledMem lm;
00454 lm.mem = placement_iter->mem;
00455 lm.label = 0;
00456 pair_list.insert( insert_iter, lm );
00457 placement_iter->iter = insert_iter;
00458 placement_iter->iter--;
00459 }
00460 }
00461 boolean debug_labels = false;
00462 ofstream debug_label_file;
00463 if( debug_labels )
00464 debug_label_file.open( "label_debug.txt" );
00465
00466 uint cur_label = 0;
00467 mlist.clear();
00468 vector< LabeledMem > pair_vec;
00469 pair_vec.reserve( pair_list.size() );
00470 mlist.reserve( pair_list.size() );
00471 for( pair_iter = pair_list.begin(); pair_iter != pair_list.end(); ++pair_iter ){
00472 pair_iter->label = cur_label++;
00473 mlist.push_back( pair_iter->mem );
00474 pair_vec.push_back( *pair_iter );
00475 if( debug_labels ){
00476 debug_label_map.insert( map<uint, Match*>::value_type( pair_iter->label, pair_iter->mem ) );
00477 debug_label_file << pair_iter->label << '\t' << (*pair_iter->mem) << endl;
00478 }
00479 }
00480 if( debug_labels )
00481 debug_label_file.close();
00482
00483 breakpoints.clear();
00484 pair_list.clear();
00485 vector< LabeledMem >::iterator pair_vec_iter;
00486 for( uint seqI = 1; seqI < seq_count; seqI++ ){
00487
00488 LabeledMemComparator lmc( seqI );
00489 sort( pair_vec.begin(), pair_vec.end(), lmc );
00490 set< uint > no_match_labels;
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502 pair_vec_iter = pair_vec.begin();
00503 uint block_start = pair_vec_iter->label;
00504 uint break_label = 0;
00505 for( ++pair_vec_iter; pair_vec_iter != pair_vec.end(); ++pair_vec_iter ){
00506 vector<LabeledMem>::iterator pair_prev = pair_vec_iter;
00507 pair_prev--;
00508 break_label = 0;
00509 uint scan_label = 0;
00510 if( pair_prev->mem->Start( seqI ) == NO_MATCH ){
00511 no_match_labels.insert( set< uint >::value_type( pair_prev->label ) );
00512
00513 if( pair_vec_iter->mem->Start( seqI ) < 0 ){
00514 block_start = pair_vec_iter->label;
00515 scanLabels( no_match_labels, block_start, true );
00516 }else if( pair_vec_iter->mem->Start( seqI ) > 0 ){
00517 block_start = pair_vec_iter->label;
00518 scanLabels( no_match_labels, block_start, false );
00519 }
00520
00521 continue;
00522 }
00523
00524 if( pair_prev->mem->Start( seqI ) < 0 ){
00525
00526 break_label = block_start;
00527 }else{
00528
00529 break_label = pair_prev->label;
00530 scanLabels( no_match_labels, break_label, true );
00531 }
00532 if( pair_vec_iter->mem->Start( seqI ) < 0 ){
00533
00534 scan_label = pair_vec_iter->label;
00535 scanLabels( no_match_labels, scan_label, true );
00536 }else{
00537
00538 scan_label = pair_vec_iter->label;
00539 scanLabels( no_match_labels, scan_label, false );
00540 }
00541
00542 if( pair_vec_iter->mem->Start( seqI ) < 0 &&
00543 pair_prev->mem->Start( seqI ) < 0 ){
00544 if( scan_label + 1 == pair_prev->label )
00545 continue;
00546 if( debugging ){
00547 map< uint, Match* >::const_iterator debug_iter = debug_label_map.find( pair_vec_iter->label );
00548 while( debug_iter->first <= pair_prev->label ){
00549 cout << debug_iter->first << '\t' << *(debug_iter->second) << endl;
00550 ++debug_iter;
00551 }
00552 }
00553 }else
00554 if( pair_vec_iter->mem->Start( seqI ) > 0 &&
00555 pair_prev->mem->Start( seqI ) > 0 ){
00556
00557 if( scan_label - 1 == pair_prev->label )
00558 continue;
00559 if( debugging ){
00560 map< uint, Match* >::const_iterator debug_iter = debug_label_map.find( pair_prev->label );
00561 while( debug_iter->first <= pair_vec_iter->label ){
00562 cout << debug_iter->first << '\t' << *(debug_iter->second) << endl;
00563 ++debug_iter;
00564 }
00565 }
00566 }
00567
00568
00569
00570
00571
00572 breakpoints.insert( break_label );
00573 block_start = scan_label;
00574 }
00575
00576
00577 if( pair_vec_iter != pair_vec.begin() ){
00578 pair_vec_iter--;
00579
00580 if( pair_vec_iter->mem->Start( seqI ) < 0 ){
00581 break_label = block_start;
00582 }else{
00583 break_label = pair_vec_iter->label;
00584 scanLabels( no_match_labels, break_label, true );
00585 }
00586 breakpoints.insert( break_label );
00587 }
00588 }
00589 }
00590
00592 void Aligner::SetPermutationOutput( std::string& permutation_filename, int64 permutation_weight )
00593 {
00594 this->permutation_filename = permutation_filename;
00595 this->permutation_weight = permutation_weight;
00596 }
00597
00598
00599 void GetLCBCoverage( MatchList& lcb, uint64& coverage ){
00600 vector< Match* >::iterator match_iter = lcb.begin();
00601 coverage = 0;
00602 bool debug = true;
00603 for( ; match_iter != lcb.end(); ++match_iter ){
00604 coverage += (*match_iter)->Length() * (*match_iter)->Multiplicity();
00605
00606
00607
00608 if( lcb.seq_table.size() > 0 )
00609 {
00610 for( uint seqI = 0; seqI < (*match_iter)->SeqCount(); ++seqI )
00611 {
00612 gnSeqI lend = absolut((*match_iter)->Start(seqI));
00613 gnSeqI length = (*match_iter)->Length();
00614 if( lend == 0 )
00615 continue;
00616 string match_seq = lcb.seq_table[seqI]->ToString(length, lend);
00617 for( size_t s = 0; s < match_seq.size(); ++s )
00618 if( match_seq[s] == 'n' || match_seq[s] == 'N' )
00619 if( (*match_iter)->Start(seqI) > 0 )
00620 coverage--;
00621 }
00622 }
00623 }
00624 }
00625
00626
00627 void computeLCBAdjacencies_v2( vector<MatchList>& lcb_list, vector< int64 >& weights, vector< LCB >& adjacencies ){
00628 IntervalList iv_list;
00629 for( uint lcbI = 0; lcbI < lcb_list.size(); ++lcbI ){
00630 vector<AbstractMatch*> asdf;
00631 asdf.push_back( lcb_list[ lcbI ].front() );
00632 if( lcb_list[lcbI].size() > 1 )
00633 asdf.push_back( lcb_list[ lcbI ].back() );
00634 Interval iv( asdf.begin(), asdf.end() );
00635 iv_list.push_back( iv );
00636 }
00637 computeLCBAdjacencies_v2( iv_list, weights, adjacencies );
00638 }
00639
00640 const uint NO_ADJACENCY = (std::numeric_limits<uint>::max)();
00641
00645 void computeLCBAdjacencies_v2( IntervalList& iv_list, vector< int64 >& weights, vector< LCB >& adjacencies ){
00646 adjacencies.clear();
00647 if( iv_list.size() == 0 )
00648 return;
00649
00650 uint seq_count = iv_list[0].SeqCount();
00651 uint seqI;
00652 uint lcbI;
00653 adjacencies.resize(iv_list.size());
00654 for( lcbI = 0; lcbI < iv_list.size(); ++lcbI ){
00655 LCB& lcb = adjacencies[lcbI];
00656 lcb.left_end.resize(seq_count);
00657 lcb.right_end.resize(seq_count);
00658 lcb.left_adjacency.resize(seq_count);
00659 lcb.right_adjacency.resize(seq_count);
00660 for( seqI = 0; seqI < seq_count; seqI++ ){
00661
00662 int64 leftI = iv_list[lcbI].LeftEnd(seqI);
00663 int64 rightI = NO_MATCH;
00664 if( leftI != NO_MATCH )
00665 {
00666 leftI = iv_list[lcbI].Orientation(seqI) == AbstractMatch::forward ? leftI : -leftI;
00667 rightI = iv_list[lcbI].RightEnd(seqI)+1;
00668 rightI = iv_list[lcbI].Orientation(seqI) == AbstractMatch::forward ? rightI : -rightI;
00669 }
00670
00671 lcb.left_end[seqI] = leftI;
00672 lcb.right_end[seqI] = rightI;
00673 lcb.left_adjacency[seqI] = NO_ADJACENCY;
00674 lcb.right_adjacency[seqI] = NO_ADJACENCY;
00675 }
00676 lcb.lcb_id = lcbI;
00677 lcb.weight = weights[ lcbI ];
00678 lcb.to_be_deleted = false;
00679 }
00680
00681 for( seqI = 0; seqI < seq_count; seqI++ ){
00682 LCBLeftComparator llc( seqI );
00683 sort( adjacencies.begin(), adjacencies.end(), llc );
00684 for( lcbI = 1; lcbI + 1 < iv_list.size(); lcbI++ ){
00685 adjacencies[ lcbI ].left_adjacency[ seqI ] = adjacencies[ lcbI - 1 ].lcb_id;
00686 adjacencies[ lcbI ].right_adjacency[ seqI ] = adjacencies[ lcbI + 1 ].lcb_id;
00687 }
00688 if( lcbI == iv_list.size() )
00689 lcbI--;
00690
00691
00692 adjacencies[ 0 ].left_adjacency[ seqI ] = NO_ADJACENCY;
00693 adjacencies[ lcbI ].right_adjacency[ seqI ] = NO_ADJACENCY;
00694 if( lcbI > 0 ){
00695 adjacencies[ 0 ].right_adjacency[ seqI ] = adjacencies[ 1 ].lcb_id;
00696 adjacencies[ lcbI ].left_adjacency[ seqI ] = adjacencies[ lcbI - 1 ].lcb_id;
00697 }
00698 }
00699 LCBIDComparator lic;
00700 sort( adjacencies.begin(), adjacencies.end(), lic );
00701
00702 }
00703
00704
00705 void scanLeft( int& left_recurseI, vector< LCB >& adjacencies, int min_weight, int seqI ){
00706 while( left_recurseI != -1 && adjacencies[ left_recurseI ].weight < min_weight )
00707 left_recurseI = adjacencies[ left_recurseI ].left_adjacency[ seqI ];
00708 }
00709 void scanRight( int& right_recurseI, vector< LCB >& adjacencies, int min_weight, int seqI ){
00710 while( right_recurseI != -1 && adjacencies[ right_recurseI ].weight < min_weight )
00711 right_recurseI = adjacencies[ right_recurseI ].right_adjacency[ seqI ];
00712 }
00713
00714
00715
00720 void CreateGapSearchList( vector< LCB >& adjacencies, const vector< gnSequence* >& seq_table, vector< vector< int64 > >& iv_regions, boolean entire_genome )
00721 {
00722 iv_regions.clear();
00723 if( adjacencies.size() == 0 )
00724 return;
00725 if( adjacencies.size() == 1 && !entire_genome )
00726 return;
00727 boolean debug_lcb_extension = false;
00728 const uint seq_count = seq_table.size();
00729
00730 uint seqI = 0;
00731 int lcbI = 0;
00732 iv_regions = vector< vector< int64 > >( seq_count );
00733
00734
00735 for( seqI = 0; seqI < seq_count; seqI++ ){
00736
00737
00738 for( lcbI = 0; lcbI < adjacencies.size(); lcbI++ ){
00739 if( adjacencies[ lcbI ].left_adjacency[ seqI ] == -1 )
00740 break;
00741 }
00742
00743
00744 int right_recurseI = lcbI;
00745 lcbI = -1;
00746 if( !entire_genome && right_recurseI != -1 ){
00747 lcbI = right_recurseI;
00748 right_recurseI = adjacencies[ lcbI ].right_adjacency[ seqI ];
00749 }
00750 gnSeqI seq_len = 0;
00751 while( (lcbI != -1 || right_recurseI != -1 ) && right_recurseI < (int)adjacencies.size() ){
00752 int64 l_end = lcbI == -1 ? 1 : adjacencies[ lcbI ].right_end[ seqI ];
00753 int64 r_end = right_recurseI == -1 ? seq_table[ seqI ]->length() : adjacencies[ right_recurseI ].left_end[ seqI ];
00754
00755
00756 if( !entire_genome && right_recurseI == -1 )
00757 break;
00758
00759 l_end = absolut( l_end );
00760 r_end = absolut( r_end );
00761
00762 if( l_end > r_end && !( r_end + 1 == l_end && right_recurseI == -1 ) ){
00763 std::cerr << "Overlapping LCBs. lcbI " << lcbI << " right_recurseI " << right_recurseI << endl;
00764 std::cerr << "lend: " << l_end << " rend: " << r_end << endl;
00765 l_end = r_end;
00766
00767 }
00768
00769 lcbI = right_recurseI;
00770 if( right_recurseI != -1 )
00771 right_recurseI = adjacencies[ right_recurseI ].right_adjacency[ seqI ];
00772 if( r_end + 1 == l_end && right_recurseI == -1 )
00773 continue;
00774 seq_len += r_end - l_end;
00775 iv_regions[ seqI ].push_back( l_end );
00776 iv_regions[ seqI ].push_back( r_end );
00777 }
00778 if( debug_lcb_extension )
00779 std::cerr << "seqI " << seqI << " seq_len: " << seq_len << endl;
00780 }
00781
00782 }
00783
00784 void SearchLCBGaps( MatchList& new_matches, const std::vector< std::vector< int64 > >& iv_regions, MaskedMemHash& nway_mh ) {
00785 if( iv_regions.size() == 0 )
00786 return;
00787 size_t sI = 0;
00788 for( ; sI < iv_regions.size(); sI++ )
00789 if( iv_regions[sI].size() > 0 )
00790 break;
00791 if( sI == iv_regions.size() )
00792 return;
00793
00794 boolean debug_lcb_extension = false;
00796 const uint seq_count = new_matches.seq_table.size();
00797 uint seqI = 0;
00798 int lcbI = 0;
00799 MatchList gap_list;
00800 gap_list.seq_table = vector< gnSequence* >( seq_count );
00801 gap_list.sml_table = vector< SortedMerList* >( seq_count );
00802
00803
00804 for( seqI = 0; seqI < seq_count; seqI++ ){
00805 gap_list.seq_table[ seqI ] = new gnSequence();
00806 gap_list.sml_table[ seqI ] = new DNAMemorySML();
00807 gnSeqI seq_len = 0;
00808 for( size_t ivI = 0; ivI < iv_regions[seqI].size(); ivI += 2 )
00809 {
00810 int64 l_end = iv_regions[seqI][ivI];
00811 int64 r_end = iv_regions[seqI][ivI+1];
00812 try{
00813 if( debug_lcb_extension )
00814 cerr << "Adding " << seqI << "\t" << l_end << "\t" << r_end << "\t(" << r_end - l_end << " bp)" << endl;
00815 gap_list.seq_table[ seqI ]->append( new_matches.seq_table[ seqI ]->ToString(r_end - l_end, l_end ) );
00816
00817 }catch(...){
00818 cout << "";
00819 }
00820 seq_len += r_end - l_end;
00821 }
00822 if( debug_lcb_extension )
00823 cerr << "seqI " << seqI << " seq_len: " << seq_len << endl;
00824 }
00825
00826
00827
00828
00829
00830 gnSeqI total_iv_length = 0;
00831 for( seqI = 0; seqI < seq_count; seqI++ ){
00832 total_iv_length += gap_list.seq_table[ seqI ]->length();
00833
00834
00835
00836 }
00837 total_iv_length /= seq_count;
00838
00839 uint search_mer_size = getDefaultSeedWeight( total_iv_length );
00840 if( search_mer_size < MIN_DNA_SEED_WEIGHT )
00841 return;
00842 uint64 default_seed = getSeed( search_mer_size );
00843
00844
00845 vector< boost::filesystem::path > delete_files;
00846 boolean create_succeeded = true;
00847 for( seqI = 0; seqI < seq_count; seqI++ ){
00848 gap_list.sml_table[ seqI ]->Clear();
00849 try{
00850 if( debug_lcb_extension )
00851 cerr << "Creating memory SML for seqI " << seqI << endl;
00852 gap_list.sml_table[ seqI ]->Create( *(gap_list.seq_table[ seqI ]), default_seed );
00853 }catch(...){
00854 create_succeeded = false;
00855 break;
00856 }
00857 }
00858 if( !create_succeeded ){
00859
00860 for( seqI = 0; seqI < seq_count; seqI++ ){
00861 gap_list.sml_table[ seqI ]->Clear();
00862 delete gap_list.sml_table[ seqI ];
00863 }
00864
00865 for( seqI = 0; seqI < seq_count; seqI++ ){
00866 cerr << "Creating dmSML for seqI " << seqI << endl;
00867
00868
00869 string concat_file = CreateTempFileName("seqconcat");
00870
00871 concat_file += ".raw";
00872 gnRAWSource::Write( *(gap_list.seq_table[ seqI ]), concat_file.c_str() );
00873 delete_files.push_back( concat_file );
00874 delete gap_list.seq_table[ seqI ];
00875 cerr << "Wrote raw sequence for seqI " << seqI << endl;
00876 gap_list.seq_table[ seqI ] = new gnSequence();
00877 gap_list.seq_table[ seqI ]->LoadSource( concat_file.c_str() );
00878 cerr << "Loaded sequence " << seqI << gap_list.seq_table[ seqI ]->length() << "b.p.\n";
00879 string sml_file = CreateTempFileName("dmsml");
00880 DNAFileSML* sml = new DNAFileSML( sml_file.c_str() );
00881 gap_list.sml_table[ seqI ] = sml;
00882 sml->dmCreate( *(gap_list.seq_table[ seqI ]), default_seed );
00883 delete_files.push_back( sml_file );
00884 delete_files.push_back( sml_file + ".coords" );
00885 }
00886 }
00887
00888
00889 nway_mh.Clear();
00890 nway_mh.FindMatches( gap_list );
00891 gap_list.MultiplicityFilter( seq_count );
00892
00893
00894
00895 for( seqI = 0; seqI < seq_count; seqI++ ){
00896 gap_list.sml_table[ seqI ]->Clear();
00897 delete gap_list.sml_table[ seqI ];
00898 }
00899
00900 if( debug_lcb_extension ){
00901 ofstream debug_extension_out( "new_extension_matches.txt" );
00902 WriteList( gap_list, debug_extension_out );
00903 debug_extension_out.close();
00904 }
00905
00906
00907
00908
00909
00910 if( !create_succeeded ){
00911 for( seqI = 0; seqI < seq_count; seqI++ )
00912 transposeMatches( gap_list, seqI, ((FileSML*)gap_list.sml_table[ seqI ])->getUsedCoordinates() );
00913 }
00914
00915
00916
00917 for( seqI = 0; seqI < seq_count; seqI++ )
00918 transposeMatches( gap_list, seqI, iv_regions[ seqI ] );
00919
00920 EliminateOverlaps( gap_list );
00921 gap_list.MultiplicityFilter( seq_count );
00922
00923 gap_list.LengthFilter( MIN_ANCHOR_LENGTH );
00924
00925
00926 for( seqI = 0; seqI < seq_count; seqI++ )
00927 delete gap_list.seq_table[ seqI ];
00928
00929 for( int delI = 0; delI < delete_files.size(); delI++ )
00930 boost::filesystem::remove( delete_files[delI] );
00931
00932 new_matches.insert( new_matches.end(), gap_list.begin(), gap_list.end() );
00933 }
00934
00935
00936
00937 class MatchLeftEndComparator {
00938 public:
00939 MatchLeftEndComparator( unsigned seq = 0 ){
00940 m_seq = seq;
00941 }
00942 MatchLeftEndComparator( MatchLeftEndComparator& msc ){
00943 m_seq = msc.m_seq;
00944 }
00945
00946 boolean operator()(const AbstractMatch* a, const AbstractMatch* b) const{
00947 int32 start_diff = max( a->FirstStart(), m_seq ) - max( b->FirstStart(), m_seq );
00948 if(start_diff == 0){
00949 uint32 m_count = a->SeqCount();
00950 m_count = m_count <= b->SeqCount() ? m_count : b->SeqCount();
00951 for(uint32 seqI = m_seq; seqI < m_count; seqI++){
00952 int64 a_start = absolut( a->Start( seqI ) ), b_start = absolut( b->Start( seqI ) );
00953 int64 diff = a_start - b_start;
00954 if(a_start == (int64)NO_MATCH || b_start == (int64)NO_MATCH)
00955 continue;
00956 else if(diff == 0)
00957 continue;
00958 else
00959 return diff < 0;
00960 }
00961 }
00962 return start_diff < 0;
00963 }
00964 private:
00965 unsigned m_seq;
00966 };
00967
00973 void transposeMatches( MatchList& mlist, uint seqI, const vector< int64 >& seq_regions ){
00974 if( seq_regions.size() < 2 )
00975 return;
00976
00977 uint matchI = 0;
00978 MatchLeftEndComparator msc( seqI );
00979 sort( mlist.begin(), mlist.end(), msc );
00980 uint regionI = 0;
00981 gnSeqI region_sum = seq_regions[ 1 ] - seq_regions[ 0 ];
00982 gnSeqI region_start_sum = 0;
00983 MatchList new_matches;
00984
00985 for( ; matchI < mlist.size(); matchI++ ){
00986
00987 int64 trans_start = mlist[ matchI ]->Start( seqI );
00988 int64 iv_orig_start = trans_start;
00989 if( trans_start == 0 )
00990 continue;
00991 while( region_sum < absolut( trans_start ) && regionI + 2 < seq_regions.size() ){
00992 regionI += 2;
00993 region_start_sum = region_sum;
00994 region_sum += seq_regions[ regionI + 1 ] - seq_regions[ regionI ];
00995 }
00996
00997 if( trans_start < 0 )
00998 trans_start = -seq_regions[ regionI ] - ( -trans_start - region_start_sum ) + 1;
00999 else if( trans_start > 0 )
01000 trans_start = seq_regions[ regionI ] + ( trans_start - region_start_sum ) - 1;
01001
01002 int64 trans_end = mlist[ matchI ]->Start( seqI );
01003 trans_end += trans_end > 0 ? mlist[ matchI ]->Length() - 1: -mlist[ matchI ]->Length() + 1;
01004
01005 mlist[ matchI ]->SetStart( seqI, trans_start );
01006
01007
01008 gnSeqI end_region_sum = region_sum;
01009 gnSeqI end_prev_sum = region_start_sum;
01010 uint end_regionI = regionI;
01011 Match* cur_match = mlist[ matchI ];
01012 while( end_region_sum < absolut( trans_end ) && end_regionI + 2 < seq_regions.size() ){
01013 end_regionI += 2;
01014
01015 Match* left_match = new Match( *cur_match );
01016
01017 if( left_match->Start( seqI ) < 0 ){
01018 cur_match->CropStart( absolut( iv_orig_start ) + left_match->Length() - end_region_sum - 1);
01019 left_match->CropEnd( cur_match->Length() );
01020 }else{
01021 cur_match->CropEnd( absolut( iv_orig_start ) + left_match->Length() - end_region_sum - 1);
01022 left_match->CropStart( cur_match->Length() );
01023 }
01024
01025 iv_orig_start += iv_orig_start > 0 ? cur_match->Length(): -cur_match->Length();
01026
01027 if( trans_start < 0 )
01028 trans_start = -seq_regions[ end_regionI ] - ( -iv_orig_start - end_region_sum ) + 1;
01029 else if( trans_start > 0 )
01030 trans_start = seq_regions[ end_regionI ] + ( iv_orig_start - end_region_sum ) - 1;
01031
01032 left_match->SetStart( seqI, trans_start );
01033
01034 cur_match = left_match;
01035 new_matches.push_back( left_match );
01036
01037 end_prev_sum = end_region_sum;
01038 end_region_sum += seq_regions[ end_regionI + 1 ] - seq_regions[ end_regionI ];
01039
01040 }
01041
01042
01043 }
01044
01045
01046 mlist.insert( mlist.end(), new_matches.begin(), new_matches.end() );
01047 }
01048
01049 void ComputeLCBs( MatchList& meml, set<uint>& breakpoints, vector<MatchList>& lcb_list, vector<int64>& weights ){
01050
01051
01052 if( breakpoints.size() < 1 )
01053 return;
01054
01055 lcb_list.clear();
01056 weights.clear();
01057
01058
01059
01060 set<uint>::iterator break_iter = breakpoints.begin();
01061 uint prev_break = 0;
01062 MatchList lcb = meml;
01063 for( ; break_iter != breakpoints.end(); break_iter++ ){
01064 lcb.clear();
01065 lcb.insert( lcb.begin(), meml.begin() + prev_break, meml.begin() + *break_iter + 1 );
01066 prev_break = *break_iter + 1;
01067
01068
01069 uint64 coverage;
01070 GetLCBCoverage( lcb, coverage );
01071 weights.push_back( coverage );
01072
01073
01074 lcb_list.push_back( lcb );
01075 }
01076 }
01077
01078 void Aligner::Recursion( MatchList& r_list, Match* r_begin, Match* r_end, boolean nway_only ){
01079 try{
01080 gnSeqI gap_size = 0;
01081 uint seqI = 0;
01082
01083 boolean create_ok = true;
01084
01085
01086 MatchList gap_list;
01087
01088 gap_list.seq_table.reserve( seq_count );
01089 gap_list.sml_table.reserve( seq_count );
01090 vector< int64 > starts;
01091 uint below_cutoff_count = 0;
01092
01093
01094
01095 for( seqI = 0; seqI < seq_count; seqI++ ){
01096 int64 gap_end = 0;
01097 int64 gap_start = 0;
01098 getInterveningCoordinates( r_list.seq_table, r_begin, r_end, seqI, gap_start, gap_end );
01099 if( (r_end && r_end->Start( seqI ) == NO_MATCH) ||
01100 (r_begin && r_begin->Start( seqI ) == NO_MATCH )){
01101 below_cutoff_count++;
01102 cerr << "It's screwed up\n";
01103 gap_list.seq_table.push_back( new gnSequence() );
01104 gap_list.sml_table.push_back( new DNAMemorySML() );
01105 continue;
01106 }
01107 if( gap_end < 0 && gap_start > 0 ){
01108 create_ok = false;
01109 cerr << "It's screwed up 2\n";
01110 break;
01111 }else if( gap_end < 0 && gap_start > 0 ){
01112 cerr << "It's screwed up 3\n";
01113 create_ok = false;
01114 break;
01115 }
01116 int64 diff = gap_end - gap_start;
01117 diff = 0 < diff ? diff : 0;
01118 gap_size = diff < gap_size ? gap_size : diff;
01119
01120 if( gap_start == 0 )
01121 cerr << "scheiss\n";
01122
01123 if( debug )
01124 cout << r_list.seq_table[ seqI ]->length() << endl;
01125
01126 if( diff < min_recursive_gap_length )
01127 below_cutoff_count++;
01128 starts.push_back( gap_start );
01129 gnSequence* new_seq = new gnSequence( r_list.seq_table[ seqI ]->subseq( gap_start, diff ) );
01130 gap_list.seq_table.push_back( new_seq );
01131 gap_list.sml_table.push_back( new DNAMemorySML() );
01132 }
01133
01134
01135
01136 if( below_cutoff_count + 1 < seq_count ){
01137 if( nway_only )
01138 nway_mh.Clear();
01139 else
01140 gap_mh.get().Clear();
01141
01142 multimap< uint, uint > mer_sizes;
01143
01144 for( seqI = 0; seqI < seq_count; seqI++ ){
01145 uint search_mer_size = getDefaultSeedWeight( gap_list.seq_table[ seqI ]->length() );
01146 mer_sizes.insert( multimap< uint, uint >::value_type( search_mer_size, seqI ) );
01147 }
01148 multimap< uint, uint >::iterator mer_iter = mer_sizes.end();
01149 mer_iter--;
01150 vector< uint > search_seqs;
01151 while( mer_iter != mer_sizes.end() ){
01152 uint prev_mer = mer_iter->first;
01153 uint new_seqs = 0;
01154 while( true ){
01155 if( mer_iter->first < MIN_DNA_SEED_WEIGHT )
01156 break;
01157 if( mer_iter->first == prev_mer || search_seqs.size() < 2 ){
01158 search_seqs.push_back( mer_iter->second );
01159 new_seqs++;
01160 if( mer_iter == mer_sizes.begin() ){
01161 mer_iter = mer_sizes.end();
01162 break;
01163 }
01164 prev_mer = mer_iter->first;
01165 mer_iter--;
01166 }else
01167 break;
01168 }
01169
01170 if( search_seqs.size() < 2 )
01171 break;
01172
01173
01174
01175
01176
01177
01178 uint64 default_seed = getSeed( prev_mer );
01179 if( prev_mer < MIN_DNA_SEED_WEIGHT )
01180 break;
01181 for( uint seqI = 0; seqI < gap_list.seq_table.size(); seqI++ ){
01182 gap_list.sml_table[ seqI ]->Clear();
01183 gap_list.sml_table[ seqI ]->Create( *(gap_list.seq_table[ seqI ]), default_seed );
01184 }
01185
01186
01187
01188 MatchList cur_mems = gap_list;
01189 cur_mems.clear();
01190 if( nway_only ){
01191
01192 if( search_seqs.size() < seq_count )
01193 continue;
01194 nway_mh.ClearSequences();
01195 nway_mh.FindMatches( cur_mems );
01196 }else{
01197 gap_mh.get().ClearSequences();
01198 gap_mh.get().FindMatches( cur_mems );
01199 }
01200 for( size_t mI = 0; mI < cur_mems.size(); ++mI )
01201 cur_mems[mI]->Free();
01202 cur_mems.clear();
01203 }
01204 if( nway_only )
01205 nway_mh.GetMatchList( gap_list );
01206 else
01207 gap_mh.get().GetMatchList( gap_list );
01208
01209
01210
01211 EliminateOverlaps( gap_list );
01212
01213 if( nway_only )
01214 gap_list.MultiplicityFilter( seq_count );
01215
01216
01217
01218 gap_list.LengthFilter( MIN_ANCHOR_LENGTH );
01219
01220
01221
01222 if( gap_list.size() > 0 && create_ok ){
01223
01224
01225
01226
01227
01228
01229
01230
01231
01232
01233
01234
01235
01236
01237 vector< Match* >::iterator mum_iter = gap_list.begin();
01238 for( ; mum_iter != gap_list.end(); ){
01239 boolean add_ok = true;
01240 for( uint seqI = 0; seqI < (*mum_iter)->SeqCount(); ++seqI ){
01241 int64 gap_start;
01242 if( (*mum_iter)->Start( seqI ) == NO_MATCH )
01243 continue;
01244 else if( (*mum_iter)->Start( seqI ) < 0 ){
01245 gap_start = r_begin != NULL ? -r_begin->End( seqI ) : 0;
01246 if( gap_start > 0 )
01247
01248 gap_start = r_end != NULL ? r_end->Start( seqI ) - r_end->Length() + 1 : 0;
01249 else if( r_begin )
01250 add_ok = false;
01251 (*mum_iter)->SetStart( seqI, (*mum_iter)->Start( seqI ) + gap_start );
01252 }else{
01253
01254 gap_start = r_begin != NULL ? r_begin->End( seqI ) : 0;
01255 if( gap_start < 0 ){
01256 gap_start = r_end != NULL ? r_end->Start( seqI ) - r_end->Length() + 1 : 0;
01257 add_ok = false;
01258 }
01259 (*mum_iter)->SetStart( seqI, (*mum_iter)->Start( seqI ) + gap_start );
01260 }
01261 }
01262 if( add_ok )
01263 r_list.push_back( *mum_iter );
01264 else{
01265 (*mum_iter)->Free();
01266 (*mum_iter) = NULL;
01267 }
01268 ++mum_iter;
01269 }
01270
01271
01272 }
01273 }
01274
01275 for( uint seqI = 0; seqI < gap_list.seq_table.size(); ++seqI )
01276 delete gap_list.seq_table[ seqI ];
01277 for( uint seqI = 0; seqI < gap_list.sml_table.size(); ++seqI )
01278 delete gap_list.sml_table[ seqI ];
01279
01280 gap_list.seq_table.clear();
01281 gap_list.sml_table.clear();
01282
01283 }catch( gnException& gne ){
01284 cerr << gne << endl;
01285 }catch( exception& e ){
01286 cerr << e.what() << endl;
01287 }catch(...){
01288 cerr << "When I say 'ohhh' you say 'shit'!\n";
01289 }
01290 }
01291
01292
01293 void AlignLCBInParallel( bool collinear_genomes, mems::GappedAligner* gal, MatchList& mlist, Interval& iv, AlnProgressTracker& apt )
01294 {
01295
01296 if( !collinear_genomes && mlist.size() < 2 ){
01297 iv.SetMatches( mlist );
01298 return;
01299 }
01300 size_t galI = 0;
01301 vector<GappedAlignment*> gapped_alns(mlist.size()+1, NULL);
01302 vector<int> success(gapped_alns.size(), 0);
01303 gnSeqI progress_base = apt.cur_leftend;
01304
01305 for( int mI = 0; mI < mlist.size()-1; mI++ )
01306 {
01307
01308 GappedAlignment ga(mlist.seq_table.size(),0);
01309 gapped_alns[mI] = ga.Copy();
01310
01311 bool align_success = gal->Align( *(gapped_alns[mI]), mlist[mI], mlist[mI+1], mlist.seq_table );
01312 if(align_success)
01313 success[mI] = 1;
01314 if(mI % 50 == 0 && mI > 0)
01315 {
01316
01317 int done = 0;
01318 for( int i = 0; i < gapped_alns.size(); i++ )
01319 if(gapped_alns[i] != NULL)
01320 done++;
01321
01322 {
01323 double cur_progress = ((double)(progress_base+done) / (double)apt.total_len)*100.0;
01324 printProgress((uint)apt.prev_progress, (uint)cur_progress, cout);
01325 apt.prev_progress = cur_progress;
01326 }
01327 }
01328 }
01329 apt.cur_leftend += mlist.size()-1;
01330
01331
01332 vector<AbstractMatch*> merged(mlist.size()*2 + 1);
01333 size_t mlistI = 0;
01334 size_t gappedI = 0;
01335 bool turn = true;
01336 size_t mJ = 0;
01337
01338
01339 if(collinear_genomes)
01340 {
01341 GappedAlignment ga_tmp(mlist.seq_table.size(),0);
01342 GappedAlignment* ga = ga_tmp.Copy();
01343 bool align_success = gal->Align( *ga, NULL, mlist[0], mlist.seq_table );
01344 if(align_success)
01345 merged[mJ++] = ga;
01346 gapped_alns[mlist.size()] = ga_tmp.Copy();
01347 align_success = gal->Align( *(gapped_alns[mlist.size()]), mlist.back(), NULL, mlist.seq_table );
01348 if(align_success)
01349 success[mlist.size()] = 1;
01350 }
01351 for( ; mJ < merged.size() && mlistI < mlist.size(); )
01352 {
01353 if(turn)
01354 merged[mJ++] = mlist[mlistI++];
01355 else if(success[gappedI])
01356 merged[mJ++] = gapped_alns[gappedI++];
01357 else
01358 gappedI++;
01359 turn = !turn;
01360 }
01361
01362 if( success[mlist.size()]==1 )
01363 merged[mJ++] = gapped_alns.back();
01364 merged.resize(mJ);
01365
01366 iv.SetMatches(merged);
01367 }
01368
01369
01370 void Aligner::AlignLCB( MatchList& mlist, Interval& iv ){
01371
01372 if( !collinear_genomes && mlist.size() < 2 ){
01373 iv.SetMatches( mlist );
01374 return;
01375 }
01376
01377 vector< AbstractMatch* > iv_matches;
01378 boolean debug_recurse = false;
01379 int64 config_value = 138500;
01380 int print_interval = 50;
01381 try{
01382 list< Match* > match_list;
01383 match_list.insert( match_list.end(), mlist.begin(), mlist.end() );
01384 mlist.clear();
01385 MatchList r_list = mlist;
01386
01387 list< Match* >::iterator recurse_iter = match_list.begin();
01388 list< Match* >::iterator recurse_prev = match_list.begin();
01389
01390 while( recurse_prev != match_list.end() && (*recurse_prev)->Multiplicity() != seq_count )
01391 ++recurse_prev;
01392
01393 recurse_iter = recurse_prev;
01394 if( !collinear_genomes ){
01395 if( recurse_iter != match_list.end() )
01396 ++recurse_iter;
01397 while( recurse_iter != match_list.end() && (*recurse_iter)->Multiplicity() != seq_count )
01398 ++recurse_iter;
01399 }else
01400 cout << "Assuming collinear genomes...\n";
01401
01402 uint memI = 0;
01403 uint matchI = 0;
01404 while( true ){
01405 if( memI >= print_interval && memI % print_interval == 0 || debug)
01406 cout << "Number: " << memI << " match " << **recurse_prev << endl;
01407 ++memI;
01408 if( debug_recurse ){
01409 cout << "Recursing on " << endl;
01410 if( recurse_prev != match_list.end() )
01411 cout << **recurse_prev << " and " << endl;
01412 if( recurse_iter != match_list.end() )
01413 cout << **recurse_iter << endl;
01414 }
01415
01416 if( recurse_prev != match_list.end() && (*recurse_prev)->Start( 0 ) == config_value )
01417 cout << "";
01418
01419
01420
01421
01422 r_list.clear();
01423 GappedAlignment* cr = NULL;
01424 boolean align_success = false;
01425
01426 Match* r_lend = NULL;
01427 Match* r_rend = NULL;
01428 if( recurse_iter != recurse_prev )
01429 r_lend = *recurse_prev;
01430 if( recurse_iter != match_list.end() )
01431 r_rend = *recurse_iter;
01432
01433
01434 cr = new GappedAlignment();
01435 align_success = gal->Align( *cr, r_lend, r_rend, r_list.seq_table );
01436
01437
01438 if( r_lend != NULL )
01439 iv_matches.push_back( r_lend );
01440 if( align_success )
01441 iv_matches.push_back( cr );
01442
01443
01444 recurse_prev = recurse_iter;
01445 if( recurse_iter != match_list.end() )
01446 ++recurse_iter;
01447 while( recurse_iter != match_list.end() && (*recurse_iter)->Multiplicity() != seq_count )
01448 ++recurse_iter;
01449
01450 if( ( recurse_iter == match_list.end() && !collinear_genomes ) ||
01451 ( recurse_prev == match_list.end() && collinear_genomes ) )
01452 break;
01453 }
01454
01455 list< Match* >::iterator iter = recurse_prev;
01456 for( ; iter != recurse_iter; ++iter )
01457 iv_matches.push_back(*iter);
01458
01459 mlist.insert( mlist.end(), match_list.begin(), match_list.end() );
01460 iv.SetMatches(iv_matches);
01461
01462 }catch( gnException& gne ){
01463 cerr << gne << endl;
01464 }catch(exception& e){
01465 cerr << e.what();
01466 }catch(...){
01467 cerr << "matrix exception?\n";
01468 }
01469 }
01470
01471
01472 void Aligner::SearchWithinLCB( MatchList& mlist, std::vector< search_cache_t >& new_cache, bool leftmost, bool rightmost){
01473
01474 if( !(leftmost || rightmost) && mlist.size() < 2 )
01475 return;
01476
01477 boolean debug_recurse = false;
01478 int64 config_value = 138500;
01479 int print_interval = 50;
01480
01481 try{
01482 list< Match* > match_list;
01483 match_list.insert( match_list.end(), mlist.begin(), mlist.end() );
01484 mlist.clear();
01485 MatchList r_list = mlist;
01486
01487 list< Match* >::iterator recurse_iter = match_list.begin();
01488 list< Match* >::iterator recurse_prev = match_list.begin();
01489 if( !leftmost && recurse_iter != match_list.end() )
01490 ++recurse_iter;
01491
01492 uint memI = 0;
01493 uint matchI = 0;
01494 while( recurse_prev != match_list.end() ){
01495 if( memI >= print_interval && memI % print_interval == 0 || debug)
01496 cout << "Number: " << memI << " match " << **recurse_prev << endl;
01497 ++memI;
01498 if( debug_recurse ){
01499 cout << "Recursing on " << endl;
01500 if( recurse_prev != match_list.end() )
01501 cout << **recurse_prev << " and " << endl;
01502 if( recurse_iter != match_list.end() )
01503 cout << **recurse_iter << endl;
01504 }
01505
01506
01507
01508
01509
01510 r_list.clear();
01511 Match* r_left = NULL;
01512 Match* r_right = NULL;
01513 if( recurse_iter == match_list.begin() && leftmost ){
01514 r_left = NULL;
01515 r_right = *recurse_iter;
01516 }else if( recurse_iter == match_list.end() && rightmost ){
01517 r_left = *recurse_prev;
01518 r_right = NULL;
01519 }else{
01520 r_left = *recurse_prev;
01521 r_right = *recurse_iter;
01522 }
01523
01524
01525 search_cache_t cacheval = make_pair( r_left, r_right );
01526 if( cacheval.first != NULL )
01527 cacheval.first = cacheval.first->Copy();
01528 if( cacheval.second != NULL )
01529 cacheval.second = cacheval.second->Copy();
01530 std::vector< search_cache_t >::iterator cache_entry = std::upper_bound( search_cache.begin(), search_cache.end(), cacheval, cache_comparator );
01531 if( cache_entry == search_cache.end() ||
01532 (cache_comparator( cacheval, *cache_entry ) || cache_comparator( *cache_entry, cacheval )) )
01533 {
01534
01535 Recursion( r_list, r_left, r_right, true );
01536 }
01537 new_cache.push_back( cacheval );
01538
01539 if( debug_recurse ){
01540 vector< Match* >::iterator r_iter = r_list.begin();
01541 cout << "Found matches " << endl;
01542 for(; r_iter != r_list.end(); ++r_iter )
01543 cout << **r_iter << endl;
01544 }
01545
01546
01547 for( matchI = 0; matchI < r_list.size(); ++matchI ){
01548 if( r_list[ matchI ]->Multiplicity() == seq_count ){
01549 match_list.insert( recurse_iter, r_list[ matchI ] );
01550 }else
01551 {
01552 r_list[matchI]->Free();
01553 r_list[matchI] = NULL;
01554 }
01555 }
01556
01557
01558 recurse_prev = recurse_iter;
01559 if( recurse_iter != match_list.end() )
01560 ++recurse_iter;
01561
01562
01563 if( !rightmost && recurse_iter == match_list.end() )
01564 break;
01565
01566 }
01567
01568 mlist.insert( mlist.begin(), match_list.begin(), match_list.end() );
01569
01570 }catch( gnException& gne ){
01571 cerr << gne << endl;
01572 }catch(exception& e){
01573 cerr << e.what();
01574 }catch(...){
01575 cerr << "matrix exception?\n";
01576 }
01577
01578
01579 mlist.MultiplicityFilter( seq_count );
01580 EliminateOverlaps( mlist );
01581
01582 mlist.MultiplicityFilter( seq_count );
01583 }
01584
01585 void Aligner::consistencyCheck( uint lcb_count, vector< LCB >& adjacencies, vector< MatchList >& lcb_list, vector< int64 >& weights ){
01586 vector< LCB > tmp_adj = adjacencies;
01587 vector< MatchList > tmp_lcbs = lcb_list;
01588 vector< int64 > tmp_weights = weights;
01589 filterMatches( tmp_adj, tmp_lcbs, tmp_weights );
01590 MatchList emmlist;
01591 for( uint lcbI = 0; lcbI < tmp_lcbs.size(); lcbI++ )
01592 emmlist.insert( emmlist.end(), tmp_lcbs[ lcbI ].begin(), tmp_lcbs[ lcbI ].end() );
01593 set< uint > breakpoints;
01594 AaronsLCB( emmlist, breakpoints );
01595
01596
01597 if( lcb_count != tmp_lcbs.size() ){
01598 cerr << "lcb_count: " << lcb_count << "\ttmp_lcbs.size(): " << tmp_lcbs.size() << endl;
01599 }
01600 if( lcb_count != breakpoints.size() ){
01601 cerr << "lcb_count: " << lcb_count << "\tbreakpoints.size(): " << breakpoints.size() << endl;
01602 }
01603 if( tmp_lcbs.size() != breakpoints.size() ){
01604 cerr << "tmp_lcbs.size(): " << tmp_lcbs.size() << "\tbreakpoints.size(): " << breakpoints.size() << endl;
01605 }
01606 }
01607
01608
01615 int64 greedyBreakpointElimination( gnSeqI minimum_weight, vector< LCB >& adjacencies, vector< int64 >& weights, ostream* status_out ){
01616
01617 uint lcbI = 0;
01618 vector< uint > low_weight;
01619 bool have_weight = false;
01620 gnSeqI min_weight = 0;
01621 gnSeqI prev_min_weight = 0;
01622 uint min_lcb = 0;
01623 uint lcb_count = adjacencies.size();
01624 boolean debug_bp_elimination = false;
01625 uint current_lcbI = 0;
01627 if( adjacencies.size() == 0 )
01628 return 0;
01629 uint seq_count = adjacencies[0].left_end.size();
01630
01631 while( min_weight < minimum_weight ){
01632 if( lcb_count == 1 )
01633 break;
01634
01635 while(true){
01636 have_weight = false;
01637 min_weight = 0;
01638 current_lcbI = 0;
01639
01640
01641 for( lcbI = current_lcbI; lcbI < weights.size(); lcbI++ ){
01642 if( adjacencies[ lcbI ].lcb_id != lcbI ){
01643
01644 continue;
01645 }
01646 if( weights[ lcbI ] < min_weight || !have_weight ){
01647 min_weight = weights[ lcbI ];
01648 min_lcb = lcbI;
01649 have_weight = true;
01650 if( min_weight == prev_min_weight && current_lcbI > 0 )
01651 break;
01652
01653 }
01654 }
01655 lcbI = min_lcb;
01656 have_weight = false;
01657
01658 if( prev_min_weight != min_weight ){
01659 if( status_out != NULL )
01660 *status_out << "There are " << lcb_count << " LCBs with minimum weight " << min_weight << endl;
01661
01662 current_lcbI = 0;
01663 prev_min_weight = min_weight;
01664 continue;
01665 }
01666
01667
01668 current_lcbI = min_lcb;
01669 break;
01670 }
01671
01672
01673 if( min_weight >= minimum_weight )
01674 break;
01675
01676
01677
01678
01679
01680 adjacencies[ lcbI ].lcb_id = -2;
01681
01682
01683 uint seqI;
01684 uint left_adj;
01685 uint right_adj;
01686 for( seqI = 0; seqI < seq_count; seqI++ ){
01687 left_adj = adjacencies[ lcbI ].left_adjacency[ seqI ];
01688 right_adj = adjacencies[ lcbI ].right_adjacency[ seqI ];
01689 if( debug_bp_elimination ){
01690 if( left_adj == -2 || right_adj == -2 ){
01691 cerr << "improper linking\n";
01692 }
01693
01694 if( left_adj != -1 && adjacencies[ left_adj ].right_adjacency[ seqI ] != lcbI )
01695 cerr << "Mutiny on the bounty!\n";
01696
01697 if( right_adj == adjacencies.size() )
01698 cerr << "Horrible Error -399a\n";
01699 if( right_adj != -1 && adjacencies[ right_adj ].left_adjacency[ seqI ] != lcbI )
01700 cerr << "Mutiny on the bounty!\n";
01701 }
01702 if( left_adj != -1 )
01703 adjacencies[ left_adj ].right_adjacency[ seqI ] = right_adj;
01704 if( right_adj != -1 && right_adj != adjacencies.size() )
01705 adjacencies[ right_adj ].left_adjacency[ seqI ] = left_adj;
01706
01707 }
01708
01709 lcb_count--;
01710
01711
01712 for( seqI = 0; seqI < seq_count; seqI++ ){
01713 left_adj = adjacencies[ lcbI ].left_adjacency[ seqI ];
01714 right_adj = adjacencies[ lcbI ].right_adjacency[ seqI ];
01715 if( left_adj == -1 || right_adj == -1 )
01716 continue;
01717
01718 if( debug_bp_elimination ){
01719 if( right_adj == adjacencies.size() )
01720 cerr << "Horrible Error -399a\n";
01721
01722 if( left_adj != adjacencies[ left_adj ].lcb_id ||
01723 right_adj != adjacencies[ right_adj ].lcb_id ){
01724
01725
01726
01727 cerr << "improper linking\n";
01728 continue;
01729 }
01730 if( left_adj == -2 || right_adj == -2 ){
01731 cerr << "improper linking\n";
01732 }
01733 }
01734
01735
01736 boolean orientation = adjacencies[ left_adj ].left_end[ seqI ] > 0 ? true : false;
01737 uint seqJ;
01738 for( seqJ = 0; seqJ < seq_count; seqJ++ ){
01739 boolean j_orientation = adjacencies[ left_adj ].left_end[ seqJ ] > 0;
01740 if( j_orientation == orientation &&
01741 adjacencies[ left_adj ].right_adjacency[ seqJ ] != right_adj )
01742 break;
01743 if( j_orientation != orientation &&
01744 adjacencies[ left_adj ].left_adjacency[ seqJ ] != right_adj )
01745 break;
01746
01747 if( adjacencies[ right_adj ].left_end[ seqJ ] > 0 != j_orientation )
01748 break;
01749 }
01750
01751 if( seqJ != seq_count )
01752 continue;
01753
01754
01755
01756
01757 adjacencies[ right_adj ].lcb_id = left_adj;
01758 if( adjacencies[ right_adj ].lcb_id == -1 ||
01759 adjacencies[ right_adj ].lcb_id == -2 )
01760 cerr << "Trouble in the eleventh circle\n";
01761 weights[ left_adj ] += weights[ right_adj ];
01762
01763
01764 for( seqJ = 0; seqJ < seq_count; seqJ++ ){
01765 boolean j_orientation = adjacencies[ left_adj ].left_end[ seqJ ] > 0;
01766 uint rr_adj = adjacencies[ right_adj ].right_adjacency[ seqJ ];
01767 uint rl_adj = adjacencies[ right_adj ].left_adjacency[ seqJ ];
01768 if( j_orientation == orientation ){
01769 adjacencies[ left_adj ].right_end[ seqJ ] = adjacencies[ right_adj ].right_end[ seqJ ];
01770 adjacencies[ left_adj ].right_adjacency[ seqJ ] = rr_adj;
01771 if( rr_adj == adjacencies.size() )
01772 cerr << "Horrible Error -399a\n";
01773 if( rr_adj != -1 )
01774 adjacencies[ rr_adj ].left_adjacency[ seqJ ] = left_adj;
01775 }else{
01776 adjacencies[ left_adj ].left_end[ seqJ ] = adjacencies[ right_adj ].left_end[ seqJ ];
01777 adjacencies[ left_adj ].left_adjacency[ seqJ ] = rl_adj;
01778 if( rl_adj == adjacencies.size() )
01779 cerr << "Horrible Error -399a\n";
01780 if( rl_adj != -1 )
01781 adjacencies[ rl_adj ].right_adjacency[ seqJ ] = left_adj;
01782 }
01783
01784 if( adjacencies[ lcbI ].left_adjacency[ seqJ ] == right_adj )
01785 adjacencies[ lcbI ].left_adjacency[ seqJ ] = left_adj;
01786 if( adjacencies[ lcbI ].right_adjacency[ seqJ ] == right_adj )
01787 adjacencies[ lcbI ].right_adjacency[ seqJ ] = left_adj;
01788
01789
01790 }
01791
01792 lcb_count--;
01793 }
01794 }
01795 return min_weight;
01796 }
01797
01798 class LCBLeftEndComp
01799 {
01800 public:
01801 LCBLeftEndComp() : ssc(0) {};
01802 bool operator()( const MatchList& a, const MatchList& b )
01803 {
01804 return ssc(a.front(), b.front());
01805 }
01806 protected:
01807 SingleStartComparator<AbstractMatch> ssc;
01808 };
01809
01814 void filterMatches( vector< LCB >& adjacencies, vector< MatchList >& lcb_list, vector< int64 >& weights ){
01815 if( lcb_list.size() < 1 )
01816 return;
01817 MatchList lcb_tmp = lcb_list[ 0 ];
01818 lcb_tmp.clear();
01819 vector< MatchList > filtered_lcbs = vector< MatchList >( lcb_list.size(), lcb_tmp );
01820 uint lcbI;
01821 for( lcbI = 0; lcbI < adjacencies.size(); lcbI++ ){
01822 if( adjacencies[ lcbI ].lcb_id == lcbI ){
01823 filtered_lcbs[ lcbI ].insert( filtered_lcbs[ lcbI ].end(), lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end() );
01824 continue;
01825 }
01826 if( adjacencies[ lcbI ].lcb_id == -1 ){
01827 cerr << "weird";
01828 continue;
01829 }
01830 if( adjacencies[ lcbI ].lcb_id == -2 )
01831 continue;
01832
01833
01834
01835 stack< uint > visited_lcbs;
01836 visited_lcbs.push( lcbI );
01837 uint cur_lcb = adjacencies[ lcbI ].lcb_id;
01838 while( adjacencies[ cur_lcb ].lcb_id != cur_lcb ){
01839 visited_lcbs.push( cur_lcb );
01840 cur_lcb = adjacencies[ cur_lcb ].lcb_id;
01841 if( cur_lcb == -1 || cur_lcb == -2 ){
01842
01843 break;
01844 }
01845 }
01846 while( visited_lcbs.size() > 0 ){
01847 adjacencies[ visited_lcbs.top() ].lcb_id = cur_lcb;
01848 visited_lcbs.pop();
01849 }
01850
01851 if( cur_lcb != -1 && cur_lcb != -2 )
01852 filtered_lcbs[ cur_lcb ].insert( filtered_lcbs[ cur_lcb ].end(), lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end() );
01853 }
01854
01855
01856 lcb_list.clear();
01857 vector< int64 > new_weights;
01858 for( lcbI = 0; lcbI < filtered_lcbs.size(); lcbI++ ){
01859 if( filtered_lcbs[ lcbI ].size() > 0 ){
01860 lcb_list.push_back( filtered_lcbs[ lcbI ] );
01861 uint64 wt = 0;
01862 GetLCBCoverage( filtered_lcbs[lcbI], wt );
01863 new_weights.push_back( wt );
01864
01865
01866
01867 }
01868 }
01869
01870
01871 MatchStartComparator<AbstractMatch> msc( 0 );
01872 for( lcbI = 0; lcbI < lcb_list.size(); lcbI++ ){
01873 sort( lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end(), msc );
01874 }
01875
01876
01877 LCBLeftEndComp llec;
01878 std::sort( lcb_list.begin(), lcb_list.end(), llec );
01879
01880
01881 weights = new_weights;
01882 computeLCBAdjacencies_v2( lcb_list, weights, adjacencies );
01883
01884 }
01885
01886 void Aligner::WritePermutation( vector< LCB >& adjacencies, std::string out_filename )
01887 {
01888 ofstream permutation_out( out_filename.c_str() );
01889 if( !permutation_out.is_open() )
01890 {
01891 cerr << "Error opening " << out_filename << endl;
01892 return;
01893 }
01894 for( int seqI = 0; seqI < seq_count; seqI++ )
01895 {
01896
01897 int left_lcb = 0;
01898 for( ; left_lcb < adjacencies.size(); left_lcb++ )
01899 {
01900 uint left_adj = adjacencies[left_lcb].left_adjacency[seqI];
01901 if( left_adj == -1 )
01902 break;
01903 }
01904
01905 for( uint lcbI = left_lcb; lcbI < adjacencies.size(); )
01906 {
01907 if( lcbI != left_lcb )
01908 permutation_out << '\t';
01909 if( adjacencies[lcbI].left_end[seqI] < 0 )
01910 permutation_out << "-";
01911 permutation_out << adjacencies[lcbI].lcb_id;
01912 lcbI = adjacencies[lcbI].right_adjacency[seqI];
01913 }
01914 permutation_out << endl;
01915 }
01916 }
01917
01918 void WritePermutationCoordinates( IntervalList& perm_iv_list, std::string out_filename )
01919 {
01920 ofstream perm_out( out_filename.c_str() );
01921 if( !perm_out.is_open() )
01922 {
01923 cerr << "Error opening \"" << out_filename << "\"\n";
01924 return;
01925 }
01926 perm_out << "#";
01927 for( size_t seqI = 0; seqI < perm_iv_list.seq_table.size(); ++seqI )
01928 {
01929 if( seqI > 0 )
01930 perm_out << '\t';
01931 perm_out << "seq" << seqI << "_leftend\tseq" << seqI << "_rightend";
01932 }
01933 perm_out << endl;
01934 for( size_t ivI = 0; ivI < perm_iv_list.size(); ++ivI )
01935 {
01936 for( size_t seqI = 0; seqI < perm_iv_list.seq_table.size(); ++seqI )
01937 {
01938 if( seqI > 0 )
01939 perm_out << '\t';
01940 if( perm_iv_list[ivI].Orientation(seqI) == AbstractMatch::reverse )
01941 perm_out << '-';
01942 perm_out << perm_iv_list[ivI].LeftEnd(seqI) << '\t';
01943 if( perm_iv_list[ivI].Orientation(seqI) == AbstractMatch::reverse )
01944 perm_out << '-';
01945 perm_out << perm_iv_list[ivI].RightEnd(seqI);
01946 }
01947 perm_out << endl;
01948 }
01949 }
01950
01951 void Aligner::RecursiveAnchorSearch( MatchList& mlist, gnSeqI minimum_weight, vector< MatchList >& LCB_list, boolean entire_genome, ostream* status_out ){
01952
01953
01954
01955
01956 uint lcbI;
01957 set<uint> breakpoints;
01958 vector< int64 > weights;
01959 vector< LCB > adjacencies;
01960 MatchList new_matches;
01961 new_matches.seq_table = mlist.seq_table;
01962 new_matches.seq_filename = mlist.seq_filename;
01963
01964 if( mlist.size() == 0 )
01965 return;
01966
01967 AaronsLCB( mlist, breakpoints );
01968 if( status_out )
01969 *status_out << "The " << mlist.size() << " matches constitute " << breakpoints.size() << " breakpoints\n";
01970
01971 ComputeLCBs( mlist, breakpoints, LCB_list, weights );
01972 uint weightI;
01973 for( weightI = 0; weightI < weights.size(); weightI++ )
01974 if( weights[weightI] < cur_min_coverage || cur_min_coverage == -1 )
01975 cur_min_coverage = weights[weightI];
01976
01977 computeLCBAdjacencies_v2( LCB_list, weights, adjacencies );
01978
01979 int cur_extension_round = 0;
01980 int64 total_weight = 0;
01981 int64 prev_total_weight = 0;
01982 weightI = 0;
01983 vector< vector< int64 > > prev_iv_regions;
01984 do {
01985
01986
01987
01988
01989 int64 extension_weight = total_weight;
01990 int64 prev_extension_weight = total_weight;
01991
01992
01993 if( entire_genome && extend_lcbs && total_weight != 0 &&
01994 cur_extension_round < this->max_extension_iters )
01995 {
01996 cur_extension_round++;
01997 if( status_out )
01998 *status_out << "Performing LCB extension\n";
01999 vector< vector< int64 > > cur_iv_regions;
02000 CreateGapSearchList( adjacencies, new_matches.seq_table, cur_iv_regions, entire_genome );
02001
02002 if( prev_iv_regions != cur_iv_regions )
02003 {
02004 int local_round = 0;
02005 do {
02006 local_round++;
02007
02008 new_matches.clear();
02009 vector< vector< int64 > > new_iv_regions;
02010 CreateGapSearchList( adjacencies, new_matches.seq_table, new_iv_regions, entire_genome );
02011 SearchLCBGaps( new_matches, new_iv_regions, nway_mh );
02012 mlist.insert( mlist.end(), new_matches.begin(), new_matches.end() );
02013
02014 AaronsLCB( mlist, breakpoints );
02015 ComputeLCBs( mlist, breakpoints, LCB_list, weights );
02016 cur_min_coverage = *(std::min_element(weights.begin(), weights.end()));
02017 computeLCBAdjacencies_v2( LCB_list, weights, adjacencies );
02018
02019
02020 prev_extension_weight = extension_weight;
02021 extension_weight = 0;
02022 for( weightI = 0; weightI < weights.size(); weightI++ )
02023 extension_weight += weights[ weightI ];
02024 if( status_out )
02025 *status_out << "Previous weight: " << prev_extension_weight << " new weight: " << extension_weight << endl;
02026 if( prev_extension_weight > extension_weight ){
02027 cerr << "Error! Previous weight: " << prev_extension_weight << " new weight: " << extension_weight << endl;
02028 }
02029 }while( extension_weight > prev_extension_weight && local_round < this->max_extension_iters);
02030 }
02031 swap( prev_iv_regions, cur_iv_regions );
02032 }
02033
02034
02035 if( currently_recursing && total_weight != 0 ){
02036 vector< search_cache_t > new_cache;
02037 for( lcbI = 0; lcbI < LCB_list.size(); lcbI++ ){
02038
02039
02040 int prev_size = LCB_list[ lcbI ].size();
02041 bool leftmost = true;
02042 for( int i = 0; leftmost && i < adjacencies[lcbI].left_adjacency.size(); i++ )
02043 if(adjacencies[lcbI].left_adjacency[i] != NO_ADJACENCY)
02044 leftmost = false;
02045 bool rightmost = true;
02046 for( int i = 0; rightmost && i < adjacencies[lcbI].right_adjacency.size(); i++ )
02047 if(adjacencies[lcbI].right_adjacency[i] != NO_ADJACENCY)
02048 rightmost = false;
02049 SearchWithinLCB( LCB_list[ lcbI ], new_cache, leftmost, rightmost );
02050
02051
02052
02053 }
02054
02055
02056 swap( search_cache, new_cache );
02057 for( size_t mI = 0; mI < new_cache.size(); mI++ )
02058 {
02059 if( new_cache[mI].first != NULL )
02060 new_cache[mI].first->Free();
02061 if( new_cache[mI].second != NULL )
02062 new_cache[mI].second->Free();
02063 }
02064 new_cache.clear();
02065 std::sort( search_cache.begin(), search_cache.end(), cache_comparator );
02066 }
02067
02068 mlist.clear();
02069 for( lcbI = 0; lcbI < LCB_list.size(); lcbI++ ){
02070 mlist.insert( mlist.end(), LCB_list[ lcbI ].begin(), LCB_list[ lcbI ].end() );
02071 }
02072
02073 if( currently_recursing && total_weight != 0 ){
02074
02075 AaronsLCB( mlist, breakpoints );
02076 ComputeLCBs( mlist, breakpoints, LCB_list, weights );
02077 computeLCBAdjacencies_v2( LCB_list, weights, adjacencies );
02078 cur_min_coverage = *(std::min_element(weights.begin(), weights.end()));
02079 }
02080
02081
02082
02083
02084
02085
02086
02087
02088
02089
02090 int64 cur_perm_weight = permutation_weight != -1 ? permutation_weight : minimum_weight;
02091 do{
02092 vector<double> m_weights(weights.size());
02093 std::copy( weights.begin(), weights.end(), m_weights.begin());
02094 SimpleBreakpointScorer sbs(adjacencies, cur_perm_weight, this->collinear_genomes);
02095 if( status_out )
02096 (*status_out) << "Performing greedy breakpoint elimination (this may take some time)\n";
02097
02098 greedyBreakpointElimination_v4(adjacencies, m_weights, sbs, NULL, false);
02099
02100
02101 filterMatches( adjacencies, LCB_list, weights );
02102 cur_min_coverage = *(std::min_element(weights.begin(), weights.end()));
02103
02104 mlist.clear();
02105 for( lcbI = 0; lcbI < LCB_list.size(); lcbI++ ){
02106 mlist.insert( mlist.end(), LCB_list[ lcbI ].begin(), LCB_list[ lcbI ].end() );
02107 }
02108 if( status_out )
02109 *status_out << "Greedy breakpoint elimination leaves " << mlist.size() << " matches constituting " << LCB_list.size() << " LCBs covering at least " << cur_min_coverage << "b.p.\n";
02110
02111 if( permutation_weight != -1 ){
02112
02113 stringstream cur_perm_filename;
02114 cur_perm_filename << permutation_filename << "." << cur_perm_weight / seq_count;
02115
02116 WritePermutation( adjacencies, cur_perm_filename.str() );
02117
02118
02119 cur_perm_filename << ".lcbs";
02120 IntervalList perm_iv_list;
02121 perm_iv_list.seq_filename = mlist.seq_filename;
02122 perm_iv_list.seq_table = mlist.seq_table;
02123 for( int permI = 0; permI < LCB_list.size(); permI++ ){
02124 vector< AbstractMatch* > perm_vector;
02125 perm_vector.push_back( LCB_list[permI].front() );
02126 if( LCB_list[permI].size() > 1 )
02127 perm_vector.push_back( LCB_list[permI].back() );
02128 Interval perm_iv(perm_vector.begin(), perm_vector.end());
02129 perm_iv_list.push_back(perm_iv);
02130 }
02131 WritePermutationCoordinates( perm_iv_list, cur_perm_filename.str() );
02132
02133
02134 vector< int64 >::iterator min_w = std::min_element( weights.begin(), weights.end() );
02135
02136 cur_perm_weight = *min_w + seq_count;
02137 }
02138 }while( cur_perm_weight < minimum_weight );
02139
02140
02141 if( recursive && entire_genome ){
02142 currently_recursing = true;
02143 }
02144
02145
02146 prev_total_weight = total_weight;
02147 total_weight = 0;
02148 for( weightI = 0; weightI < weights.size(); weightI++ )
02149 total_weight += weights[ weightI ];
02150 if( status_out )
02151 *status_out << "Previous weight: " << prev_total_weight << " new weight: " << total_weight << endl;
02152
02153
02154
02155
02156
02157
02158
02159
02160
02161 }while( total_weight != prev_total_weight );
02162
02163
02164 for( size_t mI = 0; mI < search_cache.size(); mI++ )
02165 {
02166 if( search_cache[mI].first != NULL )
02167 search_cache[mI].first->Free();
02168 if( search_cache[mI].second != NULL )
02169 search_cache[mI].second->Free();
02170 }
02171 }
02172
02192 void Aligner::align( MatchList& mlist, IntervalList& interval_list, double LCB_minimum_density, double LCB_minimum_range, boolean recursive, boolean extend_lcbs, boolean gapped_alignment, string tree_filename ){
02193 seq_count = mlist.seq_table.size();
02194 this->LCB_minimum_density = LCB_minimum_density;
02195 this->LCB_minimum_range = LCB_minimum_range;
02196 this->recursive = recursive;
02197 this->currently_recursing = false;
02198 this->extend_lcbs = extend_lcbs;
02199 this->gapped_alignment = gapped_alignment;
02200
02201
02202
02203 this->collinear_genomes = LCB_minimum_range == -1;
02204 if( collinear_genomes )
02205 cout << "\nAssuming collinear genomes...\n";
02206
02207
02208 uint64 nway_mask = 1;
02209 nway_mask <<= seq_count;
02210 nway_mask--;
02211 nway_mh.SetMask( nway_mask );
02212
02213 cout << "Starting with " << mlist.size() << " MUMs\n";
02214
02215
02216
02217
02218
02219 EliminateOverlaps( mlist );
02220 cout << "Eliminating overlaps yields " << mlist.size() << " MUMs\n";
02221
02222
02223
02224
02225
02226 bool guide_tree_loaded = false;
02227 MuscleInterface& mi = MuscleInterface::getMuscleInterface();
02228
02229 if( !guide_tree_loaded && (recursive || tree_filename != "") ){
02230
02231 interval_list.seq_table = mlist.seq_table;
02232 interval_list.seq_filename = mlist.seq_filename;
02233
02234 NumericMatrix< double > distance;
02235 DistanceMatrix( mlist, distance );
02236 if( tree_filename == "" )
02237 tree_filename = CreateTempFileName("guide_tree");
02238 mi.CreateTree( distance, tree_filename );
02239 }
02240
02241
02242
02243
02244
02245 mlist.MultiplicityFilter( seq_count );
02246 cout << "Multiplicity filter gives " << mlist.size() << " MUMs\n";
02247
02248 if( mlist.size() == 0 )
02249 return;
02250
02251
02252
02253
02254 vector< MatchList > LCB_list;
02255 RecursiveAnchorSearch( mlist, (gnSeqI)LCB_minimum_range, LCB_list, true, &cout );
02256
02257
02258
02259
02260
02261 if( gapped_alignment && recursive )
02262 cout << "\nMaking final gapped alignment...\n";
02263 interval_list.clear();
02264 AlnProgressTracker apt;
02265 apt.cur_leftend = 0;
02266 apt.prev_progress = 0;
02267 apt.total_len = 0;
02268 for( uint lcbI = 0; lcbI < LCB_list.size(); lcbI++ )
02269 apt.total_len += LCB_list[lcbI].size()-1;
02270 for( uint lcbI = 0; lcbI < LCB_list.size(); lcbI++ ){
02271 Interval new_iv;
02272 interval_list.push_back( new_iv );
02273 Interval& iv = interval_list.back();
02274 if( !gapped_alignment || !recursive ){
02275 iv.SetMatches( LCB_list[lcbI] );
02276 }else{
02277
02278 AlignLCBInParallel( collinear_genomes || (LCB_list.size()==1), gal, LCB_list[ lcbI ], iv, apt );
02279 }
02280 }
02281
02282
02283 if( gapped_alignment )
02284 addUnalignedIntervals( interval_list );
02285 }
02286
02287 }
02288