libMems/Aligner.cpp

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * $Id: Aligner.cpp,v 1.47 2004/04/19 23:10:30 darling Exp $
00003  * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
00004  * Please see the file called COPYING for licensing, copying, and modification
00005  * Please see the file called COPYING for licensing details.
00006  * **************
00007  ******************************************************************************/
00008 
00009 #include "libMems/Aligner.h"
00010 #include "libMems/Islands.h"
00011 #include "libMems/DNAFileSML.h"
00012 #include "libMems/MuscleInterface.h"    // it's the default gapped aligner
00013 #include "libGenome/gnRAWSource.h"
00014 #include "libMems/DistanceMatrix.h"
00015 #include "libMems/Files.h"
00016 
00017 #include <map>
00018 #include <fstream>      // for debugging
00019 #include <sstream>
00020 #include <stack>
00021 #include <algorithm>
00022 #include <limits>
00023 
00024 using namespace std;
00025 using namespace genome;
00026 namespace mems {
00027 
00028 
00029 boolean validateLCB( MatchList& lcb );
00030 void validateRangeIntersections( vector< MatchList >& lcb_list  );
00031 bool debug_shite = false;
00032 
00036 boolean validateLCB( MatchList& lcb ){
00037         vector< Match* >::iterator lcb_iter = lcb.begin();
00038         if( lcb.size() == 0 )
00039                 return true;
00040         uint seq_count = (*lcb_iter)->SeqCount();
00041         uint seqI = 0;
00042         boolean complain = false;
00043         for(; seqI < seq_count; seqI++ ){
00044                 lcb_iter = lcb.begin();
00045                 int64 prev_coord = 0;
00046                 for(; lcb_iter != lcb.end(); lcb_iter++ ){
00047                         if( (*lcb_iter)->Start( seqI ) == NO_MATCH )
00048                                 continue;
00049                         else if( prev_coord != 0 && (*lcb_iter)->Start( seqI ) < prev_coord ){
00050                                 complain = true;
00051                         }
00052                         prev_coord = (*lcb_iter)->Start( seqI );
00053                 }
00054         }
00055         return !complain;
00056 }
00057 
00062 void EliminateOverlaps( MatchList& ml ){
00063         if( ml.size() < 2 )
00064                 return;
00065         vector< Match* > result_matches;
00066         uint seq_count = ml[0]->SeqCount();
00067         for( uint seqI = 0; seqI < seq_count; seqI++ ){
00068                 SingleStartComparator<AbstractMatch> msc( seqI );
00069                 sort( ml.begin(), ml.end(), msc );
00070                 int64 matchI = 0;
00071                 int64 nextI = 0;
00072                 int64 deleted_count = 0;
00073                 vector< Match* > new_matches;
00074 
00075                 // scan forward to first defined match
00076                 for(; matchI != ml.size(); matchI++ )
00077                         if( ml[ matchI ]->Start( seqI ) != NO_MATCH )
00078                                 break;
00079 
00080                 for(; matchI < ml.size(); matchI++ ){
00081                         if( ml[ matchI ] == NULL )
00082                                 continue;
00083                         
00084                         for( nextI = matchI + 1; nextI < ml.size(); nextI++ ){
00085                                 if( ml[ nextI ] == NULL )
00086                                         continue;
00087 
00088                                 boolean deleted_matchI = false;
00089                                 // check for overlaps
00090                                 int64 startI = ml[ matchI ]->Start( seqI );
00091                                 int64 lenI = ml[ matchI ]->Length();
00092                                 int64 startJ = ml[ nextI ]->Start( seqI );
00093 //                              int64 diff =  absolut( startJ ) - absolut( startI ) - lenI;
00094                                 int64 diff =  absolut( startJ ) - absolut( startI ) - lenI;
00095 
00096                                 if( diff < 0 ){
00097                                         diff = -diff;
00098                                         Match* new_match;
00099                                         // delete bases from the smaller match
00100 //                                      if( ml[ nextI ]->Length() * ml[ nextI ]->Multiplicity() >= 
00101 //                                              lenI * ml[ matchI ]->Multiplicity() ){
00102                                         if( ( ml[ nextI ]->Multiplicity() > ml[ matchI ]->Multiplicity() ) ||
00103                                                 ( ml[ nextI ]->Multiplicity() == ml[ matchI ]->Multiplicity() && ml[ nextI ]->Length() > ml[ matchI ]->Length() ) ){
00104                                                 // mem_iter is smaller
00105                                                 new_match = ml[matchI]->Copy();
00106                                                 // erase base pairs from new_match
00107                                                 if( diff >= lenI ){
00108 //                                                      cerr << "Deleting " << **mem_iter << " at the hands of\n" << **next_iter << endl;
00109                                                         ml[ matchI ]->Free();
00110                                                         ml[ matchI ] = NULL;
00111                                                         matchI--;
00112                                                         deleted_matchI = true;
00113                                                         deleted_count++;
00114                                                 }else{
00115                                                         if( startI > 0 ){
00116                                                                 ml[ matchI ]->CropEnd( diff );
00117                                                                 new_match->CropStart( new_match->Length() - diff );
00118                                                         }else{
00119                                                                 ml[ matchI ]->CropStart( diff );
00120                                                                 new_match->CropEnd( new_match->Length() - diff );
00121                                                         }
00122                                                 }
00123                                         }else{
00124                                                 // match_iter is smaller
00125                                                 new_match = ml[nextI]->Copy();
00126                                                 // erase base pairs from new_match
00127                                                 if( diff >= ml[ nextI ]->Length() ){
00128 //                                                      cerr << "Deleting " << **next_iter << " at the hands of\n" << **mem_iter << endl;
00129                                                         ml[ nextI ]->Free();
00130                                                         ml[ nextI ] = NULL;
00131                                                         deleted_count++;
00132                                                 }else{
00133                                                         if( startJ > 0 ){
00134                                                                 ml[ nextI ]->CropStart( diff );
00135                                                                 new_match->CropEnd( new_match->Length() - diff );
00136                                                         }else{
00137                                                                 ml[ nextI ]->CropEnd( diff );
00138                                                                 new_match->CropStart( new_match->Length() - diff );
00139                                                         }
00140                                                 }
00141 
00142                                         }
00143                                         new_match->SetStart( seqI, 0 );
00144                                         if( new_match->Multiplicity() > 1 && new_match->Length() > 0 )
00145                                                 new_matches.push_back( new_match );
00146                                         else
00147                                         {
00148                                                 new_match->Free();
00149                                                 new_match = NULL;
00150                                         }
00151                                         if( deleted_matchI )
00152                                                 break;
00153                                 }else
00154                                         break;  // there are no more overlaps
00155                         }
00156 //                      if( nextI > 1 )
00157 //                              cerr << "There were " << nextI << " overlaps\n";
00158 //                      if( nextI > config_value_2 )
00159 //                              __asm(nop);
00160                 }
00161 
00162                 if( deleted_count > 0 ){
00163                         result_matches.reserve( ml.size() - deleted_count );
00164                         for( int64 copyI = 0; copyI < ml.size(); copyI++ ){
00165                                 if( ml[ copyI ] != NULL )
00166                                         result_matches.push_back( ml[ copyI ] );
00167                         }
00168                         ml.clear();
00169                         ml.insert( ml.end(), result_matches.begin(), result_matches.end() );
00170                 }
00171                 ml.insert( ml.end(), new_matches.begin(), new_matches.end() );
00172                 result_matches.clear();
00173                 new_matches.clear();
00174         }
00175                 
00176 }
00177 
00178 
00179 const gnSeqI default_min_r_gap_size = 200;
00180 Aligner::Aligner( uint seq_count ) :
00181 debug(false),
00182 seq_count(seq_count),
00183 min_recursive_gap_length(default_min_r_gap_size),
00184 collinear_genomes(false),
00185 gal(&(MuscleInterface::getMuscleInterface())),
00186 permutation_weight(-1),
00187 cur_min_coverage(-1),
00188 max_extension_iters(4)
00189 {}
00190 
00191 Aligner::Aligner( const Aligner& al ) :
00192 //gap_mh( al.gap_mh ),
00193 nway_mh( al.nway_mh ),
00194 seq_count( al.seq_count ),
00195 debug( al.debug),
00196 LCB_minimum_density( al.LCB_minimum_density),
00197 LCB_minimum_range( al.LCB_minimum_range ),
00198 cur_min_coverage( al.cur_min_coverage),
00199 min_recursive_gap_length( al.min_recursive_gap_length ),
00200 collinear_genomes( al.collinear_genomes ),
00201 gal( al.gal ),
00202 permutation_weight( al.permutation_weight ),
00203 permutation_filename( al.permutation_filename ),
00204 max_extension_iters( al.max_extension_iters )
00205 {}
00206 
00207 Aligner& Aligner::operator=( const Aligner& al )
00208 {
00209         gap_mh = al.gap_mh;
00210         nway_mh = al.nway_mh;
00211         seq_count = al.seq_count;
00212         debug = al.debug;
00213         
00214         LCB_minimum_density = al.LCB_minimum_density;
00215         LCB_minimum_range = al.LCB_minimum_range;
00216         
00217         cur_min_coverage = al.cur_min_coverage;
00218         min_recursive_gap_length = al.min_recursive_gap_length;
00219         collinear_genomes = al.collinear_genomes;
00220 
00221         gal = al.gal;
00222 
00223         permutation_weight = al.permutation_weight;
00224         permutation_filename = al.permutation_filename;
00225 
00226         max_extension_iters = al.max_extension_iters;
00227 
00228         return *this;
00229 }
00230 
00231 void Aligner::SetMinRecursionGapLength( gnSeqI min_r_gap ) {
00232         min_recursive_gap_length = min_r_gap;
00233 }
00234 
00235 void Aligner::SetGappedAligner( GappedAligner& gal ){
00236         this->gal = &(gal);
00237 }
00238 
00239 void Aligner::SetMaxGappedAlignmentLength( gnSeqI len ){
00240         gal->SetMaxAlignmentLength( len );
00241 }
00242 
00243 
00244 /* returns true if all labels between start_label and end_label are contained in the no_match_labels set */
00245 void scanLabels( set< uint >& no_match_labels, uint& start_label, boolean forward ){
00246         uint labelI;
00247         // scan no_match_labels for consecutive labels starting at start_label until one is missing
00248         if( forward ){
00249                 for( labelI = start_label + 1; ; labelI++){
00250                         set< uint >::iterator  label_iter = no_match_labels.find( labelI );
00251                         if( label_iter == no_match_labels.end() ){
00252                                 start_label = labelI - 1;
00253                                 break;
00254                         }
00255                 }
00256         }else{
00257                 for( labelI = start_label; labelI > 0; labelI--){
00258                         set< uint >::iterator  label_iter = no_match_labels.find( labelI - 1 );
00259                         if( label_iter == no_match_labels.end() ){
00260                                 start_label = labelI;
00261                                 break;
00262                         }
00263                 }
00264         }
00265 }
00266 
00267 boolean checkCollinearity( Match* m1, Match* m2 ){
00268         for( uint seqI = 0; seqI < m1->SeqCount(); seqI++ ){
00269                 if( m1->Start( seqI ) == NO_MATCH ||
00270                         m2->Start( seqI ) == NO_MATCH )
00271                         continue;
00272                 if((( m1->Start( seqI ) > 0 &&
00273                         m2->Start( seqI ) > 0 ) ||
00274                         (m1->Start( seqI ) < 0 &&
00275                         m2->Start( seqI ) < 0 )) &&
00276                         m1->Start( seqI ) <= m2->Start( seqI ) )
00277                         continue;
00278                 return false;
00279         }
00280         return true;
00281 }
00282 
00283 void scanFit( list< LabeledMem >& pair_list, list< LabeledMem >::iterator& list_iter, Match* new_match, uint sort_seq ){
00284 
00285         list< LabeledMem >::iterator cur_iter = list_iter;
00286         list< LabeledMem >::iterator last_iter = list_iter;
00287 //      int64 initial_start = absolut( list_iter->mem->Start( sort_seq ) );
00288         int64 initial_start = absolut( list_iter->mem->Start( sort_seq ) );
00289 
00290         uint match_count = 0;
00291         for(; last_iter != pair_list.end(); ++last_iter ){
00292                 if( last_iter->mem->Start( sort_seq ) == NO_MATCH ){
00293                         ++match_count;
00294                         continue;
00295                 }
00296 //              if( absolut( last_iter->mem->Start( sort_seq ) ) < initial_start ||
00297 //                      absolut( last_iter->mem->Start( sort_seq ) ) > new_match->Start( sort_seq ) )
00298                 if( absolut( last_iter->mem->Start( sort_seq ) ) < initial_start ||
00299                         absolut( last_iter->mem->Start( sort_seq ) ) > new_match->Start( sort_seq ) )
00300                         break;
00301                 ++match_count;
00302         }
00303         vector< vector< int > > score_vector;
00304         score_vector.reserve( new_match->SeqCount() - sort_seq - 1 );
00305         for( uint seqI = sort_seq + 1; seqI < new_match->SeqCount(); ++seqI ){
00306                 vector< int > sv;
00307                 score_vector.push_back( sv );
00308                 score_vector[ score_vector.size() - 1 ].reserve( match_count );
00309         }
00310         uint matchI = 0;
00311         for(; cur_iter != last_iter; ++cur_iter ){
00312                 
00313                 for( uint seqI = sort_seq + 1; seqI < new_match->SeqCount(); ++seqI ){
00314                         int64 p_start = cur_iter->mem->Start( seqI );
00315                         int64 m_start = new_match->Start( seqI );
00316                         p_start = p_start < 0 ? -p_start : p_start;
00317                         m_start = m_start < 0 ? -m_start : m_start;
00318                         if( m_start == NO_MATCH ){
00319                                 score_vector[ seqI - sort_seq - 1 ].push_back( 0 );
00320                         }else if( p_start == NO_MATCH ){
00321                                 score_vector[ seqI - sort_seq - 1 ].push_back( 0 );
00322                         }else if( p_start < m_start ){
00323                                 score_vector[ seqI - sort_seq - 1 ].push_back( 1 );
00324                         }else
00325                                 score_vector[ seqI - sort_seq - 1 ].push_back( -1 );
00326                 }
00327         }
00328         vector< int > scores;
00329         scores.reserve( match_count );
00330         for( matchI = match_count; matchI > 0; matchI-- )
00331                 scores.push_back( 0 );
00332         for( uint seqI = 0; seqI < new_match->SeqCount() - sort_seq - 1; ++seqI ){
00333                 boolean redefined = false;
00334                 for( matchI = match_count; matchI > 0; matchI-- ){
00335                         if( !redefined ){
00336                                 if( score_vector[ seqI ][ matchI - 1 ] >= 0 ){
00337                                         if( score_vector[ seqI ][ matchI - 1 ] == 1 )
00338                                                 redefined = true;
00339                                         ++scores[ matchI - 1 ];
00340                                 }
00341                         }else{
00342                                 if( score_vector[ seqI ][ matchI - 1 ] == -1 )
00343                                         redefined = false;
00344                         }
00345                 }
00346         }
00347         // find the first highest scoring match
00348         cur_iter = list_iter;
00349         int max_score = 0;
00350         for( matchI = 0; matchI < match_count; ++matchI ){
00351                 if( scores[ matchI ] > max_score ){
00352                         max_score = scores[ matchI ];
00353                         list_iter = cur_iter;
00354                 }
00355                 ++cur_iter;
00356         }
00357 }
00358 
00362 void AaronsLCB( MatchList& mlist, set<uint>& breakpoints ){
00363         breakpoints.clear(); // make sure this is empty
00364         if( mlist.size() == 0 )
00365                 return;
00366         // can only look for breakpoints if there is more than one match!!
00367         if( mlist.size() == 1 ){
00368                 breakpoints.insert( 0 );
00369                 return;
00370         }
00371         uint seq_count = mlist[0]->SeqCount();
00372 
00373         SingleStartComparator<AbstractMatch> msc( 0 );
00374         sort( mlist.begin(), mlist.end(), msc );
00375         vector<Match*>::iterator mem_iter = mlist.begin();
00376         list<LabeledMem> pair_list;
00377         
00378         map<uint, Match*> debug_label_map;
00379         boolean debugging = false;
00380         
00381         
00382         list< PlacementMatch > placement_list;
00383         
00384         for(; mem_iter != mlist.end(); ++mem_iter ){
00385                 if( (*mem_iter)->Start( 0 ) != NO_MATCH ){              
00386                         // add this one to the list.
00387                         LabeledMem lm;
00388                         lm.mem = *mem_iter;
00389                         lm.label = 0;
00390                         pair_list.push_back( lm );
00391                 }else{
00392                         PlacementMatch pm;
00393                         pm.mem = *mem_iter;
00394                         pm.iter = pair_list.end();
00395                         placement_list.push_back( pm );
00396                 }
00397         }
00398         LabeledMemComparator lmc( 0 );
00399         pair_list.sort( lmc );
00400         list< LabeledMem >::iterator pair_iter = pair_list.begin();
00401         for(; pair_iter != pair_list.end(); ++pair_iter ){
00402                 PlacementMatch pm;
00403                 pm.mem = pair_iter->mem;
00404                 pm.iter = pair_iter;
00405                 placement_list.push_back( pm );
00406         }
00407         
00408         // place all the subset matches from each sequence in the correct place in pair_list.
00409         for( uint seqI = 1; seqI < seq_count; ++seqI ){
00410                 PlacementMatchComparator pmc( seqI );
00411                 placement_list.sort( pmc );
00412                 list< PlacementMatch >::iterator placement_prev;
00413                 list< PlacementMatch >::iterator placement_iter = placement_list.begin();
00414                 if( placement_iter->iter == pair_list.end() &&
00415                         placement_iter->mem->Start( seqI ) != NO_MATCH ){
00416                         LabeledMem lm;
00417                         lm.mem = placement_iter->mem;
00418                         lm.label = 0;
00419                         pair_list.insert( pair_list.begin(), lm );
00420                         placement_iter->iter = pair_list.begin();
00421                 }
00422 
00423                 for( ++placement_iter; placement_iter != placement_list.end(); ++placement_iter ){
00424                         placement_prev = placement_iter;
00425                         placement_prev--;
00426                         
00427                         if( placement_iter->iter != pair_list.end() )
00428                                 continue;
00429                         
00430                         if( placement_iter->mem->Start( seqI ) == NO_MATCH )
00431                                 continue;
00432                         
00433                         list< LabeledMem >::iterator insert_iter = placement_prev->iter;
00434                         if( insert_iter == pair_list.end() || placement_prev->mem->Start( seqI ) == NO_MATCH )
00435                                 insert_iter = pair_list.begin();
00436                         else{
00437                                 if( insert_iter->mem->Start( seqI ) < 0 ){
00438                                         // invert if necessary and insert before
00439                                         if( placement_iter->mem->Start( seqI ) > 0 )
00440                                                 placement_iter->mem->Invert();
00441                                         if( !checkCollinearity( placement_iter->mem, insert_iter->mem ) ){
00442                                                 placement_iter->mem->Invert();
00443                                                 scanFit( pair_list, insert_iter, placement_iter->mem, seqI );
00444                                                 ++insert_iter;
00445                                         }
00446                                 }else{
00447                                         // insert in the earliest place this match fits with surrounding matches
00448                                         scanFit( pair_list, insert_iter, placement_iter->mem, seqI );
00449                                         ++insert_iter;
00450                                 }
00451                         }
00452                         
00453                         LabeledMem lm;
00454                         lm.mem = placement_iter->mem;
00455                         lm.label = 0;
00456                         pair_list.insert( insert_iter, lm );
00457                         placement_iter->iter = insert_iter;
00458                         placement_iter->iter--;
00459                 }
00460         }
00461         boolean debug_labels = false;
00462         ofstream debug_label_file;
00463         if( debug_labels )
00464                 debug_label_file.open( "label_debug.txt" );
00465         // number the LabeledMems in the pair_list
00466         uint cur_label = 0;
00467         mlist.clear();
00468         vector< LabeledMem > pair_vec;
00469         pair_vec.reserve( pair_list.size() );
00470         mlist.reserve( pair_list.size() );
00471         for( pair_iter = pair_list.begin(); pair_iter != pair_list.end(); ++pair_iter ){
00472                 pair_iter->label = cur_label++;
00473                 mlist.push_back( pair_iter->mem );
00474                 pair_vec.push_back( *pair_iter );
00475                 if( debug_labels ){
00476                         debug_label_map.insert( map<uint, Match*>::value_type( pair_iter->label, pair_iter->mem ) );
00477                         debug_label_file << pair_iter->label << '\t' << (*pair_iter->mem) << endl;
00478                 }
00479         }
00480         if( debug_labels )
00481                 debug_label_file.close();
00482         
00483         breakpoints.clear();
00484         pair_list.clear();
00485         vector< LabeledMem >::iterator pair_vec_iter;
00486         for( uint seqI = 1; seqI < seq_count; seqI++ ){
00487                 // sort the list on the current genome
00488                 LabeledMemComparator lmc( seqI );
00489                 sort( pair_vec.begin(), pair_vec.end(), lmc );
00490                 set< uint > no_match_labels;
00491 
00492                 // debugging code
00493 /*              stringstream debug_filename;
00494                 debug_filename << "label_sort_" << seqI << ".txt";
00495                 ofstream debug_file( debug_filename.str().c_str() );
00496                 for( uint pairI = 0; pairI < pair_vec.size(); pairI++ ){
00497                         debug_file << pair_vec[ pairI ].label << *pair_vec[ pairI ].mem << endl;
00498                 }
00499                 debug_file.close();
00500 */              // end debugging code
00501                 
00502                 pair_vec_iter = pair_vec.begin();
00503                 uint block_start = pair_vec_iter->label;
00504                 uint break_label = 0;
00505                 for( ++pair_vec_iter; pair_vec_iter != pair_vec.end(); ++pair_vec_iter ){
00506                         vector<LabeledMem>::iterator pair_prev = pair_vec_iter;
00507                         pair_prev--;
00508                         break_label = 0;
00509                         uint scan_label = 0;
00510                         if( pair_prev->mem->Start( seqI ) == NO_MATCH ){
00511                                 no_match_labels.insert( set< uint >::value_type( pair_prev->label ) );
00512                                 // get the correct block start
00513                                 if( pair_vec_iter->mem->Start( seqI ) < 0 ){
00514                                         block_start = pair_vec_iter->label;
00515                                         scanLabels( no_match_labels, block_start, true );
00516                                 }else if( pair_vec_iter->mem->Start( seqI ) > 0 ){
00517                                         block_start = pair_vec_iter->label;
00518                                         scanLabels( no_match_labels, block_start, false );
00519                                 }
00520                                 
00521                                 continue;
00522                         }
00523 
00524                         if( pair_prev->mem->Start( seqI ) < 0 ){
00525                                 // this block would break at its start
00526                                 break_label = block_start;
00527                         }else{
00528                                 // this block would break at its end
00529                                 break_label = pair_prev->label;
00530                                 scanLabels( no_match_labels, break_label, true );
00531                         }
00532                         if( pair_vec_iter->mem->Start( seqI ) < 0 ){
00533                                 // scan forward to the beginning of new block
00534                                 scan_label = pair_vec_iter->label;
00535                                 scanLabels( no_match_labels, scan_label, true );
00536                         }else{
00537                                 // scan back to the beginning of new block
00538                                 scan_label = pair_vec_iter->label;
00539                                 scanLabels( no_match_labels, scan_label, false );
00540                         }
00541 
00542                         if( pair_vec_iter->mem->Start( seqI ) < 0 &&
00543                                 pair_prev->mem->Start( seqI ) < 0 ){
00544                                 if( scan_label + 1 == pair_prev->label )
00545                                         continue;
00546                                 if( debugging ){
00547                                         map< uint, Match* >::const_iterator debug_iter = debug_label_map.find( pair_vec_iter->label );
00548                                         while( debug_iter->first <= pair_prev->label ){
00549                                                 cout << debug_iter->first << '\t' << *(debug_iter->second) << endl;
00550                                                 ++debug_iter;
00551                                         }
00552                                 }
00553                         }else
00554                         if( pair_vec_iter->mem->Start( seqI ) > 0 &&
00555                                 pair_prev->mem->Start( seqI ) > 0 ){
00556                                 
00557                                 if( scan_label - 1 == pair_prev->label )
00558                                         continue;
00559                                 if( debugging ){
00560                                         map< uint, Match* >::const_iterator debug_iter = debug_label_map.find( pair_prev->label );
00561                                         while( debug_iter->first <= pair_vec_iter->label ){
00562                                                 cout << debug_iter->first << '\t' << *(debug_iter->second) << endl;
00563                                                 ++debug_iter;
00564                                         }
00565                                 }
00566                         }
00567                         // check if the missing matches are in the set of non-matches
00568 
00569                         // since it didn't meet any of the above
00570                         // criteria it's a breakpoint.  insert the label of the end of the current block
00571                         // note that if it's a reverse complement block, the end label is really the start label
00572                         breakpoints.insert( break_label );
00573                         block_start = scan_label;
00574                 }
00575 
00576                 // insert the correct block ending
00577                 if( pair_vec_iter != pair_vec.begin() ){
00578                         pair_vec_iter--;
00579                         
00580                         if( pair_vec_iter->mem->Start( seqI ) < 0 ){
00581                                 break_label = block_start;
00582                         }else{
00583                                 break_label = pair_vec_iter->label;
00584                                 scanLabels( no_match_labels, break_label, true );
00585                         }
00586                         breakpoints.insert( break_label );
00587                 }
00588         }
00589 }
00590 
00592 void Aligner::SetPermutationOutput( std::string& permutation_filename, int64 permutation_weight )
00593 {
00594         this->permutation_filename = permutation_filename;
00595         this->permutation_weight = permutation_weight;
00596 }
00597 
00598 
00599 void GetLCBCoverage( MatchList& lcb, uint64& coverage ){
00600         vector< Match* >::iterator match_iter = lcb.begin();
00601         coverage = 0;
00602         bool debug = true;
00603         for( ; match_iter != lcb.end(); ++match_iter ){
00604                 coverage += (*match_iter)->Length() * (*match_iter)->Multiplicity();
00605 
00606                 // if we have sequence information then
00607                 // subtract the coverage for any position that contains an N
00608                 if( lcb.seq_table.size() > 0 )
00609                 {
00610                         for( uint seqI = 0; seqI < (*match_iter)->SeqCount(); ++seqI )
00611                         {
00612                                 gnSeqI lend = absolut((*match_iter)->Start(seqI));
00613                                 gnSeqI length = (*match_iter)->Length();
00614                                 if( lend == 0 )
00615                                         continue;
00616                                 string match_seq = lcb.seq_table[seqI]->ToString(length, lend);
00617                                 for( size_t s = 0; s < match_seq.size(); ++s )
00618                                         if( match_seq[s] == 'n' || match_seq[s] == 'N' )
00619                                                 if( (*match_iter)->Start(seqI) > 0 )
00620                                                         coverage--;
00621                         }
00622                 }
00623         }
00624 }
00625 
00626 
00627 void computeLCBAdjacencies_v2( vector<MatchList>& lcb_list, vector< int64 >& weights, vector< LCB >& adjacencies ){
00628         IntervalList iv_list;
00629         for( uint lcbI = 0; lcbI < lcb_list.size(); ++lcbI ){
00630                 vector<AbstractMatch*> asdf;
00631                 asdf.push_back( lcb_list[ lcbI ].front() );
00632                 if( lcb_list[lcbI].size() > 1 )
00633                         asdf.push_back( lcb_list[ lcbI ].back() );
00634                 Interval iv( asdf.begin(), asdf.end() );
00635                 iv_list.push_back( iv );
00636         }
00637         computeLCBAdjacencies_v2( iv_list, weights, adjacencies );
00638 }
00639 
00640 const uint NO_ADJACENCY = (std::numeric_limits<uint>::max)();
00641 
00645 void computeLCBAdjacencies_v2( IntervalList& iv_list, vector< int64 >& weights, vector< LCB >& adjacencies ){
00646         adjacencies.clear(); // start with no LCB adjacencies
00647         if( iv_list.size() == 0 )
00648                 return; // there aren't any LCBs so there aren't any adjacencies!
00649 
00650         uint seq_count = iv_list[0].SeqCount();
00651         uint seqI;
00652         uint lcbI;
00653         adjacencies.resize(iv_list.size());
00654         for( lcbI = 0; lcbI < iv_list.size(); ++lcbI ){
00655                 LCB& lcb = adjacencies[lcbI];
00656                 lcb.left_end.resize(seq_count);
00657                 lcb.right_end.resize(seq_count);
00658                 lcb.left_adjacency.resize(seq_count);
00659                 lcb.right_adjacency.resize(seq_count);
00660                 for( seqI = 0; seqI < seq_count; seqI++ ){
00661                         // support "ragged edges" on the ends of LCBs
00662                         int64 leftI = iv_list[lcbI].LeftEnd(seqI);
00663                         int64 rightI = NO_MATCH;
00664                         if( leftI != NO_MATCH )
00665                         {
00666                                 leftI = iv_list[lcbI].Orientation(seqI) == AbstractMatch::forward ? leftI : -leftI;
00667                                 rightI = iv_list[lcbI].RightEnd(seqI)+1;
00668                                 rightI = iv_list[lcbI].Orientation(seqI) == AbstractMatch::forward ? rightI : -rightI;
00669                         }
00670 
00671                         lcb.left_end[seqI] = leftI;
00672                         lcb.right_end[seqI] = rightI;
00673                         lcb.left_adjacency[seqI] = NO_ADJACENCY;
00674                         lcb.right_adjacency[seqI] = NO_ADJACENCY;
00675                 }
00676                 lcb.lcb_id = lcbI;
00677                 lcb.weight = weights[ lcbI ];
00678                 lcb.to_be_deleted = false;
00679         }
00680 
00681         for( seqI = 0; seqI < seq_count; seqI++ ){
00682                 LCBLeftComparator llc( seqI );
00683                 sort( adjacencies.begin(), adjacencies.end(), llc );
00684                 for( lcbI = 1; lcbI + 1 < iv_list.size(); lcbI++ ){
00685                         adjacencies[ lcbI ].left_adjacency[ seqI ] = adjacencies[ lcbI - 1 ].lcb_id;
00686                         adjacencies[ lcbI ].right_adjacency[ seqI ] = adjacencies[ lcbI + 1 ].lcb_id;
00687                 }
00688                 if( lcbI == iv_list.size() )
00689                         lcbI--; // need to decrement when there is only a single LCB
00690 
00691                 // set first and last lcb adjacencies to -1
00692                 adjacencies[ 0 ].left_adjacency[ seqI ] = NO_ADJACENCY;
00693                 adjacencies[ lcbI ].right_adjacency[ seqI ] = NO_ADJACENCY;
00694                 if( lcbI > 0 ){
00695                         adjacencies[ 0 ].right_adjacency[ seqI ] = adjacencies[ 1 ].lcb_id;
00696                         adjacencies[ lcbI ].left_adjacency[ seqI ] = adjacencies[ lcbI - 1 ].lcb_id;
00697                 }
00698         }
00699         LCBIDComparator lic;
00700         sort( adjacencies.begin(), adjacencies.end(), lic );
00701         
00702 }
00703 
00704 
00705 void scanLeft( int& left_recurseI, vector< LCB >& adjacencies, int min_weight, int seqI ){
00706         while( left_recurseI != -1 && adjacencies[ left_recurseI ].weight < min_weight )
00707                 left_recurseI = adjacencies[ left_recurseI ].left_adjacency[ seqI ];
00708 }
00709 void scanRight( int& right_recurseI, vector< LCB >& adjacencies, int min_weight, int seqI ){
00710         while( right_recurseI != -1 && adjacencies[ right_recurseI ].weight < min_weight )
00711                 right_recurseI = adjacencies[ right_recurseI ].right_adjacency[ seqI ];
00712 }
00713 
00714 
00715 
00720 void CreateGapSearchList( vector< LCB >& adjacencies, const vector< gnSequence* >& seq_table, vector< vector< int64 > >& iv_regions, boolean entire_genome ) 
00721 {
00722         iv_regions.clear();
00723         if( adjacencies.size() == 0 )
00724                 return;         // there aren't any intervening LCB regions!
00725         if( adjacencies.size() == 1 && !entire_genome )
00726                 return;         // there aren't any interveniing LCB regions in the local area
00727         boolean debug_lcb_extension = false;    
00728         const uint seq_count = seq_table.size();
00729 
00730         uint seqI = 0;
00731         int lcbI = 0;
00732         iv_regions = vector< vector< int64 > >( seq_count );
00733 
00734         // extract a gnSequence containing only the intervening regions
00735         for( seqI = 0; seqI < seq_count; seqI++ ){
00736 
00737                 // find the first LCB in this sequence
00738                 for( lcbI = 0; lcbI < adjacencies.size(); lcbI++ ){
00739                         if( adjacencies[ lcbI ].left_adjacency[ seqI ] == -1 )
00740                                 break;
00741                 }
00742                 // start concatenating the intervening regions
00743                 // scan right
00744                 int right_recurseI = lcbI;
00745                 lcbI = -1;
00746                 if( !entire_genome && right_recurseI != -1 ){
00747                         lcbI = right_recurseI;
00748                         right_recurseI = adjacencies[ lcbI ].right_adjacency[ seqI ];
00749                 }
00750                 gnSeqI seq_len = 0;
00751                 while( (lcbI != -1 || right_recurseI != -1 ) && right_recurseI < (int)adjacencies.size() ){
00752                         int64 l_end = lcbI == -1 ? 1 : adjacencies[ lcbI ].right_end[ seqI ];
00753                         int64 r_end = right_recurseI == -1 ? seq_table[ seqI ]->length() : adjacencies[ right_recurseI ].left_end[ seqI ];
00754 
00755                         // break out if outside the last LCB and not searching the entire genome
00756                         if( !entire_genome && right_recurseI == -1 )
00757                                 break;
00758 
00759                         l_end = absolut( l_end );
00760                         r_end = absolut( r_end );
00761                         
00762                         if( l_end > r_end && !( r_end + 1 == l_end && right_recurseI == -1 ) ){
00763                                 std::cerr << "Overlapping LCBs.  lcbI " << lcbI << " right_recurseI " << right_recurseI << endl;
00764                                 std::cerr << "lend: " << l_end << " rend: " << r_end << endl;
00765                                 l_end = r_end;
00766                                 
00767                         }
00768                         
00769                         lcbI = right_recurseI;
00770                         if( right_recurseI != -1 )
00771                                 right_recurseI = adjacencies[ right_recurseI ].right_adjacency[ seqI ];
00772                         if( r_end + 1 == l_end && right_recurseI == -1 )
00773                                 continue;       // we're at the right end and there's nothing to add
00774                         seq_len += r_end - l_end;
00775                         iv_regions[ seqI ].push_back( l_end );
00776                         iv_regions[ seqI ].push_back( r_end );
00777                 }
00778                 if( debug_lcb_extension )
00779                         std::cerr << "seqI " << seqI << " seq_len: " << seq_len << endl;
00780         }
00781 
00782 }
00783 
00784 void SearchLCBGaps( MatchList& new_matches, const std::vector< std::vector< int64 > >& iv_regions, MaskedMemHash& nway_mh ) {
00785         if( iv_regions.size() == 0 )
00786                 return;         // there aren't any intervening LCB regions!
00787         size_t sI = 0;
00788         for( ; sI < iv_regions.size(); sI++ )
00789                 if( iv_regions[sI].size() > 0 )
00790                         break;
00791         if( sI == iv_regions.size() )
00792                 return;         // there aren't any intervening LCB regions!
00793 
00794         boolean debug_lcb_extension = false;    
00796         const uint seq_count = new_matches.seq_table.size();
00797         uint seqI = 0;
00798         int lcbI = 0;
00799         MatchList gap_list;
00800         gap_list.seq_table = vector< gnSequence* >( seq_count );        
00801         gap_list.sml_table = vector< SortedMerList* >( seq_count );
00802 
00803         // extract a gnSequence containing only the intervening regions
00804         for( seqI = 0; seqI < seq_count; seqI++ ){
00805                 gap_list.seq_table[ seqI ] = new gnSequence();
00806                 gap_list.sml_table[ seqI ] = new DNAMemorySML();
00807                 gnSeqI seq_len = 0;
00808                 for( size_t ivI = 0; ivI < iv_regions[seqI].size(); ivI += 2 )
00809                 {
00810                         int64 l_end = iv_regions[seqI][ivI];
00811                         int64 r_end = iv_regions[seqI][ivI+1];
00812                         try{
00813                         if( debug_lcb_extension )
00814                                 cerr << "Adding " << seqI << "\t" << l_end << "\t" << r_end << "\t(" << r_end - l_end << " bp)" << endl;
00815                         gap_list.seq_table[ seqI ]->append( new_matches.seq_table[ seqI ]->ToString(r_end - l_end, l_end ) );
00816 //                      gap_list.seq_table[ seqI ]->append( new_matches.seq_table[ seqI ]->subseq( l_end, r_end - l_end ) );
00817                         }catch(...){
00818                                 cout << "";
00819                         }
00820                         seq_len += r_end - l_end;
00821                 }
00822                 if( debug_lcb_extension )
00823                         cerr << "seqI " << seqI << " seq_len: " << seq_len << endl;
00824         }
00825         //
00826         // search for MUMs in the intervening sequence regions
00827         //
00828 
00829         // calculate potential mer sizes for searches
00830         gnSeqI total_iv_length = 0;
00831         for( seqI = 0; seqI < seq_count; seqI++ ){
00832                 total_iv_length += gap_list.seq_table[ seqI ]->length();
00833 /*              cerr << "seqI: " << seqI << " length: " << gap_list.seq_table[ seqI ]->length();
00834                 cerr << "\n";
00835 */
00836         }
00837         total_iv_length /= seq_count;
00838 
00839         uint search_mer_size = getDefaultSeedWeight( total_iv_length );
00840         if( search_mer_size < MIN_DNA_SEED_WEIGHT )
00841                 return;         // The seed size is too small to be significant
00842         uint64 default_seed = getSeed( search_mer_size );
00843         
00844         //      Create sorted mer lists for the intervening gap region
00845         vector< boost::filesystem::path > delete_files;
00846         boolean create_succeeded = true;
00847         for( seqI = 0; seqI < seq_count; seqI++ ){
00848                 gap_list.sml_table[ seqI ]->Clear();
00849                 try{
00850                         if( debug_lcb_extension )
00851                                 cerr << "Creating memory SML for seqI " << seqI << endl;
00852                         gap_list.sml_table[ seqI ]->Create( *(gap_list.seq_table[ seqI ]), default_seed );
00853                 }catch(...){
00854                         create_succeeded = false;
00855                         break;
00856                 }
00857         }
00858         if( !create_succeeded ){        
00859                 // free memory consumed by any SMLs     
00860                 for( seqI = 0; seqI < seq_count; seqI++ ){
00861                         gap_list.sml_table[ seqI ]->Clear();
00862                         delete gap_list.sml_table[ seqI ];
00863                 }
00864 
00865                 for( seqI = 0; seqI < seq_count; seqI++ ){
00866                         cerr << "Creating dmSML for seqI " << seqI << endl;
00867                         // presumably we ran out of memory and couldn't use a MemorySML.        
00868                         // try using a FileSML with external sort
00869                         string concat_file = CreateTempFileName("seqconcat");
00870 
00871                         concat_file += ".raw";  // need .raw extension to tell stupid libGenome it's a raw file
00872                         gnRAWSource::Write( *(gap_list.seq_table[ seqI ]), concat_file.c_str() );
00873                         delete_files.push_back( concat_file );
00874                         delete gap_list.seq_table[ seqI ];      // make sure memory gets freed!
00875                         cerr << "Wrote raw sequence for seqI " << seqI << endl;
00876                         gap_list.seq_table[ seqI ] = new gnSequence();
00877                         gap_list.seq_table[ seqI ]->LoadSource( concat_file.c_str() );
00878                         cerr << "Loaded sequence " << seqI << gap_list.seq_table[ seqI ]->length() << "b.p.\n";
00879                         string sml_file = CreateTempFileName("dmsml");  
00880                         DNAFileSML* sml = new DNAFileSML( sml_file.c_str() );   
00881                         gap_list.sml_table[ seqI ] = sml;       
00882                         sml->dmCreate( *(gap_list.seq_table[ seqI ]), default_seed );   
00883                         delete_files.push_back( sml_file );
00884                         delete_files.push_back( sml_file + ".coords" );
00885                 }
00886         }
00887 
00888         //      Find all exact matches in the gap region
00889         nway_mh.Clear();
00890         nway_mh.FindMatches( gap_list );
00891         gap_list.MultiplicityFilter( seq_count );
00892 //      nway_mh.GetMatchList( gap_list );
00893 
00894         // free memory used by SMLs!
00895         for( seqI = 0; seqI < seq_count; seqI++ ){
00896                 gap_list.sml_table[ seqI ]->Clear();
00897                 delete gap_list.sml_table[ seqI ];
00898         }
00899         
00900         if( debug_lcb_extension ){
00901                 ofstream debug_extension_out( "new_extension_matches.txt" );
00902                 WriteList( gap_list, debug_extension_out );
00903                 debug_extension_out.close();
00904         }
00905 
00906         //      
00907         // If an N mask was used, transpose MUMs back into the previous 
00908         // sequence coordinates 
00909         //      
00910         if( !create_succeeded ){
00911                 for( seqI = 0; seqI < seq_count; seqI++ )       
00912                         transposeMatches( gap_list, seqI, ((FileSML*)gap_list.sml_table[ seqI ])->getUsedCoordinates() );
00913         }
00914         //
00915         // Transpose MUMs back into their original sequence coordinates
00916         //
00917         for( seqI = 0; seqI < seq_count; seqI++ )
00918                 transposeMatches( gap_list, seqI, iv_regions[ seqI ] );
00919 
00920         EliminateOverlaps( gap_list );
00921         gap_list.MultiplicityFilter( seq_count );
00922         // filter out matches that are too short
00923         gap_list.LengthFilter( MIN_ANCHOR_LENGTH );
00924 
00925         // free memory used by sequences!
00926         for( seqI = 0; seqI < seq_count; seqI++ )
00927                 delete gap_list.seq_table[ seqI ];
00928 
00929         for( int delI = 0; delI < delete_files.size(); delI++ ) 
00930                 boost::filesystem::remove( delete_files[delI] );
00931 
00932         new_matches.insert( new_matches.end(), gap_list.begin(), gap_list.end() );
00933 }
00934 
00935 
00936 
00937 class MatchLeftEndComparator {
00938 public:
00939         MatchLeftEndComparator( unsigned seq = 0 ){
00940                 m_seq = seq;
00941         }
00942         MatchLeftEndComparator( MatchLeftEndComparator& msc ){
00943                 m_seq = msc.m_seq;
00944         }
00945         // TODO??  make this do a wraparound comparison if all is equal?
00946         boolean operator()(const AbstractMatch* a, const AbstractMatch* b) const{
00947                 int32 start_diff = max( a->FirstStart(), m_seq ) - max( b->FirstStart(), m_seq );
00948                 if(start_diff == 0){
00949                         uint32 m_count = a->SeqCount();
00950                         m_count = m_count <= b->SeqCount() ? m_count : b->SeqCount();
00951                         for(uint32 seqI = m_seq; seqI < m_count; seqI++){
00952                                 int64 a_start = absolut( a->Start( seqI ) ), b_start = absolut( b->Start( seqI ) );
00953                                 int64 diff = a_start - b_start;
00954                                 if(a_start == (int64)NO_MATCH || b_start == (int64)NO_MATCH)
00955                                         continue;
00956                                 else if(diff == 0)
00957                                         continue;
00958                                 else
00959                                         return diff < 0;
00960                         }
00961                 }
00962                 return start_diff < 0;
00963         }
00964 private:
00965         unsigned m_seq;
00966 };
00967 
00973 void transposeMatches( MatchList& mlist, uint seqI, const vector< int64 >& seq_regions ){
00974         if( seq_regions.size() < 2 )    
00975                 return; // no work to be done here...
00976 
00977         uint matchI = 0;
00978         MatchLeftEndComparator msc( seqI );
00979         sort( mlist.begin(), mlist.end(), msc );
00980         uint regionI = 0;
00981         gnSeqI region_sum = seq_regions[ 1 ] - seq_regions[ 0 ];
00982         gnSeqI region_start_sum = 0;
00983         MatchList new_matches;
00984 
00985         for( ; matchI < mlist.size(); matchI++ ){
00986                 // find the translated start coordinate for this match
00987                 int64 trans_start = mlist[ matchI ]->Start( seqI );
00988                 int64 iv_orig_start = trans_start;
00989                 if( trans_start == 0 )
00990                         continue;
00991                 while( region_sum < absolut( trans_start ) && regionI + 2 < seq_regions.size() ){
00992                         regionI += 2;
00993                         region_start_sum = region_sum;
00994                         region_sum += seq_regions[ regionI + 1 ] - seq_regions[ regionI ];
00995                 }
00996 
00997                 if( trans_start < 0 )
00998                         trans_start = -seq_regions[ regionI ] - ( -trans_start - region_start_sum ) + 1;
00999                 else if( trans_start > 0 )
01000                         trans_start = seq_regions[ regionI ] + ( trans_start - region_start_sum ) - 1;
01001 
01002                 int64 trans_end = mlist[ matchI ]->Start( seqI );
01003                 trans_end += trans_end > 0 ? mlist[ matchI ]->Length() - 1: -mlist[ matchI ]->Length() + 1;
01004                 
01005                 mlist[ matchI ]->SetStart( seqI, trans_start );
01006                 
01007                 // this bad boy may need to be split
01008                 gnSeqI end_region_sum = region_sum;
01009                 gnSeqI end_prev_sum = region_start_sum;
01010                 uint end_regionI = regionI;
01011                 Match* cur_match = mlist[ matchI ];
01012                 while( end_region_sum < absolut( trans_end ) && end_regionI + 2 < seq_regions.size() ){
01013                         end_regionI += 2;
01014 
01015                         Match* left_match = new Match( *cur_match );
01016                         // clip off the part going to the other match
01017                         if( left_match->Start( seqI ) < 0 ){
01018                                 cur_match->CropStart( absolut( iv_orig_start ) + left_match->Length() - end_region_sum - 1);
01019                                 left_match->CropEnd( cur_match->Length() );
01020                         }else{
01021                                 cur_match->CropEnd( absolut( iv_orig_start ) + left_match->Length() - end_region_sum - 1);
01022                                 left_match->CropStart( cur_match->Length() );
01023                         }
01024 
01025                         iv_orig_start += iv_orig_start > 0 ? cur_match->Length(): -cur_match->Length();
01026 
01027                         if( trans_start < 0 )
01028                                 trans_start = -seq_regions[ end_regionI ] - ( -iv_orig_start - end_region_sum ) + 1;
01029                         else if( trans_start > 0 )
01030                                 trans_start = seq_regions[ end_regionI ] + ( iv_orig_start - end_region_sum ) - 1;
01031                         
01032                         left_match->SetStart( seqI, trans_start );
01033 
01034                         cur_match = left_match;
01035                         new_matches.push_back( left_match );
01036 
01037                         end_prev_sum = end_region_sum;
01038                         end_region_sum += seq_regions[ end_regionI + 1 ] - seq_regions[ end_regionI ];
01039 
01040                 }
01041 //              if( end_region_sum == absolut( trans_end ) )
01042 //                      cerr << "Beware of a possible bug in transposeMatches()\n";
01043         }
01044         
01045         // voila... coordinates are translated
01046         mlist.insert( mlist.end(), new_matches.begin(), new_matches.end() );
01047 }
01048 
01049 void ComputeLCBs( MatchList& meml, set<uint>& breakpoints, vector<MatchList>& lcb_list, vector<int64>& weights ){
01050 
01051         // there must be at least one end of a block defined
01052         if( breakpoints.size() < 1 )
01053                 return;
01054                 
01055         lcb_list.clear();
01056         weights.clear();
01057         
01058         // organize the LCBs into different MatchList instances
01059 
01060         set<uint>::iterator break_iter = breakpoints.begin();
01061         uint prev_break = 0;    // prev_break is the first match in the current block
01062         MatchList lcb = meml;
01063         for( ; break_iter != breakpoints.end(); break_iter++ ){
01064                 lcb.clear();
01065                 lcb.insert( lcb.begin(), meml.begin() + prev_break, meml.begin() + *break_iter + 1 );
01066                 prev_break = *break_iter + 1;
01067                 
01068                 // code to filter LCBs based on their coverage
01069                 uint64 coverage;
01070                 GetLCBCoverage( lcb, coverage );
01071                 weights.push_back( coverage );
01072 
01073                 // add the new MatchList to the set if it made the cut
01074                 lcb_list.push_back( lcb );
01075         }
01076 }
01077 
01078 void Aligner::Recursion( MatchList& r_list, Match* r_begin, Match* r_end, boolean nway_only ){
01079         try{
01080         gnSeqI gap_size = 0;
01081         uint seqI = 0;
01082 //      gnSeqI min_gap_size = 0;
01083         boolean create_ok = true;
01084         // create gnSequences for each intervening region
01085         // create a MatchList for the intervening region
01086         MatchList gap_list;
01087         
01088         gap_list.seq_table.reserve( seq_count );
01089         gap_list.sml_table.reserve( seq_count );
01090         vector< int64 > starts;
01091         uint below_cutoff_count = 0;
01092 // 
01093 //      Get the sequence in the intervening gaps between these two matches
01094 //
01095         for( seqI = 0; seqI < seq_count; seqI++ ){
01096                 int64 gap_end = 0;
01097                 int64 gap_start = 0;
01098                 getInterveningCoordinates( r_list.seq_table, r_begin, r_end, seqI, gap_start, gap_end );
01099                 if( (r_end && r_end->Start( seqI ) == NO_MATCH) ||
01100                         (r_begin && r_begin->Start( seqI ) == NO_MATCH )){
01101                         below_cutoff_count++;
01102                         cerr << "It's screwed up\n";
01103                         gap_list.seq_table.push_back( new gnSequence() );
01104                         gap_list.sml_table.push_back( new DNAMemorySML() );
01105                         continue;
01106                 }
01107                 if( gap_end < 0 && gap_start > 0 ){
01108                         create_ok = false;
01109                         cerr << "It's screwed up 2\n";
01110                         break; // bail out on directional inconsistency
01111                 }else if( gap_end < 0 && gap_start > 0 ){
01112                         cerr << "It's screwed up 3\n";
01113                         create_ok = false;
01114                         break;  // bail out on directional inconsistency
01115                 }
01116                 int64 diff = gap_end - gap_start;
01117                 diff = 0 < diff ? diff : 0;
01118                 gap_size = diff < gap_size ? gap_size : diff;
01119 
01120                 if( gap_start == 0 )
01121                         cerr << "scheiss\n";
01122 
01123                 if( debug )
01124                         cout << r_list.seq_table[ seqI ]->length() << endl;
01125 
01126                 if( diff < min_recursive_gap_length )
01127                         below_cutoff_count++;
01128                 starts.push_back( gap_start );
01129                 gnSequence* new_seq = new gnSequence( r_list.seq_table[ seqI ]->subseq( gap_start, diff ) );
01130                 gap_list.seq_table.push_back( new_seq );
01131                 gap_list.sml_table.push_back( new DNAMemorySML() );
01132         }
01133         
01134         // only perform recursive anchoring if the gapped regions are long enough
01135         // otherwise just let ClustalW do the work
01136         if( below_cutoff_count + 1 < seq_count ){
01137                 if( nway_only )
01138                         nway_mh.Clear();
01139                 else
01140                         gap_mh.get().Clear();
01141 
01142                 multimap< uint, uint > mer_sizes;
01143                 // calculate potential mer sizes for searches
01144                 for( seqI = 0; seqI < seq_count; seqI++ ){
01145                         uint search_mer_size = getDefaultSeedWeight( gap_list.seq_table[ seqI ]->length() );
01146                         mer_sizes.insert( multimap< uint, uint >::value_type( search_mer_size, seqI ) );
01147                 }
01148                 multimap< uint, uint >::iterator mer_iter = mer_sizes.end();
01149                 mer_iter--;
01150                 vector< uint > search_seqs;
01151                 while( mer_iter != mer_sizes.end() ){
01152                         uint prev_mer = mer_iter->first;
01153                         uint new_seqs = 0;
01154                         while( true ){
01155                                 if( mer_iter->first < MIN_DNA_SEED_WEIGHT )
01156                                         break;
01157                                 if( mer_iter->first == prev_mer || search_seqs.size() < 2 ){
01158                                         search_seqs.push_back( mer_iter->second );
01159                                         new_seqs++;
01160                                         if( mer_iter == mer_sizes.begin() ){
01161                                                 mer_iter = mer_sizes.end();     // signify that the scan is complete
01162                                                 break;
01163                                         }
01164                                         prev_mer = mer_iter->first;
01165                                         mer_iter--;
01166                                 }else
01167                                         break;
01168                         }
01169 
01170                         if( search_seqs.size() < 2 )
01171                                 break;
01172                         // look for MUMs
01173                         
01174                         //
01175                         //      Create sorted mer lists for the intervening gap region
01176                         //
01177 
01178                         uint64 default_seed = getSeed( prev_mer );
01179                         if( prev_mer < MIN_DNA_SEED_WEIGHT )
01180                                 break;
01181                         for( uint seqI = 0; seqI < gap_list.seq_table.size(); seqI++ ){
01182                                 gap_list.sml_table[ seqI ]->Clear();
01183                                 gap_list.sml_table[ seqI ]->Create( *(gap_list.seq_table[ seqI ]), default_seed );
01184                         }
01185                         //
01186                         //      Find all exact matches in the gap region
01187                         //
01188                         MatchList cur_mems = gap_list;
01189                         cur_mems.clear();
01190                         if( nway_only ){
01191                                 // no sense in searching for matches in subsets!!
01192                                 if( search_seqs.size() < seq_count )
01193                                         continue;
01194                                 nway_mh.ClearSequences();
01195                                 nway_mh.FindMatches( cur_mems );
01196                         }else{
01197                                 gap_mh.get().ClearSequences();
01198                                 gap_mh.get().FindMatches( cur_mems );
01199                         }
01200                         for( size_t mI = 0; mI < cur_mems.size(); ++mI )
01201                                 cur_mems[mI]->Free();
01202                         cur_mems.clear();
01203                 }
01204                 if( nway_only )
01205                         nway_mh.GetMatchList( gap_list );
01206                 else
01207                         gap_mh.get().GetMatchList( gap_list );
01208                 
01209 
01210                 // delete overlaps/inclusions           
01211                 EliminateOverlaps( gap_list );
01212                 // mult. filter after EliminateOverlaps because e.o. may generate some subset matches
01213                 if( nway_only )
01214                         gap_list.MultiplicityFilter( seq_count );
01215                 
01216                 // for anchor accuracy, throw out any anchors that are shorter than the minimum
01217                 // anchor length after EliminateOverlaps()
01218                 gap_list.LengthFilter( MIN_ANCHOR_LENGTH );
01219 
01220         //      if( min_gap_size < search_mer_size )
01221         //              create_ok = false;
01222                 if( gap_list.size() > 0 && create_ok ){
01223 
01224         /*              if( debug ){
01225                                 cout << "Starting mem: " << *r_begin << endl;
01226                                 cout << "Next mem: " << *r_end << endl;
01227                                 list<Match*>::iterator gappy_iter = gap_list.begin();
01228                                 while( gappy_iter != gap_list.end() ){
01229                                         cout << **gappy_iter;
01230                                         cout << endl;
01231                                         gappy_iter++;
01232                                 }
01233                         }
01234         */
01235 
01236                         // move all the matches that were found
01237                         vector< Match* >::iterator mum_iter = gap_list.begin();
01238                         for( ; mum_iter != gap_list.end(); ){
01239                                 boolean add_ok = true;
01240                                 for( uint seqI = 0; seqI < (*mum_iter)->SeqCount(); ++seqI ){
01241                                         int64 gap_start;
01242                                         if( (*mum_iter)->Start( seqI ) == NO_MATCH )
01243                                                 continue;
01244                                         else if( (*mum_iter)->Start( seqI ) < 0 ){
01245                                                 gap_start = r_begin != NULL ? -r_begin->End( seqI ) : 0;
01246                                                 if( gap_start > 0 )
01247         //                                              gap_start = -r_end->Start( seqI ) + r_end->Length() - 1;
01248                                                         gap_start = r_end != NULL ? r_end->Start( seqI ) - r_end->Length() + 1 : 0;
01249                                                 else if( r_begin )
01250                                                         add_ok = false;
01251                                                 (*mum_iter)->SetStart( seqI, (*mum_iter)->Start( seqI ) + gap_start );
01252                                         }else{
01253                                                 // insert them all before mem_iter
01254                                                 gap_start = r_begin != NULL ? r_begin->End( seqI ) : 0;
01255                                                 if( gap_start < 0 ){
01256                                                         gap_start = r_end != NULL ? r_end->Start( seqI ) - r_end->Length() + 1 : 0;
01257                                                         add_ok = false;
01258                                                 }
01259                                                 (*mum_iter)->SetStart( seqI, (*mum_iter)->Start( seqI ) + gap_start );
01260                                         }
01261                                 }
01262                                 if( add_ok )
01263                                         r_list.push_back( *mum_iter );
01264                                 else{
01265                                         (*mum_iter)->Free();
01266                                         (*mum_iter) = NULL;
01267                                 }
01268                                 ++mum_iter;
01269                         }
01270         //              for( ; mum_iter != gap_list.end(); )
01271         //                      match_allocator.Free( *mum_iter );
01272                 }
01273         }
01274         // delete sequences and smls
01275         for( uint seqI = 0; seqI < gap_list.seq_table.size(); ++seqI )
01276                 delete gap_list.seq_table[ seqI ];
01277         for( uint seqI = 0; seqI < gap_list.sml_table.size(); ++seqI )
01278                 delete gap_list.sml_table[ seqI ];
01279                 
01280         gap_list.seq_table.clear();
01281         gap_list.sml_table.clear();
01282         
01283         }catch( gnException& gne ){
01284                 cerr << gne << endl;
01285         }catch( exception& e ){
01286                 cerr << e.what() << endl;
01287         }catch(...){
01288                 cerr << "When I say 'ohhh' you say 'shit'!\n";
01289         }
01290 }
01291 
01292 // compute the gapped alignments between anchors in an LCB
01293 void AlignLCBInParallel( bool collinear_genomes, mems::GappedAligner* gal, MatchList& mlist, Interval& iv, AlnProgressTracker& apt )
01294 {
01295         // check whether this function can do anything useful...
01296         if( !collinear_genomes && mlist.size() < 2 ){
01297                 iv.SetMatches( mlist );
01298                 return;
01299         }
01300         size_t galI = 0;
01301         vector<GappedAlignment*> gapped_alns(mlist.size()+1, NULL);
01302         vector<int> success(gapped_alns.size(), 0);
01303         gnSeqI progress_base = apt.cur_leftend;
01304 //#pragma omp parallel for
01305         for( int mI = 0; mI < mlist.size()-1; mI++ )
01306         {
01307                 // align the region between mI and mI+1
01308                 GappedAlignment ga(mlist.seq_table.size(),0);
01309                 gapped_alns[mI] = ga.Copy();
01310 
01311                 bool align_success = gal->Align( *(gapped_alns[mI]), mlist[mI], mlist[mI+1], mlist.seq_table );
01312                 if(align_success)
01313                         success[mI] = 1;
01314                 if(mI % 50 == 0 && mI > 0)
01315                 {
01316                         // update and print progress
01317                         int done = 0;
01318                         for( int i = 0; i < gapped_alns.size(); i++ )
01319                                 if(gapped_alns[i] != NULL)
01320                                         done++;
01321 //#pragma omp critical
01322 {
01323                         double cur_progress = ((double)(progress_base+done) / (double)apt.total_len)*100.0;
01324                         printProgress((uint)apt.prev_progress, (uint)cur_progress, cout);
01325                         apt.prev_progress = cur_progress;
01326 }
01327                 }
01328         }
01329         apt.cur_leftend += mlist.size()-1;
01330 
01331         // merge the alignments and anchors back together
01332         vector<AbstractMatch*> merged(mlist.size()*2 + 1);
01333         size_t mlistI = 0;
01334         size_t gappedI = 0;
01335         bool turn = true;
01336         size_t mJ = 0;
01337 
01338         // check if genomes are collinear and get the start and end alignments if necessary
01339         if(collinear_genomes)
01340         {
01341                 GappedAlignment ga_tmp(mlist.seq_table.size(),0);
01342                 GappedAlignment* ga = ga_tmp.Copy();
01343                 bool align_success = gal->Align( *ga, NULL, mlist[0], mlist.seq_table );
01344                 if(align_success)
01345                         merged[mJ++] = ga;
01346                 gapped_alns[mlist.size()] = ga_tmp.Copy();
01347                 align_success = gal->Align( *(gapped_alns[mlist.size()]), mlist.back(), NULL, mlist.seq_table );
01348                 if(align_success)
01349                         success[mlist.size()] = 1;
01350         }
01351         for( ; mJ < merged.size() && mlistI < mlist.size();  )
01352         {
01353                 if(turn)
01354                         merged[mJ++] = mlist[mlistI++];
01355                 else if(success[gappedI])
01356                         merged[mJ++] = gapped_alns[gappedI++];
01357                 else
01358                         gappedI++;
01359                 turn = !turn;
01360         }
01361         // add the last alignment
01362         if( success[mlist.size()]==1 )
01363                 merged[mJ++] = gapped_alns.back();
01364         merged.resize(mJ);
01365 
01366         iv.SetMatches(merged);
01367 }
01368 
01369 // compute the gapped alignments between anchors in an LCB
01370 void Aligner::AlignLCB( MatchList& mlist, Interval& iv ){
01371         // check whether this function can do anything useful...
01372         if( !collinear_genomes && mlist.size() < 2 ){
01373                 iv.SetMatches( mlist );
01374                 return;
01375         }
01376 
01377         vector< AbstractMatch* > iv_matches;
01378         boolean debug_recurse = false;
01379         int64 config_value = 138500;
01380         int print_interval = 50;
01381         try{
01382         list< Match* > match_list;
01383         match_list.insert( match_list.end(), mlist.begin(), mlist.end() );
01384         mlist.clear();
01385         MatchList r_list = mlist;
01386 
01387         list< Match* >::iterator recurse_iter = match_list.begin();
01388         list< Match* >::iterator recurse_prev = match_list.begin();
01389         // scan ahead to the first n-way matches
01390         while( recurse_prev != match_list.end() && (*recurse_prev)->Multiplicity() != seq_count )
01391                 ++recurse_prev;
01392 
01393         recurse_iter = recurse_prev;
01394         if( !collinear_genomes ){
01395                 if( recurse_iter != match_list.end() )
01396                         ++recurse_iter;
01397                 while( recurse_iter != match_list.end() && (*recurse_iter)->Multiplicity() != seq_count )
01398                         ++recurse_iter;
01399         }else
01400                 cout << "Assuming collinear genomes...\n";
01401         
01402         uint memI = 0;
01403         uint matchI = 0;
01404         while( true ){
01405                 if( memI >= print_interval && memI % print_interval == 0 || debug)
01406                         cout << "Number: " << memI << " match " << **recurse_prev << endl;
01407                 ++memI;
01408                 if( debug_recurse ){
01409                         cout << "Recursing on " << endl;
01410                         if( recurse_prev != match_list.end() )
01411                                 cout << **recurse_prev << " and " << endl;
01412                         if( recurse_iter != match_list.end() )
01413                                 cout << **recurse_iter << endl;
01414                 }
01415                 
01416                 if( recurse_prev != match_list.end() && (*recurse_prev)->Start( 0 ) == config_value )
01417                         cout << "";
01418                 
01419                 // recurse on a pair of matches! 
01420                 // this function should locate all matches between the two iterators
01421                 // and add them to r_list               
01422                 r_list.clear();
01423                 GappedAlignment* cr = NULL;
01424                 boolean align_success = false;
01425                 
01426                 Match* r_lend = NULL;
01427                 Match* r_rend = NULL;
01428                 if( recurse_iter != recurse_prev )
01429                         r_lend = *recurse_prev;
01430                 if( recurse_iter != match_list.end() )
01431                         r_rend = *recurse_iter;
01432 
01433                 // attempt a clustalW alignment
01434                 cr = new GappedAlignment();
01435                 align_success = gal->Align( *cr, r_lend, r_rend, r_list.seq_table );
01436 
01437                 // add the gapped alignment to the Interval
01438                 if( r_lend != NULL )
01439                         iv_matches.push_back( r_lend );
01440                 if( align_success )
01441                         iv_matches.push_back( cr );
01442 
01443                 // scan ahead to the next pair of n-way matches
01444                 recurse_prev = recurse_iter;
01445                 if( recurse_iter != match_list.end() )
01446                         ++recurse_iter;
01447                 while( recurse_iter != match_list.end() && (*recurse_iter)->Multiplicity() != seq_count )
01448                         ++recurse_iter;
01449 
01450                 if( ( recurse_iter == match_list.end() && !collinear_genomes ) ||
01451                                 ( recurse_prev == match_list.end() && collinear_genomes ) )
01452                                 break;
01453         }
01454         // get the last little bit at the end of the LCB.
01455         list< Match* >::iterator iter = recurse_prev;
01456         for( ; iter != recurse_iter; ++iter )
01457                 iv_matches.push_back(*iter);
01458 
01459         mlist.insert( mlist.end(), match_list.begin(), match_list.end() );
01460         iv.SetMatches(iv_matches); 
01461 
01462         }catch( gnException& gne ){
01463                 cerr << gne << endl;
01464         }catch(exception& e){
01465                 cerr << e.what();
01466         }catch(...){
01467                 cerr << "matrix exception?\n";
01468         }
01469 }
01470 
01471 // just search each intervening region once for matches, no gapped alignment...
01472 void Aligner::SearchWithinLCB( MatchList& mlist, std::vector< search_cache_t >& new_cache, bool leftmost, bool rightmost){
01473         // check whether this function can do anything useful...
01474         if( !(leftmost || rightmost) && mlist.size() < 2 )
01475                 return;
01476 
01477         boolean debug_recurse = false;
01478         int64 config_value = 138500;
01479         int print_interval = 50;
01480 
01481         try{
01482         list< Match* > match_list;
01483         match_list.insert( match_list.end(), mlist.begin(), mlist.end() );
01484         mlist.clear();
01485         MatchList r_list = mlist;
01486 
01487         list< Match* >::iterator recurse_iter = match_list.begin();
01488         list< Match* >::iterator recurse_prev = match_list.begin();
01489         if( !leftmost && recurse_iter != match_list.end() )
01490                 ++recurse_iter;
01491         
01492         uint memI = 0;
01493         uint matchI = 0;
01494         while( recurse_prev != match_list.end() ){
01495                 if( memI >= print_interval && memI % print_interval == 0 || debug)
01496                         cout << "Number: " << memI << " match " << **recurse_prev << endl;
01497                 ++memI;
01498                 if( debug_recurse ){
01499                         cout << "Recursing on " << endl;
01500                         if( recurse_prev != match_list.end() )
01501                                 cout << **recurse_prev << " and " << endl;
01502                         if( recurse_iter != match_list.end() )
01503                                 cout << **recurse_iter << endl;
01504                 }
01505                 
01506                 
01507                 // recurse on a pair of matches! 
01508                 // this function should locate all matches between the two iterators
01509                 // and add them to r_list               
01510                 r_list.clear();
01511                 Match* r_left = NULL;
01512                 Match* r_right = NULL;
01513                 if( recurse_iter == match_list.begin() && leftmost ){
01514                         r_left = NULL;
01515                         r_right = *recurse_iter;
01516                 }else if( recurse_iter == match_list.end() && rightmost ){
01517                         r_left = *recurse_prev;
01518                         r_right = NULL;
01519                 }else{
01520                         r_left = *recurse_prev;
01521                         r_right = *recurse_iter;
01522                 }
01523                 // check the cache to see whether this search has already been done!
01524 
01525                 search_cache_t cacheval = make_pair( r_left, r_right );
01526                 if( cacheval.first != NULL )
01527                         cacheval.first = cacheval.first->Copy();
01528                 if( cacheval.second != NULL )
01529                         cacheval.second = cacheval.second->Copy();
01530                 std::vector< search_cache_t >::iterator cache_entry = std::upper_bound( search_cache.begin(), search_cache.end(), cacheval, cache_comparator );
01531                 if( cache_entry == search_cache.end() || 
01532                         (cache_comparator( cacheval, *cache_entry ) || cache_comparator( *cache_entry, cacheval )) )
01533                 {
01534                         // search this region
01535                         Recursion( r_list, r_left, r_right, true );
01536                 }
01537                 new_cache.push_back( cacheval );
01538 
01539                 if( debug_recurse ){
01540                         vector< Match* >::iterator r_iter = r_list.begin();
01541                         cout << "Found matches " << endl;
01542                         for(; r_iter != r_list.end(); ++r_iter )
01543                                 cout << **r_iter << endl;
01544                 }
01545 
01546                 // insert any n-way matches into the match list
01547                 for( matchI = 0; matchI < r_list.size(); ++matchI ){
01548                         if( r_list[ matchI ]->Multiplicity() == seq_count ){
01549                                 match_list.insert( recurse_iter, r_list[ matchI ] );
01550                         }else
01551                         {
01552                                 r_list[matchI]->Free();
01553                                 r_list[matchI] = NULL;
01554                         }
01555                 }
01556 
01557                 // move ahead to the next pair of n-way matches
01558                 recurse_prev = recurse_iter;
01559                 if( recurse_iter != match_list.end() )
01560                         ++recurse_iter;
01561                 
01562                 // break early if we aren't assuming genome collinearity
01563                 if( !rightmost && recurse_iter == match_list.end() )
01564                         break;
01565                         
01566         }
01567 
01568         mlist.insert( mlist.begin(), match_list.begin(), match_list.end() );
01569 
01570         }catch( gnException& gne ){
01571                 cerr << gne << endl;
01572         }catch(exception& e){
01573                 cerr << e.what();
01574         }catch(...){
01575                 cerr << "matrix exception?\n";
01576         }
01577 
01578         // Multiplicity Filter...
01579         mlist.MultiplicityFilter( seq_count );
01580         EliminateOverlaps( mlist );
01581         // E.O. can create some matches of lower multiplicity
01582         mlist.MultiplicityFilter( seq_count );
01583 }
01584 
01585 void Aligner::consistencyCheck( uint lcb_count, vector< LCB >& adjacencies, vector< MatchList >& lcb_list, vector< int64 >& weights ){
01586         vector< LCB > tmp_adj = adjacencies;
01587         vector< MatchList > tmp_lcbs = lcb_list;
01588         vector< int64 > tmp_weights = weights;
01589         filterMatches( tmp_adj, tmp_lcbs, tmp_weights );
01590         MatchList emmlist;
01591         for( uint lcbI = 0; lcbI < tmp_lcbs.size(); lcbI++ )
01592                 emmlist.insert( emmlist.end(), tmp_lcbs[ lcbI ].begin(), tmp_lcbs[ lcbI ].end() );
01593         set< uint > breakpoints;
01594         AaronsLCB( emmlist, breakpoints );
01595         
01596         // do the correct number of LCBs exist?
01597         if( lcb_count != tmp_lcbs.size() ){
01598                 cerr << "lcb_count: " << lcb_count << "\ttmp_lcbs.size(): " << tmp_lcbs.size() << endl;
01599         }
01600         if( lcb_count != breakpoints.size() ){
01601                 cerr << "lcb_count: " << lcb_count << "\tbreakpoints.size(): " << breakpoints.size() << endl;
01602         }
01603         if( tmp_lcbs.size() != breakpoints.size() ){
01604                 cerr << "tmp_lcbs.size(): " << tmp_lcbs.size() << "\tbreakpoints.size(): " << breakpoints.size() << endl;
01605         }
01606 }
01607 
01608 
01615 int64 greedyBreakpointElimination( gnSeqI minimum_weight, vector< LCB >& adjacencies, vector< int64 >& weights, ostream* status_out ){
01616         // repeatedly remove the low weight LCBs until the minimum weight criteria is satisfied
01617         uint lcbI = 0;
01618         vector< uint > low_weight;
01619         bool have_weight = false;
01620         gnSeqI min_weight = 0;
01621         gnSeqI prev_min_weight = 0;
01622         uint min_lcb = 0;
01623         uint lcb_count = adjacencies.size();
01624         boolean debug_bp_elimination = false;
01625         uint current_lcbI = 0;  
01627         if( adjacencies.size() == 0 )
01628                 return 0;       // nothing can be done
01629         uint seq_count = adjacencies[0].left_end.size();
01630         
01631         while( min_weight < minimum_weight ){
01632                 if( lcb_count == 1 )
01633                         break;  // if only a single LCB remains, don't remove it
01634 
01635                 while(true){
01636                         have_weight = false;
01637                         min_weight = 0;
01638                         current_lcbI = 0;       // always scan the entire set
01639 
01640                         // start with current_lcbI since everything up to it has already been scanned
01641                         for( lcbI = current_lcbI; lcbI < weights.size(); lcbI++ ){
01642                                 if( adjacencies[ lcbI ].lcb_id != lcbI ){
01643                                         // this lcb has been removed or merged with another lcb
01644                                         continue;
01645                                 }
01646                                 if( weights[ lcbI ] < min_weight || !have_weight ){
01647                                         min_weight = weights[ lcbI ];
01648                                         min_lcb = lcbI;
01649                                         have_weight = true;
01650                                         if( min_weight == prev_min_weight && current_lcbI > 0 )
01651                                                 break;  // we've already found a minimum
01652                                                                 // weight LCB, stop here to save some searching
01653                                 }
01654                         }
01655                         lcbI = min_lcb;
01656                         have_weight = false;
01657                         // if the min weight changed then scan the entire set from the beginning
01658                         if( prev_min_weight != min_weight ){
01659                                 if( status_out != NULL )
01660                                         *status_out << "There are " << lcb_count << " LCBs with minimum weight " << min_weight << endl;
01661 
01662                                 current_lcbI = 0;
01663                                 prev_min_weight = min_weight;
01664                                 continue;
01665                         }
01666 
01667                         // save time by skipping LCBs that have already been scanned
01668                         current_lcbI = min_lcb;
01669                         break;
01670                 }
01671                 
01672 //              consistencyCheck( lcb_count, adjacencies, lcb_list, weights );
01673                 if( min_weight >= minimum_weight )
01674                         break;
01675 
01676                 // actually remove the LCBs now
01677                 // (only remove a single LCB for now -- it's easier to calculate adjacencies)
01678 
01679                 // remove this LCB
01680                 adjacencies[ lcbI ].lcb_id = -2;
01681                 
01682                 // update adjacencies
01683                 uint seqI;
01684                 uint left_adj;
01685                 uint right_adj;
01686                 for( seqI = 0; seqI < seq_count; seqI++ ){
01687                         left_adj = adjacencies[ lcbI ].left_adjacency[ seqI ];
01688                         right_adj = adjacencies[ lcbI ].right_adjacency[ seqI ];
01689                         if( debug_bp_elimination ){
01690                                 if( left_adj == -2 || right_adj == -2 ){
01691                                         cerr << "improper linking\n";
01692                                 }
01693                                 // for debugging, check for consistency:
01694                                 if( left_adj != -1 && adjacencies[ left_adj ].right_adjacency[ seqI ] != lcbI )
01695                                         cerr << "Mutiny on the bounty!\n";
01696                                 // for debugging, check for consistency
01697                                 if( right_adj == adjacencies.size() )
01698                                         cerr << "Horrible Error -399a\n";
01699                                 if( right_adj != -1 && adjacencies[ right_adj ].left_adjacency[ seqI ] != lcbI )
01700                                         cerr << "Mutiny on the bounty!\n";
01701                         }
01702                         if( left_adj != -1 )
01703                                 adjacencies[ left_adj ].right_adjacency[ seqI ] = right_adj;
01704                         if( right_adj != -1 && right_adj != adjacencies.size() )
01705                                 adjacencies[ right_adj ].left_adjacency[ seqI ] = left_adj;
01706                         
01707                 }
01708                 // just deleted an lcb, drop the lcb count
01709                 lcb_count--;
01710 
01711                 // check for collapse
01712                 for( seqI = 0; seqI < seq_count; seqI++ ){
01713                         left_adj = adjacencies[ lcbI ].left_adjacency[ seqI ];
01714                         right_adj = adjacencies[ lcbI ].right_adjacency[ seqI ];
01715                         if( left_adj == -1 || right_adj == -1 )
01716                                 continue;       // can't collapse with a non-existant LCB!
01717 
01718                         if( debug_bp_elimination ){
01719                                 if( right_adj == adjacencies.size() )
01720                                         cerr << "Horrible Error -399a\n";
01721                                 // check whether this LCB has already been merged
01722                                 if( left_adj != adjacencies[ left_adj ].lcb_id ||
01723                                         right_adj != adjacencies[ right_adj ].lcb_id ){
01724                                         // because adjacency pointers are always updated to point to the 
01725                                         // representative entry of an LCB, the lcb_id and the array index
01726                                         // should always be identical
01727                                         cerr << "improper linking\n";
01728                                         continue;
01729                                 }
01730                                 if( left_adj == -2 || right_adj == -2 ){
01731                                         cerr << "improper linking\n";
01732                                 }
01733                         }
01734 
01735                         // check whether the two LCBs are adjacent in each sequence
01736                         boolean orientation = adjacencies[ left_adj ].left_end[ seqI ] > 0 ? true : false;
01737                         uint seqJ;
01738                         for( seqJ = 0; seqJ < seq_count; seqJ++ ){
01739                                 boolean j_orientation = adjacencies[ left_adj ].left_end[ seqJ ] > 0;
01740                                 if( j_orientation == orientation &&
01741                                         adjacencies[ left_adj ].right_adjacency[ seqJ ] != right_adj )
01742                                         break;
01743                                 if( j_orientation != orientation &&
01744                                         adjacencies[ left_adj ].left_adjacency[ seqJ ] != right_adj )
01745                                         break;
01746                                 // check that they are both in the same orientation
01747                                 if( adjacencies[ right_adj ].left_end[ seqJ ] > 0 != j_orientation )
01748                                         break;
01749                         }
01750 
01751                         if( seqJ != seq_count )
01752                                 continue;
01753                         
01754 
01755                         // these two can be collapsed
01756                         // do it.  do it now.
01757                         adjacencies[ right_adj ].lcb_id = left_adj;
01758                         if( adjacencies[ right_adj ].lcb_id == -1 ||
01759                                 adjacencies[ right_adj ].lcb_id == -2 )
01760                                 cerr << "Trouble in the eleventh circle\n";
01761                         weights[ left_adj ] += weights[ right_adj ];
01762                         // unlink right_adj from the adjacency list and
01763                         // update left and right ends of left_adj
01764                         for( seqJ = 0; seqJ < seq_count; seqJ++ ){
01765                                 boolean j_orientation = adjacencies[ left_adj ].left_end[ seqJ ] > 0;
01766                                 uint rr_adj = adjacencies[ right_adj ].right_adjacency[ seqJ ];
01767                                 uint rl_adj = adjacencies[ right_adj ].left_adjacency[ seqJ ];
01768                                 if( j_orientation == orientation ){
01769                                         adjacencies[ left_adj ].right_end[ seqJ ] = adjacencies[ right_adj ].right_end[ seqJ ];
01770                                         adjacencies[ left_adj ].right_adjacency[ seqJ ] = rr_adj;
01771                                         if( rr_adj == adjacencies.size() )
01772                                                 cerr << "Horrible Error -399a\n";
01773                                         if( rr_adj != -1 )
01774                                                 adjacencies[ rr_adj ].left_adjacency[ seqJ ] = left_adj;
01775                                 }else{
01776                                         adjacencies[ left_adj ].left_end[ seqJ ] = adjacencies[ right_adj ].left_end[ seqJ ];
01777                                         adjacencies[ left_adj ].left_adjacency[ seqJ ] = rl_adj;
01778                                         if( rl_adj == adjacencies.size() )
01779                                                 cerr << "Horrible Error -399a\n";
01780                                         if( rl_adj != -1 )
01781                                                 adjacencies[ rl_adj ].right_adjacency[ seqJ ] = left_adj;
01782                                 }
01783                                 // update lcbI's adjacency links to point nowhere
01784                                 if( adjacencies[ lcbI ].left_adjacency[ seqJ ] == right_adj )
01785                                         adjacencies[ lcbI ].left_adjacency[ seqJ ] = left_adj;
01786                                 if( adjacencies[ lcbI ].right_adjacency[ seqJ ] == right_adj )
01787                                         adjacencies[ lcbI ].right_adjacency[ seqJ ] = left_adj;
01788 
01789 
01790                         }
01791                         // just collapsed an lcb, decrement
01792                         lcb_count--;
01793                 }
01794         }
01795         return min_weight;
01796 }
01797 
01798 class LCBLeftEndComp
01799 {
01800 public:
01801         LCBLeftEndComp() : ssc(0) {};
01802         bool operator()( const MatchList& a, const MatchList& b )
01803         {
01804                 return ssc(a.front(), b.front());
01805         }
01806 protected:
01807         SingleStartComparator<AbstractMatch> ssc;
01808 };
01809 
01814 void filterMatches( vector< LCB >& adjacencies, vector< MatchList >& lcb_list, vector< int64 >& weights ){
01815         if( lcb_list.size() < 1 )
01816                 return;
01817         MatchList lcb_tmp = lcb_list[ 0 ];
01818         lcb_tmp.clear();
01819         vector< MatchList > filtered_lcbs = vector< MatchList >( lcb_list.size(), lcb_tmp );
01820         uint lcbI;
01821         for( lcbI = 0; lcbI < adjacencies.size(); lcbI++ ){
01822                 if( adjacencies[ lcbI ].lcb_id == lcbI ){
01823                         filtered_lcbs[ lcbI ].insert( filtered_lcbs[ lcbI ].end(), lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end() );
01824                         continue;
01825                 }
01826                 if( adjacencies[ lcbI ].lcb_id == -1 ){
01827                         cerr << "weird";
01828                         continue;       // this one was removed
01829                 }
01830                 if( adjacencies[ lcbI ].lcb_id == -2 )
01831                         continue;       // this one was removed
01832 
01833                 // this one points elsewhere
01834                 // search and update the union/find structure for the target
01835                 stack< uint > visited_lcbs;
01836                 visited_lcbs.push( lcbI );
01837                 uint cur_lcb = adjacencies[ lcbI ].lcb_id;
01838                 while( adjacencies[ cur_lcb ].lcb_id != cur_lcb ){
01839                         visited_lcbs.push( cur_lcb );
01840                         cur_lcb = adjacencies[ cur_lcb ].lcb_id;
01841                         if( cur_lcb == -1 || cur_lcb == -2 ){
01842 //                              cerr << "improper hoodidge\n";
01843                                 break;  // this one points to an LCB that got deleted
01844                         }
01845                 }
01846                 while( visited_lcbs.size() > 0 ){
01847                         adjacencies[ visited_lcbs.top() ].lcb_id = cur_lcb;
01848                         visited_lcbs.pop();
01849                 }
01850                 // add this LCB's matches to the target LCB.
01851                 if( cur_lcb != -1 && cur_lcb != -2 )
01852                         filtered_lcbs[ cur_lcb ].insert( filtered_lcbs[ cur_lcb ].end(), lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end() );
01853         }
01854 
01855 
01856         lcb_list.clear();
01857         vector< int64 > new_weights;
01858         for( lcbI = 0; lcbI < filtered_lcbs.size(); lcbI++ ){
01859                 if( filtered_lcbs[ lcbI ].size() > 0 ){
01860                         lcb_list.push_back( filtered_lcbs[ lcbI ] );
01861                         uint64 wt = 0;
01862                         GetLCBCoverage( filtered_lcbs[lcbI], wt );
01863                         new_weights.push_back( wt );
01864 //                      if( new_weights[ new_weights.size() - 1 ] != weights[ lcbI ] ){
01865 //                              cerr << "Error: Have you lost weight Susan? difference: " << new_weights[ new_weights.size() - 1 ] - weights[ lcbI ] << "\n";
01866 //                      }
01867                 }
01868         }
01869 
01870         // sort the matches inside consolidated LCBs
01871         MatchStartComparator<AbstractMatch> msc( 0 );
01872         for( lcbI = 0; lcbI < lcb_list.size(); lcbI++ ){
01873                 sort( lcb_list[ lcbI ].begin(), lcb_list[ lcbI ].end(), msc );
01874         }
01875 
01876         // sort the LCBs themselves
01877         LCBLeftEndComp llec;
01878         std::sort( lcb_list.begin(), lcb_list.end(), llec );
01879 
01880         // calculate the LCB adjacencies
01881         weights = new_weights;
01882         computeLCBAdjacencies_v2( lcb_list, weights, adjacencies );
01883 
01884 }
01885 
01886 void Aligner::WritePermutation( vector< LCB >& adjacencies, std::string out_filename )
01887 {
01888         ofstream permutation_out( out_filename.c_str() );
01889         if( !permutation_out.is_open() )
01890         {
01891                 cerr << "Error opening " << out_filename << endl;
01892                 return;
01893         }
01894         for( int seqI = 0; seqI < seq_count; seqI++ )
01895         {
01896                 // find the left-most LCB in this genome
01897                 int left_lcb = 0;
01898                 for( ; left_lcb < adjacencies.size(); left_lcb++ )
01899                 {
01900                         uint left_adj = adjacencies[left_lcb].left_adjacency[seqI];
01901                         if( left_adj == -1 )
01902                                 break;
01903                 }
01904                 // write out lcb id's in order
01905                 for( uint lcbI = left_lcb; lcbI < adjacencies.size(); )
01906                 {
01907                         if( lcbI != left_lcb )
01908                                 permutation_out << '\t';
01909                         if( adjacencies[lcbI].left_end[seqI] < 0 )
01910                                 permutation_out << "-";
01911                         permutation_out << adjacencies[lcbI].lcb_id;
01912                         lcbI = adjacencies[lcbI].right_adjacency[seqI];
01913                 }
01914                 permutation_out << endl;
01915         }
01916 }
01917 
01918 void WritePermutationCoordinates( IntervalList& perm_iv_list, std::string out_filename )
01919 {
01920         ofstream perm_out( out_filename.c_str() );
01921         if( !perm_out.is_open() )
01922         {
01923                 cerr << "Error opening \"" << out_filename << "\"\n";
01924                 return;
01925         }
01926         perm_out << "#";
01927         for( size_t seqI = 0; seqI < perm_iv_list.seq_table.size(); ++seqI )
01928         {
01929                 if( seqI > 0 )
01930                         perm_out << '\t';
01931                 perm_out << "seq" << seqI << "_leftend\tseq" << seqI << "_rightend";
01932         }
01933         perm_out << endl;
01934         for( size_t ivI = 0; ivI < perm_iv_list.size(); ++ivI )
01935         {
01936                 for( size_t seqI = 0; seqI < perm_iv_list.seq_table.size(); ++seqI )
01937                 {
01938                         if( seqI > 0 )
01939                                 perm_out << '\t';
01940                         if( perm_iv_list[ivI].Orientation(seqI) == AbstractMatch::reverse )
01941                                 perm_out << '-';
01942                         perm_out << perm_iv_list[ivI].LeftEnd(seqI) << '\t';
01943                         if( perm_iv_list[ivI].Orientation(seqI) == AbstractMatch::reverse )
01944                                 perm_out << '-';
01945                         perm_out << perm_iv_list[ivI].RightEnd(seqI);
01946                 }
01947                 perm_out << endl;
01948         }
01949 }
01950 
01951 void Aligner::RecursiveAnchorSearch( MatchList& mlist, gnSeqI minimum_weight, vector< MatchList >& LCB_list, boolean entire_genome, ostream* status_out ){
01952 
01953 //
01954 // Step 4) Identify regions of collinearity (LCBs) among the remaining n-way multi-MUMs
01955 //
01956         uint lcbI;
01957         set<uint> breakpoints;
01958         vector< int64 > weights;
01959         vector< LCB > adjacencies;
01960         MatchList new_matches;
01961         new_matches.seq_table = mlist.seq_table;
01962         new_matches.seq_filename = mlist.seq_filename;
01963 
01964         if( mlist.size() == 0 )
01965                 return;
01966 
01967         AaronsLCB( mlist, breakpoints );
01968         if( status_out )
01969                 *status_out << "The " << mlist.size() << " matches constitute " << breakpoints.size() << " breakpoints\n";
01970         // organize the LCBs into different MatchList instances (inside of LCB_list)
01971         ComputeLCBs( mlist, breakpoints, LCB_list, weights );
01972         uint weightI;
01973         for( weightI = 0; weightI < weights.size(); weightI++ )
01974                 if( weights[weightI] < cur_min_coverage || cur_min_coverage == -1 )
01975                         cur_min_coverage = weights[weightI];
01976 
01977         computeLCBAdjacencies_v2( LCB_list, weights, adjacencies );
01978 
01979         int cur_extension_round = 0;
01980         int64 total_weight = 0;
01981         int64 prev_total_weight = 0;
01982         weightI = 0;
01983         vector< vector< int64 > > prev_iv_regions;
01984         do {
01985 
01986 //              for( ; weightI < weights.size(); weightI++ )
01987 //                      total_weight += weights[ weightI ];
01988 
01989                 int64 extension_weight = total_weight;
01990                 int64 prev_extension_weight = total_weight;
01991 
01992                 // only search outside existing LCBs on the whole-genome scale to save time
01993                 if( entire_genome && extend_lcbs && total_weight != 0 &&
01994                         cur_extension_round < this->max_extension_iters )
01995                 {
01996                         cur_extension_round++;
01997                         if( status_out )
01998                                 *status_out << "Performing LCB extension\n";
01999                         vector< vector< int64 > > cur_iv_regions;
02000                         CreateGapSearchList( adjacencies, new_matches.seq_table, cur_iv_regions, entire_genome );
02001                         // only do the search if there's something new to search
02002                         if( prev_iv_regions != cur_iv_regions )
02003                         {
02004                                 int local_round = 0;
02005                                 do {
02006                                         local_round++;
02007                                         // search the gaps between the LCBs to extend the ends of LCBs
02008                                         new_matches.clear();
02009                                         vector< vector< int64 > > new_iv_regions;
02010                                         CreateGapSearchList( adjacencies, new_matches.seq_table, new_iv_regions, entire_genome );
02011                                         SearchLCBGaps( new_matches, new_iv_regions, nway_mh );
02012                                         mlist.insert( mlist.end(), new_matches.begin(), new_matches.end() );
02013                                         
02014                                         AaronsLCB( mlist, breakpoints );
02015                                         ComputeLCBs( mlist, breakpoints, LCB_list, weights );
02016                                         cur_min_coverage = *(std::min_element(weights.begin(), weights.end()));
02017                                         computeLCBAdjacencies_v2( LCB_list, weights, adjacencies );
02018 
02019                                         // calculate the new total LCB weight
02020                                         prev_extension_weight = extension_weight;
02021                                         extension_weight = 0;
02022                                         for( weightI = 0; weightI < weights.size(); weightI++ )
02023                                                 extension_weight += weights[ weightI ];
02024                                         if( status_out )
02025                                                 *status_out << "Previous weight: " << prev_extension_weight << " new weight: " << extension_weight << endl;
02026                                         if( prev_extension_weight > extension_weight ){
02027                                                 cerr << "Error! Previous weight: " << prev_extension_weight << " new weight: " << extension_weight << endl;
02028                                         }
02029                                 }while( extension_weight > prev_extension_weight && local_round < this->max_extension_iters);
02030                         }
02031                         swap( prev_iv_regions, cur_iv_regions );
02032                 }
02033                 
02034                 // now search within LCBs
02035                 if( currently_recursing && total_weight != 0 ){
02036                         vector< search_cache_t > new_cache;
02037                         for( lcbI = 0; lcbI < LCB_list.size(); lcbI++ ){
02038 //                              if( status_out )
02039 //                                      *status_out << "Searching in LCB: " << lcbI << endl;
02040                                 int prev_size = LCB_list[ lcbI ].size();
02041                                 bool leftmost = true;
02042                                 for( int i = 0; leftmost && i < adjacencies[lcbI].left_adjacency.size(); i++ )
02043                                         if(adjacencies[lcbI].left_adjacency[i] != NO_ADJACENCY)
02044                                                 leftmost = false;
02045                                 bool rightmost = true;
02046                                 for( int i = 0; rightmost && i < adjacencies[lcbI].right_adjacency.size(); i++ )
02047                                         if(adjacencies[lcbI].right_adjacency[i] != NO_ADJACENCY)
02048                                                 rightmost = false;
02049                                 SearchWithinLCB( LCB_list[ lcbI ], new_cache, leftmost, rightmost );
02050 //                              if( status_out )
02051 //                                      *status_out << "Gained " << LCB_list[ lcbI ].size() - prev_size << " matches\n";
02052 
02053                         }
02054 
02055                         // delete the previous search cache
02056                         swap( search_cache, new_cache );
02057                         for( size_t mI = 0; mI < new_cache.size(); mI++ )
02058                         {
02059                                 if( new_cache[mI].first != NULL )
02060                                         new_cache[mI].first->Free();
02061                                 if( new_cache[mI].second != NULL )
02062                                         new_cache[mI].second->Free();
02063                         }
02064                         new_cache.clear();
02065                         std::sort( search_cache.begin(), search_cache.end(), cache_comparator );
02066                 }
02067                 
02068                 mlist.clear();
02069                 for( lcbI = 0; lcbI < LCB_list.size(); lcbI++ ){
02070                         mlist.insert( mlist.end(), LCB_list[ lcbI ].begin(), LCB_list[ lcbI ].end() );
02071                 }
02072 
02073                 if( currently_recursing && total_weight != 0 ){
02074                         // remove low weight LCBs, while searching coalesced regions
02075                         AaronsLCB( mlist, breakpoints );
02076                         ComputeLCBs( mlist, breakpoints, LCB_list, weights );
02077                         computeLCBAdjacencies_v2( LCB_list, weights, adjacencies );
02078                         cur_min_coverage = *(std::min_element(weights.begin(), weights.end()));
02079                 }
02080 
02081                 
02082                 // write  alist for debugging
02083 //              ofstream debug_match_list( "debug_match_list.txt" );
02084 //              mlist.WriteList( debug_match_list );
02085 //              debug_match_list.close();
02086 
02087 //
02088 // Step 6) Use greedy breakpoint elimination to remove low-weight LCBs
02089 //
02090                 int64 cur_perm_weight = permutation_weight != -1 ? permutation_weight : minimum_weight;
02091                 do{
02092                         vector<double> m_weights(weights.size());
02093                         std::copy( weights.begin(), weights.end(), m_weights.begin());
02094                         SimpleBreakpointScorer sbs(adjacencies, cur_perm_weight, this->collinear_genomes);
02095                         if( status_out )
02096                                 (*status_out) << "Performing greedy breakpoint elimination (this may take some time)\n";
02097 
02098                         greedyBreakpointElimination_v4(adjacencies, m_weights, sbs, NULL, false);
02099 //                      cur_min_coverage = greedyBreakpointElimination( cur_perm_weight, adjacencies, weights, status_out );
02100 //                      MatchList deleted_matches;
02101                         filterMatches( adjacencies, LCB_list, weights );
02102                         cur_min_coverage = *(std::min_element(weights.begin(), weights.end()));
02103                         
02104                         mlist.clear();
02105                         for( lcbI = 0; lcbI < LCB_list.size(); lcbI++ ){
02106                                 mlist.insert( mlist.end(), LCB_list[ lcbI ].begin(), LCB_list[ lcbI ].end() );
02107                         }
02108                         if( status_out )
02109                                 *status_out << "Greedy breakpoint elimination leaves " << mlist.size() << " matches constituting " << LCB_list.size() << " LCBs covering at least " << cur_min_coverage << "b.p.\n";
02110                         
02111                         if( permutation_weight != -1 ){
02112                                 // construct a filename
02113                                 stringstream cur_perm_filename;
02114                                 cur_perm_filename << permutation_filename << "." << cur_perm_weight / seq_count;
02115                                 // output the permutation
02116                                 WritePermutation( adjacencies, cur_perm_filename.str() );
02117 
02118                                 // also write out condensed interval data for the permutation
02119                                 cur_perm_filename << ".lcbs";
02120                                 IntervalList perm_iv_list;
02121                                 perm_iv_list.seq_filename = mlist.seq_filename;
02122                                 perm_iv_list.seq_table = mlist.seq_table;
02123                                 for( int permI = 0; permI < LCB_list.size(); permI++ ){
02124                                         vector< AbstractMatch* > perm_vector;
02125                                         perm_vector.push_back( LCB_list[permI].front() );
02126                                         if( LCB_list[permI].size() > 1 )
02127                                                 perm_vector.push_back( LCB_list[permI].back() );
02128                                         Interval perm_iv(perm_vector.begin(), perm_vector.end());
02129                                         perm_iv_list.push_back(perm_iv);
02130                                 }
02131                                 WritePermutationCoordinates( perm_iv_list, cur_perm_filename.str() );
02132 
02133                                 // get the current min weight
02134                                 vector< int64 >::iterator min_w = std::min_element( weights.begin(), weights.end() );
02135                                 // increment the current weight
02136                                 cur_perm_weight = *min_w + seq_count;
02137                         }
02138                 }while( cur_perm_weight < minimum_weight );
02139                 // only enable recursive anchor search once we achieve
02140                 // the desired weight threshold once -- for speed's sake
02141                 if( recursive && entire_genome ){
02142                         currently_recursing = true;
02143                 }
02144 
02145                 // calculate the new total LCB weight
02146                 prev_total_weight = total_weight;
02147                 total_weight = 0;
02148                 for( weightI = 0; weightI < weights.size(); weightI++ )
02149                         total_weight += weights[ weightI ];
02150                 if( status_out )
02151                         *status_out << "Previous weight: " << prev_total_weight << " new weight: " << total_weight << endl;
02152         // the weight can shrink--this isn't an error condition
02153 //              if( prev_total_weight > total_weight ){
02154 //                      cerr << "Error! Previous weight: " << prev_total_weight << " new weight: " << total_weight << endl;
02155                         // write out the lcb lists
02156 //              }
02157 
02158 //
02159 // Step 7) Repeat 4, 5 and 6 until the total weight stabilizes
02160 //
02161         }while( total_weight != prev_total_weight );
02162 
02163         // delete the search cache
02164         for( size_t mI = 0; mI < search_cache.size(); mI++ )
02165         {
02166                 if( search_cache[mI].first != NULL )
02167                         search_cache[mI].first->Free();
02168                 if( search_cache[mI].second != NULL )
02169                         search_cache[mI].second->Free();
02170         }
02171 }
02172 
02192 void Aligner::align( MatchList& mlist, IntervalList& interval_list, double LCB_minimum_density, double LCB_minimum_range, boolean recursive, boolean extend_lcbs, boolean gapped_alignment, string tree_filename ){
02193         seq_count = mlist.seq_table.size();
02194         this->LCB_minimum_density = LCB_minimum_density;
02195         this->LCB_minimum_range = LCB_minimum_range;
02196         this->recursive = recursive;
02197         this->currently_recursing = false;
02198         this->extend_lcbs = extend_lcbs;
02199         this->gapped_alignment = gapped_alignment;
02200 
02201         // use LCB_minimum_range == -1 to indicate that all genomes are 
02202         // expected to be collinear
02203         this->collinear_genomes = LCB_minimum_range == -1;
02204         if( collinear_genomes )
02205                 cout << "\nAssuming collinear genomes...\n";
02206 
02207         // set the nway_mh mask
02208         uint64 nway_mask = 1;
02209         nway_mask <<= seq_count;
02210         nway_mask--;
02211         nway_mh.SetMask( nway_mask );
02212                 
02213         cout << "Starting with " << mlist.size() << " MUMs\n";
02214         
02215 //
02216 // Step 1) Eliminate overlaps among the multi-MUMs
02217 //      
02218         // Remove linked inclusions
02219         EliminateOverlaps( mlist );
02220         cout << "Eliminating overlaps yields " << mlist.size() << " MUMs\n";
02221 
02222 //
02223 // Step 2) Compute a phylogenetic guide tree using the multi-MUMs
02224 //
02225 
02226         bool guide_tree_loaded = false;
02227         MuscleInterface& mi = MuscleInterface::getMuscleInterface();    
02228 
02229         if( !guide_tree_loaded && (recursive || tree_filename != "") ){
02230                 // Make a phylogenetic tree for ClustalW
02231                 interval_list.seq_table = mlist.seq_table;
02232                 interval_list.seq_filename = mlist.seq_filename;
02233                 // use the identity matrix method and convert to a distance matrix
02234                 NumericMatrix< double > distance;
02235                 DistanceMatrix( mlist, distance );
02236                 if( tree_filename == "" )
02237                         tree_filename = CreateTempFileName("guide_tree");
02238                 mi.CreateTree( distance, tree_filename );
02239         }
02240 
02241 //
02242 // Step 3) Remove subset multi-MUMs
02243 //
02244         // Multiplicity Filter...
02245         mlist.MultiplicityFilter( seq_count );
02246         cout << "Multiplicity filter gives " << mlist.size() << " MUMs\n";
02247 
02248         if( mlist.size() == 0 )
02249                 return;
02250         
02251 //
02252 // Steps 4 through 7 are contained in RecursiveAnchorSearch
02253 //
02254         vector< MatchList > LCB_list;
02255         RecursiveAnchorSearch( mlist, (gnSeqI)LCB_minimum_range, LCB_list, true, &cout );
02256 
02257 
02258 //
02259 // Step 8) Perform gapped alignment on each LCB using the anchors
02260 //
02261         if( gapped_alignment && recursive )
02262                 cout << "\nMaking final gapped alignment...\n";
02263         interval_list.clear();
02264         AlnProgressTracker apt;
02265         apt.cur_leftend = 0;
02266         apt.prev_progress = 0;
02267         apt.total_len = 0;
02268         for( uint lcbI = 0; lcbI < LCB_list.size(); lcbI++ )
02269                 apt.total_len += LCB_list[lcbI].size()-1;
02270         for( uint lcbI = 0; lcbI < LCB_list.size(); lcbI++ ){
02271                 Interval new_iv;
02272                 interval_list.push_back( new_iv );
02273                 Interval& iv = interval_list.back();
02274                 if( !gapped_alignment || !recursive ){
02275                         iv.SetMatches( LCB_list[lcbI] );
02276                 }else{
02277 //                      AlignLCB( LCB_list[ lcbI ], iv );
02278                         AlignLCBInParallel( collinear_genomes || (LCB_list.size()==1), gal, LCB_list[ lcbI ], iv, apt );
02279                 }
02280         }
02281         
02282         // finally add any unaligned regions to the interval list       
02283         if( gapped_alignment )
02284                 addUnalignedIntervals( interval_list );
02285 }
02286 
02287 }       // namespace mems
02288 

Generated on Fri Mar 14 06:01:02 2008 for libMems by doxygen 1.3.6