libMems/ProgressiveAligner.cpp

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * $Id: progressiveAligner.cpp,v 1.47 2004/04/19 23:10:30 darling Exp $
00003  * BEWARE!!
00004  * This code was created in the likeness of the flying spaghetti monster
00005  *
00006  * dedicated to Loren...
00007  ******************************************************************************/
00008 
00009 #ifdef HAVE_CONFIG_H
00010 #include "config.h"
00011 #endif
00012 
00013 
00014 #include "libMems/ProgressiveAligner.h"
00015 #include "libMems/GreedyBreakpointElimination.h"
00016 #include "libMems/Aligner.h"
00017 #include "libMems/Islands.h"
00018 #include "libMems/DNAFileSML.h"
00019 #include "libMems/MuscleInterface.h"    // it's the default gapped aligner
00020 #include "libMems/gnAlignedSequences.h"
00021 #include "libMems/CompactGappedAlignment.h"
00022 #include "libMems/MatchProjectionAdapter.h"
00023 #include "libMems/PairwiseMatchFinder.h"
00024 #include "libMems/TreeUtilities.h"
00025 #include "libMems/PairwiseMatchAdapter.h"
00026 #include "libMems/DistanceMatrix.h"
00027 
00028 #include <boost/dynamic_bitset.hpp>
00029 #include <boost/tuple/tuple.hpp>
00030 #include <boost/property_map.hpp>
00031 #include <boost/graph/graph_traits.hpp>
00032 #include <boost/graph/adjacency_list.hpp>
00033 #include <boost/graph/johnson_all_pairs_shortest.hpp>
00034 #include <boost/graph/undirected_dfs.hpp>
00035 
00036 #include <map>
00037 #include <fstream>      // for debugging
00038 #include <sstream>
00039 #include <stack>
00040 #include <algorithm>
00041 #include <limits>
00042 #include <iomanip>
00043 
00044 #include "stdlib.h"
00045 
00046 using namespace std;
00047 using namespace genome;
00048 
00049 namespace mems {
00050 
00051 
00052 bool progress_msgs = false;
00053 
00054 bool debug_me = false;
00055 static int dbg_count = 0;        
00056 
00057 
00058 double min_window_size = 200;
00059 double max_window_size = 20000;  // don't feed MUSCLE anything bigger than this
00060 double min_density = .5;
00061 double max_density = .9;
00062 size_t max_gap_length = 3000;
00063 size_t lcb_hangover = 300;
00064 
00065 
00066 void mergeUnalignedIntervals( uint seqI, vector< Interval* >& iv_list, vector< Interval* >& new_list );
00067 
00072 boolean my_validateLCB( MatchList& lcb ){
00073         vector< Match* >::iterator lcb_iter = lcb.begin();
00074         if( lcb.size() == 0 )
00075                 return true;
00076         uint seq_count = (*lcb_iter)->SeqCount();
00077         uint seqI = 0;
00078         boolean complain = false;
00079         for(; seqI < seq_count; seqI++ ){
00080                 lcb_iter = lcb.begin();
00081                 int64 prev_coord = 0;
00082                 for(; lcb_iter != lcb.end(); ++lcb_iter ){
00083                         if( (*lcb_iter)->Start( seqI ) == NO_MATCH )
00084                                 continue;
00085                         else if( prev_coord != 0 && (*lcb_iter)->Start( seqI ) < prev_coord ){
00086                                 complain = true;
00087                         }
00088                         prev_coord = (*lcb_iter)->Start( seqI );
00089                 }
00090         }
00091         return !complain;
00092 }
00093 
00094 template< class BoostMatType >
00095 void print2d_matrix( BoostMatType& mat, std::ostream& os )
00096 {
00097         for( size_t i = 0; i < mat.shape()[0]; ++i )
00098         {
00099                 for( size_t j = 0; j < mat.shape()[1]; ++j )
00100                 {
00101                         if( j > 0 )
00102                                 os << "\t";
00103                         os << mat[i][j];
00104                 }
00105                 os << endl;
00106         }
00107 }
00108 
00109 double getDefaultBreakpointPenalty( std::vector< gnSequence* >& sequences )
00110 {
00111         uint default_mer_size = MatchList::GetDefaultMerSize( sequences );
00112         double avg_seq_len = 0;
00113         for( size_t seqI = 0; seqI < sequences.size(); ++seqI )
00114                 avg_seq_len += (double)sequences[seqI]->length();
00115         avg_seq_len /= (double)sequences.size();
00116         avg_seq_len = log( avg_seq_len ) / log( 2.0 );
00117         return avg_seq_len * 7000;        // seems to work reasonably well?
00118 }
00119 
00120 
00121 double getDefaultBpDistEstimateMinScore( std::vector< gnSequence* >& sequences )
00122 {
00123         return 2.0 * getDefaultBreakpointPenalty(sequences);
00124 }
00125 
00126 
00127 
00128 /*
00129  * A progressive alignment algorithm for genomes with rearrangements.
00130  * Start simple, add complexity later.
00131  * TODO: rewrite the algorithm outline
00132  */
00133 
00134 ProgressiveAligner::ProgressiveAligner( uint seq_count ) :
00135 Aligner( seq_count ),
00136 breakpoint_penalty( -1 ),
00137 min_breakpoint_penalty( 2000 ),
00138 debug(false),
00139 refine(true),
00140 scoring_scheme(ExtantSumOfPairsScoring),
00141 use_weight_scaling(true),
00142 conservation_dist_scale(1),
00143 bp_dist_scale(.9),
00144 max_gapped_alignment_length(20000),
00145 bp_dist_estimate_score(-1),
00146 use_seed_families(false),
00147 using_cache_db(true)
00148 {
00149         gapped_alignment = true;
00150         max_window_size = max_gapped_alignment_length;
00151 }
00152 
00153 void ProgressiveAligner::SetMaxGappedAlignmentLength( size_t len )
00154 { 
00155         max_gapped_alignment_length = len; 
00156         max_window_size = max_gapped_alignment_length;
00157 }
00158 
00160 void ProgressiveAligner::getAlignedChildren( node_id_t node, vector< node_id_t >& descendants )
00161 {
00162         // do a depth first search along edges that have been aligned
00163         stack< node_id_t > node_stack;
00164         node_stack.push( node );
00165         vector< bool > visited( alignment_tree.size(), false );
00166         descendants.clear();
00167         while( node_stack.size() > 0 )
00168         {
00169                 node_id_t cur_node = node_stack.top();
00170                 if(progress_msgs) cout << "Evaluating aligned nodes linked to node " << cur_node << endl;
00171                 node_stack.pop();
00172                 visited[cur_node] = true;
00173                 for( uint childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
00174                 {
00175                         node_id_t child_id = alignment_tree[cur_node].children[childI];
00176                         if( alignment_tree[cur_node].children_aligned[childI] && !visited[child_id])
00177                                 node_stack.push( child_id );
00178                 }
00179                 if( alignment_tree[ cur_node ].sequence != NULL )
00180                         descendants.push_back( cur_node );
00181         }
00182 }
00183 
00184 
00186 void ProgressiveAligner::getPath( node_id_t first_n, node_id_t last_n, vector< node_id_t >& path )
00187 {
00188         // do a depth first search along edges that have been aligned
00189         stack< node_id_t > node_stack;
00190         node_stack.push( last_n );
00191         vector< bool > visited( alignment_tree.size(), false );
00192         while( node_stack.top() != first_n )
00193         {
00194                 node_id_t cur_node = node_stack.top();
00195                 size_t pre_size = node_stack.size();
00196                 visited[cur_node] = true;
00197                 for( uint childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
00198                 {
00199                         node_id_t child_id = alignment_tree[cur_node].children[childI];
00200                         if(!visited[child_id])
00201                         {
00202                                 node_stack.push( child_id );
00203                                 break;
00204                         }
00205                 }
00206                 if( pre_size != node_stack.size() )
00207                         continue;
00208                 for( uint parentI = 0; parentI < alignment_tree[cur_node].parents.size(); parentI++ )
00209                 {
00210                         node_id_t parent_id = alignment_tree[cur_node].parents[parentI];
00211                         if(!visited[parent_id])
00212                         {
00213                                 node_stack.push( parent_id );
00214                                 break;
00215                         }
00216                 }
00217                 if( pre_size != node_stack.size() )
00218                         continue;
00219                 node_stack.pop();       // didn't make any progress
00220         }
00221         path = vector< node_id_t >( node_stack.size() );
00222         for( size_t pI = 0; pI < path.size(); pI++ )
00223         {
00224                 path[pI] = node_stack.top();
00225                 node_stack.pop();
00226         }
00227 }
00228 
00229 
00230 
00231 
00232 
00233 
00234 template<class MatchType>
00235 void ProgressiveAligner::propagateDescendantBreakpoints( node_id_t node1, uint seqI, std::vector<MatchType*>& iv_list )
00236 {
00237         SSC<MatchType> ilc(seqI);
00238         sort( iv_list.begin(), iv_list.end(), ilc );
00239         vector< SuperInterval >& ord = alignment_tree[ node1 ].ordering;
00240         vector<gnSeqI> bp_list;
00241         for( size_t sI = 0; sI < ord.size(); sI++ )
00242                 bp_list.push_back( ord[sI].LeftEnd() );
00243 
00244         GenericMatchSeqManipulator<MatchType> ism( seqI );
00245         applyBreakpoints( bp_list, iv_list, ism );
00246 }
00247 
00248 // T should be a pointer type
00249 template<class T, class Manipulator>
00250 void applyAncestralBreakpoints( const vector< SuperInterval >& siv_list, vector<T>& ord, uint seqI, Manipulator& m )
00251 {
00252         // make bp list
00253         vector<gnSeqI> bp_list(siv_list.size()*2, 0);
00254         size_t cur = 0;
00255         for( size_t i = 0; i < siv_list.size(); i++ )
00256         {
00257                 if( siv_list[i].reference_iv.Start(seqI) == NO_MATCH )
00258                         continue;
00259                 bp_list[cur++] = siv_list[i].reference_iv.LeftEnd(seqI);
00260                 bp_list[cur++] = siv_list[i].reference_iv.LeftEnd(seqI) + siv_list[i].reference_iv.Length(seqI);
00261         }
00262         bp_list.resize(cur);
00263         // sort the breakpoints and apply...
00264         sort( bp_list.begin(), bp_list.end() );
00265         applyBreakpoints( bp_list, ord, m );
00266 }
00267 
00268 
00269 // assuming breakpoints have been propagated in both directions
00270 // there should now be a 1-to-1 correspondence between superintervals
00271 // in the ancestor and descendants.
00272 void ProgressiveAligner::linkSuperIntervals( node_id_t node1, uint seqI, node_id_t ancestor )
00273 {
00274         // TODO: speed this up by implementing O(N) instead of O(N^2)
00275         vector<SuperInterval>& a_ord = alignment_tree[ancestor].ordering;
00276         vector<SuperInterval>& c_ord = alignment_tree[node1].ordering;
00277         // initialize all linkages to nothing
00278         for( size_t aI = 0; aI < a_ord.size(); aI++ )
00279                 if( seqI == 0 )
00280                         a_ord[aI].c1_siv = (std::numeric_limits<size_t>::max)();
00281                 else
00282                         a_ord[aI].c2_siv = (std::numeric_limits<size_t>::max)();
00283         for( size_t cI = 0; cI < c_ord.size(); cI++ )
00284                 c_ord[cI].parent_siv = (std::numeric_limits<size_t>::max)();
00285 
00286         for( size_t aI = 0; aI < a_ord.size(); aI++ )
00287         {
00288                 if( a_ord[aI].reference_iv.LeftEnd(seqI) == NO_MATCH )
00289                         continue;
00290                 size_t cI = 0;
00291                 for( ; cI < c_ord.size(); cI++ )
00292                 {
00293                         if( absolut(a_ord[aI].reference_iv.Start(seqI)) != c_ord[cI].LeftEnd() )
00294                                 continue;
00295                         if( a_ord[aI].reference_iv.Length(seqI) != c_ord[cI].Length() )
00296                         {
00297                                 breakHere();
00298                                 cerr << "mapping length mismatch\n";
00299                                 cerr << "ancestor: " << ancestor << "\t node1: " << node1 << endl;
00300                                 cerr << "a_ord[" << aI << "].reference_iv.Length(" << seqI << "): " << a_ord[aI].reference_iv.Length(seqI) << endl;
00301                                 cerr << "a_ord[" << aI << "].reference_iv.LeftEnd(" << seqI << "): " << a_ord[aI].reference_iv.LeftEnd(seqI) << endl;
00302                                 cerr << "c_ord[" << cI << "].Length(): " << c_ord[cI].Length() << endl;
00303                                 cerr << "c_ord[" << cI << "].LeftEnd(): " << c_ord[cI].LeftEnd() << endl;
00304                                 cerr << "";
00305                                 cerr << "";
00306                         }
00307                         // link these
00308                         if( seqI == 0 )
00309                                 a_ord[aI].c1_siv = cI;
00310                         else
00311                                 a_ord[aI].c2_siv = cI;
00312                         c_ord[cI].parent_siv = aI;
00313                         break;
00314                 }
00315                 if( cI == c_ord.size() )
00316                 {
00317                         breakHere();
00318                         cerr << "error no mapping\n";
00319                 }
00320         }
00321 }
00322 
00323 
00324 void ProgressiveAligner::translateGappedCoordinates( vector<AbstractMatch*>& ml, uint seqI, node_id_t extant, node_id_t ancestor )
00325 {
00326         // determine the path that must be traversed
00327         vector< node_id_t > trans_path;
00328         getPath( extant, ancestor, trans_path );
00329 
00330         // set seqI to forward orientation 
00331         for( size_t mI = 0; mI < ml.size(); mI++ )
00332                 if( ml[mI]->Orientation(seqI) == AbstractMatch::reverse )
00333                         ml[mI]->Invert();
00334 
00335         // for each node on the path, construct a complete coordinate translation
00336         for( size_t nI = 1; nI < trans_path.size(); nI++ )
00337         {
00338                 // first sort matches on start pos and make them all forward oriented
00339                 // then split them on superinterval boundaries and assign each to a superinterval
00340                 // then convert each match's coordinates to be superinterval-local
00341                 // then apply the coordinate translation with transposeCoordinates
00342                 // then shift each match's coordinates to the global ancestral coordinate space
00343                 SSC<AbstractMatch> ssc(seqI);
00344                 sort(ml.begin(), ml.end(), ssc);
00345 
00346                 // split on superinterval boundaries
00347                 vector< SuperInterval >& siv_list = alignment_tree[trans_path[nI]].ordering;
00348                 vector< vector< AbstractMatch* > > siv_matches = vector< vector< AbstractMatch* > >(siv_list.size());
00349                 size_t cur_child = 0;
00350                 if( alignment_tree[trans_path[nI]].children[0] == trans_path[nI-1] )
00351                         cur_child = 0;
00352                 else if( alignment_tree[trans_path[nI]].children[1] == trans_path[nI-1] )
00353                         cur_child = 1;
00354                 else 
00355                 {
00356                         breakHere();
00357                         cerr << "forest fire\n";
00358                 }
00359 
00360                 AbstractMatchSeqManipulator amsm( seqI );
00361                 applyAncestralBreakpoints(siv_list, ml, cur_child, amsm );
00362                 
00363                 // sort matches again because new ones were added at the end
00364                 sort(ml.begin(), ml.end(), ssc);
00365 
00366                 // assign each match to a siv, and convert coords to siv-local
00367                 for( size_t mI = 0; mI < ml.size(); mI++ )
00368                 {
00369                         if( ml[mI]->LeftEnd(seqI) == 0 )
00370                         {
00371                                 breakHere();
00372                                 cerr << "fefefe";
00373                         }
00374                         size_t sivI = 0;
00375                         for( ; sivI < siv_list.size(); sivI++ )
00376                         {
00377                                 if( siv_list[sivI].reference_iv.LeftEnd(cur_child) == NO_MATCH )
00378                                         continue;
00379                                 if( ml[mI]->LeftEnd(seqI) >= siv_list[sivI].reference_iv.LeftEnd(cur_child) &&
00380                                         ml[mI]->LeftEnd(seqI) < siv_list[sivI].reference_iv.LeftEnd(cur_child) + siv_list[sivI].reference_iv.Length(cur_child) )
00381                                         break;
00382                         }
00383                         if( sivI == siv_list.size() )
00384                         {
00385                                 cerr << "nI is: "<< nI << endl;
00386                                 cerr << "trans_path: ";
00387                                 for( size_t ttI = 0; ttI < trans_path.size(); ttI++ )
00388                                         cerr << "  " << trans_path[ttI];
00389                                 cerr << endl;
00390                                 cerr << "problem seq: " << seqI << std::endl;
00391                                 cerr << "ml[" << mI << "]->Start(0) == " << ml[mI]->Start(0) << endl;
00392                                 cerr << "ml[" << mI << "]->Length(0) == " << ml[mI]->Length(1) << endl;
00393                                 cerr << "ml[" << mI << "]->Start(1) == " << ml[mI]->Start(0) << endl;
00394                                 cerr << "ml[" << mI << "]->Length(1) == " << ml[mI]->Length(1) << endl;
00395                                 cerr << "ml.size(): " << ml.size() << endl;
00396                                 for( sivI = 0; sivI < siv_list.size(); sivI++ )
00397                                 {
00398                                         cerr << "siv_list[" << sivI << "] left end 0: " << siv_list[sivI].reference_iv.LeftEnd(0)  << endl;
00399                                         if( siv_list[sivI].reference_iv.LeftEnd(0) != 0 )
00400                                                 cerr << "siv_list[" << sivI << "] right end 0: " << siv_list[sivI].reference_iv.LeftEnd(0) + siv_list[sivI].reference_iv.Length(0) << endl;
00401                                         cerr << "siv_list[" << sivI << "] left end 1: " << siv_list[sivI].reference_iv.LeftEnd(1)  << endl;
00402                                         if( siv_list[sivI].reference_iv.LeftEnd(1) != 0 )
00403                                                 cerr << "siv_list[" << sivI << "] right end 1: " << siv_list[sivI].reference_iv.LeftEnd(1) + siv_list[sivI].reference_iv.Length(1) << endl;
00404                                 }
00405                                 breakHere();
00406                         }
00407                         if( ml[mI]->LeftEnd(seqI) + ml[mI]->Length(seqI) > 
00408                                 siv_list[sivI].reference_iv.LeftEnd(cur_child) + siv_list[sivI].reference_iv.Length(cur_child) )
00409                         {
00410                                 cerr << "doesn't fit\n";
00411                                 cerr << "ml[" << mI << "]->LeftEnd(" << seqI << "): " << ml[mI]->LeftEnd(seqI) << endl;
00412                                 cerr << "ml[" << mI << "]->RightEnd(" << seqI << "): " << ml[mI]->RightEnd(seqI) << endl;
00413                                 cerr << "siv_list[" << sivI << "] left end 0: " << siv_list[sivI].reference_iv.LeftEnd(0)  << endl;
00414                                 if( siv_list[sivI].reference_iv.LeftEnd(0) != 0 )
00415                                         cerr << "siv_list[" << sivI << "] right end 0: " << siv_list[sivI].reference_iv.LeftEnd(0) + siv_list[sivI].reference_iv.Length(0) << endl;
00416                                 cerr << "siv_list[" << sivI << "] left end 1: " << siv_list[sivI].reference_iv.LeftEnd(1)  << endl;
00417                                         if( siv_list[sivI].reference_iv.LeftEnd(1) != 0 )
00418                                                 cerr << "siv_list[" << sivI << "] right end 1: " << siv_list[sivI].reference_iv.LeftEnd(1) + siv_list[sivI].reference_iv.Length(1) << endl;
00419                                 cerr << "ml.size(): " << ml.size() << endl;
00420                                 cerr << "siv_list.size(): " << siv_list.size() << endl;
00421                                 cerr << "trans_path:";
00422                                 for( size_t tI = 0; tI < trans_path.size(); tI++ )
00423                                         cerr << " " << trans_path[tI];
00424                                 cerr << endl;
00425                                 cerr << "trans_path[" << nI << "]: " << trans_path[nI] << endl;
00426                                 breakHere();
00427                         }
00428 
00429                         ml[mI]->SetLeftEnd( seqI, ml[mI]->LeftEnd(seqI) - siv_list[sivI].reference_iv.LeftEnd(cur_child) + 1 );
00430                         // if this interval matches the reverse strand then we should effectively invert all matches
00431                         if( siv_list[sivI].reference_iv.Start(cur_child) < 0 )
00432                         {
00433                                 int64 new_lend = siv_list[sivI].reference_iv.Length(cur_child) - ml[mI]->LeftEnd(seqI);
00434                                 new_lend -= ml[mI]->Length( seqI ) - 2;
00435                                 new_lend *= ml[mI]->Orientation(seqI) == AbstractMatch::forward ? 1 : -1;
00436                                 ml[mI]->Invert();
00437                                 ml[mI]->SetStart( seqI, new_lend ); 
00438                         }
00439                         siv_matches[sivI].push_back( ml[mI] );
00440                 }
00441 
00442                 // apply the coordinate translation
00443                 ml.clear();
00444                 for( size_t sivI = 0; sivI < siv_matches.size(); sivI++ )
00445                 {
00446                         if( siv_matches[sivI].size() == 0 )
00447                                 continue;
00448                         
00449                         // get a CompactGappedAlignment<> for this interval
00450                         CompactGappedAlignment<>* siv_cga = dynamic_cast<CompactGappedAlignment<>*>(siv_list[sivI].reference_iv.GetMatches()[0]);
00451                         if( siv_list[sivI].reference_iv.GetMatches().size() > 1 )
00452                                 siv_cga = NULL;
00453                         bool alloc_new_siv = false;
00454                         CompactGappedAlignment<> tmp_cga;
00455                         if( siv_cga == NULL )
00456                         {
00457                                 alloc_new_siv = true;
00458                                 siv_cga = tmp_cga.Copy();
00459                                 CompactGappedAlignment<> dorkas(siv_list[sivI].reference_iv);
00460                                 *siv_cga = dorkas;
00461                         }
00462 
00463                         // now translate each match...
00464                         for( size_t mI = 0; mI < siv_matches[sivI].size(); mI++ )
00465                         {
00466                                 CompactGappedAlignment<>* match_cga = dynamic_cast<CompactGappedAlignment<>*>(siv_matches[sivI][mI]);
00467                                 bool alloc_new = false;
00468                                 if( match_cga == NULL )
00469                                 {
00470                                         match_cga = tmp_cga.Copy();
00471                                         *match_cga = CompactGappedAlignment<>(*(siv_matches[sivI][mI]));
00472                                         alloc_new = true;
00473                                 }
00474                                 siv_cga->translate( *match_cga, seqI, cur_child );
00475 
00476                                 if( alloc_new )
00477                                 {
00478                                         siv_matches[sivI][mI]->Free();
00479                                         siv_matches[sivI][mI] = match_cga;
00480                                 }
00481                         }
00482 
00483                         // shift coordinates back to global space
00484                         for( size_t mI = 0; mI < siv_matches[sivI].size(); mI++ )
00485                         {
00486                                 int64 cur_start = siv_matches[sivI][mI]->Start(seqI);
00487                                 if( cur_start > 0 )
00488                                         siv_matches[sivI][mI]->SetStart( seqI, cur_start + siv_list[sivI].LeftEnd() - 1 );
00489                                 else
00490                                         siv_matches[sivI][mI]->SetStart( seqI, cur_start - siv_list[sivI].LeftEnd() + 1);
00491                                 if( (siv_matches[sivI][mI]->LeftEnd(seqI) + siv_matches[sivI][mI]->Length(seqI) > siv_list.back().LeftEnd() + siv_list.back().Length() )
00492                                          )
00493                                 {
00494                                         // is there something wrong with the translation table?
00495                                         cerr << "siv left is: " << siv_list[sivI].LeftEnd() << endl;
00496                                         cerr << "siv right is: " << siv_list[sivI].LeftEnd() + siv_list[sivI].Length() << endl;
00497                                         cerr << "match right is: " << siv_matches[sivI][mI]->LeftEnd(seqI) + siv_matches[sivI][mI]->Length(seqI) << endl;
00498                                         cerr << "superseq right is: " << siv_list.back().LeftEnd() + siv_list.back().Length() << endl;
00499                                         cerr << "";
00500                                         breakHere();
00501                                 }
00502                                 if( debug_aligner && siv_matches[sivI][mI]->Start(seqI) == 0 )
00503                                 {
00504                                         breakHere();
00505                                 }
00506                         }
00507                         if(alloc_new_siv)
00508                                 siv_cga->Free();
00509                         ml.insert( ml.end(), siv_matches[sivI].begin(), siv_matches[sivI].end() );
00510                 }
00511         }
00512         // restore forward orientation seqI
00513         for( size_t mI = 0; mI < ml.size(); mI++ )
00514                 if( ml[mI]->Orientation(seqI) == AbstractMatch::reverse )
00515                         ml[mI]->Invert();
00516 }
00517 
00518 class SuperIntervalPtrComp
00519 {
00520 public:
00521         bool operator()( const SuperInterval* a, const SuperInterval* b )
00522         {
00523                 return (*a) < (*b);
00524         }
00525 };
00526 
00527 void ProgressiveAligner::recursiveApplyAncestralBreakpoints( node_id_t ancestor )
00528 {
00529         stack<node_id_t> node_stack;
00530         node_stack.push(ancestor);
00531         while( node_stack.size() > 0 )
00532         {
00533                 // pop the current node, apply ancestral breakpoints, recurse on children
00534                 node_id_t cur_node = node_stack.top();
00535                 node_stack.pop();
00536                 SuperIntervalManipulator sim;
00537                 if( progress_msgs ) cout << "cur node: " << cur_node << endl;
00538                 for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
00539                 {
00540                         AlignmentTreeNode& atn = alignment_tree[alignment_tree[cur_node].children[childI]];
00541                         if( progress_msgs ) cout << "childI " << childI << " aab\n";
00542                         applyAncestralBreakpoints( alignment_tree[cur_node].ordering, atn.ordering, childI, sim );
00543                         if( progress_msgs ) cout << "sort childI " << childI << "\n";
00544                         vector<SuperInterval*> siv_ptr_list(atn.ordering.size());
00545                         for( size_t sivI = 0; sivI < atn.ordering.size(); ++sivI )
00546                                 siv_ptr_list[sivI] = &(atn.ordering[sivI]);
00547                         SuperIntervalPtrComp sipc;
00548                         sort( siv_ptr_list.begin(), siv_ptr_list.end(), sipc );
00549                         vector< SuperInterval > siv_list;
00550                         for( size_t sivI = 0; sivI < siv_ptr_list.size(); ++sivI )
00551                                 siv_list.push_back(*siv_ptr_list[sivI]);
00552                         swap(siv_list, atn.ordering);
00553                         node_stack.push( alignment_tree[cur_node].children[childI] );
00554                 }
00555                 if( debug_aligner && alignment_tree[cur_node].children.size() > 0 )
00556                         validateSuperIntervals(alignment_tree[cur_node].children[0], alignment_tree[cur_node].children[1], cur_node);
00557                 if( progress_msgs ) cout << "linking node " << cur_node << "'s" << alignment_tree[cur_node].ordering.size() << " superintervals\n"; 
00558                 for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
00559                         linkSuperIntervals( alignment_tree[cur_node].children[childI], childI, cur_node );
00560         }
00561 }
00562 
00563 
00564 boolean getInterveningCoordinates( const AbstractMatch* iv, uint oseqI, Match* r_begin, Match* r_end, uint seqI, int64& gap_lend, int64& gap_rend ){
00565         // skip this sequence if it's undefined
00566         if( (r_end != NULL && r_end->Start( seqI ) == NO_MATCH) ||
00567                 (r_begin != NULL && r_begin->Start( seqI ) == NO_MATCH) ){
00568                 gap_lend = 0;
00569                 gap_rend = 0;
00570                 return true;
00571         }
00572                         
00573         // determine the size of the gap
00574         gap_rend = r_end != NULL ? r_end->Start( seqI ) : iv->RightEnd( oseqI ) + 1;
00575         gap_lend = r_begin != NULL ? r_begin->End( seqI ) + 1 : iv->LeftEnd( oseqI );
00576         if( gap_rend < 0 || gap_lend < 0 ){
00577                 gap_rend = r_begin != NULL ? -r_begin->Start( seqI ) : iv->RightEnd( oseqI ) + 1;
00578                 gap_lend = r_end != NULL ? -r_end->Start( seqI ) + r_end->Length() : 1;
00579         }
00580         if( gap_rend <= 0 || gap_lend <= 0 ){
00581                 // if either is still < 0 then there's a problem...
00582                 genome::ErrorMsg( "Error constructing intervening coordinates" );
00583         }
00584         return true;
00585 }
00586 
00587 
00588 void ProgressiveAligner::pairwiseAnchorSearch( MatchList& r_list, Match* r_begin, Match* r_end, const AbstractMatch* iv, uint oseqI, uint oseqJ )
00589 {
00590         uint seqI = 0;
00591         MatchList gap_list;
00592         vector< int64 > starts;
00593 // 
00594 //      Get the sequence in the intervening gaps between these two matches
00595 //
00596         for( seqI = 0; seqI < 2; seqI++ )
00597         {
00598                 int64 gap_end = 0;
00599                 int64 gap_start = 0;
00600                 getInterveningCoordinates( iv, (seqI == 0 ? oseqI : oseqJ), r_begin, r_end, seqI, gap_start, gap_end);
00601                 int64 diff = gap_end - gap_start;
00602                 diff = diff > 0 ? diff - 1 : 0;
00603 
00604                 starts.push_back( gap_start );
00605                 gnSequence* new_seq = NULL;
00606                 if(diff > 0 && gap_start + diff - 1 <= r_list.seq_table[ seqI ]->length())
00607                         new_seq = new gnSequence( r_list.seq_table[ seqI ]->ToString( diff, gap_start ) );
00608                 else
00609                         new_seq = new gnSequence();
00610                 gap_list.seq_table.push_back( new_seq );
00611                 gap_list.sml_table.push_back( new DNAMemorySML() );
00612         }
00613 
00614         gnSeqI avg_len = (gap_list.seq_table[0]->length() + gap_list.seq_table[1]->length())/2;
00615         uint search_seed_size = getDefaultSeedWeight( avg_len );
00616         gap_mh.get().Clear();
00617         
00618         uint seed_count = use_seed_families ? 3 : 1;
00619         for( size_t seedI = 0; seedI < seed_count; seedI++ )
00620         {
00621                 //
00622                 //      Create sorted mer lists for the intervening gap region
00623                 //
00624                 uint64 default_seed = getSeed( search_seed_size, seedI );
00625                 if( search_seed_size < MIN_DNA_SEED_WEIGHT )
00626                 {
00627                         for( uint seqI = 0; seqI < gap_list.seq_table.size(); seqI++ )
00628                                 delete gap_list.seq_table[ seqI ];
00629                         for( uint seqI = 0; seqI < gap_list.sml_table.size(); seqI++ )
00630                                 delete gap_list.sml_table[ seqI ];
00631                         return;
00632                 }
00633                 for( uint seqI = 0; seqI < gap_list.seq_table.size(); seqI++ ){
00634                         gap_list.sml_table[ seqI ]->Clear();
00635                         gap_list.sml_table[ seqI ]->Create( *(gap_list.seq_table[ seqI ]), default_seed );
00636                 }
00637 
00638                 //
00639                 //      Find all matches in the gap region
00640                 //
00641                 gap_mh.get().ClearSequences();
00642                 if(seed_count>1)
00643                 {
00644                         MatchList cur_list = gap_list;
00645                         gap_mh.get().FindMatches( cur_list );
00646                         for( size_t mI = 0; mI < cur_list.size(); mI++ )
00647                                 cur_list[mI]->Free();
00648                 }else
00649                         gap_mh.get().FindMatches( gap_list );
00650         }
00651         if(seed_count>1)
00652                 gap_mh.get().GetMatchList(gap_list);
00653 
00654         EliminateOverlaps_v2( gap_list );
00655 
00656         // for anchor accuracy, throw out any anchors that are shorter than the minimum
00657         // anchor length after EliminateOverlaps()
00658         gap_list.LengthFilter( MIN_ANCHOR_LENGTH + 3 );
00659 
00660         for( size_t gI = 0; gI < gap_list.size(); gI++ )
00661         {
00662                 for( seqI = 0; seqI < 2; seqI++ )
00663                 {
00664                         int64 gap_rend = 0;
00665                         int64 gap_lend = 0;
00666                         getInterveningCoordinates( iv, (seqI == 0 ? oseqI : oseqJ), r_begin, r_end, seqI, gap_lend, gap_rend);
00667                         gap_list[gI]->SetLeftEnd(seqI, gap_list[gI]->LeftEnd(seqI) + gap_lend - 1);
00668                 }
00669         }
00670         r_list.insert(r_list.end(), gap_list.begin(), gap_list.end());
00671 
00672         // delete sequences and smls
00673         for( uint seqI = 0; seqI < gap_list.seq_table.size(); seqI++ )
00674                 delete gap_list.seq_table[ seqI ];
00675         for( uint seqI = 0; seqI < gap_list.sml_table.size(); seqI++ )
00676                 delete gap_list.sml_table[ seqI ];
00677 }
00678 
00679 template<class GappedAlignmentType>
00680 void ProgressiveAligner::recurseOnPairs( const vector<node_id_t>& node1_seqs, const vector<node_id_t>& node2_seqs, const GappedAlignmentType& iv, Matrix<MatchList>& matches, Matrix< std::vector< search_cache_t > >& search_cache_db, Matrix< std::vector< search_cache_t > >& new_cache_db, boost::multi_array< vector< vector< int64 > >, 2 >& iv_regions )
00681 {
00682         matches = Matrix<MatchList>(node1_seqs.size(),node2_seqs.size());
00683 
00684         std::vector< bitset_t > aln_matrix;
00685         iv.GetAlignment(aln_matrix);
00686         Match tmp(2);
00687         const size_t sizer = node1_seqs.size() * node2_seqs.size();
00688         std::vector< std::pair<size_t,size_t> > node_pairs(sizer);
00689         int nni = 0;
00690         for( size_t n1 = 0; n1 < node1_seqs.size(); n1++ )
00691                 for( size_t n2 = 0; n2 < node2_seqs.size(); n2++ )
00692                         node_pairs[nni++] = make_pair(n1,n2);
00693 
00694 #pragma omp parallel for
00695         for(int ni = 0; ni < node_pairs.size(); ni++)
00696         {
00697                 size_t n1 = node_pairs[ni].first;
00698                 size_t n2 = node_pairs[ni].second;
00699                 vector<node_id_t>::const_iterator n1_iter = node1_seqs.begin() + n1;
00700                 vector<node_id_t>::const_iterator n2_iter = node2_seqs.begin() + n2;
00701                 
00702                 uint seqI = node_sequence_map[*n1_iter];
00703                 uint seqJ = node_sequence_map[*n2_iter];
00704                 MatchList& mlist = matches(n1, n2);
00705                 std::vector< search_cache_t >& cache = search_cache_db(n1, n2);
00706                 std::vector< search_cache_t >& new_cache = new_cache_db(n1, n2);
00707                 mlist.seq_table.push_back( alignment_tree[*n1_iter].sequence );
00708                 mlist.seq_table.push_back( alignment_tree[*n2_iter].sequence );
00709 
00710                 gnSeqI charI = 0;
00711                 gnSeqI charJ = 0;
00712                 const size_t iv_aln_length = iv.AlignmentLength();
00713 
00714 // first determine the outer aligned boundaries of the LCB and record them for
00715 // later use
00716                 pair< int64, int64 > pair_1l(0,0);
00717                 pair< int64, int64 > pair_1r(0,0);
00718                 pair< int64, int64 > pair_2l(0,0);
00719                 pair< int64, int64 > pair_2r(0,0);
00720                 for( uint colI = 0; colI <= iv_aln_length; colI++ )
00721                 {
00722                         if( (aln_matrix[seqI].test(colI) && aln_matrix[seqJ].test(colI)) )
00723                         {
00724                                 if( colI == 0 )
00725                                         break;  // nothing to see here, move along...
00726                                 if( iv.Orientation(seqI) == AbstractMatch::forward )
00727                                         pair_1l = make_pair( iv.LeftEnd(seqI), iv.LeftEnd(seqI)+charI );
00728                                 else
00729                                         pair_1r = make_pair( iv.RightEnd(seqI)-charI, iv.RightEnd(seqI) );
00730                                 if( iv.Orientation(seqJ) == AbstractMatch::forward )
00731                                         pair_2l = make_pair( iv.LeftEnd(seqJ), iv.LeftEnd(seqJ)+charJ );
00732                                 else
00733                                         pair_2r = make_pair( iv.RightEnd(seqJ)-charJ, iv.RightEnd(seqJ) );
00734                                 break;
00735                         }
00736                         if( aln_matrix[seqI].test(colI) )
00737                                 ++charI;
00738                         if( aln_matrix[seqJ].test(colI) )
00739                                 ++charJ;
00740                 }
00741 
00742                 charI = 0;
00743                 charJ = 0;
00744                 for( uint colI = iv_aln_length; colI > 0 ; colI-- )
00745                 {
00746                         if( (aln_matrix[seqI].test(colI-1) && aln_matrix[seqJ].test(colI-1)) )
00747                         {
00748                                 if( colI == iv_aln_length )
00749                                         break;  // nothing to see here, move along...
00750                                 if( iv.Orientation(seqI) == AbstractMatch::forward )
00751                                         pair_1r = make_pair( iv.RightEnd(seqI)-charI, iv.RightEnd(seqI) );
00752                                 else
00753                                         pair_1l = make_pair( iv.LeftEnd(seqI), iv.LeftEnd(seqI)+charI );
00754                                 if( iv.Orientation(seqJ) == AbstractMatch::forward )
00755                                         pair_2r = make_pair( iv.RightEnd(seqJ)-charJ, iv.RightEnd(seqJ) );
00756                                 else
00757                                         pair_2l = make_pair( iv.LeftEnd(seqJ), iv.LeftEnd(seqJ)+charJ );
00758                                 break;
00759                         }
00760                         if( aln_matrix[seqI].test(colI-1) )
00761                                 ++charI;
00762                         if( aln_matrix[seqJ].test(colI-1) )
00763                                 ++charJ;
00764                 }
00765                 if( pair_1l.first < pair_1l.second )
00766                 {
00767                         iv_regions[n1][n2][0].push_back(pair_1l.first);
00768                         iv_regions[n1][n2][0].push_back(pair_1l.second);
00769                 }
00770                 if( pair_1r.first < pair_1r.second )
00771                 {
00772                         iv_regions[n1][n2][0].push_back(pair_1r.first);
00773                         iv_regions[n1][n2][0].push_back(pair_1r.second);
00774                         if( pair_1r.first <= pair_1l.second && pair_1r.second >= pair_1l.first )
00775                         {
00776                                 cerr << "Ohno.  Overlap in outside LCB search intervals\n";
00777                                 cerr << "Left: " << pair_1l.first << '\t' << pair_1l.second << " right:  " << pair_1r.first << '\t' << pair_1r.second << endl;
00778                                 genome::breakHere();
00779                         }
00780                 }
00781 
00782                 if( pair_2l.first < pair_2l.second )
00783                 {
00784                         iv_regions[n1][n2][1].push_back(pair_2l.first);
00785                         iv_regions[n1][n2][1].push_back(pair_2l.second);
00786                 }
00787                 if( pair_2r.first < pair_2r.second )
00788                 {
00789                         iv_regions[n1][n2][1].push_back(pair_2r.first);
00790                         iv_regions[n1][n2][1].push_back(pair_2r.second);
00791                         if( pair_2r.first <= pair_2l.second && pair_2r.second >= pair_2l.first )
00792                         {
00793                                 cerr << "Ohno.  Overlap in outside LCB search intervals\n";
00794                                 cerr << "Left: " << pair_2l.first << '\t' << pair_2l.second << " right:  " << pair_2r.first << '\t' << pair_2r.second << endl;
00795                                 genome::breakHere();
00796                         }
00797                 }
00798 
00799                 charI = 0;
00800                 charJ = 0;
00801                 gnSeqI prev_charI = 0;
00802                 gnSeqI prev_charJ = 0;
00803                 bool in_gap = false;
00804 
00805                 for( uint colI = 0; colI <= iv_aln_length; colI++ )
00806                 {
00807                         if( colI == iv_aln_length || 
00808                                 (aln_matrix[seqI].test(colI) && aln_matrix[seqJ].test(colI)) )
00809                         {
00810                                 if( in_gap && 
00811                                         charI - prev_charI > min_recursive_gap_length &&
00812                                         charJ - prev_charJ > min_recursive_gap_length )
00813                                 {
00814 
00815                                         Match* l_match = NULL;
00816                                         l_match = tmp.Copy();
00817                                         if(iv.Orientation(seqI) == AbstractMatch::forward)
00818                                                 l_match->SetLeftEnd(0, iv.LeftEnd(seqI)+prev_charI);
00819                                         else
00820                                         {
00821                                                 l_match->SetLeftEnd(0, iv.RightEnd(seqI)-prev_charI);
00822                                                 l_match->SetOrientation(0, AbstractMatch::reverse );
00823                                         }
00824                                         if(iv.Orientation(seqJ) == AbstractMatch::forward)
00825                                                 l_match->SetLeftEnd(1, iv.LeftEnd(seqJ)+prev_charJ);
00826                                         else
00827                                         {
00828                                                 l_match->SetLeftEnd(1, iv.RightEnd(seqJ)-prev_charJ);
00829                                                 l_match->SetOrientation(1, AbstractMatch::reverse );
00830                                         }
00831                                         l_match->SetLength(0);
00832                                         Match* r_match = NULL;
00833                                         if( charJ != iv.RightEnd(seqJ) && charI != iv.RightEnd(seqI) )
00834                                         {
00835                                                 r_match = tmp.Copy();
00836                                                 if(iv.Orientation(seqI) == AbstractMatch::forward)
00837                                                         r_match->SetLeftEnd(0, iv.LeftEnd(seqI)+charI);
00838                                                 else
00839                                                 {
00840                                                         r_match->SetLeftEnd(0, iv.RightEnd(seqI)-charI);
00841                                                         r_match->SetOrientation(0, AbstractMatch::reverse );
00842                                                 }
00843                                                 if(iv.Orientation(seqJ) == AbstractMatch::forward)
00844                                                         r_match->SetLeftEnd(1, iv.LeftEnd(seqJ)+charJ);
00845                                                 else
00846                                                 {
00847                                                         r_match->SetLeftEnd(1, iv.RightEnd(seqJ)-charJ);
00848                                                         r_match->SetOrientation(1, AbstractMatch::reverse );
00849                                                 }
00850                                                 r_match->SetLength(0);
00851                                         }
00852 
00853                                         if( iv.Orientation(seqI) == AbstractMatch::reverse )
00854                                         {
00855                                                 swap(l_match,r_match);
00856                                                 if( l_match != NULL ) l_match->Invert();
00857                                                 if( r_match != NULL ) r_match->Invert();
00858                                         }
00859                                         // check whether the current cache already has the searched region
00860                                         search_cache_t cacheval = make_pair( l_match, r_match );
00861                                         std::vector< search_cache_t >::iterator cache_entry = std::upper_bound( cache.begin(), cache.end(), cacheval, mems::cache_comparator );
00862                                         if( cache_entry == cache.end() || 
00863                                                 (mems::cache_comparator( cacheval, *cache_entry ) || mems::cache_comparator( *cache_entry, cacheval )) )
00864                                         {
00865                                                 // search this region
00866                                                         pairwiseAnchorSearch(mlist, l_match, r_match, &iv, seqI, seqJ);
00867                                         }
00868                                         if(using_cache_db)
00869                                                 new_cache.push_back( cacheval );
00870                                 }
00871                                 prev_charI = charI;
00872                                 prev_charJ = charJ;
00873                                 in_gap = false;
00874                         }
00875                         else
00876                                 in_gap = true;
00877                         if( colI < iv.AlignmentLength() )
00878                         {
00879                                 if( aln_matrix[seqI].test(colI) )
00880                                         ++charI;
00881                                 if( aln_matrix[seqJ].test(colI) )
00882                                         ++charJ;
00883                         }
00884                 }
00885         }
00886 }
00887 
00888 void ProgressiveAligner::getAncestralMatches( const vector< node_id_t > node1_seqs, const vector< node_id_t > node2_seqs, node_id_t node1, node_id_t node2, node_id_t ancestor, std::vector< AbstractMatch* >& ancestral_matches )
00889 {
00890         // to save memory, always make node1_seqs the bigger vector
00891 //      if( node1_seqs.size() < node2_seqs.size() )
00892 //              swap( node1_seqs, node2_seqs );
00893 
00894         // for each pair of genomes, extract pairwise matches and translate up
00895         // eliminate overlaps
00896         for( uint seqI = 0; seqI < node1_seqs.size(); seqI++ )
00897         {
00898                 uint ii = this->node_sequence_map[node1_seqs[seqI]];
00899                 vector< AbstractMatch* > seqI_matches;
00900 
00901                 for( uint seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
00902                 {
00903                         uint jj = this->node_sequence_map[node2_seqs[seqJ]];
00904                         vector< AbstractMatch* > cur_matches;
00905                         for( size_t mI = 0; mI < original_ml.size(); mI++ )
00906                         {
00907                                 if( original_ml[mI]->LeftEnd(ii) == NO_MATCH )
00908                                         continue;
00909                                 if( original_ml[mI]->LeftEnd(jj) == NO_MATCH )
00910                                         continue;
00911                                 Match mm( 2 );
00912                                 Match* new_m = mm.Copy();
00913                                 new_m->SetStart( 0, original_ml[mI]->Start(ii));
00914                                 new_m->SetStart( 1, original_ml[mI]->Start(jj));
00915                                 new_m->SetLength(original_ml[mI]->Length());
00916                                 if( new_m->Start(0) < 0 )
00917                                         new_m->Invert();        // assign reference orientation to seq 0
00918                                 cur_matches.push_back( new_m );
00919                         }
00920                         // now translate cur_matches
00921                         translateGappedCoordinates( cur_matches, 1, node2_seqs[seqJ], node2 );
00922                         seqI_matches.insert( seqI_matches.end(), cur_matches.begin(), cur_matches.end() );
00923                 }
00924                 EliminateOverlaps_v2( seqI_matches );
00925                 translateGappedCoordinates( seqI_matches, 0, node1_seqs[seqI], node1 );
00926                 ancestral_matches.insert( ancestral_matches.end(), seqI_matches.begin(), seqI_matches.end() );
00927         }
00928         EliminateOverlaps_v2( ancestral_matches );
00929 }
00930 
00931 
00932 void ProgressiveAligner::getPairwiseMatches( const vector< node_id_t >& node1_seqs, const vector< node_id_t >& node2_seqs, Matrix<MatchList>& pairwise_matches )
00933 {
00934         pairwise_matches = Matrix< MatchList >( node1_seqs.size(), node2_seqs.size() );
00935 
00936         // copy sequence tables
00937         for( uint seqI = 0; seqI < node1_seqs.size(); seqI++ )
00938         {
00939                 for( uint seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
00940                 {
00941                         uint ii = this->node_sequence_map[node1_seqs[seqI]];
00942                         uint jj = this->node_sequence_map[node2_seqs[seqJ]];
00943                         pairwise_matches(seqI, seqJ).seq_table.push_back(original_ml.seq_table[ii]);
00944                         pairwise_matches(seqI, seqJ).seq_table.push_back(original_ml.seq_table[jj]);
00945                         pairwise_matches(seqI, seqJ).seq_filename.push_back(original_ml.seq_filename[ii]);
00946                         pairwise_matches(seqI, seqJ).seq_filename.push_back(original_ml.seq_filename[jj]);
00947                 }
00948         }
00949 
00950         // now copy pairwise matches
00951         for( size_t mI = 0; mI < original_ml.size(); mI++ )
00952         {
00953                 for( uint seqI = 0; seqI < node1_seqs.size(); seqI++ )
00954                 {
00955                         uint ii = this->node_sequence_map[node1_seqs[seqI]];
00956                         if( original_ml[mI]->LeftEnd(ii) == NO_MATCH )
00957                                 continue;
00958                         for( uint seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
00959                         {
00960                                 uint jj = this->node_sequence_map[node2_seqs[seqJ]];
00961                                 if( original_ml[mI]->LeftEnd(jj) == NO_MATCH )
00962                                         continue;
00963                                 Match mm( 2 );
00964                                 Match* new_m = mm.Copy();
00965                                 new_m->SetStart( 0, original_ml[mI]->Start(ii));
00966                                 new_m->SetStart( 1, original_ml[mI]->Start(jj));
00967                                 new_m->SetLength(original_ml[mI]->Length());
00968                                 if( new_m->Start(0) < 0 )
00969                                         new_m->Invert();        // assign reference orientation to seq 0
00970                                 pairwise_matches(seqI,seqJ).push_back( new_m );
00971                         }
00972                 }
00973         }
00974 }
00975 
00976 
00977 int IsDenseEnough( GappedAlignment* gal_iter )
00978 {
00979         double total_len = 0;
00980         gnSeqI seqs = 0;
00981         for( uint seqI = 0; seqI < gal_iter->SeqCount(); seqI++ )
00982         {
00983                 if( gal_iter->LeftEnd(seqI) == NO_MATCH )
00984                         continue;
00985                 total_len += gal_iter->Length(seqI);
00986         }
00987         double density = total_len / (gal_iter->AlignmentLength() * (double)gal_iter->Multiplicity());
00988         // density of 1 is ideal
00989         // the shorter the alignment, the closer we should be to 1 to allow splitting
00990         // use a linear threshold with (min_window_size,1) and (max_window_size,min_gappiness)
00991         // as endpoints of the threshold line
00992         
00993         // determine the density threshold for the given alignment length
00994         double threshold = ((max_density - min_density)/(min_window_size - max_window_size)) * ( (double)gal_iter->AlignmentLength() - max_window_size ) + min_density;
00995         if( density > max_density )     // don't bother aligning this, it's so dense we'll wait until iterative refinement.
00996                 return 2;
00997         if( density > threshold )
00998                 return 1;
00999         return 0;
01000 }
01001 
01002 void splitGappedAlignment( const GappedAlignment& ga, GappedAlignment& ga1, GappedAlignment& ga2, std::vector<size_t>& seqs1, std::vector<size_t>& seqs2 )
01003 {
01004         const vector< string >& aln = GetAlignment( ga, std::vector<gnSequence*>(ga.SeqCount()) );
01005         ga1 = ga;
01006         ga2 = ga;
01007         for( size_t seqI = 0; seqI < seqs1.size(); seqI++ )
01008                 ga2.SetLeftEnd(seqs1[seqI], NO_MATCH);
01009         for( size_t seqI = 0; seqI < seqs2.size(); seqI++ )
01010                 ga1.SetLeftEnd(seqs2[seqI], NO_MATCH);
01011 }
01012 
01013 void removeLargeGapsPP( GappedAlignment& gal, list< GappedAlignment* >& gal_list, vector<bool>& gap_iv, const vector< size_t >& group1, const vector< size_t >& group2 )
01014 {
01015         // scan through and remove any section where members of group1 aren't aligned to members of group2
01016         // for more than some number of nucleotides
01017         gap_iv.clear();
01018         gal_list.clear();
01019         const vector< string >& aln_matrix = GetAlignment(gal, vector<gnSequence*>(gal.SeqCount(),NULL));
01020         size_t gap_cols = 0;
01021         size_t last_aln_col = (std::numeric_limits<size_t>::max)();
01022         size_t col_base = 0;
01023         GappedAlignment* galp = gal.Copy();
01024         for( size_t colI = 0; colI < gal.AlignmentLength(); colI++ )
01025         {
01026                  size_t g1 = 0;
01027                  size_t g2 = 0;
01028                  for( ; g1 < group1.size(); ++g1 )
01029                  {
01030                          if( aln_matrix[group1[g1]][colI] != '-' )
01031                                  break;
01032                  }
01033                  for( ; g2 < group2.size(); ++g2 )
01034                  {
01035                          if( aln_matrix[group2[g2]][colI] != '-' )
01036                                  break;
01037                  }
01038                  if( g1 < group1.size() && g2 < group2.size() )
01039                  {
01040                          // it's an aligned col
01041                          if( gap_cols > max_gap_length )
01042                          {
01043                                 // crop out the middle gapped section
01044                                 gnSeqI split_point = 0;
01045                                 if( last_aln_col != (std::numeric_limits<size_t>::max)() )
01046                                 {
01047                                         split_point = last_aln_col + lcb_hangover - col_base;
01048                                         gal_list.push_back( galp );
01049                                         gap_iv.push_back(false);
01050                                         galp = (GappedAlignment*)galp->Split(split_point);      // set galp to the right side after splitting
01051                                         col_base += split_point;
01052                                 }
01053                                 split_point = colI - lcb_hangover - col_base;
01054                                 gal_list.push_back( galp );
01055                                 gap_iv.push_back(true);
01056                                 galp = (GappedAlignment*)galp->Split(split_point);      // set galp to the right side after splitting
01057                                 col_base += split_point;
01058                          }
01059                          last_aln_col = colI;
01060                          gap_cols = 0;
01061                  }else
01062                          ++gap_cols;
01063         }
01064 
01065         if( gap_cols > max_gap_length )
01066         {
01067                 gnSeqI split_point = 0;
01068                 if( last_aln_col != (std::numeric_limits<size_t>::max)() )
01069                 {
01070                         split_point = last_aln_col + lcb_hangover - col_base;
01071                         gal_list.push_back( galp );
01072                         gap_iv.push_back(false);
01073                         galp = (GappedAlignment*)galp->Split(split_point);      // set galp to the right side after splitting
01074                 }
01075                 gap_iv.push_back(true);
01076         }else
01077                 gap_iv.push_back(false);
01078         gal_list.push_back( galp );
01079 }
01080 
01081 void ProgressiveAligner::refineAlignment( GappedAlignment& gal, node_id_t ancestor, bool profile_aln, AlnProgressTracker& apt )
01082 {
01083         // divide the gapped alignment up into windows of a given size and have
01084         // muscle refine the alignments
01085         // when anchors are dense use smaller windows to improve speed efficiency
01086         list< GappedAlignment* > gal_list;
01087         vector<bool> gap_iv;
01088         std::vector<node_id_t> nodes1;
01089         std::vector<node_id_t> nodes2;
01090         getAlignedChildren( alignment_tree[ancestor].children[0], nodes1 );
01091         getAlignedChildren( alignment_tree[ancestor].children[1], nodes2 );
01092         std::vector<size_t> seqs1( nodes1.size() );
01093         std::vector<size_t> seqs2( nodes2.size() );
01094         for( size_t nI = 0; nI < nodes1.size(); nI++ )
01095                 seqs1[nI] = node_sequence_map[nodes1[nI]];
01096         for( size_t nI = 0; nI < nodes2.size(); nI++ )
01097                 seqs2[nI] = node_sequence_map[nodes2[nI]];
01098 //      if( profile_aln )
01099 //      {
01100                 removeLargeGapsPP( gal, gal_list, gap_iv, seqs1, seqs2 );
01101 //      }else{
01102 //              gal_list.push_back( gal.Copy() );
01103 //              gap_iv.push_back(false);
01104 //      }
01105         list< GappedAlignment* >::iterator gal_iter = gal_list.begin();
01106         vector<bool>::iterator gap_iter = gap_iv.begin();
01107         while(gal_iter != gal_list.end())
01108         {
01109                 int density = IsDenseEnough( *gal_iter );
01110                 if( (density == 0 && (*gal_iter)->AlignmentLength() > max_window_size / 3) ||
01111                         (density == 1 && (*gal_iter)->AlignmentLength() > max_window_size ) ||
01112                         (density == 2 && (*gal_iter)->AlignmentLength() > max_window_size * 3 )
01113 
01114 //                        || ( (*gal_iter)->AlignmentLength() > min_window_size && density == 1 && profile_aln == true ) 
01115                           )
01116                 {
01117                         // split in half
01118                         gnSeqI split_point = (*gal_iter)->AlignmentLength() / 2;
01119                         list< GappedAlignment* >::iterator ins_iter = gal_iter;
01120                         ++ins_iter;
01121                         ins_iter = gal_list.insert(ins_iter, new GappedAlignment(**gal_iter) );
01122 //                      ins_iter = gal_list.insert(ins_iter, (*gal_iter)->Copy());
01123                         vector<bool>::iterator gap_ins_iter = gap_iter;
01124                         size_t gap_off = gap_iter - gap_iv.begin();
01125                         ++gap_ins_iter;
01126                         gap_iv.insert( gap_ins_iter, *gap_iter );
01127                         gap_iter = gap_iv.begin() + gap_off;
01128                         (*gal_iter)->CropEnd( split_point );
01129                         (*ins_iter)->CropStart( (*ins_iter)->AlignmentLength() - split_point );
01130                         continue;
01131                 }
01132 
01133                 ++gal_iter;
01134                 ++gap_iter;
01135         }
01136         MuscleInterface& mi = MuscleInterface::getMuscleInterface();
01137         // now that the alignment is all split up use muscle to refine it
01138         gnSeqI new_len = 0;
01139 
01140         gap_iter = gap_iv.begin();
01141 
01142         const size_t gal_count = gal_list.size();
01143 #pragma omp parallel for schedule(dynamic)
01144         for( int galI = 0; galI < gal_count; galI++ )
01145         {
01146                 list<GappedAlignment*>::iterator my_g_iter = gal_list.begin();
01147                 vector<bool>::iterator my_b_iter = gap_iv.begin();
01148                 for(uint a = 0; a < galI; a++)
01149                 {
01150                         ++my_g_iter;
01151                         ++my_b_iter;
01152                 }
01153 #pragma omp critical
01154 {
01155                 apt.cur_leftend += (*my_g_iter)->AlignmentLength();
01156 }
01157                 if( profile_aln && !(*my_b_iter) )
01158                 {
01159                         GappedAlignment ga1;
01160                         GappedAlignment ga2;
01161                         splitGappedAlignment( **my_g_iter, ga1, ga2, seqs1, seqs2 );
01162                         if( ga1.Multiplicity() > 0 && ga2.Multiplicity() > 0 )
01163                         {
01164                                 mi.ProfileAlignFast( ga1, ga2, **my_g_iter, true );
01165                         }
01166                 }else if(!(*my_b_iter))
01167                 {
01168                         int density = IsDenseEnough( *my_g_iter );
01169                         if( density == 0 )
01170                                 mi.RefineFast( **my_g_iter );
01171                         else if( density == 1 )
01172                                 mi.RefineFast( **my_g_iter, 500 );
01173                         else
01174                                 mi.RefineFast( **my_g_iter, 200 );
01175                 }
01176 
01177                 new_len += (*my_g_iter)->AlignmentLength();
01178 #pragma omp critical
01179 {
01180                 // print a progress message
01181                 double cur_progress = ((double)apt.cur_leftend / (double)apt.total_len)*100.0;
01182                 printProgress((uint)apt.prev_progress, (uint)cur_progress, cout);
01183                 apt.prev_progress = cur_progress;
01184 }
01185         }
01186         gal_iter = gal_list.end();
01187 
01188         // put humpty dumpty back together
01189         vector< string > aln_matrix( gal.SeqCount(), string( new_len, '-' ) );
01190         vector< string::size_type > pos( gal.SeqCount(), 0 );
01191         for( gal_iter = gal_list.begin(); gal_iter != gal_list.end(); ++gal_iter )
01192         {
01193                 const vector< string >& tmp_mat = GetAlignment(**gal_iter, vector<gnSequence*>( gal.SeqCount() ) );
01194                 for( uint seqI = 0; seqI < tmp_mat.size(); seqI++ )
01195                 {
01196                         if( gal.LeftEnd(seqI) == 0 )
01197                                 continue;
01198                         aln_matrix[seqI].replace(pos[seqI], tmp_mat[seqI].size(), tmp_mat[seqI]);
01199                         pos[seqI] += tmp_mat[seqI].size();
01200                 }
01201 //              (*gal_iter)->Free();
01202                 delete (*gal_iter);
01203         }
01204         gal.SetAlignment(aln_matrix);
01205 }
01206 
01207 void ProgressiveAligner::doGappedAlignment( node_id_t ancestor, bool profile_aln )
01208 {
01209         AlnProgressTracker apt;
01210         gnSeqI total_len = 0;
01211         for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
01212                 total_len += alignment_tree[ancestor].ordering[aI].Length();
01213         apt.total_len = total_len;
01214         apt.prev_progress = 0;
01215 
01216         printProgress(-1, 0, cout);
01217         apt.cur_leftend = 1;
01218 
01219         for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
01220         {
01221                 if( alignment_tree[ancestor].ordering[aI].reference_iv.Multiplicity() == 1 )
01222                 {
01223                         apt.cur_leftend += alignment_tree[ancestor].ordering[aI].reference_iv.AlignmentLength();
01224                         continue;       // don't bother re-refining intervals that didn't get aligned here
01225                 }
01226 
01227 //              printMemUsage();
01228 //              cout << "extract aln\n";
01229                 GappedAlignment gal;
01230                 extractAlignment(ancestor, aI, gal);
01231 //              printMemUsage();
01232 //              cout << "refine aln\n";
01233                 if( gal.Multiplicity() > 1 )    // no point in refining intervals that are unaligned anyways
01234                         refineAlignment( gal, ancestor, profile_aln, apt );
01235                 else
01236                         apt.cur_leftend += gal.AlignmentLength();
01237 //              printMemUsage();
01238 //              cout << "construct siv\n";
01239                 ConstructSuperIntervalFromMSA(ancestor, aI, gal);
01240 //              printMemUsage();
01241 
01242                 // print a progress message
01243                 double cur_progress = ((double)apt.cur_leftend / (double)apt.total_len)*100.0;
01244                 printProgress((uint)apt.prev_progress, (uint)cur_progress, cout);
01245                 apt.prev_progress = cur_progress;
01246         }
01247         printMemUsage();
01248         cout << "Fix left ends\n";
01249         FixLeftEnds(ancestor);
01250         printMemUsage();
01251 
01252         if( debug_aligner )
01253                 validateSuperIntervals(alignment_tree[ancestor].children[0], alignment_tree[ancestor].children[1], ancestor);
01254         cout << "\ndone.\n";
01255 }
01256 
01257 void ProgressiveAligner::FixLeftEnds( node_id_t ancestor )
01258 {
01259         // fixes all SuperInterval left-end coordinates for nodes below ancestor
01260         stack< node_id_t > node_stack;
01261         node_stack.push( ancestor );
01262         vector<bool> visited( alignment_tree.size(), false );
01263         while( node_stack.size() > 0 )
01264         {
01265                 node_id_t cur_node = node_stack.top();
01266                 // visit post-order
01267                 if( !visited[cur_node] )
01268                 {
01269                         for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
01270                                 node_stack.push( alignment_tree[cur_node].children[childI] );
01271                         visited[cur_node] = true;
01272                         continue;
01273                 }
01274                 node_stack.pop();
01275                 if( alignment_tree[cur_node].sequence != NULL )
01276                         continue;       // don't do anything on leaf nodes
01277 
01278                 vector< SuperInterval >& siv_list = alignment_tree[cur_node].ordering;
01279                 gnSeqI left_end = 1;
01280                 for( size_t sivI = 0; sivI < siv_list.size(); sivI++ )
01281                 {
01282                         siv_list[sivI].SetLeftEnd(left_end);
01283                         siv_list[sivI].SetLength(siv_list[sivI].reference_iv.AlignmentLength());
01284                         left_end += siv_list[sivI].reference_iv.AlignmentLength();
01285                         CompactGappedAlignment<>* m_cga = dynamic_cast<CompactGappedAlignment<>*>(siv_list[sivI].reference_iv.GetMatches()[0]);
01286                         
01287                         // this one wasn't refined, just move it appropriately
01288                         if( m_cga == NULL || siv_list[sivI].reference_iv.GetMatches().size() > 1 )
01289                         {
01290                                 for( uint childI = 0; childI <= 1; childI++ )
01291                                 {
01292                                         size_t cur_siv = childI == 0 ? alignment_tree[cur_node].ordering[sivI].c1_siv : alignment_tree[cur_node].ordering[sivI].c2_siv;
01293                                         if( cur_siv == (std::numeric_limits<size_t>::max)() )
01294                                                 continue;
01295                                         const SuperInterval& c_siv = alignment_tree[ alignment_tree[cur_node].children[childI] ].ordering[ cur_siv ];
01296                                         int64 diff = c_siv.LeftEnd() - siv_list[sivI].reference_iv.LeftEnd(childI);
01297                                         siv_list[sivI].reference_iv.SetLeftEnd(childI, c_siv.LeftEnd());
01298                                         const vector< AbstractMatch* >& matches = siv_list[sivI].reference_iv.GetMatches();
01299                                         for( size_t mI = 0; mI < matches.size(); mI++ )
01300                                         {
01301                                                 if( matches[mI]->LeftEnd(childI) != NO_MATCH )
01302                                                         matches[mI]->SetLeftEnd(childI, matches[mI]->LeftEnd(childI) + diff);
01303                                         }
01304                                 }
01305 
01306                         }else{
01307 
01308                                 size_t c1_siv = alignment_tree[cur_node].ordering[sivI].c1_siv;
01309                                 if( c1_siv != (std::numeric_limits<size_t>::max)() )
01310                                 {
01311                                         const SuperInterval& c_siv = alignment_tree[ alignment_tree[cur_node].children[0] ].ordering[ c1_siv ];
01312                                         m_cga->SetLeftEnd(0, c_siv.LeftEnd());
01313                                         siv_list[sivI].reference_iv.SetLeftEnd(0, c_siv.LeftEnd());
01314                                         m_cga->SetLength(c_siv.Length(), 0);
01315                                         siv_list[sivI].reference_iv.SetLength(c_siv.Length(), 0);
01316                                         siv_list[sivI].reference_iv.SetOrientation(0, m_cga->Orientation(0));
01317                                 }
01318                                 size_t c2_siv = alignment_tree[cur_node].ordering[sivI].c2_siv;
01319                                 if( c2_siv != (std::numeric_limits<size_t>::max)() )
01320                                 {
01321                                         const SuperInterval& c_siv = alignment_tree[ alignment_tree[cur_node].children[1] ].ordering[ c2_siv ];
01322                                         m_cga->SetLeftEnd(1, c_siv.LeftEnd());
01323                                         siv_list[sivI].reference_iv.SetLeftEnd(1, c_siv.LeftEnd());
01324                                         m_cga->SetLength(c_siv.Length(), 1);
01325                                         siv_list[sivI].reference_iv.SetLength(c_siv.Length(), 1);
01326                                         siv_list[sivI].reference_iv.SetOrientation(1, m_cga->Orientation(1));
01327                                 }
01328                         }
01329                         if( debug_cga && m_cga && !m_cga->validate() )
01330 //                      if( m_cga && !m_cga->validate() )
01331                                 cerr << "oh junkedy\n";
01332 
01333                 }
01334         }
01335 }
01336 
01340 void propagateInvert( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t ancestor, size_t ans_siv )
01341 {
01342         stack< pair< node_id_t, size_t > > node_siv_stack;
01343         node_siv_stack.push( make_pair(ancestor, ans_siv) );
01344         while( node_siv_stack.size() > 0 )
01345         {
01346                 pair< node_id_t, size_t > cur = node_siv_stack.top();
01347                 node_siv_stack.pop();
01348                 node_id_t cur_node = cur.first;
01349                 if( alignment_tree[cur_node].ordering[cur.second].c1_siv != (std::numeric_limits<size_t>::max)() )
01350                         node_siv_stack.push( make_pair( alignment_tree[cur_node].children[0], alignment_tree[cur_node].ordering[cur.second].c1_siv ) );
01351                 if( alignment_tree[cur_node].ordering[cur.second].c2_siv != (std::numeric_limits<size_t>::max)() )
01352                         node_siv_stack.push( make_pair( alignment_tree[cur_node].children[1], alignment_tree[cur_node].ordering[cur.second].c2_siv ) );
01353                 if( cur_node == ancestor )
01354                         continue;       // don't do anything at the ancestor
01355                 if( alignment_tree[cur_node].sequence != NULL )
01356                         continue;       // don't do anything on leaf nodes
01357 
01358                 // reverse the homology structure at this node
01359                 Interval& ref_iv = alignment_tree[cur_node].ordering[cur.second].reference_iv;
01360                 vector< AbstractMatch* > matches;
01361                 ref_iv.StealMatches( matches );
01362                 AbstractMatch::orientation o0 = matches[0]->Orientation(0);
01363                 AbstractMatch::orientation o1 = matches[0]->Orientation(1);
01364                 matches[0]->Invert();
01365                 if( o0 != AbstractMatch::undefined )
01366                         matches[0]->SetOrientation(0,o0);
01367                 if( o1 != AbstractMatch::undefined )
01368                         matches[0]->SetOrientation(1,o1);
01369                 ref_iv.SetMatches( matches );
01370                 if( o0 != AbstractMatch::undefined )
01371                 {
01372                         ref_iv.SetOrientation(0,o0);
01373                         ref_iv.SetLeftEnd(0,0);
01374                 }
01375                 if( o1 != AbstractMatch::undefined )
01376                 {
01377                         ref_iv.SetOrientation(1,o1);
01378                         ref_iv.SetLeftEnd(1,0);
01379                 }
01380         }
01381 }
01382 
01383 
01384 void ProgressiveAligner::ConstructSuperIntervalFromMSA( node_id_t ancestor, size_t ans_siv, GappedAlignment& gal )
01385 {
01386         const vector< string >& aln_matrix = GetAlignment( gal, vector< gnSequence* >() );
01387         stack< pair< node_id_t, size_t > > node_siv_stack;
01388         node_siv_stack.push( make_pair(ancestor, ans_siv) );
01389         vector<bool> visited( alignment_tree.size(), false );
01390         while( node_siv_stack.size() > 0 )
01391         {
01392                 pair< node_id_t, size_t > cur = node_siv_stack.top();
01393                 node_id_t cur_node = cur.first;
01394                 // visit post-order
01395                 if( !visited[cur_node] )
01396                 {
01397                         if( alignment_tree[cur_node].ordering[cur.second].c1_siv != (std::numeric_limits<size_t>::max)() )
01398                                 node_siv_stack.push( make_pair( alignment_tree[cur_node].children[0], alignment_tree[cur_node].ordering[cur.second].c1_siv ) );
01399                         if( alignment_tree[cur_node].ordering[cur.second].c2_siv != (std::numeric_limits<size_t>::max)() )
01400                                 node_siv_stack.push( make_pair( alignment_tree[cur_node].children[1], alignment_tree[cur_node].ordering[cur.second].c2_siv ) );
01401                         visited[cur_node] = true;
01402                         continue;
01403                 }
01404                 node_siv_stack.pop();
01405                 if( alignment_tree[cur_node].sequence != NULL )
01406                         continue;       // don't do anything on leaf nodes
01407 
01408                 // build a super-interval
01409                 vector< node_id_t > node1_seqs; 
01410                 vector< node_id_t > node2_seqs; 
01411                 getAlignedChildren( alignment_tree[cur_node].children[0], node1_seqs );
01412                 getAlignedChildren( alignment_tree[cur_node].children[1], node2_seqs );
01413                 vector< bitset_t > m_aln(2, bitset_t( aln_matrix[0].size(), false ) );
01414                 gnSeqI seqI_len = 0;
01415                 gnSeqI seqJ_len = 0;
01416                 gnSeqI cur_col = 0;
01417                 for( size_t colI = 0; colI < aln_matrix[0].size(); colI++ )
01418                 {
01419                         uint seqI = 0;
01420                         uint seqJ = 0;
01421                         for( ; seqI < node1_seqs.size(); ++seqI )
01422                                 if( aln_matrix[node_sequence_map[node1_seqs[seqI]]][colI] != '-' )
01423                                         break;
01424                         for( ; seqJ < node2_seqs.size(); ++seqJ )
01425                                 if( aln_matrix[node_sequence_map[node2_seqs[seqJ]]][colI] != '-' )
01426                                         break;
01427 
01428                         if( seqI == node1_seqs.size() && seqJ == node2_seqs.size() )
01429                                 continue;       // nothing in this column
01430                         if( seqI != node1_seqs.size() )
01431                         {
01432                                 seqI_len++;
01433                                 m_aln[0].set(cur_col);
01434                         }
01435                         if( seqJ != node2_seqs.size() )
01436                         {
01437                                 seqJ_len++;
01438                                 m_aln[1].set(cur_col);
01439                         }
01440                         cur_col++;
01441                 }
01442                 m_aln[0].resize(cur_col);
01443                 m_aln[1].resize(cur_col);
01444                 CompactGappedAlignment<> tmp_cga(m_aln.size(), cur_col);
01445                 CompactGappedAlignment<>* cga = tmp_cga.Copy();
01446                 cga->SetLeftEnd(0, seqI_len > 0 ? 1 : 0);       // at this point we have no idea where the left end should really be
01447                 cga->SetLeftEnd(1, seqJ_len > 0 ? 1 : 0);
01448                 if( cga->LeftEnd(0) != NO_MATCH )
01449                         cga->SetOrientation(0, alignment_tree[cur_node].ordering[cur.second].reference_iv.Orientation(0));
01450                 if( cga->LeftEnd(1) != NO_MATCH )
01451                         cga->SetOrientation(1, alignment_tree[cur_node].ordering[cur.second].reference_iv.Orientation(1));
01452                 cga->SetLength(seqI_len,0);
01453                 cga->SetLength(seqJ_len,1);
01454                 cga->SetAlignment(m_aln);       // do this afterwords so that it can create the bitcount
01455 
01456                 // the alignment may need to be reversed if the aligned parent is reverse
01457                 size_t p_siv = alignment_tree[cur_node].ordering[cur.second].parent_siv;
01458                 bool reverse_me = false;
01459                 if( p_siv != (std::numeric_limits<size_t>::max)() )
01460                 {
01461                         size_t p_node = alignment_tree[cur_node].parents[0];
01462                         int p_child = alignment_tree[p_node].children[0] == cur_node ? 0 : 1;
01463                         if( alignment_tree[p_node].ordering[p_siv].reference_iv.Orientation(p_child) == AbstractMatch::reverse )
01464                                 reverse_me = true;
01465                 }
01466                 if( reverse_me )
01467                 {
01468                         cga->Invert();
01469                         if( cga->LeftEnd(0) != NO_MATCH )
01470                                 cga->SetOrientation(0, alignment_tree[cur_node].ordering[cur.second].reference_iv.Orientation(0));
01471                         if( cga->LeftEnd(1) != NO_MATCH )
01472                                 cga->SetOrientation(1, alignment_tree[cur_node].ordering[cur.second].reference_iv.Orientation(1));
01473                         propagateInvert( alignment_tree, cur_node, cur.second );
01474                 }
01475 
01476                 alignment_tree[cur_node].ordering[cur.second].reference_iv = Interval();
01477                 vector< AbstractMatch* > am_list(1, cga);
01478                 alignment_tree[cur_node].ordering[cur.second].reference_iv.SetMatches( am_list );
01479                 // set these to zero so they don't interfere with coordinate translation
01480                 alignment_tree[cur_node].ordering[cur.second].reference_iv.SetLeftEnd(0, 0);
01481                 alignment_tree[cur_node].ordering[cur.second].reference_iv.SetLeftEnd(1, 0);
01482         }
01483 }
01484 
01485 typedef boost::tuple<CompactGappedAlignment<>*, vector< bitset_t >*, AbstractMatch* > _sort_tracker_type;
01486 
01487 template< class CompType >
01488 class CgaBsComp
01489 {
01490 public:
01491         CgaBsComp( CompType& c ) : comp(c) {};
01492         bool operator()( const _sort_tracker_type& a, const _sort_tracker_type& b )
01493         {
01494                 return comp( a.get<0>(), b.get<0>() );
01495         }
01496 protected:
01497         CompType& comp;
01498 };
01499 
01500 template< typename MatchVector >
01501 void multFilter( MatchVector& matches, uint mult = 2 )
01502 {
01503         // apply a multiplicity filter
01504         size_t cur = 0;
01505         for( size_t mI = 0; mI < matches.size(); ++mI )
01506         {
01507                 if( matches[mI]->Multiplicity() == mult )
01508                         matches[cur++] = matches[mI];
01509                 else
01510                         matches[mI]->Free();
01511         }
01512         matches.erase(matches.begin()+cur, matches.end());
01513 }
01514 
01515 template< typename MatchVector >
01516 void alignedNtCountFilter( MatchVector& matches, uint length )
01517 {
01518         // require at least some number of aligned pairs in the anchor
01519         size_t cur = 0;
01520         for( size_t mI = 0; mI < matches.size(); ++mI )
01521         {
01522                 size_t len_sum = 0;
01523                 for( size_t seqI = 0; seqI < matches[mI]->SeqCount(); seqI++ )
01524                         if(matches[mI]->LeftEnd(seqI) != NO_MATCH)
01525                                 len_sum += matches[mI]->Length(seqI);
01526 
01527                 if( len_sum - length > matches[mI]->AlignmentLength() )
01528                         matches[cur++] = matches[mI];
01529                 else
01530                         matches[mI]->Free();
01531         }
01532         matches.erase(matches.begin()+cur, matches.end());
01533 }
01534 
01535 
01536 bool debugging_cltm = false;
01537 void ProgressiveAligner::constructLcbTrackingMatches( 
01538         node_id_t ancestral_node, 
01539         vector< AbstractMatch* >& ancestral_matches, 
01540         vector< LcbTrackingMatch< AbstractMatch* > >& tracking_matches 
01541         )
01542 {
01543         node_id_t child_0 = alignment_tree[ancestral_node].children[0];
01544         node_id_t child_1 = alignment_tree[ancestral_node].children[1];
01545         // split up matches at descendant's breakpoints
01546         propagateDescendantBreakpoints( child_0, 0, ancestral_matches );
01547         propagateDescendantBreakpoints( child_1, 1, ancestral_matches );
01548 
01549         // store alignment bitvectors for each match...
01550         vector< bitset_t > bs_tmp(alignment_tree.size());
01551         vector< vector< bitset_t > > bs(ancestral_matches.size(), bs_tmp);
01552         vector< _sort_tracker_type > cga_list;
01553         // initialize alignment bitvectors
01554         for( size_t mI = 0; mI < ancestral_matches.size(); mI++ )
01555         {
01556                 vector< bitset_t > aln( alignment_tree.size(), bitset_t(ancestral_matches[mI]->AlignmentLength() ) );
01557                 swap( bs[mI], aln );
01558                 ancestral_matches[mI]->GetAlignment(aln);
01559                 swap( bs[mI][child_0], aln[0] );
01560                 swap( bs[mI][child_1], aln[1] );
01561                 CompactGappedAlignment<> c(alignment_tree.size(),0);
01562                 c.SetLeftEnd(child_0, ancestral_matches[mI]->LeftEnd(0));
01563                 c.SetOrientation(child_0, ancestral_matches[mI]->Orientation(0));
01564                 c.SetLength(ancestral_matches[mI]->Length(0), child_0);
01565                 c.SetLeftEnd(child_1, ancestral_matches[mI]->LeftEnd(1));
01566                 c.SetOrientation(child_1, ancestral_matches[mI]->Orientation(1));
01567                 c.SetLength(ancestral_matches[mI]->Length(1), child_1);
01568                 cga_list.push_back(make_tuple(c.Copy(), &bs[mI], ancestral_matches[mI]));
01569         }
01570 
01571         stack<node_id_t> node_stack;
01572         node_stack.push(child_0);
01573         node_stack.push(child_1);
01574         while(node_stack.size() > 0)
01575         {
01576                 node_id_t cur_node = node_stack.top();
01577                 node_stack.pop();
01578                 if( alignment_tree[cur_node].children.size() == 0 )
01579                         continue;
01580                 node_stack.push(alignment_tree[cur_node].children[0]);
01581                 node_stack.push(alignment_tree[cur_node].children[1]);
01582 
01583                 // do processing for cur_node...
01584                 // 1. determine which interval in the current node each match falls into
01585                 // 2. determine the offset of this match in that interval
01586                 // 3. translate with that interval
01587 
01588                 vector< SuperInterval >& siv_list = alignment_tree[cur_node].ordering;
01589                 SingleStartComparator< CompactGappedAlignment<> > ssc(cur_node);
01590                 CgaBsComp< SingleStartComparator< CompactGappedAlignment<> > > comp( ssc );
01591                 sort(cga_list.begin(), cga_list.end(), comp);
01592                 size_t mI = 0;
01593                 size_t sivI = 0;
01594                 while( mI < cga_list.size() && sivI < siv_list.size() )
01595                 {
01596                         CompactGappedAlignment<>* cur_match = cga_list[mI].get<0>();
01597                         if( cur_match->Start(cur_node) == 0 )
01598                         {
01599                                 mI++;
01600                                 continue;       // this one doesn't match in this lineage!!
01601                         }
01602                         if( cur_match->LeftEnd(cur_node) >= siv_list[sivI].LeftEnd() + siv_list[sivI].Length() )
01603                         {
01604                                 sivI++;
01605                                 continue;
01606                         }
01607 
01608                         if( cur_match->LeftEnd(cur_node) + cur_match->Length(cur_node) > 
01609                                 siv_list[sivI].LeftEnd() + siv_list[sivI].Length() )
01610                         {
01611                                 cerr << "doesn't fit\n";
01612                                 cerr << "cga_list[" << mI << "]->LeftEnd(" << cur_node << "): " << cur_match->LeftEnd(cur_node) << endl;
01613                                 cerr << "cga_list[" << mI << "]->RightEnd(" << cur_node << "): " << cur_match->RightEnd(cur_node) << endl;
01614                                 breakHere();
01615                         }
01616 
01617                         // extract the region of the siv matched by the current match
01618                         CompactGappedAlignment<>* siv_cga = dynamic_cast<CompactGappedAlignment<>*>(siv_list[sivI].reference_iv.GetMatches()[0]);
01619                         if( siv_list[sivI].reference_iv.GetMatches().size() > 1 )
01620                                 siv_cga = NULL;
01621                         if( siv_cga == NULL )
01622                         {
01623                                 CompactGappedAlignment<> tmp_cga;
01624                                 siv_cga = tmp_cga.Copy();
01625                                 *siv_cga = CompactGappedAlignment<>(siv_list[sivI].reference_iv);
01626                                 vector<AbstractMatch*> tmp_matches(1,siv_cga);
01627                                 siv_list[sivI].reference_iv.SetMatches(tmp_matches);
01628                         }
01629                         CompactGappedAlignment<> new_cga;
01630                         siv_cga->copyRange(new_cga, cur_match->LeftEnd(cur_node) - siv_list[sivI].LeftEnd(), cur_match->Length(cur_node));
01631                         if( cur_match->Orientation(cur_node) == AbstractMatch::reverse )
01632                                 new_cga.Invert();
01633                         if( new_cga.Multiplicity() == 0 )
01634                         {
01635                                 cerr << "impossible!  there's no match!\n";
01636                                 genome::breakHere();
01637                         }
01638                         // set the leftend in cga_list
01639                         for( uint cur_child = 0; cur_child < 2; cur_child++ )
01640                         {
01641                                 node_id_t sweet_child = alignment_tree[cur_node].children[cur_child];
01642                                 cur_match->SetLeftEnd(sweet_child, new_cga.LeftEnd(cur_child));
01643                                 if( new_cga.LeftEnd(cur_child) != NO_MATCH )
01644                                 {
01645                                         cur_match->SetOrientation(sweet_child, new_cga.Orientation(cur_child));
01646                                         cur_match->SetLength(new_cga.Length(cur_child), sweet_child);
01647                                 }
01648                         }
01649 
01650                         // prepare a cga for translation
01651                         CompactGappedAlignment<> c(1,(*cga_list[mI].get<1>())[cur_node].size());
01652                         c.SetLeftEnd(0,1);
01653                         c.SetLength((*cga_list[mI].get<1>())[cur_node].count(),0);
01654                         vector<bitset_t> bivouac(1, (*cga_list[mI].get<1>())[cur_node]);
01655                         c.SetAlignment(bivouac);
01656 
01657                         // now translate each child
01658                         for( uint cur_child = 0; cur_child < 2; cur_child++ )
01659                         {
01660                                 if( new_cga.Orientation(cur_child) == AbstractMatch::undefined )
01661                                         continue;
01662                                 CompactGappedAlignment<> cga_tmp = new_cga;
01663                                 cga_tmp.SetStart(cur_child, 1);
01664                                 c.translate(cga_tmp, cur_child, 0, false);
01665                                 // adjust for end-gaps
01666                                 bitset_t bs = (cga_tmp.GetAlignment())[cur_child];
01667                                 bs.resize(c.GetAlignment()[0].size(), false);
01668                                 bs <<= c.GetAlignment()[0].find_first();
01669                                 node_id_t sweet_child = alignment_tree[cur_node].children[cur_child];
01670                                 swap( (*cga_list[mI].get<1>())[sweet_child], bs );
01671                                 for( size_t testI = 0; testI < cga_tmp.SeqCount(); ++testI )
01672                                 {
01673                                         if( ((*cga_list[mI].get<1>())[testI].size() != 0 && (*cga_list[mI].get<1>())[testI].size() != (*cga_list[mI].get<1>())[sweet_child].size() ) )
01674                                         {
01675                                                 cerr << "bj0rk3l\n";
01676                                                 genome::breakHere();
01677                                         }
01678                                 }
01679                         }
01680 
01681                         debugging_cltm = false;
01682                         mI++;   // advance to the next match
01683                 }
01684         }
01685         tracking_matches.resize( cga_list.size() );
01686         // finally, construct CompactGappedAlignments out of the bitsets
01687         for( size_t bsI = 0; bsI < cga_list.size(); ++bsI )
01688         {
01689                 cga_list[bsI].get<0>()->SetAlignment(*cga_list[bsI].get<1>());
01690                 cga_list[bsI].get<0>()->validate();
01691                 TrackingMatch& ltm = tracking_matches[bsI];
01692                 ltm.node_match = cga_list[bsI].get<0>();
01693                 ltm.original_match = cga_list[bsI].get<2>();
01694                 ltm.match_id = bsI;
01695 
01696                 bool found_extant = false;
01697                 for( size_t i = 0; i < alignment_tree.size()-1; ++i )
01698                 {
01699                         size_t im = node_sequence_map[i];
01700                         if( im == (std::numeric_limits<size_t>::max)() )
01701                                 continue;
01702                         if( ltm.node_match->LeftEnd(i) != NO_MATCH )
01703                                 found_extant = true;
01704                 }
01705                 if( !found_extant )
01706                 {
01707                         cout << "orig aln len: " << ltm.original_match->AlignmentLength() << endl;
01708                         cout << "orig lend 0: " << ltm.original_match->Start(0) << endl;
01709                         cout << "orig lend 1: " << ltm.original_match->Start(1) << endl;
01710                         cout << "orig length 0: " << ltm.original_match->Length(0) << endl;
01711                         cout << "orig length 1: " << ltm.original_match->Length(1) << endl;
01712 
01713                         cerr << "this is an ungrounded match!!!\n";
01714                         genome::breakHere();
01715                 }
01716         }
01717 }
01718 
01719 size_t countUnrefined( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t ancestor )
01720 {
01721         stack< node_id_t > node_stack;
01722         node_stack.push(ancestor);
01723         size_t unrefined_count = 0;
01724         while( node_stack.size() > 0 )
01725         {
01726                 node_id_t cur_node = node_stack.top();
01727                 node_stack.pop();
01728                 if( alignment_tree[cur_node].children.size() > 0 )
01729                         for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); ++childI )
01730                                 node_stack.push( alignment_tree[cur_node].children[childI] );
01731                 if( !alignment_tree[cur_node].refined )
01732                         unrefined_count++;
01733         }
01734         return unrefined_count;
01735 }
01736 
01737 void markAsRefined( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t ancestor )
01738 {
01739         stack< node_id_t > node_stack;
01740         node_stack.push(ancestor);
01741         size_t refined_count = 0;
01742         while( node_stack.size() > 0 )
01743         {
01744                 node_id_t cur_node = node_stack.top();
01745                 node_stack.pop();
01746                 if( alignment_tree[cur_node].children.size() > 0 )
01747                         for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); ++childI )
01748                                 node_stack.push( alignment_tree[cur_node].children[childI] );
01749                 alignment_tree[cur_node].refined = true;
01750         }
01751         alignment_tree[ancestor].refined = false;
01752 }
01753 
01754 
01755 
01756 void ProgressiveAligner::pairwiseScoreTrackingMatches( 
01757                                                 std::vector< TrackingMatch >& tracking_matches, 
01758                                                 std::vector<node_id_t>& node1_descendants,
01759                                                 std::vector<node_id_t>& node2_descendants,
01760                                                 boost::multi_array< double, 3 >& tm_score_array)
01761 {
01762         tm_score_array.resize( boost::extents[tracking_matches.size()][node1_descendants.size()][node2_descendants.size()] );
01763         for( size_t mI = 0; mI < tracking_matches.size(); ++mI )
01764         {
01765                 TrackingMatch* cur_match = &tracking_matches[mI];
01766                 AbstractMatch* node_match = cur_match->node_match;
01767                 for( size_t nI = 0; nI < node1_descendants.size(); ++nI )
01768                 {
01769                         if( node_sequence_map[node1_descendants[nI]] == (std::numeric_limits<uint>::max)()  ||
01770                                 node_match->LeftEnd(node1_descendants[nI]) == NO_MATCH )
01771                                 continue;
01772                         for( size_t nJ = 0; nJ < node2_descendants.size(); ++nJ )
01773                         {
01774                                 if( node_sequence_map[node2_descendants[nJ]] == (std::numeric_limits<uint>::max)() ||
01775                                         node_match->LeftEnd(node2_descendants[nJ]) == NO_MATCH )
01776                                         continue;       // not extant or no match between this pair
01777 
01778                                 node_id_t cur_n1 = node1_descendants[nI];
01779                                 node_id_t cur_n2 = node2_descendants[nJ];
01780                                 size_t nsmI = node_sequence_map[cur_n1];
01781                                 size_t nsmJ = node_sequence_map[cur_n2];
01782                                 PairwiseMatchAdapter pma( node_match, cur_n1, cur_n2 );
01783                                 vector< AbstractMatch* > lcb_vect( 1, &pma );
01784                                 vector< gnSequence* > ex_seqs(2);
01785                                 ex_seqs[0] = alignment_tree[ cur_n1 ].sequence;
01786                                 ex_seqs[1] = alignment_tree[ cur_n2 ].sequence;
01787 
01788                                 tm_score_array[mI][nI][nJ] = GetPairwiseAnchorScore(lcb_vect, ex_seqs, subst_scoring, sol_list[nsmI], sol_list[nsmJ]);
01789                         }
01790                 }
01791         }
01792         computeAvgAncestralMatchScores(tracking_matches, node1_descendants, node2_descendants, tm_score_array);
01793 }
01794 
01795 void ProgressiveAligner::computeAvgAncestralMatchScores( 
01796                                                 std::vector< TrackingMatch >& tracking_matches, 
01797                                                 std::vector<node_id_t>& node1_descendants,
01798                                                 std::vector<node_id_t>& node2_descendants,
01799                                                 boost::multi_array< double, 3 >& tm_score_array)
01800 {
01801         // now build up the consensus (ancestral) match scores and bp distances
01802         for( uint nodeI = 0; nodeI < node1_descendants.size(); nodeI++ )
01803         {
01804                 for( uint nodeJ = 0; nodeJ < node2_descendants.size(); nodeJ++ )
01805                 {
01806                         node_id_t n1 = node1_descendants[nodeI];
01807                         node_id_t n2 = node2_descendants[nodeJ];
01808 
01809                         vector<node_id_t> n1_ext;
01810                         vector<node_id_t> n2_ext;
01811                         getAlignedChildren(n1, n1_ext);
01812                         getAlignedChildren(n2, n2_ext);
01813                         if( n1_ext.size() == 1 && n2_ext.size() == 1 )
01814                                 continue;       // this node has two extant nodes below it and was already scored
01815 
01816                         // map the nodes in n1_ext to their indices in n1_descendants
01817                         vector< node_id_t > n1_ext_map(n1_ext.size());
01818                         for( size_t i = 0; i < n1_ext.size(); ++i )
01819                         {
01820                                 vector< node_id_t >::iterator iter = std::find( node1_descendants.begin(), node1_descendants.end(), n1_ext[i] );
01821                                 n1_ext_map[i] = iter - node1_descendants.begin();
01822                         }
01823                         vector< node_id_t > n2_ext_map(n2_ext.size());
01824                         for( size_t i = 0; i < n2_ext.size(); ++i )
01825                         {
01826                                 vector< node_id_t >::iterator iter = std::find( node2_descendants.begin(), node2_descendants.end(), n2_ext[i] );
01827                                 n2_ext_map[i] = iter - node2_descendants.begin();
01828                         }
01829 
01830                         // compute scores for all matches at this node
01831                         for( size_t mI = 0; mI < tracking_matches.size(); ++mI )
01832                         {
01833                                 uint tally = 0;
01834                                 double score_sum = 0;
01835                                 for( size_t i = 0; i < n1_ext.size(); ++i )
01836                                 {
01837                                         if( tracking_matches[mI].node_match->LeftEnd(n1_ext[i]) == NO_MATCH )
01838                                                 continue;
01839                                         for( size_t j = 0; j < n2_ext.size(); ++j )
01840                                         {
01841                                                 if( tracking_matches[mI].node_match->LeftEnd(n2_ext[j]) == NO_MATCH )
01842                                                         continue;
01843                                                 ++tally;
01844                                                 score_sum += tm_score_array[mI][n1_ext_map[i]][n2_ext_map[j]];
01845                                         }
01846                                 }
01847                                 if( tally > 0 )
01848                                         tm_score_array[mI][nodeI][nodeJ] = score_sum / (double)tally;
01849                         }
01850                 }
01851         }
01852 }
01853 
01854 
01855 void ProgressiveAligner::computeInternalNodeDistances( 
01856                                                 boost::multi_array<double, 2>& bp_dist_mat, 
01857                                                 boost::multi_array<double, 2>& cons_dist_mat, 
01858                                                 std::vector<node_id_t>& node1_descendants,
01859                                                 std::vector<node_id_t>& node2_descendants)
01860 {
01861         // bp distances for the current node.
01862         bp_dist_mat.resize(boost::extents[node1_descendants.size()][node2_descendants.size()]);
01863         cons_dist_mat.resize(boost::extents[node1_descendants.size()][node2_descendants.size()]);
01864         for( size_t nI = 0; nI < node1_descendants.size(); ++nI )
01865         {
01866                 if( node_sequence_map[node1_descendants[nI]] == (std::numeric_limits<uint>::max)() )
01867                         continue;
01868                 for( size_t nJ = 0; nJ < node2_descendants.size(); ++nJ )
01869                 {
01870                         if( node_sequence_map[node2_descendants[nJ]] == (std::numeric_limits<uint>::max)() )
01871                                 continue;
01872                         size_t i = node_sequence_map[node1_descendants[nI]];
01873                         size_t j = node_sequence_map[node2_descendants[nJ]];
01874                         bp_dist_mat[nI][nJ] = this->bp_distance[i][j];
01875                         cons_dist_mat[nI][nJ] = this->conservation_distance[i][j];
01876                 }
01877         }
01878 
01879         // now build up the consensus (ancestral) bp distances
01880         for( uint nodeI = 0; nodeI < node1_descendants.size(); nodeI++ )
01881         {
01882                 for( uint nodeJ = 0; nodeJ < node2_descendants.size(); nodeJ++ )
01883                 {
01884                         node_id_t n1 = node1_descendants[nodeI];
01885                         node_id_t n2 = node2_descendants[nodeJ];
01886 
01887                         vector<node_id_t> n1_ext;
01888                         vector<node_id_t> n2_ext;
01889                         getAlignedChildren(n1, n1_ext);
01890                         getAlignedChildren(n2, n2_ext);
01891                         if( n1_ext.size() == 1 && n2_ext.size() == 1 )
01892                                 continue;       // this node has two extant nodes below it, so already has a dist
01893 
01894                         // map the nodes in n1_ext to their indices in n1_descendants
01895                         vector< node_id_t > n1_ext_map(n1_ext.size());
01896                         for( size_t i = 0; i < n1_ext.size(); ++i )
01897                         {
01898                                 vector< node_id_t >::iterator iter = std::find( node1_descendants.begin(), node1_descendants.end(), n1_ext[i] );
01899                                 n1_ext_map[i] = iter - node1_descendants.begin();
01900                         }
01901                         vector< node_id_t > n2_ext_map(n2_ext.size());
01902                         for( size_t i = 0; i < n2_ext.size(); ++i )
01903                         {
01904                                 vector< node_id_t >::iterator iter = std::find( node2_descendants.begin(), node2_descendants.end(), n2_ext[i] );
01905                                 n2_ext_map[i] = iter - node2_descendants.begin();
01906                         }
01907 
01908                         // compute average bp distance
01909                         for( size_t i = 0; i < n1_ext.size(); ++i )
01910                         {
01911                                 for( size_t j = 0; j < n2_ext.size(); ++j )
01912                                 {
01913                                         bp_dist_mat[nodeI][nodeJ] += bp_dist_mat[n1_ext_map[i]][n2_ext_map[j]];
01914                                         cons_dist_mat[nodeI][nodeJ] += cons_dist_mat[n1_ext_map[i]][n2_ext_map[j]];
01915                                 }
01916                         }
01917                         bp_dist_mat[nodeI][nodeJ] /= (double)(n1_ext.size() * n2_ext.size());
01918                         cons_dist_mat[nodeI][nodeJ] /= (double)(n1_ext.size() * n2_ext.size());
01919                 }
01920         }
01921 
01922 }
01923 
01924 double computeID( GappedAlignment& gal, size_t seqI, size_t seqJ )
01925 {
01926         const vector< string >& aln_mat = GetAlignment( gal, vector< gnSequence* >(gal.SeqCount(), NULL ));
01927         double id = 0;
01928         double possible = 0;
01929         for( size_t colI = 0; colI < gal.AlignmentLength(); colI++ )
01930         {
01931                 if( aln_mat[seqI][colI] == '-' || aln_mat[seqJ][colI] == '-' )
01932                         continue;
01933                 if( toupper(aln_mat[seqI][colI]) == toupper(aln_mat[seqJ][colI]))
01934                         id++;
01935                 possible++;
01936         }
01937         return id / possible;
01938 }
01939 
01940 
01941 //
01942 //
01943 // different option -- just pick a representative from leaf(A) and leaf(B) to translate
01944 void ProgressiveAligner::getRepresentativeAncestralMatches( const vector< node_id_t > node1_seqs, const vector< node_id_t > node2_seqs, node_id_t node1, node_id_t node2, node_id_t ancestor, std::vector< AbstractMatch* >& ancestral_matches )
01945 {
01946         // for each match, extract a representative match from any pair of genomes in node1_seqs and node2_seqs
01947         // translate up the resulting set of matches and eliminate overlaps
01948         vector< AbstractMatch* > cur_matches;
01949         boost::multi_array< vector< AbstractMatch* >, 2 > seq_matches( boost::extents[node1_seqs.size()][node2_seqs.size()] );
01950         for( size_t mI = 0; mI < original_ml.size(); mI++ )
01951         {
01952                 for( uint seqI = 0; seqI < node1_seqs.size(); seqI++ )
01953                 {
01954                         uint ii = this->node_sequence_map[node1_seqs[seqI]];
01955                         if( original_ml[mI]->LeftEnd(ii) == NO_MATCH )
01956                                 continue;
01957 
01958                         for( uint seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
01959                         {
01960                                 uint jj = this->node_sequence_map[node2_seqs[seqJ]];
01961                                 if( original_ml[mI]->LeftEnd(jj) == NO_MATCH )
01962                                         continue;
01963                                 Match mm( 2 );
01964                                 Match* new_m = mm.Copy();
01965                                 new_m->SetStart( 0, original_ml[mI]->Start(ii));
01966                                 new_m->SetStart( 1, original_ml[mI]->Start(jj));
01967                                 new_m->SetLength(original_ml[mI]->Length());
01968                                 if( new_m->Start(0) < 0 )
01969                                         new_m->Invert();        // assign reference orientation to seq 0
01970                                 seq_matches[seqI][seqJ].push_back( new_m );
01971                                 break;
01972                         }
01973                         break;
01974                 }
01975         }
01976         for( uint seqI = 0; seqI < node1_seqs.size(); seqI++ )
01977                 for( uint seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
01978                 {
01979                         translateGappedCoordinates( seq_matches[seqI][seqJ], 0, node1_seqs[seqI], node1 );
01980                         translateGappedCoordinates( seq_matches[seqI][seqJ], 1, node2_seqs[seqJ], node2 );
01981                         ancestral_matches.insert( ancestral_matches.end(), seq_matches[seqI][seqJ].begin(), seq_matches[seqI][seqJ].end() );
01982                 }
01983 
01984         EliminateOverlaps_v2( ancestral_matches, true );
01985 }
01986 
01987 int cachecomp( const void* e1, const void* e2 )
01988 {
01989         bool a = mems::cache_comparator(*(search_cache_t*)e1, *(search_cache_t*)e2);
01990         bool b = mems::cache_comparator(*(search_cache_t*)e2, *(search_cache_t*)e1);
01991         if(!a && !b)
01992                 return 0;
01993         return a ? -1 : 1;
01994 }
01995 
01996 void ProgressiveAligner::alignProfileToProfile( node_id_t node1, node_id_t node2, node_id_t ancestor )
01997 {
01998         // 1) find all pairwise matches
01999         // 2) convert to pairwise matches among the ancestral sequences
02000         //    - delete inconsistently aligned regions?
02001         // 3) perform greedy b.p. elimination on the pairwise matches
02002         // 4) extend LCBs
02003         // 5)  if total alignment weight hasn't changed, go to (8)
02004         // 6) search for additional matches between each match among extant sequences
02005         // 7) go back to 2
02006         // 8) perform a MUSCLE/Clustal alignment of each intervening region
02007 
02008         vector< node_id_t > node1_seqs; 
02009         vector< node_id_t > node2_seqs; 
02010         getAlignedChildren( node1, node1_seqs );
02011         getAlignedChildren( node2, node2_seqs );
02012 
02013         uint seqI, seqJ;
02014         gnSeqI prev_ancestral_seq_len = (std::numeric_limits<gnSeqI>::max)();
02015 
02016         printMemUsage();
02017         cout << "get ancestral matches\n";
02018 
02019         Matrix<MatchList> pairwise_matches( node1_seqs.size(), node2_seqs.size() );
02020 //      getPairwiseMatches( node1_seqs, node2_seqs, pairwise_matches );
02021         vector< AbstractMatch* > anc_pairwise_matches;
02022         getRepresentativeAncestralMatches( node1_seqs, node2_seqs, node1, node2, ancestor, anc_pairwise_matches );
02023         printMemUsage();
02024         
02025         PhyloTree< AlignmentTreeNode > aln_tree_backup;
02026 
02028         Matrix< std::vector< search_cache_t > > search_cache_db(node1_seqs.size(), node2_seqs.size());
02029         double prev_anchoring_score = -(std::numeric_limits<double>::max)();
02030         double cur_anchoring_score = -(std::numeric_limits<double>::max)();
02031 
02032         while(true)
02033         {
02034                 vector<AbstractMatch*> ancestral_matches;
02035                 if( anc_pairwise_matches.size() > 0 )
02036                 {
02037                         ancestral_matches.insert( ancestral_matches.begin(), anc_pairwise_matches.begin(), anc_pairwise_matches.end() );
02038                         anc_pairwise_matches.clear();
02039                 }
02040                 else
02041                 {
02042                         // part 2, construct pairwise matches to the ancestral sequence
02043                         // A)  for each pairwise match, translate its
02044                         //     coordinates to the ancestral genome
02045                         //         -- try to use translateCoordinates
02046                         //     -- build a translation table for translateCoordinates
02047 
02048                         for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
02049                         {
02050                                 for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
02051                                 {
02052                                         cout << node_sequence_map[node1_seqs[seqI]] << "," << node_sequence_map[node2_seqs[seqJ]] << " has " << pairwise_matches(seqI,seqJ).size() << " pairwise matches\n";
02053                                         cout.flush();
02054 
02055                                         vector< AbstractMatch* > am_list( pairwise_matches(seqI, seqJ).begin(), pairwise_matches(seqI, seqJ).end() );
02056                                         pairwise_matches(seqI, seqJ).clear();
02057                                         translateGappedCoordinates( am_list, 1, node2_seqs[seqJ], node2 );
02058                                         translateGappedCoordinates( am_list, 0, node1_seqs[seqI], node1 );
02059                                         ancestral_matches.insert( ancestral_matches.end(), am_list.begin(), am_list.end() );
02060                                 }
02061                         }
02062                 }
02063                 // include any matches from a previous iteration of this loop
02064                 for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
02065                 {
02066                         Interval& ref_iv = alignment_tree[ancestor].ordering[aI].reference_iv;
02067                         if( ref_iv.Multiplicity() == 2 )
02068                                 for( size_t mI = 0; mI < ref_iv.GetMatches().size(); mI++ )
02069                                         if( ref_iv.GetMatches()[mI]->Multiplicity() > 1 )
02070                                                 ancestral_matches.push_back( ref_iv.GetMatches()[mI]->Copy() );
02071                 }
02072 
02073                 // set seq 0 to forward ref. orientation
02074                 for( size_t mI = 0; mI < ancestral_matches.size(); ++mI )
02075                         if( ancestral_matches[mI]->Start(0) < 0 )
02076                                 ancestral_matches[mI]->Invert();
02077 
02078                 // eliminate overlaps as they correspond to inconsistently or
02079                 // multiply aligned regions
02080                 EliminateOverlaps_v2( ancestral_matches );
02081                 
02082                 multFilter( ancestral_matches );
02083 
02084                 vector< vector< AbstractMatch* > > LCB_list;
02085                 vector< LCB > adjacencies;
02086                 vector< gnSeqI > breakpoints;
02087 
02088                 if( !collinear_genomes )
02089                 {
02090                         cout << "Performing Sum-of-pairs Greedy Breakpoint Elimination\n";
02091                         cout.flush();
02092                         // project the pairwise matches at this node to all-possible pairs matches at descendant nodes
02093                         // keep a mapping of ancestral to extant matches so that when an ancestral match gets removed
02094                         // the match among extant nodes also gets removed
02095                         // how should candidate matches to remove be generated?
02096                         // one possibility is to remove entire ancestral LCBs...  this may be problematic since ancestral
02097                         // LCBs don't correspond to the pairwise LCBs thus an ancestral LCB could be removed with no useful
02098                         // change in alignment score
02099                         //
02100                         //
02101                         // translate the matches into LcbTrackingMatches
02102                         printMemUsage();
02103                         cout << "construct LCB tracking matches\n";
02104                         vector< TrackingMatch > tracking_matches;
02105                         boost::multi_array< size_t, 3 > tm_lcb_id_array;
02106                         boost::multi_array< double, 3 > tm_score_array;
02107                         constructLcbTrackingMatches( ancestor, ancestral_matches, tracking_matches );
02108 
02109                         cout << "There are " << tracking_matches.size() << " tracking matches\n";
02110                         size_t used_components = 0;
02111                         for( size_t tmI = 0; tmI < tracking_matches.size(); ++tmI )
02112                         {
02113                                 for( uint ssI = 0; ssI < tracking_matches[tmI].node_match->SeqCount(); ++ssI )
02114                                         if( tracking_matches[tmI].node_match->LeftEnd(ssI) != NO_MATCH )
02115                                                 used_components++;
02116                         }
02117                         size_t total_components = tracking_matches.size() == 0 ? 0 : tracking_matches.size() * tracking_matches[0].node_match->SeqCount();
02118                         cout << "There are " << used_components << " / " << total_components << " components used\n";
02119 
02120                         vector<node_id_t> node1_descendants;
02121                         vector<node_id_t> node2_descendants;
02122                         if( scoring_scheme == ExtantSumOfPairsScoring )
02123                         {
02124                                 node1_descendants = node1_seqs;
02125                                 node2_descendants = node2_seqs;
02126                         }else{
02127                                 getDescendants(alignment_tree, node1, node1_descendants);
02128                                 getDescendants(alignment_tree, node2, node2_descendants);
02129                         }
02130 
02131                         //
02132                         // score the matches
02133                         //
02134                         printMemUsage();
02135                         cout << "init tracking match LCB tracking\n";
02136                         initTrackingMatchLCBTracking( tracking_matches, node1_descendants.size(), node2_descendants.size(), tm_lcb_id_array );
02137                         printMemUsage();
02138                         cout << "pairwise score tracking matches\n";
02139                         pairwiseScoreTrackingMatches( tracking_matches, node1_descendants, node2_descendants, tm_score_array );
02140                         printMemUsage();
02141 
02142                         // compute bp distances for the current node.
02143                         // ancestral nodes take the average distance of extant nodes
02144                         boost::multi_array<double, 2> bp_dist_mat;
02145                         boost::multi_array<double, 2> cons_dist_mat;
02146                         computeInternalNodeDistances( bp_dist_mat, cons_dist_mat, node1_descendants, node2_descendants);
02147 
02148                         vector< TrackingMatch* > t_matches(tracking_matches.size());
02149                         for( size_t mI = 0; mI < tracking_matches.size(); ++mI )
02150                                 t_matches[mI] = &tracking_matches[mI];
02151 
02152                         // now sort these out into pairwise LCBs
02153                         cout << "get pairwise LCBs\n";
02154                         size_t pair_lcb_count = 0;
02155                         PairwiseLCBMatrix pairwise_adj_mat(boost::extents[node1_descendants.size()][node2_descendants.size()]);
02156                         for( uint nodeI = 0; nodeI < node1_descendants.size(); nodeI++ )
02157                                 for( uint nodeJ = 0; nodeJ < node2_descendants.size(); nodeJ++ )
02158                                 {
02159                                         getPairwiseLCBs( node1_descendants[nodeI], node2_descendants[nodeJ], nodeI, nodeJ, t_matches, pairwise_adj_mat[nodeI][nodeJ], tm_score_array, tm_lcb_id_array );
02160                                         pair_lcb_count += pairwise_adj_mat[nodeI][nodeJ].size();
02161                                 }
02162                         cout << "there are " << pair_lcb_count << " pairwise LCBs\n";
02163                         printMemUsage();
02164 
02165                         sort( t_matches.begin(), t_matches.end() );
02166 
02167                         // other possibility, choose pairwise LCBs to remove.  a score improvement is always guaranteed
02168                         // compute LCBs among descendant nodes
02169                         // this is a good idea.  it factors out ancestral breakpoint decisions entirely
02170                         // need a data structure to track all pairwise LCBs that contain a given match
02171                         // template <class MatchType>
02172                         // class LcbTrackingMatch <MatchType> 
02173                         // { 
02174                         // public:
02175                         //  MatchType node_match;
02176                         //      boost::multi_array< size_t, 2 > lcb_id;
02177                         // }
02178                         // all pairwise LCBs would be evaluated for removal and the one that provides the greatest
02179                         // overall score improvement gets removed.
02180                         // upon removal, matches associated with that LCB would get removed, and any LCBs in other 
02181                         // genomes would get removed if they no longer had any matches
02182                         // to pull this off, the LCB struct needs to store the set of matches directly
02183                         // 
02184                         // but what about small cycles that appear only in 3 or more-way comparisons?  are these
02185                         // important?  umm, yeah, but only if you believe in evolution.
02186                         // 
02187                         // so here's the dilly-oh: score against the ancestral ordering(s) *and* all pairwise orderings
02188                         // for an ancestor.  ancestor contributes the sum of all descendants to the score and breakpoints
02189                         // are penalized as the sum of /participating/ descendants.  a descendant is participating
02190                         // if it has some matching region defined within the LCB and if removal of that matching region
02191                         // eliminates a breakpoint in the pairwise comparison
02192                         cout << "scaling bp penalty by conservation weight:\n";
02193                         print2d_matrix(cons_dist_mat, cout);
02194                         cout << "\n\nscaling bp penalty by bp weight: \n";
02195                         print2d_matrix(bp_dist_mat, cout);
02196                         cout << "\nGreedy BPE\n";
02197                         vector< TrackingMatch* > final;
02198                         if(scoring_scheme == AncestralScoring)
02199                         {
02200                                 vector<node_id_t>::iterator d1_iter = std::find( node1_descendants.begin(), node1_descendants.end(), node1 );
02201                                 vector<node_id_t>::iterator d2_iter = std::find( node2_descendants.begin(), node2_descendants.end(), node2 );
02202                                 size_t d1_index = d1_iter - node1_descendants.begin();
02203                                 size_t d2_index = d2_iter - node2_descendants.begin();
02204                                 EvenFasterSumOfPairsBreakpointScorer spbs( breakpoint_penalty, min_breakpoint_penalty, bp_dist_mat, cons_dist_mat, 
02205                                         t_matches, pairwise_adj_mat, node1_descendants, node2_descendants, 
02206                                         tm_score_array, tm_lcb_id_array, d1_index, d1_index+1, d2_index, d2_index+1 );
02207                                 cur_anchoring_score = greedySearch( spbs );
02208                                 final = spbs.getResults();
02209                         }else{
02210                                 EvenFasterSumOfPairsBreakpointScorer spbs( breakpoint_penalty, min_breakpoint_penalty, bp_dist_mat, cons_dist_mat, 
02211                                         t_matches, pairwise_adj_mat, node1_descendants, node2_descendants, 
02212                                         tm_score_array, tm_lcb_id_array, 0, node1_descendants.size(), 0, node2_descendants.size() );
02213                                 cur_anchoring_score = greedySearch( spbs );
02214                                 final = spbs.getResults();
02215                         }
02216                         cout << "done\n";
02217 
02218                         // free memory used by pairwise projections
02219                         for( size_t mI = 0; mI < tracking_matches.size(); ++mI )
02220                                 tracking_matches[mI].node_match->Free();
02221 
02222                         ancestral_matches.clear();
02223 
02224                         // free memory from deleted matches here
02225                         std::sort(final.begin(), final.end());
02226                         vector< TrackingMatch* > deleted_t_matches( t_matches.size(), NULL );
02227                         std::set_difference( t_matches.begin(), t_matches.end(), final.begin(), final.end(), deleted_t_matches.begin() );
02228                         for( size_t delI = 0; delI < deleted_t_matches.size(); ++delI )
02229                         {
02230                                 if( deleted_t_matches[delI] == NULL )
02231                                         break;
02232                                 deleted_t_matches[delI]->original_match->Free();
02233                         }
02234 
02235                         // convert back to an LCB list
02236                         vector< AbstractMatch* > new_matches(final.size());
02237                         for( size_t mI = 0; mI < final.size(); ++mI )
02238                                 new_matches[mI] = final[mI]->original_match;
02239 
02240                         IdentifyBreakpoints( new_matches, breakpoints );
02241                         ComputeLCBs_v2( new_matches, breakpoints, LCB_list );
02242 
02243                 } // end if !collinear
02244                 else
02245                 {       // if we are assuming all genomes are collinear, then we don't need the 
02246                         // sophisticated pairwise breakpoint scoring and can get by with simple breakpoint
02247                         // penalties
02248                         IdentifyBreakpoints( ancestral_matches, breakpoints );
02249                         ComputeLCBs_v2( ancestral_matches, breakpoints, LCB_list );
02250 
02251                         vector< double > lcb_scores( LCB_list.size() );
02252                         double score_sum = 100; // anything > 0 would work.  this will be the breakpoint penalty
02253                         for( size_t lcbI = 0; lcbI < LCB_list.size(); ++lcbI )
02254                         {
02255                                 lcb_scores[lcbI] = SimpleGetLCBCoverage( LCB_list[lcbI] );
02256                                 score_sum += lcb_scores[lcbI];
02257                         }
02258 
02259                         computeLCBAdjacencies_v3( LCB_list, lcb_scores, adjacencies );
02260 
02261                         // want to eliminate all breakpoints
02262                         SimpleBreakpointScorer wbs( adjacencies, score_sum, true );
02263                         cur_min_coverage = greedyBreakpointElimination_v4( adjacencies, lcb_scores, wbs, NULL, false );
02264                         vector<AbstractMatch*> deleted_matches;
02265                         filterMatches_v2( adjacencies, LCB_list, lcb_scores, deleted_matches );
02266                         for( size_t delI = 0; delI < deleted_matches.size(); ++delI )
02267                                 deleted_matches[delI]->Free();
02268                 }
02269                 printMemUsage();
02270 
02271                 ancestral_matches.clear();
02272 
02273                 cout << "Arrived at " << LCB_list.size() << " intervals\n";
02274                 // create an ancestral ordering
02275                 vector< Interval* > pairwise_intervals;
02276                 Interval tmp_iv;
02277                 for( size_t lcbI = 0; lcbI < LCB_list.size(); lcbI++ )
02278                 {
02279                         pairwise_intervals.push_back( tmp_iv.Copy() );
02280                         pairwise_intervals.back()->SetMatches( LCB_list[lcbI] );
02281                 }
02282                 LCB_list.clear();
02283 
02284                 vector<gnSeqI> seq_lengths = vector<gnSeqI>(2,0);
02285                 for( size_t aI = 0; aI < alignment_tree[node1].ordering.size(); ++aI )
02286                         seq_lengths[0] += alignment_tree[node1].ordering[aI].Length();
02287                 for( size_t aI = 0; aI < alignment_tree[node2].ordering.size(); ++aI )
02288                         seq_lengths[1] += alignment_tree[node2].ordering[aI].Length();
02289 
02290                 cout << "Adding unaligned intervals\n";
02291                 addUnalignedIntervals_v2(pairwise_intervals, set<uint>(), seq_lengths);
02292 
02293                 cout << "addUnalignedIntervals yields " << pairwise_intervals.size() << " intervals\n";
02294 
02295                 bool borked = false;
02296                 if(debug_aligner)
02297                         borked = validatePairwiseIntervals(node1, node2, pairwise_intervals);
02298 
02299                 // merge unaligned intervals
02300                 cout << "Merging unaligned intervals\n";
02301                 cout.flush();
02302                 vector<Interval*> new_list1;
02303                 vector<Interval*> merged_intervals;
02304                 mergeUnalignedIntervals( 1, pairwise_intervals, new_list1 );
02305                 mergeUnalignedIntervals( 0, new_list1, merged_intervals );
02306                 cout << "Marbling gaps\n";
02307                 cout.flush();
02308                 for( size_t ivI = 0; ivI < merged_intervals.size(); ivI++ )
02309                         merged_intervals[ivI]->Marble(50);
02310 
02311                 cout << "Propagating descendant breakpoints\n";
02312 
02313                 // split up intervals at descendant's breakpoints
02314                 propagateDescendantBreakpoints( node1, 0, merged_intervals );
02315                 propagateDescendantBreakpoints( node2, 1, merged_intervals );
02316 
02317                 cout << "descendant 0(" << node1 << ") has " << alignment_tree[node1].ordering.size() << " intervals\n";
02318                 cout << "descendant 1(" << node2 << ") has " << alignment_tree[node2].ordering.size() << " intervals\n";
02319                 cout << "propagateDescendantBreakpoints yields " << merged_intervals.size() << " intervals\n";
02320 
02321                 if(debug_aligner)
02322                         borked = validatePairwiseIntervals(node1, node2, merged_intervals);
02323                 cout << "Creating ancestral ordering\n";
02324                 alignment_tree[ancestor].ordering.clear();
02325                 createAncestralOrdering( merged_intervals, alignment_tree[ancestor].ordering );
02326                 for( size_t ivI = 0; ivI < merged_intervals.size(); ivI++ )
02327                         merged_intervals[ivI]->Free();
02328                 merged_intervals.clear();       // free up some memory
02329 
02330                 if(debug_aligner)
02331                         validateSuperIntervals( node1, node2, ancestor );
02332 
02333                 // if we're not making any progress then bail out...
02334                 gnSeqI cur_ancestral_seq_len = 0;
02335                 for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
02336                         cur_ancestral_seq_len += alignment_tree[ancestor].ordering[aI].Length();
02337 
02338                 if( !collinear_genomes )
02339                         cout << "Previous anchoring score: " << prev_anchoring_score << ", new anchor score: " << cur_anchoring_score << endl;
02340                 else
02341                         cout << "Prev alignment len: " << prev_ancestral_seq_len << ", new alignment length: " << cur_ancestral_seq_len << endl;
02342                 // if cur_seq_len has decreased then we're improving
02343                 // if not, then we're done finding matches
02344                 if( collinear_genomes && cur_ancestral_seq_len >= prev_ancestral_seq_len )
02345                         break;
02346 
02347                 if( !collinear_genomes && cur_anchoring_score <= prev_anchoring_score )
02348                         break;
02349                 prev_anchoring_score = cur_anchoring_score;
02350                 prev_ancestral_seq_len = cur_ancestral_seq_len;
02351 
02352                 // accept the new alignment tree...
02353                 cout << "Backing up alignment tree...\n";
02354                 cout.flush();
02355                 aln_tree_backup = alignment_tree;
02356 
02357                 cout << "propagating ancestral breakpoints\n";
02358                 cout.flush();
02359                 recursiveApplyAncestralBreakpoints(ancestor);
02360 
02361 
02362                 if( debug_me )   
02363                 {
02364                         for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )        
02365                         {
02366                                 GappedAlignment gal;     
02367                                 extractAlignment(ancestor, aI, gal);     
02368 
02369                                 bool check = false;
02370                                 for( size_t ii = 0; ii < gal.SeqCount(); ++ii )
02371                                 {
02372                                         if( gal.LeftEnd(ii) == 0 )
02373                                                 continue;
02374                                         for( size_t jj = 0; jj < gal.SeqCount(); ++jj )
02375                                         {
02376                                                 if( gal.LeftEnd(jj) == 0 )
02377                                                         continue;
02378                                                 check = check || computeID( gal, ii, jj ) < .5;
02379                                         }
02380                                 }
02381                                 if( check )
02382                                         cerr << "check iv " << aI << " dbg_count " << dbg_count << endl;
02383                                 else
02384                                         continue;
02385 
02386                                 const vector< string >& aln_mat = GetAlignment(gal, this->original_ml.seq_table);        
02387                                 gnSequence seq;          
02388                                 for( size_t seqI = 0; seqI < gal.SeqCount(); ++seqI )    
02389                                         if( gal.LeftEnd(seqI) != NO_MATCH )      
02390                                                 seq += aln_mat[seqI];    
02391 
02392                                 stringstream dbg_fname;          
02393                                 dbg_fname << "prof_dbg_iv_" << aI << ".dbg." << dbg_count++ << ".fas";   
02394                                 ofstream debug_file( dbg_fname.str().c_str() );          
02395                                 gnFASSource::Write( seq, debug_file, false );    
02396                                 debug_file.close();      
02397                         }        
02398                 }
02399 
02400                 if(debug_aligner)
02401                         validateSuperIntervals( node1, node2, ancestor );
02402 
02403                 if(recursive)
02404                 {
02405                 // search for additional alignment anchors
02406                 cout << "recursive anchor search\n";
02407                 cout.flush();
02408                 Matrix<MatchList> matches;
02409                 Matrix< std::vector< search_cache_t > > new_cache_db(node1_seqs.size(), node2_seqs.size());
02410                 // initialize storage for intervening regions
02411                 boost::multi_array< std::vector< std::vector< int64 > >, 2 > iv_regions( boost::extents[node1_seqs.size()][node2_seqs.size()] );
02412                 for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
02413                         for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
02414                                 iv_regions[seqI][seqJ].resize(2);
02415                 vector< gnSequence* > bseqs( node1_seqs.size() + node2_seqs.size() );
02416                 for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
02417                 {
02418                         CompactGappedAlignment<> cga;
02419                         extractAlignment(ancestor, aI, cga);
02420                         recurseOnPairs(node1_seqs, node2_seqs, cga, matches, search_cache_db, new_cache_db, iv_regions);
02421 
02422                         // add any new matches to the pairwise_matches matrix
02423                         for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
02424                                 for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
02425                                         pairwise_matches(seqI, seqJ).insert( pairwise_matches(seqI, seqJ).end(), matches(seqI, seqJ).begin(), matches(seqI, seqJ).end() );
02426 
02427                 }
02428 
02429                 // add seqs
02430                 for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
02431                         bseqs[seqI] = alignment_tree[ node1_seqs[seqI] ].sequence;
02432                 for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
02433                         bseqs[seqI+seqJ] =  alignment_tree[ node2_seqs[seqJ] ].sequence;
02434 
02435                 MaskedMemHash nway_mh;
02436                 // now search intervening regions
02437                 for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
02438                         for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
02439                         {
02440                                 std::sort( iv_regions[seqI][seqJ][0].begin(), iv_regions[seqI][seqJ][0].end() );
02441                                 std::sort( iv_regions[seqI][seqJ][1].begin(), iv_regions[seqI][seqJ][1].end() );
02442                                 MatchList new_matches;
02443                                 new_matches.seq_table.resize(2);
02444                                 new_matches.seq_table[0] = bseqs[seqI];
02445                                 new_matches.seq_table[1] = bseqs[node1_seqs.size() + seqJ];
02446                                 SearchLCBGaps( new_matches, iv_regions[seqI][seqJ], nway_mh );
02447                                 pairwise_matches(seqI, seqJ).insert( pairwise_matches(seqI, seqJ).end(), new_matches.begin(), new_matches.end() );
02448                         }
02449 
02450                 if(using_cache_db)
02451                 {
02452 
02453                 for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
02454                 {
02455                         for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
02456                         {
02457                                 for( size_t mI = 0; mI < search_cache_db(seqI,seqJ).size(); mI++ )
02458                                 {
02459                                         if( search_cache_db(seqI,seqJ)[mI].first != NULL )
02460                                                 search_cache_db(seqI,seqJ)[mI].first->Free();
02461                                         if( search_cache_db(seqI,seqJ)[mI].second != NULL )
02462                                                 search_cache_db(seqI,seqJ)[mI].second->Free();
02463                                 }
02464                                 search_cache_db(seqI,seqJ).clear();
02465                                 if(new_cache_db(seqI, seqJ).size() > 0)
02466                                 {
02467                                         // try sorting using C's qsort -- maybe there's something wrong with std::sort?
02468                                         search_cache_t* sc_array = new search_cache_t[new_cache_db(seqI,seqJ).size()];
02469                                         for( size_t i = 0; i < new_cache_db(seqI,seqJ).size(); i++ )
02470                                                 sc_array[i] = new_cache_db(seqI,seqJ)[i];
02471                                         qsort(sc_array, new_cache_db(seqI,seqJ).size(), sizeof(AbstractMatch*), cachecomp);
02472 
02473                                         search_cache_db(seqI, seqJ).resize(new_cache_db(seqI,seqJ).size());
02474                                         for( size_t i = 0; i < new_cache_db(seqI,seqJ).size(); i++ )
02475                                                 search_cache_db(seqI, seqJ)[i] = sc_array[i];
02476                                         delete[] sc_array;
02477 
02478                                         new_cache_db(seqI, seqJ).clear();
02479                                 }
02480                                 if( pairwise_matches(seqI,seqJ).size() > 0 )
02481                                         cout << seqI << "," << seqJ << " has an additional " << pairwise_matches(seqI,seqJ).size() << " matches\n";
02482                         }
02483                 }
02484                 
02485                 }
02486                 }       // if recursive
02487 
02488                 // restore backed up tree since we only want the final set of ancestral
02489                 // breakpoints applied to the descendants
02490                 cout << "Restoring backed up alignment tree...\n";
02491                 cout.flush();
02492                 swap( alignment_tree, aln_tree_backup );
02493 
02494         }       // end while(true)
02495 
02496         if( using_cache_db )
02497         {
02498         // delete the search cache
02499         for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
02500                 for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
02501                         for( size_t mI = 0; mI < search_cache_db(seqI,seqJ).size(); mI++ )
02502                         {
02503                                 if( search_cache_db(seqI,seqJ)[mI].first != NULL )
02504                                         search_cache_db(seqI,seqJ)[mI].first->Free();
02505                                 if( search_cache_db(seqI,seqJ)[mI].second != NULL )
02506                                         search_cache_db(seqI,seqJ)[mI].second->Free();
02507                         }
02508         }
02509 
02510         printMemUsage();
02511 
02512         // aln_tree_backup has the highest scoring alignment_tree
02513         swap( alignment_tree, aln_tree_backup );
02514         cout << "propagating ancestral breakpoints\n";
02515         recursiveApplyAncestralBreakpoints(ancestor);
02516 
02517         printMemUsage();
02518 
02519         // step 8) construct a muscle alignment in each intervening region
02520         if( gapped_alignment )
02521         {
02522                 cout << "performing a gapped alignment\n";
02523                 doGappedAlignment(ancestor, true);
02524         }else
02525                 cerr << "skipping gapped alignment\n";
02526         if( refine )
02527         {
02528                 size_t unrefined = countUnrefined( alignment_tree, ancestor );
02529                 if( unrefined > 5 && ancestor != alignment_tree.root )
02530                 {
02531                         cout << "performing iterative refinement\n";
02532                         doGappedAlignment(ancestor, false);
02533                         markAsRefined( alignment_tree, ancestor );
02534                 }
02535         }
02536         printMemUsage();
02537 
02538 
02539         if( debug_me )   
02540         {
02541                 for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )        
02542                 {        
02543 
02544                         static int dbg_count = 0;        
02545                         GappedAlignment gal;     
02546                         extractAlignment(ancestor, aI, gal);     
02547 
02548                         bool check = false;
02549                         for( size_t ii = 0; ii < gal.SeqCount(); ++ii )
02550                         {
02551                                 if( gal.LeftEnd(ii) == 0 )
02552                                         continue;
02553                                 for( size_t jj = 0; jj < gal.SeqCount(); ++jj )
02554                                 {
02555                                         if( gal.LeftEnd(jj) == 0 )
02556                                                 continue;
02557                                         check = check || computeID( gal, ii, jj ) < .5;
02558                                 }
02559                         }
02560                         if( check )
02561                                 cerr << "check iv " << aI << " dbg_count " << dbg_count << endl;
02562                         else
02563                                 continue;
02564 
02565                         const vector< string >& aln_mat = GetAlignment(gal, this->original_ml.seq_table);        
02566                         gnSequence seq;          
02567                         for( size_t seqI = 0; seqI < gal.SeqCount(); ++seqI )    
02568                                 if( gal.LeftEnd(seqI) != NO_MATCH )      
02569                                         seq += aln_mat[seqI];    
02570 
02571                         stringstream dbg_fname;          
02572                         dbg_fname << "prof_dbg_iv_" << aI << ".dbg." << dbg_count++ << ".fas";   
02573                         ofstream debug_file( dbg_fname.str().c_str() );          
02574                         gnFASSource::Write( seq, debug_file, false );    
02575                         debug_file.close();      
02576                 }        
02577         }
02578 
02579 
02580 }
02581 
02582 
02583 void addGuy( uint seqI, AbstractMatch::orientation orient, 
02584                         std::vector< AbstractMatch* >& new_ivs, 
02585                         vector<Interval*>& new_list )
02586 {
02587         Interval tmp_iv;
02588         // set the orientation for any unaligned intervals
02589         if( orient == AbstractMatch::reverse )
02590         {
02591                 for( size_t nI = 0; nI < new_ivs.size(); nI++ )
02592                         if( new_ivs[nI]->LeftEnd(seqI) != NO_MATCH && new_ivs[nI]->Orientation(seqI) != orient)
02593                                 new_ivs[nI]->Invert();
02594         }
02595         // add this guy
02596         Interval* added_iv = tmp_iv.Copy();
02597         added_iv->SetMatches( new_ivs );
02598         new_list.push_back(added_iv);
02599 }
02600 
02601 void mergeUnalignedIntervals( uint seqI, vector< Interval* >& iv_list, vector< Interval* >& new_list )
02602 {
02603         SSC<Interval> ivlcJ(seqI);
02604         sort( iv_list.begin(), iv_list.end(), ivlcJ );
02605 
02606         Interval tmp_iv;
02607         AbstractMatch::orientation orient = AbstractMatch::undefined;
02608         vector< AbstractMatch* > new_ivs;
02609         vector< Interval* > to_delete;
02610         for( size_t ordI = 0; ordI < iv_list.size(); ordI++ )
02611         {
02612                 if( iv_list[ordI]->LeftEnd(seqI) == NO_MATCH )
02613                 {
02614                         new_list.push_back(iv_list[ordI]);
02615                         iv_list[ordI] = NULL;
02616                         continue;
02617                 }
02618 
02619                 if( orient == AbstractMatch::undefined && iv_list[ordI]->Multiplicity() == 2 )
02620                 {
02621                         orient = iv_list[ordI]->Orientation(seqI);
02622                         vector< AbstractMatch* > matches;
02623                         iv_list[ordI]->StealMatches( matches );
02624                         if( orient == AbstractMatch::forward )
02625                                 new_ivs.insert( new_ivs.end(), matches.begin(), matches.end() );
02626                         else
02627                                 new_ivs.insert( new_ivs.begin(), matches.begin(), matches.end() );
02628 
02629                         // if it's the last one then add
02630                         if( ordI + 1 == iv_list.size() )
02631                                 addGuy( seqI, orient, new_ivs, new_list );
02632                         continue;
02633                 }
02634                 if( orient != AbstractMatch::undefined && iv_list[ordI]->Multiplicity() == 2 )
02635                 {
02636                         // add this guy...
02637                         // set the orientation for any unaligned intervals
02638                         addGuy( seqI, orient, new_ivs, new_list );
02639 
02640                         // prepare a new one
02641                         vector< AbstractMatch* > matches;
02642                         orient = iv_list[ordI]->Orientation(seqI);
02643                         iv_list[ordI]->StealMatches( matches );
02644                         new_ivs.insert( new_ivs.end(), matches.begin(), matches.end() );
02645                         // if it's the last one then add
02646                         if( ordI + 1 == iv_list.size() )
02647                                 addGuy( seqI, orient, new_ivs, new_list );
02648                         continue;
02649                 }
02650                 if( new_ivs.size() == 0 )
02651                 {
02652                         vector< AbstractMatch* > matches;
02653                         iv_list[ordI]->StealMatches( matches );
02654                         new_ivs.insert( new_ivs.end(), matches.begin(), matches.end() );
02655                         continue;
02656                 }
02657                 // split this one in half (if its not the last one and there's something to split)...
02658                 Interval* left_iv = iv_list[ordI]->Copy();
02659                 to_delete.push_back( left_iv ); // make sure this gets deleted later
02660                 bool cropped = (ordI + 1 < iv_list.size() && iv_list[ordI]->Length(seqI) > 1);
02661                 if( cropped )
02662                 {
02663                         gnSeqI lendo = left_iv->AlignmentLength() / 2;
02664                         left_iv->CropEnd( left_iv->AlignmentLength() - lendo );
02665                         iv_list[ordI]->CropStart( lendo );
02666                 }
02667                 vector< AbstractMatch* > matches;
02668                 left_iv->StealMatches( matches );
02669                 if( orient == AbstractMatch::forward )
02670                         new_ivs.insert( new_ivs.end(), matches.begin(), matches.end() );
02671                 else
02672                         new_ivs.insert( new_ivs.begin(), matches.begin(), matches.end() );
02673 
02674                 addGuy( seqI, orient, new_ivs, new_list );
02675                 // prepare for the next
02676                 orient = AbstractMatch::undefined;
02677                 if(cropped)
02678                         ordI--; // if we split a match, make sure we get the rest of this match on the next run through the loop
02679         }
02680 
02681         if( new_ivs.size() > 0 )
02682         {
02683                 // uh-oh. there must not have been anything aligned
02684                 addGuy( seqI, AbstractMatch::forward, new_ivs, new_list );
02685         }
02686 
02687         // free up any left_ivs that were allocated
02688         for( size_t delI = 0; delI < to_delete.size(); delI++ )
02689                 to_delete[delI]->Free();
02690 
02691         // free up ivs left in iv_list
02692         for( size_t ivI = 0; ivI < iv_list.size(); ivI++ )
02693                 if( iv_list[ivI] != NULL )
02694                         iv_list[ivI]->Free();
02695         iv_list.clear();
02696 }
02697 
02698 
02702 void ProgressiveAligner::createAncestralOrdering( vector<Interval*>& interval_list, vector< SuperInterval >& ancestral_sequence )
02703 {
02704         // construct an ancestral SuperSequence
02705         int64 left_end = 1;
02706         ancestral_sequence.resize( interval_list.size() );
02707         for( uint ivI = 0; ivI < interval_list.size(); ++ivI ){
02708                 if(debug_aligner)
02709                         interval_list[ivI]->ValidateMatches();
02710                 vector<AbstractMatch*> matches;
02711                 interval_list[ivI]->StealMatches(matches);
02712                 ancestral_sequence[ivI].reference_iv.SetMatches(matches);
02713                 ancestral_sequence[ivI].SetLeftEnd(left_end);
02714                 ancestral_sequence[ivI].SetLength(ancestral_sequence[ivI].reference_iv.AlignmentLength());
02715                 if(debug_aligner)
02716                         ancestral_sequence[ivI].ValidateSelf();
02717                 left_end += ancestral_sequence[ivI].Length();
02718         }
02719 }
02720 
02721 void markAligned( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t subject_node, node_id_t neighbor )
02722 {
02723         for( uint parentI = 0; parentI < alignment_tree[subject_node].parents.size(); parentI++ )
02724                 if( alignment_tree[subject_node].parents[parentI] == neighbor )
02725                         alignment_tree[subject_node].parents_aligned[parentI] = true;
02726         for( uint childI = 0; childI < alignment_tree[subject_node].children.size(); childI++ )
02727                 if( alignment_tree[subject_node].children[childI] == neighbor )
02728                         alignment_tree[subject_node].children_aligned[childI] = true;
02729 }
02730 
02731 
02732 bool
02733 ProgressiveAligner::validateSuperIntervals(node_id_t node1, node_id_t node2, node_id_t ancestor)
02734 {
02735                 // validate the ancestor
02736         bool borked = false;
02737         vector< SuperInterval >& siv_list = alignment_tree[ancestor].ordering;
02738         gnSeqI n1_len = 0;
02739         gnSeqI n2_len = 0;
02740         gnSeqI my_len = 0;
02741         gnSeqI my_iv_len = 0;
02742         for( size_t sivI = 0; sivI < siv_list.size(); sivI++ )
02743         {
02744                 if( siv_list[sivI].reference_iv.Start(0) != 0 )
02745                         n1_len += siv_list[sivI].reference_iv.Length(0);
02746                 if( siv_list[sivI].reference_iv.Start(1) != 0 )
02747                         n2_len += siv_list[sivI].reference_iv.Length(1);
02748                 my_len += siv_list[sivI].Length();
02749                 my_iv_len += siv_list[sivI].reference_iv.AlignmentLength();
02750                 siv_list[sivI].ValidateSelf();
02751         }
02752         gnSeqI real_n1len = 0;
02753         gnSeqI real_n2len = 0;
02754 
02755         vector< SuperInterval >& siv1_list = alignment_tree[node1].ordering;
02756         for( size_t sivI = 0; sivI < siv1_list.size(); sivI++ )
02757         {
02758                 if( siv1_list[sivI].Length() == 0 )
02759                         borked = true;
02760                 real_n1len += siv1_list[sivI].Length();
02761                 siv1_list[sivI].ValidateSelf();
02762         }
02763 
02764         vector< SuperInterval >& siv2_list = alignment_tree[node2].ordering;
02765         for( size_t sivI = 0; sivI < siv2_list.size(); sivI++ )
02766         {
02767                 if( siv2_list[sivI].Length() == 0 )
02768                         borked = true;
02769                 real_n2len += siv2_list[sivI].Length();
02770                 siv2_list[sivI].ValidateSelf();
02771         }
02772 
02773         if( real_n1len != n1_len || real_n2len != n2_len )
02774                         borked = true;
02775 
02776         // check that each picks up where the last left off
02777         for( size_t sivI = 1; sivI < siv1_list.size(); sivI++ )
02778                 if( siv1_list[sivI].LeftEnd() != siv1_list[sivI-1].LeftEnd() + siv1_list[sivI-1].Length() )
02779                 {
02780                         borked = true;
02781                 }
02782         for( size_t sivI = 1; sivI < siv2_list.size(); sivI++ )
02783                 if( siv2_list[sivI].LeftEnd() != siv2_list[sivI-1].LeftEnd() + siv2_list[sivI-1].Length() )
02784                 {
02785                         borked = true;
02786                 }
02787 
02788         if( my_len != my_iv_len )
02789                 borked = true;
02790 
02791         if( my_len < real_n1len || my_len < real_n2len )
02792                 borked = true;
02793 
02794         if( borked )
02795         {
02796                 breakHere();
02797                 cerr << "child1 has " << siv1_list.size() << " ivs totalling " << real_n1len << " nt\n";
02798                 cerr << "child2 has " << siv2_list.size() << " ivs totalling " << real_n2len << " nt\n";
02799                 cerr << "parent has " << siv_list.size() << " ivs, n1_len: " << n1_len << " n2_len: " << n2_len << endl;
02800         }
02801         return borked;
02802 
02803 }
02804 
02805 bool ProgressiveAligner::validatePairwiseIntervals(node_id_t node1, node_id_t node2, std::vector<Interval*>& pair_iv)
02806 {
02807                 // validate the ancestor
02808         bool borked = false;
02809         gnSeqI n1_len = 0;
02810         gnSeqI n2_len = 0;
02811         for( size_t sivI = 0; sivI < pair_iv.size(); sivI++ )
02812         {
02813                 if( pair_iv[sivI]->Start(0) != 0 )
02814                         n1_len += pair_iv[sivI]->Length(0);
02815                 if( pair_iv[sivI]->Start(1) != 0 )
02816                         n2_len += pair_iv[sivI]->Length(1);
02817 
02818                 vector< bitset_t > aln_mat;
02819                 pair_iv[sivI]->GetAlignment(aln_mat);
02820                 if( aln_mat[0].size() != pair_iv[sivI]->AlignmentLength() )
02821                 {
02822                         cerr << "broked\n";
02823                 }
02824                 pair_iv[sivI]->ValidateMatches();
02825         }
02826         gnSeqI real_n1len = 0;
02827         gnSeqI real_n2len = 0;
02828 
02829         vector< SuperInterval >& siv1_list = alignment_tree[node1].ordering;
02830         for( size_t sivI = 0; sivI < siv1_list.size(); sivI++ )
02831         {
02832                 if( siv1_list[sivI].Length() == 0 )
02833                         borked = true;
02834                 real_n1len += siv1_list[sivI].Length();
02835         }
02836 
02837         vector< SuperInterval >& siv2_list = alignment_tree[node2].ordering;
02838         for( size_t sivI = 0; sivI < siv2_list.size(); sivI++ )
02839         {
02840                 if( siv2_list[sivI].Length() == 0 )
02841                         borked = true;
02842                 real_n2len += siv2_list[sivI].Length();
02843         }
02844 
02845         if( real_n1len != n1_len || real_n2len != n2_len )
02846                         borked = true;
02847 
02848         // check for overlapping intervals
02849         vector< Interval* > tmp_iv_list = pair_iv;
02850         for( uint seqI = 0; seqI < 2; seqI++ )
02851         {
02852                 SSC<Interval> ssc(seqI);
02853                 sort( tmp_iv_list.begin(), tmp_iv_list.end(), ssc );
02854                 for( size_t ivI = 1; ivI < tmp_iv_list.size(); ivI++ )
02855                 {
02856                         if( tmp_iv_list[ivI-1]->LeftEnd(seqI) == NO_MATCH || tmp_iv_list[ivI]->LeftEnd(seqI) == NO_MATCH )
02857                                 continue;
02858                         if( tmp_iv_list[ivI-1]->RightEnd(seqI) >= tmp_iv_list[ivI]->LeftEnd(seqI) )
02859                         {
02860                                 cerr << "overlap:\n";
02861                                 cerr << "tmp_iv_list[ivI-1].RightEnd(seqI): " << tmp_iv_list[ivI-1]->RightEnd(seqI) << endl;
02862                                 cerr << "tmp_iv_list[ivI].LeftEnd(seqI): " << tmp_iv_list[ivI]->LeftEnd(seqI) << endl;
02863                                 breakHere();
02864                         }
02865                 }
02866         }
02867 
02868         if( borked )
02869         {
02870                 cerr << "child1 has " << siv1_list.size() << " ivs totalling " << real_n1len << " nt\n";
02871                 cerr << "child2 has " << siv2_list.size() << " ivs totalling " << real_n2len << " nt\n";
02872                 cerr << "parent has " << pair_iv.size() << " ivs, n1_len: " << n1_len << " n2_len: " << n2_len << endl;
02873                 if( n2_len < real_n2len )
02874                 {
02875                         SSC<Interval> sortie(1);
02876                         sort( pair_iv.begin(), pair_iv.end(), sortie );
02877                         size_t prev_iv = 9999999;
02878                         for( size_t ivI = 0; ivI < pair_iv.size(); ++ivI)
02879                         {
02880                                 if( pair_iv[ivI]->LeftEnd(1) == NO_MATCH )
02881                                         continue;
02882 
02883                                 if( prev_iv != 9999999 )
02884                                         cerr << "diff: " << pair_iv[ivI]->LeftEnd(1) - pair_iv[prev_iv]->RightEnd(1) << endl;
02885                                 cerr << "Interval " << ivI << " LeftEnd(1): " << pair_iv[ivI]->LeftEnd(1) << " RightEnd(1): " << pair_iv[ivI]->RightEnd(1) << std::endl;
02886                                 prev_iv = ivI;
02887                         }
02888                 }else if( n2_len > real_n2len )
02889                 {
02890                         SSC<Interval> sortie(1);
02891                         sort( pair_iv.begin(), pair_iv.end(), sortie );
02892                         for( size_t ivI = 0; ivI < pair_iv.size(); ++ivI)
02893                         {
02894                                 if( pair_iv[ivI]->LeftEnd(1) < real_n2len )
02895                                         continue;
02896                                 cerr << "Interval " << ivI << " LeftEnd(1): " << pair_iv[ivI]->LeftEnd(1) << " RightEnd(1): " << pair_iv[ivI]->RightEnd(1) << std::endl;
02897                         }
02898                 }
02899                 breakHere();
02900         }
02901         return borked;
02902 }
02903 
02904 void ProgressiveAligner::alignNodes( node_id_t node1, node_id_t node2, node_id_t ancestor )
02905 {
02906         cout << "Aligning node " << node1 << " to " << node2 << " via " << ancestor << "!\n";
02907         // if node1 and node2 are not already children of ancestor then make it so...
02908         if( alignment_tree[node1].parents[0] != ancestor || 
02909                 alignment_tree[node2].parents[0] != ancestor )
02910         {
02911                 breakHere();
02912                 cerr << "rotten\n";
02913         }
02914         
02915         alignProfileToProfile(node1, node2, ancestor);
02916 
02917         // mark edges as aligned
02918         markAligned( alignment_tree, node1, node2 );
02919         markAligned( alignment_tree, node2, node1 );
02920         markAligned( alignment_tree, node1, ancestor );
02921         markAligned( alignment_tree, node2, ancestor );
02922         markAligned( alignment_tree, ancestor, node1 );
02923         markAligned( alignment_tree, ancestor, node2 );
02924 }
02925 
02929 void findMidpoint( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t& n1, node_id_t& n2 )
02930 {
02931         // use boost's all pairs shortest path to find the longest path on the tree 
02932         // Then actually traverse the path to determine which edge
02933         // is halfway.
02934         double scaling_factor = 100000;
02935         using namespace boost;
02936         typedef adjacency_list<vecS, vecS, undirectedS, no_property,
02937         property< edge_weight_t, int, property< edge_color_t, default_color_type > > > Graph;
02938         const int V = alignment_tree.size();
02939         const std::size_t E = alignment_tree.size()-1;
02940         typedef std::pair < int, int >Edge;
02941         Edge* edge_array = new Edge[ alignment_tree.size() - 1 ];
02942         int* weights = new int[ alignment_tree.size() - 1 ];
02943         bitset_t child_found( alignment_tree.size(), false );
02944         size_t eI = 0;
02945         for( size_t vI = 0; vI < V; ++vI )
02946         {
02947                 if( alignment_tree[vI].parents.size() != 0 )
02948                 {
02949                         edge_array[eI] = Edge( vI, alignment_tree[vI].parents[0] );
02950                         // for some reason boost insists on using an int for weights.  need to figure that out
02951                         weights[eI] = (int)(scaling_factor * genome::absolut(alignment_tree[vI].distance)) + 1;
02952                         eI++;
02953                 }
02954         }
02955 
02956 #if defined(BOOST_MSVC) && BOOST_MSVC <= 1300
02957         // VC++ can't handle the iterator constructor
02958         Graph g(V);
02959         for (std::size_t j = 0; j < E; ++j)
02960         add_edge(edge_array[j].first, edge_array[j].second, g);
02961 #else
02962         Graph g(edge_array, edge_array + E, V);
02963 #endif
02964 
02965         property_map < Graph, edge_weight_t >::type w = get(edge_weight, g);
02966         int *wp = weights;
02967 
02968         graph_traits < Graph >::edge_iterator e, e_end;
02969         for (boost::tie(e, e_end) = edges(g); e != e_end; ++e)
02970                 w[*e] = *wp++;
02971 
02972         boost::multi_array<int,2> D( boost::extents[V][V] );
02973         bool success = johnson_all_pairs_shortest_paths(g, D);
02974         if( !success )
02975         {
02976                 cerr << "failed, is this really a tree?\n";
02977                 return;
02978         }
02979 
02980         // find the most distant pair of nodes
02981         int max_dist = (std::numeric_limits<int>::min)();
02982         for (int i = 0; i < V; ++i) {
02983                 for (int j = 0; j < V; ++j) {
02984                         if( D[i][j] > max_dist )
02985                         {
02986                                 max_dist = D[i][j];
02987                                 n1 = i;
02988                                 n2 = j;
02989                         }
02990                 }
02991         }
02992 
02993         typedef graph_traits<Graph>::vertex_descriptor vertex_t;
02994         std::vector < vertex_t > pred(num_vertices(g));
02995         std::vector < int > dist(num_vertices(g));
02996         pred[n1] = n1;
02997 
02998         undirected_dfs(g,
02999                 root_vertex( vertex( n1, g ) ).
03000                 visitor( make_dfs_visitor( make_pair(
03001                         record_predecessors(&pred[0], on_tree_edge()),
03002                         record_distances(&dist[0], on_tree_edge())
03003                 ))).
03004                 edge_color_map(get(edge_color, g))
03005                 );
03006 
03007         int cur_node = n2;
03008         int prev_node = n2;
03009         max_dist /= 2;
03010         while( cur_node != n1 && max_dist > 0 )
03011         {
03012                 if( alignment_tree[cur_node].parents.size() > 0 && 
03013                         alignment_tree[cur_node].parents[0] == pred[cur_node] )
03014                 {
03015                         max_dist -= (int)(scaling_factor * alignment_tree[cur_node].distance) + 1;
03016                         prev_node = cur_node;
03017                         cur_node = pred[cur_node];
03018                 }else
03019                 {
03020                         prev_node = cur_node;
03021                         cur_node = pred[cur_node];
03022                         max_dist -= (int)(scaling_factor * alignment_tree[cur_node].distance) + 1;
03023                 }
03024         }
03025         n1 = cur_node;
03026         n2 = prev_node;
03027 
03028         delete[] edge_array;
03029         delete[] weights;
03030 }
03031 
03032 void extendRootBranches( PhyloTree< AlignmentTreeNode >& alignment_tree )
03033 {
03034         // find the max branch length and set the root branch lengths to twice that
03035         // swap children while we're at it
03036         node_id_t ancestor = alignment_tree.root;
03037         double max_blen = -(std::numeric_limits<double>::max)();
03038         for( size_t nI = 0; nI < alignment_tree.size(); ++nI )
03039         {
03040                 if( alignment_tree[nI].distance > max_blen )
03041                         max_blen = alignment_tree[nI].distance;
03042                 if( alignment_tree[nI].children.size() > 0 &&
03043                         alignment_tree[nI].children[0] > alignment_tree[nI].children[1] )
03044                 {
03045                         std::swap( alignment_tree[nI].children[0], alignment_tree[nI].children[1] );
03046                 }
03047         }
03048         for( size_t cI = 0; cI < alignment_tree[ancestor].children.size(); ++cI )
03049                 alignment_tree[alignment_tree[ancestor].children[cI]].distance = 2.0 * max_blen;
03050 }
03051 
03052 void chooseNextAlignmentPair( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t& node1, node_id_t& node2, node_id_t& ancestor )
03053 {
03054 
03055         // find the nearest alignable neighbor
03056         node1 = 0;
03057         node2 = 0;
03058         ancestor = 0;
03059         double nearest_distance = (numeric_limits<double>::max)();
03060         for( node_id_t nodeI = 0; nodeI < alignment_tree.size(); nodeI++ )
03061         {
03062                 AlignmentTreeNode& cur_node = alignment_tree[ nodeI ];
03063 
03064                 // skip this node if it's already been completely aligned
03065                 // or is an extant sequence
03066                 boolean completely_aligned = true;
03067                 for( uint alignedI = 0; alignedI < cur_node.children_aligned.size(); alignedI++ )
03068                         completely_aligned = completely_aligned && cur_node.children_aligned[alignedI];
03069                 for( uint alignedI = 0; alignedI < cur_node.parents_aligned.size(); alignedI++ )
03070                         completely_aligned = completely_aligned && cur_node.parents_aligned[alignedI];
03071                 if( cur_node.sequence != NULL || completely_aligned )
03072                         continue;
03073                 
03074 
03075                 vector< node_id_t > neighbor_id;
03076                 vector< boolean > alignable;
03077                 vector< double > distance;
03078                 
03079                 for( uint parentI = 0; parentI < cur_node.parents.size(); parentI++ )
03080                 {
03081                         neighbor_id.push_back( cur_node.parents[parentI] );
03082                         vector< node_id_t >::iterator cur_neighbor = neighbor_id.end() - 1;
03083                         if( *cur_neighbor == alignment_tree.root )
03084                         {
03085                                 // need special handling for the root since the alignment
03086                                 // tree is supposed to be unrooted
03087                                 // add all of root's children except this one
03088                         }
03089                         distance.push_back( cur_node.distance );
03090                         alignable.push_back( !cur_node.parents_aligned[parentI] && (alignment_tree[*cur_neighbor].ordering.size() != 0 || alignment_tree[*cur_neighbor].sequence != NULL) );
03091                 }
03092 
03093                 for( uint childI = 0; childI < cur_node.children.size(); childI++ )
03094                 {
03095                         neighbor_id.push_back( cur_node.children[childI] );
03096                         vector< node_id_t >::iterator cur_neighbor = neighbor_id.end() - 1;
03097                         distance.push_back( alignment_tree[*cur_neighbor].distance );
03098                         alignable.push_back( !cur_node.children_aligned[childI] && (alignment_tree[*cur_neighbor].ordering.size() != 0 || alignment_tree[*cur_neighbor].sequence != NULL) );
03099                 }
03100 
03101                 if( cur_node.ordering.size() != 0 )
03102                 {
03103                         // this one already has at least two sequences aligned, if another
03104                         // is alignable then check its distance
03105                         for( int i = 0; i < neighbor_id.size(); i++ ){
03106                                 if( !alignable[i] )
03107                                         continue;
03108                                 if( distance[i] < nearest_distance )
03109                                 {
03110                                         nearest_distance = distance[i];
03111                                         node1 = nodeI;
03112                                         node2 = neighbor_id[i];
03113                                         ancestor = nodeI;
03114                                 }
03115                         }
03116                 }else{
03117                         // find the nearest alignable pair
03118                         for( int i = 0; i < neighbor_id.size(); i++ )
03119                         {
03120                                 if( !alignable[i] )
03121                                         continue;
03122                                 for( int j = i+1; j < neighbor_id.size(); j++ )
03123                                 {
03124                                         if( !alignable[j] )
03125                                                 continue;
03126                                         if( distance[i] + distance[j] < nearest_distance )
03127                                         {
03128                                                 nearest_distance = distance[i] + distance[j];
03129                                                 node1 = neighbor_id[i];
03130                                                 node2 = neighbor_id[j];
03131                                                 ancestor = nodeI;
03132                                         }
03133                                 }
03134                         }
03135                 }
03136         }
03137 }
03138 
03140 void ProgressiveAligner::setPairwiseMatches( MatchList& pair_ml )
03141 {
03142         original_ml = pair_ml;
03143         pair_ml.clear();        // ProgressiveAligner owns the matches now...
03144 }
03145 
03146 
03147 node_id_t createAlignmentTreeRoot( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t node1, node_id_t node2 )
03148 {
03149                 // create a new node and link it inline between node1 and node2
03150                 AlignmentTreeNode atn;
03151                 alignment_tree.push_back( atn );
03152                 AlignmentTreeNode& old_root = alignment_tree[alignment_tree.root];
03153                 AlignmentTreeNode& new_root = alignment_tree.back();
03154 
03155                 if( find( alignment_tree[node1].children.begin(), alignment_tree[node1].children.end(), node2 ) !=
03156                         alignment_tree[node1].children.end() )
03157                 {
03158                         new_root.children.push_back(node2);
03159                         new_root.parents.push_back(node1);
03160                         alignment_tree[node2].parents.push_back(alignment_tree.size()-1);
03161                         alignment_tree[node1].children.push_back(alignment_tree.size()-1);
03162                 }else{
03163                         new_root.parents.push_back(node2);
03164                         new_root.children.push_back(node1);
03165                         alignment_tree[node2].children.push_back(alignment_tree.size()-1);
03166                         alignment_tree[node1].parents.push_back(alignment_tree.size()-1);
03167                 }
03168 
03169                 // completely unlink node1 and node2 from each other
03170                 findAndErase( alignment_tree[node1].children, node2 );
03171                 findAndErase( alignment_tree[node2].children, node1 );
03172                 findAndErase( alignment_tree[node1].parents, node2 );
03173                 findAndErase( alignment_tree[node2].parents, node1 );
03174 
03175 
03176                 // re-root the tree on the new node
03177                 rerootTree( alignment_tree, alignment_tree.size()-1 );
03178 
03179                 new_root.children_aligned = vector< boolean >( new_root.children.size(), false );
03180                 old_root.children_aligned = vector< boolean >( old_root.children.size(), false );
03181                 old_root.parents_aligned = vector< boolean >( old_root.parents.size(), false );
03182                 new_root.sequence = NULL;
03183 
03184         return alignment_tree.root;
03185 }
03186 
03187 void ProgressiveAligner::extractAlignment( node_id_t ancestor, size_t super_iv, GappedAlignment& gal )
03188 {
03189         CompactGappedAlignment<> cga;
03190         extractAlignment( ancestor, super_iv, cga );
03191         vector< string > aln;
03192         GetAlignment( cga, this->original_ml.seq_table, aln );
03193         gal = GappedAlignment(cga.SeqCount(), 0);
03194         for( size_t seqI = 0; seqI < cga.SeqCount(); ++seqI )
03195         {
03196                 gal.SetStart(seqI, cga.Start(seqI));
03197                 if( cga.Orientation(seqI) != AbstractMatch::undefined )
03198                         gal.SetLength(cga.Length(seqI), seqI);
03199         }
03200         gal.SetAlignment(aln);
03201 
03202 }
03203 
03204 void ProgressiveAligner::extractAlignment( node_id_t ancestor, size_t super_iv, CompactGappedAlignment<>& cga )
03205 {
03206         // determine the leaf node intervals below this super_iv
03207         vector< pair< node_id_t, size_t > > node_siv_list;
03208         stack< pair<node_id_t,size_t> > node_stack;
03209         node_stack.push(make_pair(ancestor,super_iv));
03210         while( node_stack.size() > 0 )
03211         {
03212                 pair<node_id_t,size_t> cur = node_stack.top();
03213                 node_id_t cur_node = cur.first;
03214                 node_stack.pop();
03215                 if( alignment_tree[cur_node].children.size() == 0 )
03216                         node_siv_list.push_back( cur );
03217                 for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
03218                 {
03219                         if( alignment_tree[cur_node].ordering[cur.second].reference_iv.LeftEnd(childI) == NO_MATCH )
03220                                 continue;
03221                         size_t child_siv = childI == 0 ? alignment_tree[cur_node].ordering[cur.second].c1_siv : 
03222                                 alignment_tree[cur_node].ordering[cur.second].c2_siv;
03223                         node_stack.push(make_pair(alignment_tree[cur_node].children[childI], child_siv) );
03224                         node_id_t n = alignment_tree[cur_node].children[childI];
03225                         if( alignment_tree[cur_node].ordering[cur.second].reference_iv.Length(childI) != alignment_tree[n].ordering[child_siv].Length() )
03226                         {
03227                                 breakHere();
03228                                 cerr << "alignment_tree[cur_node].ordering[cur.second].reference_iv.Length(childI): " << alignment_tree[cur_node].ordering[cur.second].reference_iv.Length(childI) << endl;
03229                                 cerr << "rotten in the state of denmark...\n";
03230                         }
03231                 }
03232         }
03233 
03234         // armed with the list of pairs, extract each one...
03235 
03236         // for each interval at the root write out the alignment
03237         SuperInterval& a_iv = alignment_tree[ancestor].ordering[super_iv];
03238         cga = CompactGappedAlignment<>(seq_count, a_iv.Length());
03239         vector< bitset_t > aln_mats( seq_count );
03240 
03241         // use translateCoordinates to map out each sequence's original coordinates
03242         // to the alignment coordinates
03243         for( size_t pairI = 0; pairI < node_siv_list.size(); pairI++ )
03244         {
03245                 node_id_t nodeI = node_siv_list[pairI].first;
03246                 size_t seq_siv = node_siv_list[pairI].second;
03247                 
03248                 // translate seq_siv into ancestor alignment coordinates?  
03249                 // we can abuse translateCoordinates and the Match data structure :
03250                 //   - add a single "match" covering the entire sequence
03251                 //   - translate it up to alignment root coordinates
03252                 uint seqI = node_sequence_map[nodeI];
03253                 Match mm(2);
03254                 mm.SetStart(0, alignment_tree[nodeI].ordering[seq_siv].LeftEnd());
03255                 mm.SetStart(1, alignment_tree[nodeI].ordering[seq_siv].LeftEnd());
03256                 mm.SetLength( alignment_tree[nodeI].ordering[seq_siv].Length() );
03257                 
03258                 vector< AbstractMatch* > aml( 1, mm.Copy() );
03259                 translateGappedCoordinates( aml, 0, nodeI, ancestor );
03260 
03261                 if( aml.size() > 1 )
03262                 {
03263                         cerr << "huh?";
03264                         genome::breakHere();
03265                         SingleStartComparator<AbstractMatch> ssc( 0 );
03266                         sort( aml.begin(), aml.end(), ssc );    // huh?
03267                 }
03268                 CompactGappedAlignment<>* trans_cga = dynamic_cast<CompactGappedAlignment<>*>(aml[0]);
03269                 if( trans_cga == NULL )
03270                 {
03271                         CompactGappedAlignment<> tmp_cga;
03272                         trans_cga = tmp_cga.Copy();
03273                         *trans_cga = CompactGappedAlignment<>(*aml[0]);
03274                 }
03275 
03276                 if( trans_cga->LeftEnd(0) + trans_cga->Length(0) > a_iv.LeftEnd() + a_iv.Length() )
03277                 {
03278                         cerr << "trans_cga->Start(0): " << trans_cga->Start(0) << " trans_cga->Length(0): " << trans_cga->Length(0) << endl;
03279                         cerr << "a_iv.LeftEnd(): " << a_iv.LeftEnd() << " a_iv.Length(): " << a_iv.Length() << endl;
03280                         breakHere();
03281                 }
03282                 bool parity = trans_cga->Orientation(0) == trans_cga->Orientation(1);
03283                 cga.SetLeftEnd(seqI, trans_cga->LeftEnd(1));
03284                 AbstractMatch::orientation o = parity ? AbstractMatch::forward : AbstractMatch::reverse;
03285                 cga.SetOrientation(seqI, o);
03286                 const vector< bitset_t >& tmp = trans_cga->GetAlignment();
03287                 aln_mats[seqI] = tmp[1];
03288 
03289                 size_t offset = trans_cga->LeftEnd(0) - a_iv.LeftEnd();
03290                 if( aln_mats[seqI].size() < a_iv.Length() )
03291                 {
03292                         // need to resize and shift appropriately
03293                         aln_mats[seqI].resize( a_iv.Length() );
03294                         aln_mats[seqI] <<= offset;      // this is backwards in boost::dynamic_bitset for some reason...
03295                 }
03296                 if( trans_cga->LeftEnd(0) < a_iv.LeftEnd() )
03297                 {
03298                         cerr << "trans_cga->LeftEnd(0): " << trans_cga->LeftEnd(0) << endl;
03299                         cerr << "a_iv.LeftEnd(): " << a_iv.LeftEnd() << endl;
03300                         breakHere();
03301                 }
03302 
03303                 // validate match lengths
03304                 if( trans_cga->Length(1) != alignment_tree[nodeI].ordering[seq_siv].Length() )
03305                 {
03306                         cerr << "b0rked\n";
03307                         breakHere();
03308                 }
03309                 // set the length and alignment appropriately
03310                 cga.SetLength(trans_cga->Length(1), seqI);
03311 
03312                 // free storage used by trans_cga
03313                 trans_cga->Free();
03314         }
03315         for( uint seqI = 0; seqI < aln_mats.size(); seqI++ )
03316                 if( aln_mats[seqI].size() == 0 )
03317                         aln_mats[seqI].resize( a_iv.Length() );
03318         cga.SetAlignment(aln_mats);
03319 }
03320 
03321 unsigned getDefaultBreakpointMax( const std::vector< genome::gnSequence* >& seq_table )
03322 {
03323         double avg_len = 0;
03324         for( size_t seqI = 0; seqI < seq_table.size(); ++seqI )
03325                 avg_len += seq_table[seqI]->length();
03326         avg_len /= (double)(seq_table.size());
03327         // heavily rearranged, recently diverged genomes like yersinia have up to 15 rearrangements per megabase of sequence
03328         avg_len /= 1000000.0;   // convert to number of megabases
03329         avg_len *= 15.0;        // "lots" of rearrangement
03330         return (unsigned)avg_len;
03331 }
03332 
03333 // get a pairwise bp distance
03334 void ProgressiveAligner::CreatePairwiseBPDistance( boost::multi_array<double, 2>& bp_distmat )
03335 {
03336         uint seq_count = original_ml.seq_table.size();
03337         bp_distmat.resize(boost::extents[seq_count][seq_count]);
03338         for( size_t i = 0; i < seq_count; ++i )
03339                 for( size_t j = 0; j < seq_count; ++j )
03340                         bp_distmat[i][j] = 1;
03341 
03342 #ifdef LCB_WEIGHT_LOSS_PLOT
03343         stringstream pair_bp_ofname;
03344         pair_bp_ofname << "pair_bp_log.txt";
03345         ofstream pair_bp_out( pair_bp_ofname.str().c_str() );
03346 #endif
03347 
03348         vector< pair<uint, uint> > seq_pairs( (seq_count * (seq_count-1))/2 );
03349         int ii = 0;
03350         for( uint seqI = 0; seqI < seq_count; seqI++ )
03351                 for( uint seqJ = seqI + 1; seqJ < seq_count; seqJ++ )
03352                         seq_pairs[ii++] = make_pair(seqI,seqJ);
03353 
03354 #pragma omp parallel for
03355         for(int i = 0; i < seq_pairs.size(); i++)
03356         {
03357                 uint seqI = seq_pairs[i].first;
03358                 uint seqJ = seq_pairs[i].second;
03359                 vector<uint>::iterator n1 = find( node_sequence_map.begin(), node_sequence_map.end(), seqI );
03360                 vector<uint>::iterator n2 = find( node_sequence_map.begin(), node_sequence_map.end(), seqJ );
03361                 vector<node_id_t> n1_seqs( 1, n1-node_sequence_map.begin() );
03362                 vector<node_id_t> n2_seqs( 1, n2-node_sequence_map.begin() );
03363                 Matrix<MatchList> mml;
03364                 getPairwiseMatches(n1_seqs, n2_seqs, mml);
03365                 MatchList& ml = mml(0,0);
03366 
03367                 // eliminate overlaps as they correspond to inconsistently or
03368                 // multiply aligned regions
03369                 EliminateOverlaps_v2( ml, true );
03370                 ml.MultiplicityFilter(2);
03371 
03372                 // do greedy b.p. elimination on the matches
03373                 vector< MatchList > LCB_list;
03374                 vector< LCB > adjacencies;
03375                 vector< gnSeqI > breakpoints;
03376                 IdentifyBreakpoints( ml, breakpoints );
03377                 ComputeLCBs_v2( ml, breakpoints, LCB_list );
03378                 vector< double > lcb_scores( LCB_list.size() );
03379                 for( size_t lcbI = 0; lcbI < LCB_list.size(); ++lcbI )
03380                         lcb_scores[lcbI] = GetPairwiseAnchorScore( LCB_list[lcbI], ml.seq_table, this->subst_scoring, sol_list[seqI], sol_list[seqJ] );
03381 
03382                 computeLCBAdjacencies_v3( LCB_list, lcb_scores, adjacencies );
03383 
03384                 // want to discard all low-weight LCBs
03385                 // to arrive at a set of reliable LCBs
03386                 double cons_id = 1 - this->conservation_distance[seqI][seqJ];
03387                 double scaled_score = max( bp_dist_estimate_score * cons_id * cons_id * cons_id * cons_id, min_breakpoint_penalty);
03388                 cout << "Using scaled bp penalty: " << scaled_score << endl;
03389                 GreedyRemovalScorer wbs( adjacencies, scaled_score );
03390 #ifdef LCB_WEIGHT_LOSS_PLOT
03391                 cur_min_coverage = greedyBreakpointElimination_v4( adjacencies, lcb_scores, wbs, &pair_bp_out, seqI, seqJ );
03392                 pair_bp_out.flush();
03393 #else
03394                 cur_min_coverage = greedyBreakpointElimination_v4( adjacencies, lcb_scores, wbs, NULL );
03395 #endif
03396                 MatchList deleted_matches;
03397                 filterMatches_v2( adjacencies, LCB_list, lcb_scores, deleted_matches );
03398                 cout << "Pair (" << seqI << "," << seqJ << ") has " << LCB_list.size() << " well-supported breakpoints\n";
03399                 
03400                 // now set the distance entry
03401                 bp_distmat[seqI][seqJ] = LCB_list.size();
03402                 bp_distmat[seqJ][seqI] = LCB_list.size();
03403 
03404                 // free the matches
03405                 for( size_t dI = 0; dI < ml.size(); dI++ )
03406                         ml[dI]->Free();
03407         }
03408         // normalize to [0,1]
03409         double bp_max = 0;
03410         for( uint i = 0; i < bp_distmat.shape()[0]; ++i )
03411                 for( uint j = 0; j < bp_distmat.shape()[1]; ++j )
03412                 {
03413                         if( bp_distmat[i][j] > bp_max )
03414                                 bp_max = bp_distmat[i][j];
03415                 }
03416 
03417         double default_max = getDefaultBreakpointMax(original_ml.seq_table);
03418         bp_max = bp_max > default_max ? bp_max : default_max;
03419 
03420         for( uint i = 0; i < bp_distmat.shape()[0]; ++i )
03421                 for( uint j = 0; j < bp_distmat.shape()[1]; ++j )
03422                 {
03423                         if( i != j )
03424                                 bp_distmat[i][j] /= bp_max;
03425                         bp_distmat[i][j] *= bp_dist_scale;
03426                 }
03427 }
03428 
03429 template< typename MatchListType >
03430 void makeAlignmentTree( PhyloTree< AlignmentTreeNode >& alignment_tree, MatchListType& mlist, vector< uint >& node_sequence_map )
03431 {
03432         // initialize all nodes to unaligned
03433         for( node_id_t nodeI = 0; nodeI < alignment_tree.size(); nodeI++ )
03434         {
03435                 alignment_tree[nodeI].children_aligned = vector< boolean >( alignment_tree[nodeI].children.size(), false );
03436                 alignment_tree[nodeI].parents_aligned = vector< boolean >( alignment_tree[nodeI].parents.size(), false );
03437                 alignment_tree[nodeI].sequence = NULL;
03438                 alignment_tree[nodeI].refined = false;
03439         }
03440 
03441         // set the sequence appropriately for extant sequences
03442         node_sequence_map = vector< uint >( alignment_tree.size(), -1 );
03443         for( uint seqI = 0; seqI < mlist.seq_table.size(); seqI++ )
03444         {
03445                 stringstream seq_name;
03446                 seq_name << "seq" << seqI + 1;
03447                 node_id_t nodeI = 0;
03448                 for( ; nodeI < alignment_tree.size(); nodeI++ )
03449                 {
03450                         if( seq_name.str() == alignment_tree[nodeI].name )
03451                         {
03452                                 alignment_tree[nodeI].sequence = mlist.seq_table[seqI];
03453                                 Match mm(1);
03454                                 Match* m = mm.Copy();
03455                                 m->SetStart(0,1);
03456                                 m->SetLength(alignment_tree[nodeI].sequence->length(),0);
03457                                 vector<AbstractMatch*> tmp(1,m);
03458                                 Interval iv( tmp.begin(), tmp.end() );
03459                                 m->Free();
03460                                 SuperInterval si( iv );
03461                                 si.SetLeftEnd(1);
03462                                 si.SetLength(alignment_tree[nodeI].sequence->length());
03463                                 alignment_tree[nodeI].ordering.push_back( si );
03464                                 node_sequence_map[nodeI] = seqI;
03465                                 break;
03466                         }
03467                 }
03468                 if( nodeI == alignment_tree.size() )
03469                         throw "Phylogenetic tree names unrecognized.  Should follow seqN naming format\n";
03470         }
03471 }
03472 
03473 void DistanceMatrix( IntervalList& iv_list, NumericMatrix<double>& distmat )
03474 {
03475         IdentityMatrix( iv_list, distmat );
03476         TransformDistanceIdentity(distmat);
03477 }
03478 
03479 /*
03480 void makeSuperIntervals( IntervalList& iv_list, PhyloTree< TreeNode >& alignment_tree, vector< uint >& node_sequence_map )
03481 {
03482         std::stack< node_id_t > node_stack;
03483         node_stack.push( alignment_tree.root );
03484         bitset_t visited( alignment_tree.size(), false );
03485         while( node_stack.size() > 0 )
03486         {
03487                 node_id_t cur_node = node_stack.top();
03488                 // visit post-order
03489                 for( size_t cI = 0; cI < alignment_tree[cur_node].children.size(); ++cI )
03490                 {
03491                         if( !visited[alignment_tree[cur_node].children[cI]] )
03492                                 node_stack.push(alignment_tree[cur_node].children[cI]);
03493                 }
03494                 if( node_stack.top() != cur_node )
03495                         continue;
03496                 node_stack.pop();
03497                 if( alignment_tree[cur_node].children.size() == 0 )
03498                         continue;       // only process internal nodes
03499 
03500                 // process this node
03501                 // construct pairwise LCBs
03502 
03503                 uint seqI = node_sequence_map[alignment_tree[cur_node].children[0]];
03504                 uint seqJ = node_sequence_map[alignment_tree[cur_node].children[0]];
03505                 vector< uint > projection( 2 );
03506                 projection[0] = seqI;
03507                 projection[1] = seqJ;
03508 
03509                 vector< vector< MatchProjectionAdapter* > > LCB_list;
03510                 vector< LCB > projected_adjs;
03511                 projectIntervalList( iv_list, projection, LCB_list, projected_adjs );
03512 
03513                 // create a superinterval for each adj 
03514 //              alignment_tree[cur_node].ordering.resize(adjs.size());
03515 //              for( size_t adjI = 0; adjI < adjs.size(); ++adjI )
03516 //              {
03517 //                      SuperInterval& siv = alignment_tree[cur_node].ordering[adjI];
03518 //                      Match mleft(2);
03519 //                      mleft.SetStart(0,adjI);
03520 //                      mleft.SetStart(1,adjI);
03521 //                      mleft.SetLength(1);
03522 //                      siv.SetLeftEnd( adjI );
03523 //                      siv.SetLength(1);
03524 //              }
03525 
03526         }
03527 }
03528 */
03529 
03530 void ProgressiveAligner::alignPP(IntervalList& prof1, IntervalList& prof2, IntervalList& interval_list )
03531 {
03532         if( debug_aligner )
03533         {
03534                 debug_interval = true;
03535                 debug_cga = true;
03536         }
03537 
03538         seq_count = prof1.seq_table.size() + prof2.seq_table.size();
03539 
03540         if( this->breakpoint_penalty == -1 )
03541                 this->breakpoint_penalty = getDefaultBreakpointPenalty( original_ml.seq_table );
03542 
03543         if( this->bp_dist_estimate_score == -1 )
03544                 this->bp_dist_estimate_score = getDefaultBpDistEstimateMinScore( original_ml.seq_table );
03545         cout << "using default bp penalty: " << breakpoint_penalty << endl;
03546         cout << "using default bp estimate min score: " << bp_dist_estimate_score << endl;
03547 
03548         if( this->collinear_genomes )
03549                 this->breakpoint_penalty = -1;
03550 
03551         if( collinear_genomes )
03552                 cout << "\nAssuming collinear genomes...\n";
03553                 
03554         EliminateOverlaps_v2( original_ml );
03555         // use existing pairwise matches
03556         MatchList mlist;
03557         mlist.clear();
03558         mlist = original_ml;
03559         cout << "Starting with " << mlist.size() << " multi-matches\n";
03560 
03561 //
03562 // Step 1) Compute guide trees for each profile and join them
03563 //
03564         NumericMatrix< double > distance1;
03565         DistanceMatrix( prof1, distance1 );
03566         NumericMatrix< double > distance2;
03567         DistanceMatrix( prof2, distance2 );
03568 
03569         // Make a phylogenetic tree
03570         // use the identity matrix method and convert to a distance matrix
03571         MuscleInterface& ci = MuscleInterface::getMuscleInterface();    
03572         string guide_tree_fname1 = CreateTempFileName("guide_tree");
03573         registerFileToDelete( guide_tree_fname1 );
03574         ci.CreateTree( distance1, guide_tree_fname1 );
03575         string guide_tree_fname2 = CreateTempFileName("guide_tree");
03576         registerFileToDelete( guide_tree_fname2 );
03577         ci.CreateTree( distance2, guide_tree_fname2 );
03578 
03579         // read the trees
03580         ifstream tree_file1( guide_tree_fname1.c_str() );
03581         if( !tree_file1.is_open() )
03582                 throw "Error opening guide tree file";
03583         PhyloTree< AlignmentTreeNode > tree1;
03584         tree1.readTree( tree_file1 );
03585         tree_file1.close();
03586         ifstream tree_file2( guide_tree_fname2.c_str() );
03587         if( !tree_file2.is_open() )
03588                 throw "Error opening guide tree file";
03589         PhyloTree< AlignmentTreeNode > tree2;
03590         tree2.readTree( tree_file2 );
03591         tree_file2.close();
03592 
03593 
03594         // compute pairwise distances among all nodes
03595         NumericMatrix< double > distance;
03596         DistanceMatrix( mlist, distance );
03597         conservation_distance.resize(boost::extents[seq_count][seq_count]);
03598         for( uint seqI = 0; seqI < seq_count; ++seqI )
03599                 for( uint seqJ = 0; seqJ < seq_count; ++seqJ )
03600                         if( seqJ > seqI )
03601                                 conservation_distance[seqI][seqJ] = distance(seqI,seqJ);
03602                         else
03603                                 conservation_distance[seqI][seqJ] = distance(seqJ,seqI);
03604 
03605 
03606         if( !collinear_genomes )
03607         {
03608                 cout << "Calculating pairwise breakpoint distances\n";
03609                 CreatePairwiseBPDistance(bp_distance);
03610                 cout << "bp distance matrix:\n";
03611                 print2d_matrix(bp_distance, cout);
03612                 cout << endl;
03613         }
03614 
03615         // rescale the conservation distance
03616         double conservation_range = 1;
03617         double bp_range = 1;
03618         for( uint seqI = 0; seqI < seq_count; ++seqI )
03619                 for( uint seqJ = 0; seqJ < seq_count; ++seqJ )
03620                         conservation_distance[seqI][seqJ] = distance(seqI,seqJ) / conservation_range;
03621 
03622         if( !(collinear_genomes && seq_count > 20 ) )
03623         {
03624                 cout << "genome content distance matrix:\n";
03625                 print2d_matrix(conservation_distance, cout);
03626                 cout << endl;
03627         }
03628 
03629 //
03630 // construct the alignment tree by joining trees from each profile
03631 //
03632         vector< uint > nsmap1;
03633         vector< uint > nsmap2;
03634         makeAlignmentTree( tree1, prof1, nsmap1 );
03635 //      prepareAlignmentTree(tree1);
03636         makeAlignmentTree( tree2, prof2, nsmap2 );
03637 //      prepareAlignmentTree(tree2);
03638 
03639         alignment_tree.resize( tree1.size() + tree2.size() + 1 );
03640         // set the sequence appropriately for extant sequences
03641         node_sequence_map = vector< uint >( alignment_tree.size(), -1 );
03642 
03643         // initialize all nodes to unaligned
03644         for( node_id_t nodeI = 0; nodeI < alignment_tree.size()-1; nodeI++ )
03645         {
03646                 if( nodeI < tree1.size() )
03647                 {
03648                         alignment_tree[nodeI].sequence = tree1[nodeI].sequence;
03649                         alignment_tree[nodeI].children = tree1[nodeI].children;
03650                         alignment_tree[nodeI].parents = tree1[nodeI].parents;
03651                         alignment_tree[nodeI].ordering = tree1[nodeI].ordering;
03652                         alignment_tree[nodeI].distance = tree1[nodeI].distance;
03653                         alignment_tree[nodeI].name = tree1[nodeI].name;
03654                         node_sequence_map[nodeI] = nsmap1[nodeI];
03655                 }else{
03656                         alignment_tree[nodeI].sequence = tree2[nodeI-tree1.size()].sequence;
03657                         alignment_tree[nodeI].children = tree2[nodeI-tree1.size()].children;
03658                         alignment_tree[nodeI].parents = tree2[nodeI-tree1.size()].parents;
03659                         alignment_tree[nodeI].ordering = tree2[nodeI-tree1.size()].ordering;
03660                         alignment_tree[nodeI].distance = tree2[nodeI-tree1.size()].distance;
03661                         alignment_tree[nodeI].name = tree2[nodeI-tree1.size()].name;
03662                         for( size_t cI = 0; cI < alignment_tree[nodeI].children.size(); cI++ )
03663                                 alignment_tree[nodeI].children[cI] += tree1.size();
03664                         for( size_t pI = 0; pI < alignment_tree[nodeI].parents.size(); pI++ )
03665                                 alignment_tree[nodeI].parents[pI] += tree1.size();
03666                         node_sequence_map[nodeI] = nsmap2[nodeI-tree1.size()];
03667                         if( node_sequence_map[nodeI] != (std::numeric_limits<uint>::max)() )
03668                                 node_sequence_map[nodeI] += prof1.seq_table.size();
03669                 }
03670 
03671                 alignment_tree[nodeI].children_aligned = vector< boolean >( alignment_tree[nodeI].children.size(), true );
03672                 alignment_tree[nodeI].parents_aligned = vector< boolean >( alignment_tree[nodeI].parents.size(), true );
03673                 alignment_tree[nodeI].refined = true;
03674         }
03675 
03676         alignment_tree.back().children.push_back( tree1.size()-1 );
03677         alignment_tree.back().children.push_back( alignment_tree.size()-2 );
03678         alignment_tree.back().distance = 100;
03679         alignment_tree.back().children_aligned = vector< boolean >( alignment_tree.back().children.size(), true );
03680         alignment_tree.back().parents_aligned = vector< boolean >( alignment_tree.back().parents.size(), true );
03681         alignment_tree.back().refined = false;
03682 
03683 
03684         getAlignment( interval_list );
03685 
03686 }
03687 
03688 void ProgressiveAligner::getAlignment( IntervalList& interval_list )
03689 {
03690         cout << "Aligning...\n";
03691         // pick each pair of sequences and align until none are left
03692         while(true)
03693         {
03694                 node_id_t node1;
03695                 node_id_t node2;
03696                 node_id_t ancestor;
03697                 chooseNextAlignmentPair( alignment_tree, node1, node2, ancestor );
03698                 if( node1 == node2 )
03699                         break;  // all pairs have been aligned
03700 
03701                 // this is the last alignable pair in the unrooted tree
03702                 // create a root from which the complete alignment can be extracted
03703                 alignNodes( node1, node2, ancestor );
03704                 if( ancestor == alignment_tree.root )
03705                         break;  // all done
03706         }
03707 
03708         if( refine )
03709         {
03710                 // perform iterative refinement
03711                 cout << "Performing final pass iterative refinement\n";
03712                 doGappedAlignment(alignment_tree.root, false);
03713         }
03714 
03715         // peel off the alignment from the root node
03716         cout << "root alignment has " << alignment_tree[alignment_tree.root].ordering.size() << " superintervals\n";
03717         vector< SuperInterval >& a_ivs = alignment_tree[alignment_tree.root].ordering;
03718         gnSeqI len = 0;
03719         for( size_t ivI = 0; ivI < a_ivs.size(); ivI++ )
03720         {
03721                 len += a_ivs[ivI].Length();
03722         }
03723         cout << "root alignment length: " << len << endl;
03724 
03725 
03726         // for each interval at the root write out the alignment
03727         for( size_t ivI = 0; ivI < a_ivs.size(); ivI++ )
03728         {
03729                 GappedAlignment ga(seq_count, a_ivs[ivI].Length());
03730                 extractAlignment(alignment_tree.root, ivI, ga);
03731                 vector<AbstractMatch*> tmp(1, &ga);
03732                 interval_list.push_back( Interval(tmp.begin(), tmp.end()) );
03733         }
03734 }
03735 
03740 void ProgressiveAligner::align( vector< gnSequence* >& seq_table, IntervalList& interval_list ){
03741         if( debug_aligner )
03742         {
03743                 debug_interval = true;
03744                 debug_cga = true;
03745         }
03746 
03747         seq_count = seq_table.size();
03748         this->currently_recursing = false;
03749         interval_list.seq_table = seq_table;
03750 
03751         // find pairwise matches
03752         MatchList mlist;
03753         mlist.seq_table = seq_table;
03754 
03755         if( this->breakpoint_penalty == -1 )
03756                 this->breakpoint_penalty = getDefaultBreakpointPenalty( seq_table );
03757         if( this->bp_dist_estimate_score == -1 )
03758                 this->bp_dist_estimate_score = getDefaultBpDistEstimateMinScore( original_ml.seq_table );
03759         cout << "using default bp penalty: " << breakpoint_penalty << endl;
03760         cout << "using default bp estimate min score: " << bp_dist_estimate_score << endl;
03761 
03762         if( this->collinear_genomes )
03763                 this->breakpoint_penalty = -1;
03764 
03765         if( collinear_genomes )
03766                 cout << "\nAssuming collinear genomes...\n";
03767 
03768         mlist.clear();
03769         mlist = original_ml;
03770         cout << "Starting with " << mlist.size() << " multi-matches\n";
03771         cout << "Computing genome content distance matrix...\n";
03772 
03773 //
03774 // Step 2) Compute a phylogenetic guide tree using the pairwise matches
03775 //
03776         NumericMatrix< double > distance;
03777         SingleCopyDistanceMatrix( mlist, mlist.seq_table, distance );
03778         cout << "\n\nGenome conservation distance matrix: " << endl;
03779         distance.print(cout);
03780         cout << endl;
03781 
03782         bool input_tree_specified = input_guide_tree_fname != "";
03783         bool output_tree_specified = output_guide_tree_fname != "";
03784         if( !input_tree_specified )
03785         {
03786                 // Make a phylogenetic guide tree
03787                 if( !output_tree_specified )
03788                         output_guide_tree_fname = CreateTempFileName("guide_tree");
03789                 input_guide_tree_fname = output_guide_tree_fname;
03790                 cout << "Writing guide tree to " << output_guide_tree_fname << endl;
03791                 MuscleInterface& mi = MuscleInterface::getMuscleInterface();
03792                 mi.CreateTree( distance, output_guide_tree_fname );
03793 
03794         //      ci.SetDistanceMatrix( distance, output_guide_tree_fname );
03795         }
03796 
03797         conservation_distance.resize(boost::extents[seq_count][seq_count]);
03798         for( uint seqI = 0; seqI < seq_count; ++seqI )
03799                 for( uint seqJ = 0; seqJ < seq_count; ++seqJ )
03800                         if( seqJ > seqI )
03801                         {
03802                                 conservation_distance[seqI][seqJ] = distance(seqI,seqJ);
03803                                 conservation_distance[seqJ][seqI] = distance(seqI,seqJ);
03804                         }
03805                         else
03806                         {
03807                                 conservation_distance[seqI][seqJ] = distance(seqJ,seqI);
03808                                 conservation_distance[seqJ][seqI] = distance(seqJ,seqI);
03809                         }
03810 
03811         cout << "reading tree...\n";
03812         // load the guide tree
03813         ifstream tree_file( input_guide_tree_fname.c_str() );
03814         if( !tree_file.is_open() )
03815                 throw "Error opening guide tree file";
03816         alignment_tree.readTree( tree_file );
03817         tree_file.close();
03818 
03819         cout << "initializing alignment tree...\n";
03820 
03821         makeAlignmentTree( alignment_tree, mlist, node_sequence_map );
03822         node_id_t node1;
03823         node_id_t node2;
03824         // midpoint root the tree
03825 //      findMidpoint( alignment_tree, node1, node2 );
03826 //      node_id_t ancestor = 0;
03827 //      if( seq_count > 2 )     // if only two sequences then the tree already has a root
03828 //              ancestor = createAlignmentTreeRoot( alignment_tree, node1, node2 );
03829 
03830         // write out the rooted guide tree, but don't clobber the user's input tree
03831         if( !input_tree_specified || output_tree_specified )
03832         {
03833                 ofstream out_tree_file( output_guide_tree_fname.c_str() );
03834                 if( !out_tree_file.is_open() )
03835                         throw "Error opening guide tree file for write";
03836                 alignment_tree.writeTree( out_tree_file );
03837                 out_tree_file.close();
03838         }
03839 
03840         // ensure the root is the last to get aligned and swap children to canonical order
03841         extendRootBranches(alignment_tree);
03842 
03843 
03844         if( !collinear_genomes )
03845         {
03846                 // need sol lists for scoring
03847                 vector<SeedOccurrenceList> blah(seq_count);
03848                 swap( blah, sol_list );
03849 //              sol_list = ;
03850                 // temporarily create a weight 11 SML
03851 /*              MatchList w11_mlist;
03852                 w11_mlist.seq_filename = original_ml.seq_filename;
03853                 w11_mlist.seq_table = original_ml.seq_table;
03854                 cout << "Creating weight 11 SMLs for repeat detection\n";
03855                 w11_mlist.CreateMemorySMLs( 11, NULL );
03856 */
03857                 cout << "Constructing seed occurrence lists for repeat detection\n";
03858 #pragma omp parallel for
03859                 for( int seqI = 0; seqI < seq_count; seqI++ )
03860                 {
03861                         sol_list[seqI].construct(*(mlist.sml_table[seqI]));
03862 //                      delete w11_mlist.sml_table[seqI];
03863                 }
03864 //              w11_mlist.sml_table.clear();
03865         }
03866         if( !collinear_genomes && use_weight_scaling )
03867         {
03868                 cout << "Calculating pairwise breakpoint distances\n";
03869                 CreatePairwiseBPDistance(bp_distance);
03870         }
03871 
03872         // rescale the conservation distance
03873         if( use_weight_scaling )
03874         {
03875                 for( uint seqI = 0; seqI < seq_count; ++seqI )
03876                         for( uint seqJ = 0; seqJ < seq_count; ++seqJ )
03877                                 conservation_distance[seqI][seqJ] = distance(seqI,seqJ) * conservation_dist_scale;
03878         }else{
03879                 bp_distance.resize(boost::extents[seq_count][seq_count]);
03880                 for( uint seqI = 0; seqI < seq_count; ++seqI )
03881                         for( uint seqJ = 0; seqJ < seq_count; ++seqJ )
03882                         {
03883                                 conservation_distance[seqI][seqJ] = 0;
03884                                 bp_distance[seqI][seqJ] = 0;
03885                         }
03886         }
03887 
03888         if( !collinear_genomes )
03889         {
03890                 cout << "genome content distance matrix:\n";
03891                 print2d_matrix(conservation_distance, cout);
03892                 cout << endl;
03893                 cout << "bp distance matrix:\n";
03894                 print2d_matrix(bp_distance, cout);
03895                 cout << endl;
03896         }
03897 
03898         getAlignment( interval_list );
03899 }
03900 
03901 
03902 // broken and unused function graveyard
03903 
03904 }

Generated on Fri Mar 14 06:01:03 2008 for libMems by doxygen 1.3.6