00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifdef HAVE_CONFIG_H
00010 #include "config.h"
00011 #endif
00012
00013
00014 #include "libMems/ProgressiveAligner.h"
00015 #include "libMems/GreedyBreakpointElimination.h"
00016 #include "libMems/Aligner.h"
00017 #include "libMems/Islands.h"
00018 #include "libMems/DNAFileSML.h"
00019 #include "libMems/MuscleInterface.h"
00020 #include "libMems/gnAlignedSequences.h"
00021 #include "libMems/CompactGappedAlignment.h"
00022 #include "libMems/MatchProjectionAdapter.h"
00023 #include "libMems/PairwiseMatchFinder.h"
00024 #include "libMems/TreeUtilities.h"
00025 #include "libMems/PairwiseMatchAdapter.h"
00026 #include "libMems/DistanceMatrix.h"
00027
00028 #include <boost/dynamic_bitset.hpp>
00029 #include <boost/tuple/tuple.hpp>
00030 #include <boost/property_map.hpp>
00031 #include <boost/graph/graph_traits.hpp>
00032 #include <boost/graph/adjacency_list.hpp>
00033 #include <boost/graph/johnson_all_pairs_shortest.hpp>
00034 #include <boost/graph/undirected_dfs.hpp>
00035
00036 #include <map>
00037 #include <fstream>
00038 #include <sstream>
00039 #include <stack>
00040 #include <algorithm>
00041 #include <limits>
00042 #include <iomanip>
00043
00044 #include "stdlib.h"
00045
00046 using namespace std;
00047 using namespace genome;
00048
00049 namespace mems {
00050
00051
00052 bool progress_msgs = false;
00053
00054 bool debug_me = false;
00055 static int dbg_count = 0;
00056
00057
00058 double min_window_size = 200;
00059 double max_window_size = 20000;
00060 double min_density = .5;
00061 double max_density = .9;
00062 size_t max_gap_length = 3000;
00063 size_t lcb_hangover = 300;
00064
00065
00066 void mergeUnalignedIntervals( uint seqI, vector< Interval* >& iv_list, vector< Interval* >& new_list );
00067
00072 boolean my_validateLCB( MatchList& lcb ){
00073 vector< Match* >::iterator lcb_iter = lcb.begin();
00074 if( lcb.size() == 0 )
00075 return true;
00076 uint seq_count = (*lcb_iter)->SeqCount();
00077 uint seqI = 0;
00078 boolean complain = false;
00079 for(; seqI < seq_count; seqI++ ){
00080 lcb_iter = lcb.begin();
00081 int64 prev_coord = 0;
00082 for(; lcb_iter != lcb.end(); ++lcb_iter ){
00083 if( (*lcb_iter)->Start( seqI ) == NO_MATCH )
00084 continue;
00085 else if( prev_coord != 0 && (*lcb_iter)->Start( seqI ) < prev_coord ){
00086 complain = true;
00087 }
00088 prev_coord = (*lcb_iter)->Start( seqI );
00089 }
00090 }
00091 return !complain;
00092 }
00093
00094 template< class BoostMatType >
00095 void print2d_matrix( BoostMatType& mat, std::ostream& os )
00096 {
00097 for( size_t i = 0; i < mat.shape()[0]; ++i )
00098 {
00099 for( size_t j = 0; j < mat.shape()[1]; ++j )
00100 {
00101 if( j > 0 )
00102 os << "\t";
00103 os << mat[i][j];
00104 }
00105 os << endl;
00106 }
00107 }
00108
00109 double getDefaultBreakpointPenalty( std::vector< gnSequence* >& sequences )
00110 {
00111 uint default_mer_size = MatchList::GetDefaultMerSize( sequences );
00112 double avg_seq_len = 0;
00113 for( size_t seqI = 0; seqI < sequences.size(); ++seqI )
00114 avg_seq_len += (double)sequences[seqI]->length();
00115 avg_seq_len /= (double)sequences.size();
00116 avg_seq_len = log( avg_seq_len ) / log( 2.0 );
00117 return avg_seq_len * 7000;
00118 }
00119
00120
00121 double getDefaultBpDistEstimateMinScore( std::vector< gnSequence* >& sequences )
00122 {
00123 return 2.0 * getDefaultBreakpointPenalty(sequences);
00124 }
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134 ProgressiveAligner::ProgressiveAligner( uint seq_count ) :
00135 Aligner( seq_count ),
00136 breakpoint_penalty( -1 ),
00137 min_breakpoint_penalty( 2000 ),
00138 debug(false),
00139 refine(true),
00140 scoring_scheme(ExtantSumOfPairsScoring),
00141 use_weight_scaling(true),
00142 conservation_dist_scale(1),
00143 bp_dist_scale(.9),
00144 max_gapped_alignment_length(20000),
00145 bp_dist_estimate_score(-1),
00146 use_seed_families(false),
00147 using_cache_db(true)
00148 {
00149 gapped_alignment = true;
00150 max_window_size = max_gapped_alignment_length;
00151 }
00152
00153 void ProgressiveAligner::SetMaxGappedAlignmentLength( size_t len )
00154 {
00155 max_gapped_alignment_length = len;
00156 max_window_size = max_gapped_alignment_length;
00157 }
00158
00160 void ProgressiveAligner::getAlignedChildren( node_id_t node, vector< node_id_t >& descendants )
00161 {
00162
00163 stack< node_id_t > node_stack;
00164 node_stack.push( node );
00165 vector< bool > visited( alignment_tree.size(), false );
00166 descendants.clear();
00167 while( node_stack.size() > 0 )
00168 {
00169 node_id_t cur_node = node_stack.top();
00170 if(progress_msgs) cout << "Evaluating aligned nodes linked to node " << cur_node << endl;
00171 node_stack.pop();
00172 visited[cur_node] = true;
00173 for( uint childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
00174 {
00175 node_id_t child_id = alignment_tree[cur_node].children[childI];
00176 if( alignment_tree[cur_node].children_aligned[childI] && !visited[child_id])
00177 node_stack.push( child_id );
00178 }
00179 if( alignment_tree[ cur_node ].sequence != NULL )
00180 descendants.push_back( cur_node );
00181 }
00182 }
00183
00184
00186 void ProgressiveAligner::getPath( node_id_t first_n, node_id_t last_n, vector< node_id_t >& path )
00187 {
00188
00189 stack< node_id_t > node_stack;
00190 node_stack.push( last_n );
00191 vector< bool > visited( alignment_tree.size(), false );
00192 while( node_stack.top() != first_n )
00193 {
00194 node_id_t cur_node = node_stack.top();
00195 size_t pre_size = node_stack.size();
00196 visited[cur_node] = true;
00197 for( uint childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
00198 {
00199 node_id_t child_id = alignment_tree[cur_node].children[childI];
00200 if(!visited[child_id])
00201 {
00202 node_stack.push( child_id );
00203 break;
00204 }
00205 }
00206 if( pre_size != node_stack.size() )
00207 continue;
00208 for( uint parentI = 0; parentI < alignment_tree[cur_node].parents.size(); parentI++ )
00209 {
00210 node_id_t parent_id = alignment_tree[cur_node].parents[parentI];
00211 if(!visited[parent_id])
00212 {
00213 node_stack.push( parent_id );
00214 break;
00215 }
00216 }
00217 if( pre_size != node_stack.size() )
00218 continue;
00219 node_stack.pop();
00220 }
00221 path = vector< node_id_t >( node_stack.size() );
00222 for( size_t pI = 0; pI < path.size(); pI++ )
00223 {
00224 path[pI] = node_stack.top();
00225 node_stack.pop();
00226 }
00227 }
00228
00229
00230
00231
00232
00233
00234 template<class MatchType>
00235 void ProgressiveAligner::propagateDescendantBreakpoints( node_id_t node1, uint seqI, std::vector<MatchType*>& iv_list )
00236 {
00237 SSC<MatchType> ilc(seqI);
00238 sort( iv_list.begin(), iv_list.end(), ilc );
00239 vector< SuperInterval >& ord = alignment_tree[ node1 ].ordering;
00240 vector<gnSeqI> bp_list;
00241 for( size_t sI = 0; sI < ord.size(); sI++ )
00242 bp_list.push_back( ord[sI].LeftEnd() );
00243
00244 GenericMatchSeqManipulator<MatchType> ism( seqI );
00245 applyBreakpoints( bp_list, iv_list, ism );
00246 }
00247
00248
00249 template<class T, class Manipulator>
00250 void applyAncestralBreakpoints( const vector< SuperInterval >& siv_list, vector<T>& ord, uint seqI, Manipulator& m )
00251 {
00252
00253 vector<gnSeqI> bp_list(siv_list.size()*2, 0);
00254 size_t cur = 0;
00255 for( size_t i = 0; i < siv_list.size(); i++ )
00256 {
00257 if( siv_list[i].reference_iv.Start(seqI) == NO_MATCH )
00258 continue;
00259 bp_list[cur++] = siv_list[i].reference_iv.LeftEnd(seqI);
00260 bp_list[cur++] = siv_list[i].reference_iv.LeftEnd(seqI) + siv_list[i].reference_iv.Length(seqI);
00261 }
00262 bp_list.resize(cur);
00263
00264 sort( bp_list.begin(), bp_list.end() );
00265 applyBreakpoints( bp_list, ord, m );
00266 }
00267
00268
00269
00270
00271
00272 void ProgressiveAligner::linkSuperIntervals( node_id_t node1, uint seqI, node_id_t ancestor )
00273 {
00274
00275 vector<SuperInterval>& a_ord = alignment_tree[ancestor].ordering;
00276 vector<SuperInterval>& c_ord = alignment_tree[node1].ordering;
00277
00278 for( size_t aI = 0; aI < a_ord.size(); aI++ )
00279 if( seqI == 0 )
00280 a_ord[aI].c1_siv = (std::numeric_limits<size_t>::max)();
00281 else
00282 a_ord[aI].c2_siv = (std::numeric_limits<size_t>::max)();
00283 for( size_t cI = 0; cI < c_ord.size(); cI++ )
00284 c_ord[cI].parent_siv = (std::numeric_limits<size_t>::max)();
00285
00286 for( size_t aI = 0; aI < a_ord.size(); aI++ )
00287 {
00288 if( a_ord[aI].reference_iv.LeftEnd(seqI) == NO_MATCH )
00289 continue;
00290 size_t cI = 0;
00291 for( ; cI < c_ord.size(); cI++ )
00292 {
00293 if( absolut(a_ord[aI].reference_iv.Start(seqI)) != c_ord[cI].LeftEnd() )
00294 continue;
00295 if( a_ord[aI].reference_iv.Length(seqI) != c_ord[cI].Length() )
00296 {
00297 breakHere();
00298 cerr << "mapping length mismatch\n";
00299 cerr << "ancestor: " << ancestor << "\t node1: " << node1 << endl;
00300 cerr << "a_ord[" << aI << "].reference_iv.Length(" << seqI << "): " << a_ord[aI].reference_iv.Length(seqI) << endl;
00301 cerr << "a_ord[" << aI << "].reference_iv.LeftEnd(" << seqI << "): " << a_ord[aI].reference_iv.LeftEnd(seqI) << endl;
00302 cerr << "c_ord[" << cI << "].Length(): " << c_ord[cI].Length() << endl;
00303 cerr << "c_ord[" << cI << "].LeftEnd(): " << c_ord[cI].LeftEnd() << endl;
00304 cerr << "";
00305 cerr << "";
00306 }
00307
00308 if( seqI == 0 )
00309 a_ord[aI].c1_siv = cI;
00310 else
00311 a_ord[aI].c2_siv = cI;
00312 c_ord[cI].parent_siv = aI;
00313 break;
00314 }
00315 if( cI == c_ord.size() )
00316 {
00317 breakHere();
00318 cerr << "error no mapping\n";
00319 }
00320 }
00321 }
00322
00323
00324 void ProgressiveAligner::translateGappedCoordinates( vector<AbstractMatch*>& ml, uint seqI, node_id_t extant, node_id_t ancestor )
00325 {
00326
00327 vector< node_id_t > trans_path;
00328 getPath( extant, ancestor, trans_path );
00329
00330
00331 for( size_t mI = 0; mI < ml.size(); mI++ )
00332 if( ml[mI]->Orientation(seqI) == AbstractMatch::reverse )
00333 ml[mI]->Invert();
00334
00335
00336 for( size_t nI = 1; nI < trans_path.size(); nI++ )
00337 {
00338
00339
00340
00341
00342
00343 SSC<AbstractMatch> ssc(seqI);
00344 sort(ml.begin(), ml.end(), ssc);
00345
00346
00347 vector< SuperInterval >& siv_list = alignment_tree[trans_path[nI]].ordering;
00348 vector< vector< AbstractMatch* > > siv_matches = vector< vector< AbstractMatch* > >(siv_list.size());
00349 size_t cur_child = 0;
00350 if( alignment_tree[trans_path[nI]].children[0] == trans_path[nI-1] )
00351 cur_child = 0;
00352 else if( alignment_tree[trans_path[nI]].children[1] == trans_path[nI-1] )
00353 cur_child = 1;
00354 else
00355 {
00356 breakHere();
00357 cerr << "forest fire\n";
00358 }
00359
00360 AbstractMatchSeqManipulator amsm( seqI );
00361 applyAncestralBreakpoints(siv_list, ml, cur_child, amsm );
00362
00363
00364 sort(ml.begin(), ml.end(), ssc);
00365
00366
00367 for( size_t mI = 0; mI < ml.size(); mI++ )
00368 {
00369 if( ml[mI]->LeftEnd(seqI) == 0 )
00370 {
00371 breakHere();
00372 cerr << "fefefe";
00373 }
00374 size_t sivI = 0;
00375 for( ; sivI < siv_list.size(); sivI++ )
00376 {
00377 if( siv_list[sivI].reference_iv.LeftEnd(cur_child) == NO_MATCH )
00378 continue;
00379 if( ml[mI]->LeftEnd(seqI) >= siv_list[sivI].reference_iv.LeftEnd(cur_child) &&
00380 ml[mI]->LeftEnd(seqI) < siv_list[sivI].reference_iv.LeftEnd(cur_child) + siv_list[sivI].reference_iv.Length(cur_child) )
00381 break;
00382 }
00383 if( sivI == siv_list.size() )
00384 {
00385 cerr << "nI is: "<< nI << endl;
00386 cerr << "trans_path: ";
00387 for( size_t ttI = 0; ttI < trans_path.size(); ttI++ )
00388 cerr << " " << trans_path[ttI];
00389 cerr << endl;
00390 cerr << "problem seq: " << seqI << std::endl;
00391 cerr << "ml[" << mI << "]->Start(0) == " << ml[mI]->Start(0) << endl;
00392 cerr << "ml[" << mI << "]->Length(0) == " << ml[mI]->Length(1) << endl;
00393 cerr << "ml[" << mI << "]->Start(1) == " << ml[mI]->Start(0) << endl;
00394 cerr << "ml[" << mI << "]->Length(1) == " << ml[mI]->Length(1) << endl;
00395 cerr << "ml.size(): " << ml.size() << endl;
00396 for( sivI = 0; sivI < siv_list.size(); sivI++ )
00397 {
00398 cerr << "siv_list[" << sivI << "] left end 0: " << siv_list[sivI].reference_iv.LeftEnd(0) << endl;
00399 if( siv_list[sivI].reference_iv.LeftEnd(0) != 0 )
00400 cerr << "siv_list[" << sivI << "] right end 0: " << siv_list[sivI].reference_iv.LeftEnd(0) + siv_list[sivI].reference_iv.Length(0) << endl;
00401 cerr << "siv_list[" << sivI << "] left end 1: " << siv_list[sivI].reference_iv.LeftEnd(1) << endl;
00402 if( siv_list[sivI].reference_iv.LeftEnd(1) != 0 )
00403 cerr << "siv_list[" << sivI << "] right end 1: " << siv_list[sivI].reference_iv.LeftEnd(1) + siv_list[sivI].reference_iv.Length(1) << endl;
00404 }
00405 breakHere();
00406 }
00407 if( ml[mI]->LeftEnd(seqI) + ml[mI]->Length(seqI) >
00408 siv_list[sivI].reference_iv.LeftEnd(cur_child) + siv_list[sivI].reference_iv.Length(cur_child) )
00409 {
00410 cerr << "doesn't fit\n";
00411 cerr << "ml[" << mI << "]->LeftEnd(" << seqI << "): " << ml[mI]->LeftEnd(seqI) << endl;
00412 cerr << "ml[" << mI << "]->RightEnd(" << seqI << "): " << ml[mI]->RightEnd(seqI) << endl;
00413 cerr << "siv_list[" << sivI << "] left end 0: " << siv_list[sivI].reference_iv.LeftEnd(0) << endl;
00414 if( siv_list[sivI].reference_iv.LeftEnd(0) != 0 )
00415 cerr << "siv_list[" << sivI << "] right end 0: " << siv_list[sivI].reference_iv.LeftEnd(0) + siv_list[sivI].reference_iv.Length(0) << endl;
00416 cerr << "siv_list[" << sivI << "] left end 1: " << siv_list[sivI].reference_iv.LeftEnd(1) << endl;
00417 if( siv_list[sivI].reference_iv.LeftEnd(1) != 0 )
00418 cerr << "siv_list[" << sivI << "] right end 1: " << siv_list[sivI].reference_iv.LeftEnd(1) + siv_list[sivI].reference_iv.Length(1) << endl;
00419 cerr << "ml.size(): " << ml.size() << endl;
00420 cerr << "siv_list.size(): " << siv_list.size() << endl;
00421 cerr << "trans_path:";
00422 for( size_t tI = 0; tI < trans_path.size(); tI++ )
00423 cerr << " " << trans_path[tI];
00424 cerr << endl;
00425 cerr << "trans_path[" << nI << "]: " << trans_path[nI] << endl;
00426 breakHere();
00427 }
00428
00429 ml[mI]->SetLeftEnd( seqI, ml[mI]->LeftEnd(seqI) - siv_list[sivI].reference_iv.LeftEnd(cur_child) + 1 );
00430
00431 if( siv_list[sivI].reference_iv.Start(cur_child) < 0 )
00432 {
00433 int64 new_lend = siv_list[sivI].reference_iv.Length(cur_child) - ml[mI]->LeftEnd(seqI);
00434 new_lend -= ml[mI]->Length( seqI ) - 2;
00435 new_lend *= ml[mI]->Orientation(seqI) == AbstractMatch::forward ? 1 : -1;
00436 ml[mI]->Invert();
00437 ml[mI]->SetStart( seqI, new_lend );
00438 }
00439 siv_matches[sivI].push_back( ml[mI] );
00440 }
00441
00442
00443 ml.clear();
00444 for( size_t sivI = 0; sivI < siv_matches.size(); sivI++ )
00445 {
00446 if( siv_matches[sivI].size() == 0 )
00447 continue;
00448
00449
00450 CompactGappedAlignment<>* siv_cga = dynamic_cast<CompactGappedAlignment<>*>(siv_list[sivI].reference_iv.GetMatches()[0]);
00451 if( siv_list[sivI].reference_iv.GetMatches().size() > 1 )
00452 siv_cga = NULL;
00453 bool alloc_new_siv = false;
00454 CompactGappedAlignment<> tmp_cga;
00455 if( siv_cga == NULL )
00456 {
00457 alloc_new_siv = true;
00458 siv_cga = tmp_cga.Copy();
00459 CompactGappedAlignment<> dorkas(siv_list[sivI].reference_iv);
00460 *siv_cga = dorkas;
00461 }
00462
00463
00464 for( size_t mI = 0; mI < siv_matches[sivI].size(); mI++ )
00465 {
00466 CompactGappedAlignment<>* match_cga = dynamic_cast<CompactGappedAlignment<>*>(siv_matches[sivI][mI]);
00467 bool alloc_new = false;
00468 if( match_cga == NULL )
00469 {
00470 match_cga = tmp_cga.Copy();
00471 *match_cga = CompactGappedAlignment<>(*(siv_matches[sivI][mI]));
00472 alloc_new = true;
00473 }
00474 siv_cga->translate( *match_cga, seqI, cur_child );
00475
00476 if( alloc_new )
00477 {
00478 siv_matches[sivI][mI]->Free();
00479 siv_matches[sivI][mI] = match_cga;
00480 }
00481 }
00482
00483
00484 for( size_t mI = 0; mI < siv_matches[sivI].size(); mI++ )
00485 {
00486 int64 cur_start = siv_matches[sivI][mI]->Start(seqI);
00487 if( cur_start > 0 )
00488 siv_matches[sivI][mI]->SetStart( seqI, cur_start + siv_list[sivI].LeftEnd() - 1 );
00489 else
00490 siv_matches[sivI][mI]->SetStart( seqI, cur_start - siv_list[sivI].LeftEnd() + 1);
00491 if( (siv_matches[sivI][mI]->LeftEnd(seqI) + siv_matches[sivI][mI]->Length(seqI) > siv_list.back().LeftEnd() + siv_list.back().Length() )
00492 )
00493 {
00494
00495 cerr << "siv left is: " << siv_list[sivI].LeftEnd() << endl;
00496 cerr << "siv right is: " << siv_list[sivI].LeftEnd() + siv_list[sivI].Length() << endl;
00497 cerr << "match right is: " << siv_matches[sivI][mI]->LeftEnd(seqI) + siv_matches[sivI][mI]->Length(seqI) << endl;
00498 cerr << "superseq right is: " << siv_list.back().LeftEnd() + siv_list.back().Length() << endl;
00499 cerr << "";
00500 breakHere();
00501 }
00502 if( debug_aligner && siv_matches[sivI][mI]->Start(seqI) == 0 )
00503 {
00504 breakHere();
00505 }
00506 }
00507 if(alloc_new_siv)
00508 siv_cga->Free();
00509 ml.insert( ml.end(), siv_matches[sivI].begin(), siv_matches[sivI].end() );
00510 }
00511 }
00512
00513 for( size_t mI = 0; mI < ml.size(); mI++ )
00514 if( ml[mI]->Orientation(seqI) == AbstractMatch::reverse )
00515 ml[mI]->Invert();
00516 }
00517
00518 class SuperIntervalPtrComp
00519 {
00520 public:
00521 bool operator()( const SuperInterval* a, const SuperInterval* b )
00522 {
00523 return (*a) < (*b);
00524 }
00525 };
00526
00527 void ProgressiveAligner::recursiveApplyAncestralBreakpoints( node_id_t ancestor )
00528 {
00529 stack<node_id_t> node_stack;
00530 node_stack.push(ancestor);
00531 while( node_stack.size() > 0 )
00532 {
00533
00534 node_id_t cur_node = node_stack.top();
00535 node_stack.pop();
00536 SuperIntervalManipulator sim;
00537 if( progress_msgs ) cout << "cur node: " << cur_node << endl;
00538 for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
00539 {
00540 AlignmentTreeNode& atn = alignment_tree[alignment_tree[cur_node].children[childI]];
00541 if( progress_msgs ) cout << "childI " << childI << " aab\n";
00542 applyAncestralBreakpoints( alignment_tree[cur_node].ordering, atn.ordering, childI, sim );
00543 if( progress_msgs ) cout << "sort childI " << childI << "\n";
00544 vector<SuperInterval*> siv_ptr_list(atn.ordering.size());
00545 for( size_t sivI = 0; sivI < atn.ordering.size(); ++sivI )
00546 siv_ptr_list[sivI] = &(atn.ordering[sivI]);
00547 SuperIntervalPtrComp sipc;
00548 sort( siv_ptr_list.begin(), siv_ptr_list.end(), sipc );
00549 vector< SuperInterval > siv_list;
00550 for( size_t sivI = 0; sivI < siv_ptr_list.size(); ++sivI )
00551 siv_list.push_back(*siv_ptr_list[sivI]);
00552 swap(siv_list, atn.ordering);
00553 node_stack.push( alignment_tree[cur_node].children[childI] );
00554 }
00555 if( debug_aligner && alignment_tree[cur_node].children.size() > 0 )
00556 validateSuperIntervals(alignment_tree[cur_node].children[0], alignment_tree[cur_node].children[1], cur_node);
00557 if( progress_msgs ) cout << "linking node " << cur_node << "'s" << alignment_tree[cur_node].ordering.size() << " superintervals\n";
00558 for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
00559 linkSuperIntervals( alignment_tree[cur_node].children[childI], childI, cur_node );
00560 }
00561 }
00562
00563
00564 boolean getInterveningCoordinates( const AbstractMatch* iv, uint oseqI, Match* r_begin, Match* r_end, uint seqI, int64& gap_lend, int64& gap_rend ){
00565
00566 if( (r_end != NULL && r_end->Start( seqI ) == NO_MATCH) ||
00567 (r_begin != NULL && r_begin->Start( seqI ) == NO_MATCH) ){
00568 gap_lend = 0;
00569 gap_rend = 0;
00570 return true;
00571 }
00572
00573
00574 gap_rend = r_end != NULL ? r_end->Start( seqI ) : iv->RightEnd( oseqI ) + 1;
00575 gap_lend = r_begin != NULL ? r_begin->End( seqI ) + 1 : iv->LeftEnd( oseqI );
00576 if( gap_rend < 0 || gap_lend < 0 ){
00577 gap_rend = r_begin != NULL ? -r_begin->Start( seqI ) : iv->RightEnd( oseqI ) + 1;
00578 gap_lend = r_end != NULL ? -r_end->Start( seqI ) + r_end->Length() : 1;
00579 }
00580 if( gap_rend <= 0 || gap_lend <= 0 ){
00581
00582 genome::ErrorMsg( "Error constructing intervening coordinates" );
00583 }
00584 return true;
00585 }
00586
00587
00588 void ProgressiveAligner::pairwiseAnchorSearch( MatchList& r_list, Match* r_begin, Match* r_end, const AbstractMatch* iv, uint oseqI, uint oseqJ )
00589 {
00590 uint seqI = 0;
00591 MatchList gap_list;
00592 vector< int64 > starts;
00593
00594
00595
00596 for( seqI = 0; seqI < 2; seqI++ )
00597 {
00598 int64 gap_end = 0;
00599 int64 gap_start = 0;
00600 getInterveningCoordinates( iv, (seqI == 0 ? oseqI : oseqJ), r_begin, r_end, seqI, gap_start, gap_end);
00601 int64 diff = gap_end - gap_start;
00602 diff = diff > 0 ? diff - 1 : 0;
00603
00604 starts.push_back( gap_start );
00605 gnSequence* new_seq = NULL;
00606 if(diff > 0 && gap_start + diff - 1 <= r_list.seq_table[ seqI ]->length())
00607 new_seq = new gnSequence( r_list.seq_table[ seqI ]->ToString( diff, gap_start ) );
00608 else
00609 new_seq = new gnSequence();
00610 gap_list.seq_table.push_back( new_seq );
00611 gap_list.sml_table.push_back( new DNAMemorySML() );
00612 }
00613
00614 gnSeqI avg_len = (gap_list.seq_table[0]->length() + gap_list.seq_table[1]->length())/2;
00615 uint search_seed_size = getDefaultSeedWeight( avg_len );
00616 gap_mh.get().Clear();
00617
00618 uint seed_count = use_seed_families ? 3 : 1;
00619 for( size_t seedI = 0; seedI < seed_count; seedI++ )
00620 {
00621
00622
00623
00624 uint64 default_seed = getSeed( search_seed_size, seedI );
00625 if( search_seed_size < MIN_DNA_SEED_WEIGHT )
00626 {
00627 for( uint seqI = 0; seqI < gap_list.seq_table.size(); seqI++ )
00628 delete gap_list.seq_table[ seqI ];
00629 for( uint seqI = 0; seqI < gap_list.sml_table.size(); seqI++ )
00630 delete gap_list.sml_table[ seqI ];
00631 return;
00632 }
00633 for( uint seqI = 0; seqI < gap_list.seq_table.size(); seqI++ ){
00634 gap_list.sml_table[ seqI ]->Clear();
00635 gap_list.sml_table[ seqI ]->Create( *(gap_list.seq_table[ seqI ]), default_seed );
00636 }
00637
00638
00639
00640
00641 gap_mh.get().ClearSequences();
00642 if(seed_count>1)
00643 {
00644 MatchList cur_list = gap_list;
00645 gap_mh.get().FindMatches( cur_list );
00646 for( size_t mI = 0; mI < cur_list.size(); mI++ )
00647 cur_list[mI]->Free();
00648 }else
00649 gap_mh.get().FindMatches( gap_list );
00650 }
00651 if(seed_count>1)
00652 gap_mh.get().GetMatchList(gap_list);
00653
00654 EliminateOverlaps_v2( gap_list );
00655
00656
00657
00658 gap_list.LengthFilter( MIN_ANCHOR_LENGTH + 3 );
00659
00660 for( size_t gI = 0; gI < gap_list.size(); gI++ )
00661 {
00662 for( seqI = 0; seqI < 2; seqI++ )
00663 {
00664 int64 gap_rend = 0;
00665 int64 gap_lend = 0;
00666 getInterveningCoordinates( iv, (seqI == 0 ? oseqI : oseqJ), r_begin, r_end, seqI, gap_lend, gap_rend);
00667 gap_list[gI]->SetLeftEnd(seqI, gap_list[gI]->LeftEnd(seqI) + gap_lend - 1);
00668 }
00669 }
00670 r_list.insert(r_list.end(), gap_list.begin(), gap_list.end());
00671
00672
00673 for( uint seqI = 0; seqI < gap_list.seq_table.size(); seqI++ )
00674 delete gap_list.seq_table[ seqI ];
00675 for( uint seqI = 0; seqI < gap_list.sml_table.size(); seqI++ )
00676 delete gap_list.sml_table[ seqI ];
00677 }
00678
00679 template<class GappedAlignmentType>
00680 void ProgressiveAligner::recurseOnPairs( const vector<node_id_t>& node1_seqs, const vector<node_id_t>& node2_seqs, const GappedAlignmentType& iv, Matrix<MatchList>& matches, Matrix< std::vector< search_cache_t > >& search_cache_db, Matrix< std::vector< search_cache_t > >& new_cache_db, boost::multi_array< vector< vector< int64 > >, 2 >& iv_regions )
00681 {
00682 matches = Matrix<MatchList>(node1_seqs.size(),node2_seqs.size());
00683
00684 std::vector< bitset_t > aln_matrix;
00685 iv.GetAlignment(aln_matrix);
00686 Match tmp(2);
00687 const size_t sizer = node1_seqs.size() * node2_seqs.size();
00688 std::vector< std::pair<size_t,size_t> > node_pairs(sizer);
00689 int nni = 0;
00690 for( size_t n1 = 0; n1 < node1_seqs.size(); n1++ )
00691 for( size_t n2 = 0; n2 < node2_seqs.size(); n2++ )
00692 node_pairs[nni++] = make_pair(n1,n2);
00693
00694 #pragma omp parallel for
00695 for(int ni = 0; ni < node_pairs.size(); ni++)
00696 {
00697 size_t n1 = node_pairs[ni].first;
00698 size_t n2 = node_pairs[ni].second;
00699 vector<node_id_t>::const_iterator n1_iter = node1_seqs.begin() + n1;
00700 vector<node_id_t>::const_iterator n2_iter = node2_seqs.begin() + n2;
00701
00702 uint seqI = node_sequence_map[*n1_iter];
00703 uint seqJ = node_sequence_map[*n2_iter];
00704 MatchList& mlist = matches(n1, n2);
00705 std::vector< search_cache_t >& cache = search_cache_db(n1, n2);
00706 std::vector< search_cache_t >& new_cache = new_cache_db(n1, n2);
00707 mlist.seq_table.push_back( alignment_tree[*n1_iter].sequence );
00708 mlist.seq_table.push_back( alignment_tree[*n2_iter].sequence );
00709
00710 gnSeqI charI = 0;
00711 gnSeqI charJ = 0;
00712 const size_t iv_aln_length = iv.AlignmentLength();
00713
00714
00715
00716 pair< int64, int64 > pair_1l(0,0);
00717 pair< int64, int64 > pair_1r(0,0);
00718 pair< int64, int64 > pair_2l(0,0);
00719 pair< int64, int64 > pair_2r(0,0);
00720 for( uint colI = 0; colI <= iv_aln_length; colI++ )
00721 {
00722 if( (aln_matrix[seqI].test(colI) && aln_matrix[seqJ].test(colI)) )
00723 {
00724 if( colI == 0 )
00725 break;
00726 if( iv.Orientation(seqI) == AbstractMatch::forward )
00727 pair_1l = make_pair( iv.LeftEnd(seqI), iv.LeftEnd(seqI)+charI );
00728 else
00729 pair_1r = make_pair( iv.RightEnd(seqI)-charI, iv.RightEnd(seqI) );
00730 if( iv.Orientation(seqJ) == AbstractMatch::forward )
00731 pair_2l = make_pair( iv.LeftEnd(seqJ), iv.LeftEnd(seqJ)+charJ );
00732 else
00733 pair_2r = make_pair( iv.RightEnd(seqJ)-charJ, iv.RightEnd(seqJ) );
00734 break;
00735 }
00736 if( aln_matrix[seqI].test(colI) )
00737 ++charI;
00738 if( aln_matrix[seqJ].test(colI) )
00739 ++charJ;
00740 }
00741
00742 charI = 0;
00743 charJ = 0;
00744 for( uint colI = iv_aln_length; colI > 0 ; colI-- )
00745 {
00746 if( (aln_matrix[seqI].test(colI-1) && aln_matrix[seqJ].test(colI-1)) )
00747 {
00748 if( colI == iv_aln_length )
00749 break;
00750 if( iv.Orientation(seqI) == AbstractMatch::forward )
00751 pair_1r = make_pair( iv.RightEnd(seqI)-charI, iv.RightEnd(seqI) );
00752 else
00753 pair_1l = make_pair( iv.LeftEnd(seqI), iv.LeftEnd(seqI)+charI );
00754 if( iv.Orientation(seqJ) == AbstractMatch::forward )
00755 pair_2r = make_pair( iv.RightEnd(seqJ)-charJ, iv.RightEnd(seqJ) );
00756 else
00757 pair_2l = make_pair( iv.LeftEnd(seqJ), iv.LeftEnd(seqJ)+charJ );
00758 break;
00759 }
00760 if( aln_matrix[seqI].test(colI-1) )
00761 ++charI;
00762 if( aln_matrix[seqJ].test(colI-1) )
00763 ++charJ;
00764 }
00765 if( pair_1l.first < pair_1l.second )
00766 {
00767 iv_regions[n1][n2][0].push_back(pair_1l.first);
00768 iv_regions[n1][n2][0].push_back(pair_1l.second);
00769 }
00770 if( pair_1r.first < pair_1r.second )
00771 {
00772 iv_regions[n1][n2][0].push_back(pair_1r.first);
00773 iv_regions[n1][n2][0].push_back(pair_1r.second);
00774 if( pair_1r.first <= pair_1l.second && pair_1r.second >= pair_1l.first )
00775 {
00776 cerr << "Ohno. Overlap in outside LCB search intervals\n";
00777 cerr << "Left: " << pair_1l.first << '\t' << pair_1l.second << " right: " << pair_1r.first << '\t' << pair_1r.second << endl;
00778 genome::breakHere();
00779 }
00780 }
00781
00782 if( pair_2l.first < pair_2l.second )
00783 {
00784 iv_regions[n1][n2][1].push_back(pair_2l.first);
00785 iv_regions[n1][n2][1].push_back(pair_2l.second);
00786 }
00787 if( pair_2r.first < pair_2r.second )
00788 {
00789 iv_regions[n1][n2][1].push_back(pair_2r.first);
00790 iv_regions[n1][n2][1].push_back(pair_2r.second);
00791 if( pair_2r.first <= pair_2l.second && pair_2r.second >= pair_2l.first )
00792 {
00793 cerr << "Ohno. Overlap in outside LCB search intervals\n";
00794 cerr << "Left: " << pair_2l.first << '\t' << pair_2l.second << " right: " << pair_2r.first << '\t' << pair_2r.second << endl;
00795 genome::breakHere();
00796 }
00797 }
00798
00799 charI = 0;
00800 charJ = 0;
00801 gnSeqI prev_charI = 0;
00802 gnSeqI prev_charJ = 0;
00803 bool in_gap = false;
00804
00805 for( uint colI = 0; colI <= iv_aln_length; colI++ )
00806 {
00807 if( colI == iv_aln_length ||
00808 (aln_matrix[seqI].test(colI) && aln_matrix[seqJ].test(colI)) )
00809 {
00810 if( in_gap &&
00811 charI - prev_charI > min_recursive_gap_length &&
00812 charJ - prev_charJ > min_recursive_gap_length )
00813 {
00814
00815 Match* l_match = NULL;
00816 l_match = tmp.Copy();
00817 if(iv.Orientation(seqI) == AbstractMatch::forward)
00818 l_match->SetLeftEnd(0, iv.LeftEnd(seqI)+prev_charI);
00819 else
00820 {
00821 l_match->SetLeftEnd(0, iv.RightEnd(seqI)-prev_charI);
00822 l_match->SetOrientation(0, AbstractMatch::reverse );
00823 }
00824 if(iv.Orientation(seqJ) == AbstractMatch::forward)
00825 l_match->SetLeftEnd(1, iv.LeftEnd(seqJ)+prev_charJ);
00826 else
00827 {
00828 l_match->SetLeftEnd(1, iv.RightEnd(seqJ)-prev_charJ);
00829 l_match->SetOrientation(1, AbstractMatch::reverse );
00830 }
00831 l_match->SetLength(0);
00832 Match* r_match = NULL;
00833 if( charJ != iv.RightEnd(seqJ) && charI != iv.RightEnd(seqI) )
00834 {
00835 r_match = tmp.Copy();
00836 if(iv.Orientation(seqI) == AbstractMatch::forward)
00837 r_match->SetLeftEnd(0, iv.LeftEnd(seqI)+charI);
00838 else
00839 {
00840 r_match->SetLeftEnd(0, iv.RightEnd(seqI)-charI);
00841 r_match->SetOrientation(0, AbstractMatch::reverse );
00842 }
00843 if(iv.Orientation(seqJ) == AbstractMatch::forward)
00844 r_match->SetLeftEnd(1, iv.LeftEnd(seqJ)+charJ);
00845 else
00846 {
00847 r_match->SetLeftEnd(1, iv.RightEnd(seqJ)-charJ);
00848 r_match->SetOrientation(1, AbstractMatch::reverse );
00849 }
00850 r_match->SetLength(0);
00851 }
00852
00853 if( iv.Orientation(seqI) == AbstractMatch::reverse )
00854 {
00855 swap(l_match,r_match);
00856 if( l_match != NULL ) l_match->Invert();
00857 if( r_match != NULL ) r_match->Invert();
00858 }
00859
00860 search_cache_t cacheval = make_pair( l_match, r_match );
00861 std::vector< search_cache_t >::iterator cache_entry = std::upper_bound( cache.begin(), cache.end(), cacheval, mems::cache_comparator );
00862 if( cache_entry == cache.end() ||
00863 (mems::cache_comparator( cacheval, *cache_entry ) || mems::cache_comparator( *cache_entry, cacheval )) )
00864 {
00865
00866 pairwiseAnchorSearch(mlist, l_match, r_match, &iv, seqI, seqJ);
00867 }
00868 if(using_cache_db)
00869 new_cache.push_back( cacheval );
00870 }
00871 prev_charI = charI;
00872 prev_charJ = charJ;
00873 in_gap = false;
00874 }
00875 else
00876 in_gap = true;
00877 if( colI < iv.AlignmentLength() )
00878 {
00879 if( aln_matrix[seqI].test(colI) )
00880 ++charI;
00881 if( aln_matrix[seqJ].test(colI) )
00882 ++charJ;
00883 }
00884 }
00885 }
00886 }
00887
00888 void ProgressiveAligner::getAncestralMatches( const vector< node_id_t > node1_seqs, const vector< node_id_t > node2_seqs, node_id_t node1, node_id_t node2, node_id_t ancestor, std::vector< AbstractMatch* >& ancestral_matches )
00889 {
00890
00891
00892
00893
00894
00895
00896 for( uint seqI = 0; seqI < node1_seqs.size(); seqI++ )
00897 {
00898 uint ii = this->node_sequence_map[node1_seqs[seqI]];
00899 vector< AbstractMatch* > seqI_matches;
00900
00901 for( uint seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
00902 {
00903 uint jj = this->node_sequence_map[node2_seqs[seqJ]];
00904 vector< AbstractMatch* > cur_matches;
00905 for( size_t mI = 0; mI < original_ml.size(); mI++ )
00906 {
00907 if( original_ml[mI]->LeftEnd(ii) == NO_MATCH )
00908 continue;
00909 if( original_ml[mI]->LeftEnd(jj) == NO_MATCH )
00910 continue;
00911 Match mm( 2 );
00912 Match* new_m = mm.Copy();
00913 new_m->SetStart( 0, original_ml[mI]->Start(ii));
00914 new_m->SetStart( 1, original_ml[mI]->Start(jj));
00915 new_m->SetLength(original_ml[mI]->Length());
00916 if( new_m->Start(0) < 0 )
00917 new_m->Invert();
00918 cur_matches.push_back( new_m );
00919 }
00920
00921 translateGappedCoordinates( cur_matches, 1, node2_seqs[seqJ], node2 );
00922 seqI_matches.insert( seqI_matches.end(), cur_matches.begin(), cur_matches.end() );
00923 }
00924 EliminateOverlaps_v2( seqI_matches );
00925 translateGappedCoordinates( seqI_matches, 0, node1_seqs[seqI], node1 );
00926 ancestral_matches.insert( ancestral_matches.end(), seqI_matches.begin(), seqI_matches.end() );
00927 }
00928 EliminateOverlaps_v2( ancestral_matches );
00929 }
00930
00931
00932 void ProgressiveAligner::getPairwiseMatches( const vector< node_id_t >& node1_seqs, const vector< node_id_t >& node2_seqs, Matrix<MatchList>& pairwise_matches )
00933 {
00934 pairwise_matches = Matrix< MatchList >( node1_seqs.size(), node2_seqs.size() );
00935
00936
00937 for( uint seqI = 0; seqI < node1_seqs.size(); seqI++ )
00938 {
00939 for( uint seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
00940 {
00941 uint ii = this->node_sequence_map[node1_seqs[seqI]];
00942 uint jj = this->node_sequence_map[node2_seqs[seqJ]];
00943 pairwise_matches(seqI, seqJ).seq_table.push_back(original_ml.seq_table[ii]);
00944 pairwise_matches(seqI, seqJ).seq_table.push_back(original_ml.seq_table[jj]);
00945 pairwise_matches(seqI, seqJ).seq_filename.push_back(original_ml.seq_filename[ii]);
00946 pairwise_matches(seqI, seqJ).seq_filename.push_back(original_ml.seq_filename[jj]);
00947 }
00948 }
00949
00950
00951 for( size_t mI = 0; mI < original_ml.size(); mI++ )
00952 {
00953 for( uint seqI = 0; seqI < node1_seqs.size(); seqI++ )
00954 {
00955 uint ii = this->node_sequence_map[node1_seqs[seqI]];
00956 if( original_ml[mI]->LeftEnd(ii) == NO_MATCH )
00957 continue;
00958 for( uint seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
00959 {
00960 uint jj = this->node_sequence_map[node2_seqs[seqJ]];
00961 if( original_ml[mI]->LeftEnd(jj) == NO_MATCH )
00962 continue;
00963 Match mm( 2 );
00964 Match* new_m = mm.Copy();
00965 new_m->SetStart( 0, original_ml[mI]->Start(ii));
00966 new_m->SetStart( 1, original_ml[mI]->Start(jj));
00967 new_m->SetLength(original_ml[mI]->Length());
00968 if( new_m->Start(0) < 0 )
00969 new_m->Invert();
00970 pairwise_matches(seqI,seqJ).push_back( new_m );
00971 }
00972 }
00973 }
00974 }
00975
00976
00977 int IsDenseEnough( GappedAlignment* gal_iter )
00978 {
00979 double total_len = 0;
00980 gnSeqI seqs = 0;
00981 for( uint seqI = 0; seqI < gal_iter->SeqCount(); seqI++ )
00982 {
00983 if( gal_iter->LeftEnd(seqI) == NO_MATCH )
00984 continue;
00985 total_len += gal_iter->Length(seqI);
00986 }
00987 double density = total_len / (gal_iter->AlignmentLength() * (double)gal_iter->Multiplicity());
00988
00989
00990
00991
00992
00993
00994 double threshold = ((max_density - min_density)/(min_window_size - max_window_size)) * ( (double)gal_iter->AlignmentLength() - max_window_size ) + min_density;
00995 if( density > max_density )
00996 return 2;
00997 if( density > threshold )
00998 return 1;
00999 return 0;
01000 }
01001
01002 void splitGappedAlignment( const GappedAlignment& ga, GappedAlignment& ga1, GappedAlignment& ga2, std::vector<size_t>& seqs1, std::vector<size_t>& seqs2 )
01003 {
01004 const vector< string >& aln = GetAlignment( ga, std::vector<gnSequence*>(ga.SeqCount()) );
01005 ga1 = ga;
01006 ga2 = ga;
01007 for( size_t seqI = 0; seqI < seqs1.size(); seqI++ )
01008 ga2.SetLeftEnd(seqs1[seqI], NO_MATCH);
01009 for( size_t seqI = 0; seqI < seqs2.size(); seqI++ )
01010 ga1.SetLeftEnd(seqs2[seqI], NO_MATCH);
01011 }
01012
01013 void removeLargeGapsPP( GappedAlignment& gal, list< GappedAlignment* >& gal_list, vector<bool>& gap_iv, const vector< size_t >& group1, const vector< size_t >& group2 )
01014 {
01015
01016
01017 gap_iv.clear();
01018 gal_list.clear();
01019 const vector< string >& aln_matrix = GetAlignment(gal, vector<gnSequence*>(gal.SeqCount(),NULL));
01020 size_t gap_cols = 0;
01021 size_t last_aln_col = (std::numeric_limits<size_t>::max)();
01022 size_t col_base = 0;
01023 GappedAlignment* galp = gal.Copy();
01024 for( size_t colI = 0; colI < gal.AlignmentLength(); colI++ )
01025 {
01026 size_t g1 = 0;
01027 size_t g2 = 0;
01028 for( ; g1 < group1.size(); ++g1 )
01029 {
01030 if( aln_matrix[group1[g1]][colI] != '-' )
01031 break;
01032 }
01033 for( ; g2 < group2.size(); ++g2 )
01034 {
01035 if( aln_matrix[group2[g2]][colI] != '-' )
01036 break;
01037 }
01038 if( g1 < group1.size() && g2 < group2.size() )
01039 {
01040
01041 if( gap_cols > max_gap_length )
01042 {
01043
01044 gnSeqI split_point = 0;
01045 if( last_aln_col != (std::numeric_limits<size_t>::max)() )
01046 {
01047 split_point = last_aln_col + lcb_hangover - col_base;
01048 gal_list.push_back( galp );
01049 gap_iv.push_back(false);
01050 galp = (GappedAlignment*)galp->Split(split_point);
01051 col_base += split_point;
01052 }
01053 split_point = colI - lcb_hangover - col_base;
01054 gal_list.push_back( galp );
01055 gap_iv.push_back(true);
01056 galp = (GappedAlignment*)galp->Split(split_point);
01057 col_base += split_point;
01058 }
01059 last_aln_col = colI;
01060 gap_cols = 0;
01061 }else
01062 ++gap_cols;
01063 }
01064
01065 if( gap_cols > max_gap_length )
01066 {
01067 gnSeqI split_point = 0;
01068 if( last_aln_col != (std::numeric_limits<size_t>::max)() )
01069 {
01070 split_point = last_aln_col + lcb_hangover - col_base;
01071 gal_list.push_back( galp );
01072 gap_iv.push_back(false);
01073 galp = (GappedAlignment*)galp->Split(split_point);
01074 }
01075 gap_iv.push_back(true);
01076 }else
01077 gap_iv.push_back(false);
01078 gal_list.push_back( galp );
01079 }
01080
01081 void ProgressiveAligner::refineAlignment( GappedAlignment& gal, node_id_t ancestor, bool profile_aln, AlnProgressTracker& apt )
01082 {
01083
01084
01085
01086 list< GappedAlignment* > gal_list;
01087 vector<bool> gap_iv;
01088 std::vector<node_id_t> nodes1;
01089 std::vector<node_id_t> nodes2;
01090 getAlignedChildren( alignment_tree[ancestor].children[0], nodes1 );
01091 getAlignedChildren( alignment_tree[ancestor].children[1], nodes2 );
01092 std::vector<size_t> seqs1( nodes1.size() );
01093 std::vector<size_t> seqs2( nodes2.size() );
01094 for( size_t nI = 0; nI < nodes1.size(); nI++ )
01095 seqs1[nI] = node_sequence_map[nodes1[nI]];
01096 for( size_t nI = 0; nI < nodes2.size(); nI++ )
01097 seqs2[nI] = node_sequence_map[nodes2[nI]];
01098
01099
01100 removeLargeGapsPP( gal, gal_list, gap_iv, seqs1, seqs2 );
01101
01102
01103
01104
01105 list< GappedAlignment* >::iterator gal_iter = gal_list.begin();
01106 vector<bool>::iterator gap_iter = gap_iv.begin();
01107 while(gal_iter != gal_list.end())
01108 {
01109 int density = IsDenseEnough( *gal_iter );
01110 if( (density == 0 && (*gal_iter)->AlignmentLength() > max_window_size / 3) ||
01111 (density == 1 && (*gal_iter)->AlignmentLength() > max_window_size ) ||
01112 (density == 2 && (*gal_iter)->AlignmentLength() > max_window_size * 3 )
01113
01114
01115 )
01116 {
01117
01118 gnSeqI split_point = (*gal_iter)->AlignmentLength() / 2;
01119 list< GappedAlignment* >::iterator ins_iter = gal_iter;
01120 ++ins_iter;
01121 ins_iter = gal_list.insert(ins_iter, new GappedAlignment(**gal_iter) );
01122
01123 vector<bool>::iterator gap_ins_iter = gap_iter;
01124 size_t gap_off = gap_iter - gap_iv.begin();
01125 ++gap_ins_iter;
01126 gap_iv.insert( gap_ins_iter, *gap_iter );
01127 gap_iter = gap_iv.begin() + gap_off;
01128 (*gal_iter)->CropEnd( split_point );
01129 (*ins_iter)->CropStart( (*ins_iter)->AlignmentLength() - split_point );
01130 continue;
01131 }
01132
01133 ++gal_iter;
01134 ++gap_iter;
01135 }
01136 MuscleInterface& mi = MuscleInterface::getMuscleInterface();
01137
01138 gnSeqI new_len = 0;
01139
01140 gap_iter = gap_iv.begin();
01141
01142 const size_t gal_count = gal_list.size();
01143 #pragma omp parallel for schedule(dynamic)
01144 for( int galI = 0; galI < gal_count; galI++ )
01145 {
01146 list<GappedAlignment*>::iterator my_g_iter = gal_list.begin();
01147 vector<bool>::iterator my_b_iter = gap_iv.begin();
01148 for(uint a = 0; a < galI; a++)
01149 {
01150 ++my_g_iter;
01151 ++my_b_iter;
01152 }
01153 #pragma omp critical
01154 {
01155 apt.cur_leftend += (*my_g_iter)->AlignmentLength();
01156 }
01157 if( profile_aln && !(*my_b_iter) )
01158 {
01159 GappedAlignment ga1;
01160 GappedAlignment ga2;
01161 splitGappedAlignment( **my_g_iter, ga1, ga2, seqs1, seqs2 );
01162 if( ga1.Multiplicity() > 0 && ga2.Multiplicity() > 0 )
01163 {
01164 mi.ProfileAlignFast( ga1, ga2, **my_g_iter, true );
01165 }
01166 }else if(!(*my_b_iter))
01167 {
01168 int density = IsDenseEnough( *my_g_iter );
01169 if( density == 0 )
01170 mi.RefineFast( **my_g_iter );
01171 else if( density == 1 )
01172 mi.RefineFast( **my_g_iter, 500 );
01173 else
01174 mi.RefineFast( **my_g_iter, 200 );
01175 }
01176
01177 new_len += (*my_g_iter)->AlignmentLength();
01178 #pragma omp critical
01179 {
01180
01181 double cur_progress = ((double)apt.cur_leftend / (double)apt.total_len)*100.0;
01182 printProgress((uint)apt.prev_progress, (uint)cur_progress, cout);
01183 apt.prev_progress = cur_progress;
01184 }
01185 }
01186 gal_iter = gal_list.end();
01187
01188
01189 vector< string > aln_matrix( gal.SeqCount(), string( new_len, '-' ) );
01190 vector< string::size_type > pos( gal.SeqCount(), 0 );
01191 for( gal_iter = gal_list.begin(); gal_iter != gal_list.end(); ++gal_iter )
01192 {
01193 const vector< string >& tmp_mat = GetAlignment(**gal_iter, vector<gnSequence*>( gal.SeqCount() ) );
01194 for( uint seqI = 0; seqI < tmp_mat.size(); seqI++ )
01195 {
01196 if( gal.LeftEnd(seqI) == 0 )
01197 continue;
01198 aln_matrix[seqI].replace(pos[seqI], tmp_mat[seqI].size(), tmp_mat[seqI]);
01199 pos[seqI] += tmp_mat[seqI].size();
01200 }
01201
01202 delete (*gal_iter);
01203 }
01204 gal.SetAlignment(aln_matrix);
01205 }
01206
01207 void ProgressiveAligner::doGappedAlignment( node_id_t ancestor, bool profile_aln )
01208 {
01209 AlnProgressTracker apt;
01210 gnSeqI total_len = 0;
01211 for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
01212 total_len += alignment_tree[ancestor].ordering[aI].Length();
01213 apt.total_len = total_len;
01214 apt.prev_progress = 0;
01215
01216 printProgress(-1, 0, cout);
01217 apt.cur_leftend = 1;
01218
01219 for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
01220 {
01221 if( alignment_tree[ancestor].ordering[aI].reference_iv.Multiplicity() == 1 )
01222 {
01223 apt.cur_leftend += alignment_tree[ancestor].ordering[aI].reference_iv.AlignmentLength();
01224 continue;
01225 }
01226
01227
01228
01229 GappedAlignment gal;
01230 extractAlignment(ancestor, aI, gal);
01231
01232
01233 if( gal.Multiplicity() > 1 )
01234 refineAlignment( gal, ancestor, profile_aln, apt );
01235 else
01236 apt.cur_leftend += gal.AlignmentLength();
01237
01238
01239 ConstructSuperIntervalFromMSA(ancestor, aI, gal);
01240
01241
01242
01243 double cur_progress = ((double)apt.cur_leftend / (double)apt.total_len)*100.0;
01244 printProgress((uint)apt.prev_progress, (uint)cur_progress, cout);
01245 apt.prev_progress = cur_progress;
01246 }
01247 printMemUsage();
01248 cout << "Fix left ends\n";
01249 FixLeftEnds(ancestor);
01250 printMemUsage();
01251
01252 if( debug_aligner )
01253 validateSuperIntervals(alignment_tree[ancestor].children[0], alignment_tree[ancestor].children[1], ancestor);
01254 cout << "\ndone.\n";
01255 }
01256
01257 void ProgressiveAligner::FixLeftEnds( node_id_t ancestor )
01258 {
01259
01260 stack< node_id_t > node_stack;
01261 node_stack.push( ancestor );
01262 vector<bool> visited( alignment_tree.size(), false );
01263 while( node_stack.size() > 0 )
01264 {
01265 node_id_t cur_node = node_stack.top();
01266
01267 if( !visited[cur_node] )
01268 {
01269 for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
01270 node_stack.push( alignment_tree[cur_node].children[childI] );
01271 visited[cur_node] = true;
01272 continue;
01273 }
01274 node_stack.pop();
01275 if( alignment_tree[cur_node].sequence != NULL )
01276 continue;
01277
01278 vector< SuperInterval >& siv_list = alignment_tree[cur_node].ordering;
01279 gnSeqI left_end = 1;
01280 for( size_t sivI = 0; sivI < siv_list.size(); sivI++ )
01281 {
01282 siv_list[sivI].SetLeftEnd(left_end);
01283 siv_list[sivI].SetLength(siv_list[sivI].reference_iv.AlignmentLength());
01284 left_end += siv_list[sivI].reference_iv.AlignmentLength();
01285 CompactGappedAlignment<>* m_cga = dynamic_cast<CompactGappedAlignment<>*>(siv_list[sivI].reference_iv.GetMatches()[0]);
01286
01287
01288 if( m_cga == NULL || siv_list[sivI].reference_iv.GetMatches().size() > 1 )
01289 {
01290 for( uint childI = 0; childI <= 1; childI++ )
01291 {
01292 size_t cur_siv = childI == 0 ? alignment_tree[cur_node].ordering[sivI].c1_siv : alignment_tree[cur_node].ordering[sivI].c2_siv;
01293 if( cur_siv == (std::numeric_limits<size_t>::max)() )
01294 continue;
01295 const SuperInterval& c_siv = alignment_tree[ alignment_tree[cur_node].children[childI] ].ordering[ cur_siv ];
01296 int64 diff = c_siv.LeftEnd() - siv_list[sivI].reference_iv.LeftEnd(childI);
01297 siv_list[sivI].reference_iv.SetLeftEnd(childI, c_siv.LeftEnd());
01298 const vector< AbstractMatch* >& matches = siv_list[sivI].reference_iv.GetMatches();
01299 for( size_t mI = 0; mI < matches.size(); mI++ )
01300 {
01301 if( matches[mI]->LeftEnd(childI) != NO_MATCH )
01302 matches[mI]->SetLeftEnd(childI, matches[mI]->LeftEnd(childI) + diff);
01303 }
01304 }
01305
01306 }else{
01307
01308 size_t c1_siv = alignment_tree[cur_node].ordering[sivI].c1_siv;
01309 if( c1_siv != (std::numeric_limits<size_t>::max)() )
01310 {
01311 const SuperInterval& c_siv = alignment_tree[ alignment_tree[cur_node].children[0] ].ordering[ c1_siv ];
01312 m_cga->SetLeftEnd(0, c_siv.LeftEnd());
01313 siv_list[sivI].reference_iv.SetLeftEnd(0, c_siv.LeftEnd());
01314 m_cga->SetLength(c_siv.Length(), 0);
01315 siv_list[sivI].reference_iv.SetLength(c_siv.Length(), 0);
01316 siv_list[sivI].reference_iv.SetOrientation(0, m_cga->Orientation(0));
01317 }
01318 size_t c2_siv = alignment_tree[cur_node].ordering[sivI].c2_siv;
01319 if( c2_siv != (std::numeric_limits<size_t>::max)() )
01320 {
01321 const SuperInterval& c_siv = alignment_tree[ alignment_tree[cur_node].children[1] ].ordering[ c2_siv ];
01322 m_cga->SetLeftEnd(1, c_siv.LeftEnd());
01323 siv_list[sivI].reference_iv.SetLeftEnd(1, c_siv.LeftEnd());
01324 m_cga->SetLength(c_siv.Length(), 1);
01325 siv_list[sivI].reference_iv.SetLength(c_siv.Length(), 1);
01326 siv_list[sivI].reference_iv.SetOrientation(1, m_cga->Orientation(1));
01327 }
01328 }
01329 if( debug_cga && m_cga && !m_cga->validate() )
01330
01331 cerr << "oh junkedy\n";
01332
01333 }
01334 }
01335 }
01336
01340 void propagateInvert( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t ancestor, size_t ans_siv )
01341 {
01342 stack< pair< node_id_t, size_t > > node_siv_stack;
01343 node_siv_stack.push( make_pair(ancestor, ans_siv) );
01344 while( node_siv_stack.size() > 0 )
01345 {
01346 pair< node_id_t, size_t > cur = node_siv_stack.top();
01347 node_siv_stack.pop();
01348 node_id_t cur_node = cur.first;
01349 if( alignment_tree[cur_node].ordering[cur.second].c1_siv != (std::numeric_limits<size_t>::max)() )
01350 node_siv_stack.push( make_pair( alignment_tree[cur_node].children[0], alignment_tree[cur_node].ordering[cur.second].c1_siv ) );
01351 if( alignment_tree[cur_node].ordering[cur.second].c2_siv != (std::numeric_limits<size_t>::max)() )
01352 node_siv_stack.push( make_pair( alignment_tree[cur_node].children[1], alignment_tree[cur_node].ordering[cur.second].c2_siv ) );
01353 if( cur_node == ancestor )
01354 continue;
01355 if( alignment_tree[cur_node].sequence != NULL )
01356 continue;
01357
01358
01359 Interval& ref_iv = alignment_tree[cur_node].ordering[cur.second].reference_iv;
01360 vector< AbstractMatch* > matches;
01361 ref_iv.StealMatches( matches );
01362 AbstractMatch::orientation o0 = matches[0]->Orientation(0);
01363 AbstractMatch::orientation o1 = matches[0]->Orientation(1);
01364 matches[0]->Invert();
01365 if( o0 != AbstractMatch::undefined )
01366 matches[0]->SetOrientation(0,o0);
01367 if( o1 != AbstractMatch::undefined )
01368 matches[0]->SetOrientation(1,o1);
01369 ref_iv.SetMatches( matches );
01370 if( o0 != AbstractMatch::undefined )
01371 {
01372 ref_iv.SetOrientation(0,o0);
01373 ref_iv.SetLeftEnd(0,0);
01374 }
01375 if( o1 != AbstractMatch::undefined )
01376 {
01377 ref_iv.SetOrientation(1,o1);
01378 ref_iv.SetLeftEnd(1,0);
01379 }
01380 }
01381 }
01382
01383
01384 void ProgressiveAligner::ConstructSuperIntervalFromMSA( node_id_t ancestor, size_t ans_siv, GappedAlignment& gal )
01385 {
01386 const vector< string >& aln_matrix = GetAlignment( gal, vector< gnSequence* >() );
01387 stack< pair< node_id_t, size_t > > node_siv_stack;
01388 node_siv_stack.push( make_pair(ancestor, ans_siv) );
01389 vector<bool> visited( alignment_tree.size(), false );
01390 while( node_siv_stack.size() > 0 )
01391 {
01392 pair< node_id_t, size_t > cur = node_siv_stack.top();
01393 node_id_t cur_node = cur.first;
01394
01395 if( !visited[cur_node] )
01396 {
01397 if( alignment_tree[cur_node].ordering[cur.second].c1_siv != (std::numeric_limits<size_t>::max)() )
01398 node_siv_stack.push( make_pair( alignment_tree[cur_node].children[0], alignment_tree[cur_node].ordering[cur.second].c1_siv ) );
01399 if( alignment_tree[cur_node].ordering[cur.second].c2_siv != (std::numeric_limits<size_t>::max)() )
01400 node_siv_stack.push( make_pair( alignment_tree[cur_node].children[1], alignment_tree[cur_node].ordering[cur.second].c2_siv ) );
01401 visited[cur_node] = true;
01402 continue;
01403 }
01404 node_siv_stack.pop();
01405 if( alignment_tree[cur_node].sequence != NULL )
01406 continue;
01407
01408
01409 vector< node_id_t > node1_seqs;
01410 vector< node_id_t > node2_seqs;
01411 getAlignedChildren( alignment_tree[cur_node].children[0], node1_seqs );
01412 getAlignedChildren( alignment_tree[cur_node].children[1], node2_seqs );
01413 vector< bitset_t > m_aln(2, bitset_t( aln_matrix[0].size(), false ) );
01414 gnSeqI seqI_len = 0;
01415 gnSeqI seqJ_len = 0;
01416 gnSeqI cur_col = 0;
01417 for( size_t colI = 0; colI < aln_matrix[0].size(); colI++ )
01418 {
01419 uint seqI = 0;
01420 uint seqJ = 0;
01421 for( ; seqI < node1_seqs.size(); ++seqI )
01422 if( aln_matrix[node_sequence_map[node1_seqs[seqI]]][colI] != '-' )
01423 break;
01424 for( ; seqJ < node2_seqs.size(); ++seqJ )
01425 if( aln_matrix[node_sequence_map[node2_seqs[seqJ]]][colI] != '-' )
01426 break;
01427
01428 if( seqI == node1_seqs.size() && seqJ == node2_seqs.size() )
01429 continue;
01430 if( seqI != node1_seqs.size() )
01431 {
01432 seqI_len++;
01433 m_aln[0].set(cur_col);
01434 }
01435 if( seqJ != node2_seqs.size() )
01436 {
01437 seqJ_len++;
01438 m_aln[1].set(cur_col);
01439 }
01440 cur_col++;
01441 }
01442 m_aln[0].resize(cur_col);
01443 m_aln[1].resize(cur_col);
01444 CompactGappedAlignment<> tmp_cga(m_aln.size(), cur_col);
01445 CompactGappedAlignment<>* cga = tmp_cga.Copy();
01446 cga->SetLeftEnd(0, seqI_len > 0 ? 1 : 0);
01447 cga->SetLeftEnd(1, seqJ_len > 0 ? 1 : 0);
01448 if( cga->LeftEnd(0) != NO_MATCH )
01449 cga->SetOrientation(0, alignment_tree[cur_node].ordering[cur.second].reference_iv.Orientation(0));
01450 if( cga->LeftEnd(1) != NO_MATCH )
01451 cga->SetOrientation(1, alignment_tree[cur_node].ordering[cur.second].reference_iv.Orientation(1));
01452 cga->SetLength(seqI_len,0);
01453 cga->SetLength(seqJ_len,1);
01454 cga->SetAlignment(m_aln);
01455
01456
01457 size_t p_siv = alignment_tree[cur_node].ordering[cur.second].parent_siv;
01458 bool reverse_me = false;
01459 if( p_siv != (std::numeric_limits<size_t>::max)() )
01460 {
01461 size_t p_node = alignment_tree[cur_node].parents[0];
01462 int p_child = alignment_tree[p_node].children[0] == cur_node ? 0 : 1;
01463 if( alignment_tree[p_node].ordering[p_siv].reference_iv.Orientation(p_child) == AbstractMatch::reverse )
01464 reverse_me = true;
01465 }
01466 if( reverse_me )
01467 {
01468 cga->Invert();
01469 if( cga->LeftEnd(0) != NO_MATCH )
01470 cga->SetOrientation(0, alignment_tree[cur_node].ordering[cur.second].reference_iv.Orientation(0));
01471 if( cga->LeftEnd(1) != NO_MATCH )
01472 cga->SetOrientation(1, alignment_tree[cur_node].ordering[cur.second].reference_iv.Orientation(1));
01473 propagateInvert( alignment_tree, cur_node, cur.second );
01474 }
01475
01476 alignment_tree[cur_node].ordering[cur.second].reference_iv = Interval();
01477 vector< AbstractMatch* > am_list(1, cga);
01478 alignment_tree[cur_node].ordering[cur.second].reference_iv.SetMatches( am_list );
01479
01480 alignment_tree[cur_node].ordering[cur.second].reference_iv.SetLeftEnd(0, 0);
01481 alignment_tree[cur_node].ordering[cur.second].reference_iv.SetLeftEnd(1, 0);
01482 }
01483 }
01484
01485 typedef boost::tuple<CompactGappedAlignment<>*, vector< bitset_t >*, AbstractMatch* > _sort_tracker_type;
01486
01487 template< class CompType >
01488 class CgaBsComp
01489 {
01490 public:
01491 CgaBsComp( CompType& c ) : comp(c) {};
01492 bool operator()( const _sort_tracker_type& a, const _sort_tracker_type& b )
01493 {
01494 return comp( a.get<0>(), b.get<0>() );
01495 }
01496 protected:
01497 CompType& comp;
01498 };
01499
01500 template< typename MatchVector >
01501 void multFilter( MatchVector& matches, uint mult = 2 )
01502 {
01503
01504 size_t cur = 0;
01505 for( size_t mI = 0; mI < matches.size(); ++mI )
01506 {
01507 if( matches[mI]->Multiplicity() == mult )
01508 matches[cur++] = matches[mI];
01509 else
01510 matches[mI]->Free();
01511 }
01512 matches.erase(matches.begin()+cur, matches.end());
01513 }
01514
01515 template< typename MatchVector >
01516 void alignedNtCountFilter( MatchVector& matches, uint length )
01517 {
01518
01519 size_t cur = 0;
01520 for( size_t mI = 0; mI < matches.size(); ++mI )
01521 {
01522 size_t len_sum = 0;
01523 for( size_t seqI = 0; seqI < matches[mI]->SeqCount(); seqI++ )
01524 if(matches[mI]->LeftEnd(seqI) != NO_MATCH)
01525 len_sum += matches[mI]->Length(seqI);
01526
01527 if( len_sum - length > matches[mI]->AlignmentLength() )
01528 matches[cur++] = matches[mI];
01529 else
01530 matches[mI]->Free();
01531 }
01532 matches.erase(matches.begin()+cur, matches.end());
01533 }
01534
01535
01536 bool debugging_cltm = false;
01537 void ProgressiveAligner::constructLcbTrackingMatches(
01538 node_id_t ancestral_node,
01539 vector< AbstractMatch* >& ancestral_matches,
01540 vector< LcbTrackingMatch< AbstractMatch* > >& tracking_matches
01541 )
01542 {
01543 node_id_t child_0 = alignment_tree[ancestral_node].children[0];
01544 node_id_t child_1 = alignment_tree[ancestral_node].children[1];
01545
01546 propagateDescendantBreakpoints( child_0, 0, ancestral_matches );
01547 propagateDescendantBreakpoints( child_1, 1, ancestral_matches );
01548
01549
01550 vector< bitset_t > bs_tmp(alignment_tree.size());
01551 vector< vector< bitset_t > > bs(ancestral_matches.size(), bs_tmp);
01552 vector< _sort_tracker_type > cga_list;
01553
01554 for( size_t mI = 0; mI < ancestral_matches.size(); mI++ )
01555 {
01556 vector< bitset_t > aln( alignment_tree.size(), bitset_t(ancestral_matches[mI]->AlignmentLength() ) );
01557 swap( bs[mI], aln );
01558 ancestral_matches[mI]->GetAlignment(aln);
01559 swap( bs[mI][child_0], aln[0] );
01560 swap( bs[mI][child_1], aln[1] );
01561 CompactGappedAlignment<> c(alignment_tree.size(),0);
01562 c.SetLeftEnd(child_0, ancestral_matches[mI]->LeftEnd(0));
01563 c.SetOrientation(child_0, ancestral_matches[mI]->Orientation(0));
01564 c.SetLength(ancestral_matches[mI]->Length(0), child_0);
01565 c.SetLeftEnd(child_1, ancestral_matches[mI]->LeftEnd(1));
01566 c.SetOrientation(child_1, ancestral_matches[mI]->Orientation(1));
01567 c.SetLength(ancestral_matches[mI]->Length(1), child_1);
01568 cga_list.push_back(make_tuple(c.Copy(), &bs[mI], ancestral_matches[mI]));
01569 }
01570
01571 stack<node_id_t> node_stack;
01572 node_stack.push(child_0);
01573 node_stack.push(child_1);
01574 while(node_stack.size() > 0)
01575 {
01576 node_id_t cur_node = node_stack.top();
01577 node_stack.pop();
01578 if( alignment_tree[cur_node].children.size() == 0 )
01579 continue;
01580 node_stack.push(alignment_tree[cur_node].children[0]);
01581 node_stack.push(alignment_tree[cur_node].children[1]);
01582
01583
01584
01585
01586
01587
01588 vector< SuperInterval >& siv_list = alignment_tree[cur_node].ordering;
01589 SingleStartComparator< CompactGappedAlignment<> > ssc(cur_node);
01590 CgaBsComp< SingleStartComparator< CompactGappedAlignment<> > > comp( ssc );
01591 sort(cga_list.begin(), cga_list.end(), comp);
01592 size_t mI = 0;
01593 size_t sivI = 0;
01594 while( mI < cga_list.size() && sivI < siv_list.size() )
01595 {
01596 CompactGappedAlignment<>* cur_match = cga_list[mI].get<0>();
01597 if( cur_match->Start(cur_node) == 0 )
01598 {
01599 mI++;
01600 continue;
01601 }
01602 if( cur_match->LeftEnd(cur_node) >= siv_list[sivI].LeftEnd() + siv_list[sivI].Length() )
01603 {
01604 sivI++;
01605 continue;
01606 }
01607
01608 if( cur_match->LeftEnd(cur_node) + cur_match->Length(cur_node) >
01609 siv_list[sivI].LeftEnd() + siv_list[sivI].Length() )
01610 {
01611 cerr << "doesn't fit\n";
01612 cerr << "cga_list[" << mI << "]->LeftEnd(" << cur_node << "): " << cur_match->LeftEnd(cur_node) << endl;
01613 cerr << "cga_list[" << mI << "]->RightEnd(" << cur_node << "): " << cur_match->RightEnd(cur_node) << endl;
01614 breakHere();
01615 }
01616
01617
01618 CompactGappedAlignment<>* siv_cga = dynamic_cast<CompactGappedAlignment<>*>(siv_list[sivI].reference_iv.GetMatches()[0]);
01619 if( siv_list[sivI].reference_iv.GetMatches().size() > 1 )
01620 siv_cga = NULL;
01621 if( siv_cga == NULL )
01622 {
01623 CompactGappedAlignment<> tmp_cga;
01624 siv_cga = tmp_cga.Copy();
01625 *siv_cga = CompactGappedAlignment<>(siv_list[sivI].reference_iv);
01626 vector<AbstractMatch*> tmp_matches(1,siv_cga);
01627 siv_list[sivI].reference_iv.SetMatches(tmp_matches);
01628 }
01629 CompactGappedAlignment<> new_cga;
01630 siv_cga->copyRange(new_cga, cur_match->LeftEnd(cur_node) - siv_list[sivI].LeftEnd(), cur_match->Length(cur_node));
01631 if( cur_match->Orientation(cur_node) == AbstractMatch::reverse )
01632 new_cga.Invert();
01633 if( new_cga.Multiplicity() == 0 )
01634 {
01635 cerr << "impossible! there's no match!\n";
01636 genome::breakHere();
01637 }
01638
01639 for( uint cur_child = 0; cur_child < 2; cur_child++ )
01640 {
01641 node_id_t sweet_child = alignment_tree[cur_node].children[cur_child];
01642 cur_match->SetLeftEnd(sweet_child, new_cga.LeftEnd(cur_child));
01643 if( new_cga.LeftEnd(cur_child) != NO_MATCH )
01644 {
01645 cur_match->SetOrientation(sweet_child, new_cga.Orientation(cur_child));
01646 cur_match->SetLength(new_cga.Length(cur_child), sweet_child);
01647 }
01648 }
01649
01650
01651 CompactGappedAlignment<> c(1,(*cga_list[mI].get<1>())[cur_node].size());
01652 c.SetLeftEnd(0,1);
01653 c.SetLength((*cga_list[mI].get<1>())[cur_node].count(),0);
01654 vector<bitset_t> bivouac(1, (*cga_list[mI].get<1>())[cur_node]);
01655 c.SetAlignment(bivouac);
01656
01657
01658 for( uint cur_child = 0; cur_child < 2; cur_child++ )
01659 {
01660 if( new_cga.Orientation(cur_child) == AbstractMatch::undefined )
01661 continue;
01662 CompactGappedAlignment<> cga_tmp = new_cga;
01663 cga_tmp.SetStart(cur_child, 1);
01664 c.translate(cga_tmp, cur_child, 0, false);
01665
01666 bitset_t bs = (cga_tmp.GetAlignment())[cur_child];
01667 bs.resize(c.GetAlignment()[0].size(), false);
01668 bs <<= c.GetAlignment()[0].find_first();
01669 node_id_t sweet_child = alignment_tree[cur_node].children[cur_child];
01670 swap( (*cga_list[mI].get<1>())[sweet_child], bs );
01671 for( size_t testI = 0; testI < cga_tmp.SeqCount(); ++testI )
01672 {
01673 if( ((*cga_list[mI].get<1>())[testI].size() != 0 && (*cga_list[mI].get<1>())[testI].size() != (*cga_list[mI].get<1>())[sweet_child].size() ) )
01674 {
01675 cerr << "bj0rk3l\n";
01676 genome::breakHere();
01677 }
01678 }
01679 }
01680
01681 debugging_cltm = false;
01682 mI++;
01683 }
01684 }
01685 tracking_matches.resize( cga_list.size() );
01686
01687 for( size_t bsI = 0; bsI < cga_list.size(); ++bsI )
01688 {
01689 cga_list[bsI].get<0>()->SetAlignment(*cga_list[bsI].get<1>());
01690 cga_list[bsI].get<0>()->validate();
01691 TrackingMatch& ltm = tracking_matches[bsI];
01692 ltm.node_match = cga_list[bsI].get<0>();
01693 ltm.original_match = cga_list[bsI].get<2>();
01694 ltm.match_id = bsI;
01695
01696 bool found_extant = false;
01697 for( size_t i = 0; i < alignment_tree.size()-1; ++i )
01698 {
01699 size_t im = node_sequence_map[i];
01700 if( im == (std::numeric_limits<size_t>::max)() )
01701 continue;
01702 if( ltm.node_match->LeftEnd(i) != NO_MATCH )
01703 found_extant = true;
01704 }
01705 if( !found_extant )
01706 {
01707 cout << "orig aln len: " << ltm.original_match->AlignmentLength() << endl;
01708 cout << "orig lend 0: " << ltm.original_match->Start(0) << endl;
01709 cout << "orig lend 1: " << ltm.original_match->Start(1) << endl;
01710 cout << "orig length 0: " << ltm.original_match->Length(0) << endl;
01711 cout << "orig length 1: " << ltm.original_match->Length(1) << endl;
01712
01713 cerr << "this is an ungrounded match!!!\n";
01714 genome::breakHere();
01715 }
01716 }
01717 }
01718
01719 size_t countUnrefined( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t ancestor )
01720 {
01721 stack< node_id_t > node_stack;
01722 node_stack.push(ancestor);
01723 size_t unrefined_count = 0;
01724 while( node_stack.size() > 0 )
01725 {
01726 node_id_t cur_node = node_stack.top();
01727 node_stack.pop();
01728 if( alignment_tree[cur_node].children.size() > 0 )
01729 for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); ++childI )
01730 node_stack.push( alignment_tree[cur_node].children[childI] );
01731 if( !alignment_tree[cur_node].refined )
01732 unrefined_count++;
01733 }
01734 return unrefined_count;
01735 }
01736
01737 void markAsRefined( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t ancestor )
01738 {
01739 stack< node_id_t > node_stack;
01740 node_stack.push(ancestor);
01741 size_t refined_count = 0;
01742 while( node_stack.size() > 0 )
01743 {
01744 node_id_t cur_node = node_stack.top();
01745 node_stack.pop();
01746 if( alignment_tree[cur_node].children.size() > 0 )
01747 for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); ++childI )
01748 node_stack.push( alignment_tree[cur_node].children[childI] );
01749 alignment_tree[cur_node].refined = true;
01750 }
01751 alignment_tree[ancestor].refined = false;
01752 }
01753
01754
01755
01756 void ProgressiveAligner::pairwiseScoreTrackingMatches(
01757 std::vector< TrackingMatch >& tracking_matches,
01758 std::vector<node_id_t>& node1_descendants,
01759 std::vector<node_id_t>& node2_descendants,
01760 boost::multi_array< double, 3 >& tm_score_array)
01761 {
01762 tm_score_array.resize( boost::extents[tracking_matches.size()][node1_descendants.size()][node2_descendants.size()] );
01763 for( size_t mI = 0; mI < tracking_matches.size(); ++mI )
01764 {
01765 TrackingMatch* cur_match = &tracking_matches[mI];
01766 AbstractMatch* node_match = cur_match->node_match;
01767 for( size_t nI = 0; nI < node1_descendants.size(); ++nI )
01768 {
01769 if( node_sequence_map[node1_descendants[nI]] == (std::numeric_limits<uint>::max)() ||
01770 node_match->LeftEnd(node1_descendants[nI]) == NO_MATCH )
01771 continue;
01772 for( size_t nJ = 0; nJ < node2_descendants.size(); ++nJ )
01773 {
01774 if( node_sequence_map[node2_descendants[nJ]] == (std::numeric_limits<uint>::max)() ||
01775 node_match->LeftEnd(node2_descendants[nJ]) == NO_MATCH )
01776 continue;
01777
01778 node_id_t cur_n1 = node1_descendants[nI];
01779 node_id_t cur_n2 = node2_descendants[nJ];
01780 size_t nsmI = node_sequence_map[cur_n1];
01781 size_t nsmJ = node_sequence_map[cur_n2];
01782 PairwiseMatchAdapter pma( node_match, cur_n1, cur_n2 );
01783 vector< AbstractMatch* > lcb_vect( 1, &pma );
01784 vector< gnSequence* > ex_seqs(2);
01785 ex_seqs[0] = alignment_tree[ cur_n1 ].sequence;
01786 ex_seqs[1] = alignment_tree[ cur_n2 ].sequence;
01787
01788 tm_score_array[mI][nI][nJ] = GetPairwiseAnchorScore(lcb_vect, ex_seqs, subst_scoring, sol_list[nsmI], sol_list[nsmJ]);
01789 }
01790 }
01791 }
01792 computeAvgAncestralMatchScores(tracking_matches, node1_descendants, node2_descendants, tm_score_array);
01793 }
01794
01795 void ProgressiveAligner::computeAvgAncestralMatchScores(
01796 std::vector< TrackingMatch >& tracking_matches,
01797 std::vector<node_id_t>& node1_descendants,
01798 std::vector<node_id_t>& node2_descendants,
01799 boost::multi_array< double, 3 >& tm_score_array)
01800 {
01801
01802 for( uint nodeI = 0; nodeI < node1_descendants.size(); nodeI++ )
01803 {
01804 for( uint nodeJ = 0; nodeJ < node2_descendants.size(); nodeJ++ )
01805 {
01806 node_id_t n1 = node1_descendants[nodeI];
01807 node_id_t n2 = node2_descendants[nodeJ];
01808
01809 vector<node_id_t> n1_ext;
01810 vector<node_id_t> n2_ext;
01811 getAlignedChildren(n1, n1_ext);
01812 getAlignedChildren(n2, n2_ext);
01813 if( n1_ext.size() == 1 && n2_ext.size() == 1 )
01814 continue;
01815
01816
01817 vector< node_id_t > n1_ext_map(n1_ext.size());
01818 for( size_t i = 0; i < n1_ext.size(); ++i )
01819 {
01820 vector< node_id_t >::iterator iter = std::find( node1_descendants.begin(), node1_descendants.end(), n1_ext[i] );
01821 n1_ext_map[i] = iter - node1_descendants.begin();
01822 }
01823 vector< node_id_t > n2_ext_map(n2_ext.size());
01824 for( size_t i = 0; i < n2_ext.size(); ++i )
01825 {
01826 vector< node_id_t >::iterator iter = std::find( node2_descendants.begin(), node2_descendants.end(), n2_ext[i] );
01827 n2_ext_map[i] = iter - node2_descendants.begin();
01828 }
01829
01830
01831 for( size_t mI = 0; mI < tracking_matches.size(); ++mI )
01832 {
01833 uint tally = 0;
01834 double score_sum = 0;
01835 for( size_t i = 0; i < n1_ext.size(); ++i )
01836 {
01837 if( tracking_matches[mI].node_match->LeftEnd(n1_ext[i]) == NO_MATCH )
01838 continue;
01839 for( size_t j = 0; j < n2_ext.size(); ++j )
01840 {
01841 if( tracking_matches[mI].node_match->LeftEnd(n2_ext[j]) == NO_MATCH )
01842 continue;
01843 ++tally;
01844 score_sum += tm_score_array[mI][n1_ext_map[i]][n2_ext_map[j]];
01845 }
01846 }
01847 if( tally > 0 )
01848 tm_score_array[mI][nodeI][nodeJ] = score_sum / (double)tally;
01849 }
01850 }
01851 }
01852 }
01853
01854
01855 void ProgressiveAligner::computeInternalNodeDistances(
01856 boost::multi_array<double, 2>& bp_dist_mat,
01857 boost::multi_array<double, 2>& cons_dist_mat,
01858 std::vector<node_id_t>& node1_descendants,
01859 std::vector<node_id_t>& node2_descendants)
01860 {
01861
01862 bp_dist_mat.resize(boost::extents[node1_descendants.size()][node2_descendants.size()]);
01863 cons_dist_mat.resize(boost::extents[node1_descendants.size()][node2_descendants.size()]);
01864 for( size_t nI = 0; nI < node1_descendants.size(); ++nI )
01865 {
01866 if( node_sequence_map[node1_descendants[nI]] == (std::numeric_limits<uint>::max)() )
01867 continue;
01868 for( size_t nJ = 0; nJ < node2_descendants.size(); ++nJ )
01869 {
01870 if( node_sequence_map[node2_descendants[nJ]] == (std::numeric_limits<uint>::max)() )
01871 continue;
01872 size_t i = node_sequence_map[node1_descendants[nI]];
01873 size_t j = node_sequence_map[node2_descendants[nJ]];
01874 bp_dist_mat[nI][nJ] = this->bp_distance[i][j];
01875 cons_dist_mat[nI][nJ] = this->conservation_distance[i][j];
01876 }
01877 }
01878
01879
01880 for( uint nodeI = 0; nodeI < node1_descendants.size(); nodeI++ )
01881 {
01882 for( uint nodeJ = 0; nodeJ < node2_descendants.size(); nodeJ++ )
01883 {
01884 node_id_t n1 = node1_descendants[nodeI];
01885 node_id_t n2 = node2_descendants[nodeJ];
01886
01887 vector<node_id_t> n1_ext;
01888 vector<node_id_t> n2_ext;
01889 getAlignedChildren(n1, n1_ext);
01890 getAlignedChildren(n2, n2_ext);
01891 if( n1_ext.size() == 1 && n2_ext.size() == 1 )
01892 continue;
01893
01894
01895 vector< node_id_t > n1_ext_map(n1_ext.size());
01896 for( size_t i = 0; i < n1_ext.size(); ++i )
01897 {
01898 vector< node_id_t >::iterator iter = std::find( node1_descendants.begin(), node1_descendants.end(), n1_ext[i] );
01899 n1_ext_map[i] = iter - node1_descendants.begin();
01900 }
01901 vector< node_id_t > n2_ext_map(n2_ext.size());
01902 for( size_t i = 0; i < n2_ext.size(); ++i )
01903 {
01904 vector< node_id_t >::iterator iter = std::find( node2_descendants.begin(), node2_descendants.end(), n2_ext[i] );
01905 n2_ext_map[i] = iter - node2_descendants.begin();
01906 }
01907
01908
01909 for( size_t i = 0; i < n1_ext.size(); ++i )
01910 {
01911 for( size_t j = 0; j < n2_ext.size(); ++j )
01912 {
01913 bp_dist_mat[nodeI][nodeJ] += bp_dist_mat[n1_ext_map[i]][n2_ext_map[j]];
01914 cons_dist_mat[nodeI][nodeJ] += cons_dist_mat[n1_ext_map[i]][n2_ext_map[j]];
01915 }
01916 }
01917 bp_dist_mat[nodeI][nodeJ] /= (double)(n1_ext.size() * n2_ext.size());
01918 cons_dist_mat[nodeI][nodeJ] /= (double)(n1_ext.size() * n2_ext.size());
01919 }
01920 }
01921
01922 }
01923
01924 double computeID( GappedAlignment& gal, size_t seqI, size_t seqJ )
01925 {
01926 const vector< string >& aln_mat = GetAlignment( gal, vector< gnSequence* >(gal.SeqCount(), NULL ));
01927 double id = 0;
01928 double possible = 0;
01929 for( size_t colI = 0; colI < gal.AlignmentLength(); colI++ )
01930 {
01931 if( aln_mat[seqI][colI] == '-' || aln_mat[seqJ][colI] == '-' )
01932 continue;
01933 if( toupper(aln_mat[seqI][colI]) == toupper(aln_mat[seqJ][colI]))
01934 id++;
01935 possible++;
01936 }
01937 return id / possible;
01938 }
01939
01940
01941
01942
01943
01944 void ProgressiveAligner::getRepresentativeAncestralMatches( const vector< node_id_t > node1_seqs, const vector< node_id_t > node2_seqs, node_id_t node1, node_id_t node2, node_id_t ancestor, std::vector< AbstractMatch* >& ancestral_matches )
01945 {
01946
01947
01948 vector< AbstractMatch* > cur_matches;
01949 boost::multi_array< vector< AbstractMatch* >, 2 > seq_matches( boost::extents[node1_seqs.size()][node2_seqs.size()] );
01950 for( size_t mI = 0; mI < original_ml.size(); mI++ )
01951 {
01952 for( uint seqI = 0; seqI < node1_seqs.size(); seqI++ )
01953 {
01954 uint ii = this->node_sequence_map[node1_seqs[seqI]];
01955 if( original_ml[mI]->LeftEnd(ii) == NO_MATCH )
01956 continue;
01957
01958 for( uint seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
01959 {
01960 uint jj = this->node_sequence_map[node2_seqs[seqJ]];
01961 if( original_ml[mI]->LeftEnd(jj) == NO_MATCH )
01962 continue;
01963 Match mm( 2 );
01964 Match* new_m = mm.Copy();
01965 new_m->SetStart( 0, original_ml[mI]->Start(ii));
01966 new_m->SetStart( 1, original_ml[mI]->Start(jj));
01967 new_m->SetLength(original_ml[mI]->Length());
01968 if( new_m->Start(0) < 0 )
01969 new_m->Invert();
01970 seq_matches[seqI][seqJ].push_back( new_m );
01971 break;
01972 }
01973 break;
01974 }
01975 }
01976 for( uint seqI = 0; seqI < node1_seqs.size(); seqI++ )
01977 for( uint seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
01978 {
01979 translateGappedCoordinates( seq_matches[seqI][seqJ], 0, node1_seqs[seqI], node1 );
01980 translateGappedCoordinates( seq_matches[seqI][seqJ], 1, node2_seqs[seqJ], node2 );
01981 ancestral_matches.insert( ancestral_matches.end(), seq_matches[seqI][seqJ].begin(), seq_matches[seqI][seqJ].end() );
01982 }
01983
01984 EliminateOverlaps_v2( ancestral_matches, true );
01985 }
01986
01987 int cachecomp( const void* e1, const void* e2 )
01988 {
01989 bool a = mems::cache_comparator(*(search_cache_t*)e1, *(search_cache_t*)e2);
01990 bool b = mems::cache_comparator(*(search_cache_t*)e2, *(search_cache_t*)e1);
01991 if(!a && !b)
01992 return 0;
01993 return a ? -1 : 1;
01994 }
01995
01996 void ProgressiveAligner::alignProfileToProfile( node_id_t node1, node_id_t node2, node_id_t ancestor )
01997 {
01998
01999
02000
02001
02002
02003
02004
02005
02006
02007
02008 vector< node_id_t > node1_seqs;
02009 vector< node_id_t > node2_seqs;
02010 getAlignedChildren( node1, node1_seqs );
02011 getAlignedChildren( node2, node2_seqs );
02012
02013 uint seqI, seqJ;
02014 gnSeqI prev_ancestral_seq_len = (std::numeric_limits<gnSeqI>::max)();
02015
02016 printMemUsage();
02017 cout << "get ancestral matches\n";
02018
02019 Matrix<MatchList> pairwise_matches( node1_seqs.size(), node2_seqs.size() );
02020
02021 vector< AbstractMatch* > anc_pairwise_matches;
02022 getRepresentativeAncestralMatches( node1_seqs, node2_seqs, node1, node2, ancestor, anc_pairwise_matches );
02023 printMemUsage();
02024
02025 PhyloTree< AlignmentTreeNode > aln_tree_backup;
02026
02028 Matrix< std::vector< search_cache_t > > search_cache_db(node1_seqs.size(), node2_seqs.size());
02029 double prev_anchoring_score = -(std::numeric_limits<double>::max)();
02030 double cur_anchoring_score = -(std::numeric_limits<double>::max)();
02031
02032 while(true)
02033 {
02034 vector<AbstractMatch*> ancestral_matches;
02035 if( anc_pairwise_matches.size() > 0 )
02036 {
02037 ancestral_matches.insert( ancestral_matches.begin(), anc_pairwise_matches.begin(), anc_pairwise_matches.end() );
02038 anc_pairwise_matches.clear();
02039 }
02040 else
02041 {
02042
02043
02044
02045
02046
02047
02048 for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
02049 {
02050 for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
02051 {
02052 cout << node_sequence_map[node1_seqs[seqI]] << "," << node_sequence_map[node2_seqs[seqJ]] << " has " << pairwise_matches(seqI,seqJ).size() << " pairwise matches\n";
02053 cout.flush();
02054
02055 vector< AbstractMatch* > am_list( pairwise_matches(seqI, seqJ).begin(), pairwise_matches(seqI, seqJ).end() );
02056 pairwise_matches(seqI, seqJ).clear();
02057 translateGappedCoordinates( am_list, 1, node2_seqs[seqJ], node2 );
02058 translateGappedCoordinates( am_list, 0, node1_seqs[seqI], node1 );
02059 ancestral_matches.insert( ancestral_matches.end(), am_list.begin(), am_list.end() );
02060 }
02061 }
02062 }
02063
02064 for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
02065 {
02066 Interval& ref_iv = alignment_tree[ancestor].ordering[aI].reference_iv;
02067 if( ref_iv.Multiplicity() == 2 )
02068 for( size_t mI = 0; mI < ref_iv.GetMatches().size(); mI++ )
02069 if( ref_iv.GetMatches()[mI]->Multiplicity() > 1 )
02070 ancestral_matches.push_back( ref_iv.GetMatches()[mI]->Copy() );
02071 }
02072
02073
02074 for( size_t mI = 0; mI < ancestral_matches.size(); ++mI )
02075 if( ancestral_matches[mI]->Start(0) < 0 )
02076 ancestral_matches[mI]->Invert();
02077
02078
02079
02080 EliminateOverlaps_v2( ancestral_matches );
02081
02082 multFilter( ancestral_matches );
02083
02084 vector< vector< AbstractMatch* > > LCB_list;
02085 vector< LCB > adjacencies;
02086 vector< gnSeqI > breakpoints;
02087
02088 if( !collinear_genomes )
02089 {
02090 cout << "Performing Sum-of-pairs Greedy Breakpoint Elimination\n";
02091 cout.flush();
02092
02093
02094
02095
02096
02097
02098
02099
02100
02101
02102 printMemUsage();
02103 cout << "construct LCB tracking matches\n";
02104 vector< TrackingMatch > tracking_matches;
02105 boost::multi_array< size_t, 3 > tm_lcb_id_array;
02106 boost::multi_array< double, 3 > tm_score_array;
02107 constructLcbTrackingMatches( ancestor, ancestral_matches, tracking_matches );
02108
02109 cout << "There are " << tracking_matches.size() << " tracking matches\n";
02110 size_t used_components = 0;
02111 for( size_t tmI = 0; tmI < tracking_matches.size(); ++tmI )
02112 {
02113 for( uint ssI = 0; ssI < tracking_matches[tmI].node_match->SeqCount(); ++ssI )
02114 if( tracking_matches[tmI].node_match->LeftEnd(ssI) != NO_MATCH )
02115 used_components++;
02116 }
02117 size_t total_components = tracking_matches.size() == 0 ? 0 : tracking_matches.size() * tracking_matches[0].node_match->SeqCount();
02118 cout << "There are " << used_components << " / " << total_components << " components used\n";
02119
02120 vector<node_id_t> node1_descendants;
02121 vector<node_id_t> node2_descendants;
02122 if( scoring_scheme == ExtantSumOfPairsScoring )
02123 {
02124 node1_descendants = node1_seqs;
02125 node2_descendants = node2_seqs;
02126 }else{
02127 getDescendants(alignment_tree, node1, node1_descendants);
02128 getDescendants(alignment_tree, node2, node2_descendants);
02129 }
02130
02131
02132
02133
02134 printMemUsage();
02135 cout << "init tracking match LCB tracking\n";
02136 initTrackingMatchLCBTracking( tracking_matches, node1_descendants.size(), node2_descendants.size(), tm_lcb_id_array );
02137 printMemUsage();
02138 cout << "pairwise score tracking matches\n";
02139 pairwiseScoreTrackingMatches( tracking_matches, node1_descendants, node2_descendants, tm_score_array );
02140 printMemUsage();
02141
02142
02143
02144 boost::multi_array<double, 2> bp_dist_mat;
02145 boost::multi_array<double, 2> cons_dist_mat;
02146 computeInternalNodeDistances( bp_dist_mat, cons_dist_mat, node1_descendants, node2_descendants);
02147
02148 vector< TrackingMatch* > t_matches(tracking_matches.size());
02149 for( size_t mI = 0; mI < tracking_matches.size(); ++mI )
02150 t_matches[mI] = &tracking_matches[mI];
02151
02152
02153 cout << "get pairwise LCBs\n";
02154 size_t pair_lcb_count = 0;
02155 PairwiseLCBMatrix pairwise_adj_mat(boost::extents[node1_descendants.size()][node2_descendants.size()]);
02156 for( uint nodeI = 0; nodeI < node1_descendants.size(); nodeI++ )
02157 for( uint nodeJ = 0; nodeJ < node2_descendants.size(); nodeJ++ )
02158 {
02159 getPairwiseLCBs( node1_descendants[nodeI], node2_descendants[nodeJ], nodeI, nodeJ, t_matches, pairwise_adj_mat[nodeI][nodeJ], tm_score_array, tm_lcb_id_array );
02160 pair_lcb_count += pairwise_adj_mat[nodeI][nodeJ].size();
02161 }
02162 cout << "there are " << pair_lcb_count << " pairwise LCBs\n";
02163 printMemUsage();
02164
02165 sort( t_matches.begin(), t_matches.end() );
02166
02167
02168
02169
02170
02171
02172
02173
02174
02175
02176
02177
02178
02179
02180
02181
02182
02183
02184
02185
02186
02187
02188
02189
02190
02191
02192 cout << "scaling bp penalty by conservation weight:\n";
02193 print2d_matrix(cons_dist_mat, cout);
02194 cout << "\n\nscaling bp penalty by bp weight: \n";
02195 print2d_matrix(bp_dist_mat, cout);
02196 cout << "\nGreedy BPE\n";
02197 vector< TrackingMatch* > final;
02198 if(scoring_scheme == AncestralScoring)
02199 {
02200 vector<node_id_t>::iterator d1_iter = std::find( node1_descendants.begin(), node1_descendants.end(), node1 );
02201 vector<node_id_t>::iterator d2_iter = std::find( node2_descendants.begin(), node2_descendants.end(), node2 );
02202 size_t d1_index = d1_iter - node1_descendants.begin();
02203 size_t d2_index = d2_iter - node2_descendants.begin();
02204 EvenFasterSumOfPairsBreakpointScorer spbs( breakpoint_penalty, min_breakpoint_penalty, bp_dist_mat, cons_dist_mat,
02205 t_matches, pairwise_adj_mat, node1_descendants, node2_descendants,
02206 tm_score_array, tm_lcb_id_array, d1_index, d1_index+1, d2_index, d2_index+1 );
02207 cur_anchoring_score = greedySearch( spbs );
02208 final = spbs.getResults();
02209 }else{
02210 EvenFasterSumOfPairsBreakpointScorer spbs( breakpoint_penalty, min_breakpoint_penalty, bp_dist_mat, cons_dist_mat,
02211 t_matches, pairwise_adj_mat, node1_descendants, node2_descendants,
02212 tm_score_array, tm_lcb_id_array, 0, node1_descendants.size(), 0, node2_descendants.size() );
02213 cur_anchoring_score = greedySearch( spbs );
02214 final = spbs.getResults();
02215 }
02216 cout << "done\n";
02217
02218
02219 for( size_t mI = 0; mI < tracking_matches.size(); ++mI )
02220 tracking_matches[mI].node_match->Free();
02221
02222 ancestral_matches.clear();
02223
02224
02225 std::sort(final.begin(), final.end());
02226 vector< TrackingMatch* > deleted_t_matches( t_matches.size(), NULL );
02227 std::set_difference( t_matches.begin(), t_matches.end(), final.begin(), final.end(), deleted_t_matches.begin() );
02228 for( size_t delI = 0; delI < deleted_t_matches.size(); ++delI )
02229 {
02230 if( deleted_t_matches[delI] == NULL )
02231 break;
02232 deleted_t_matches[delI]->original_match->Free();
02233 }
02234
02235
02236 vector< AbstractMatch* > new_matches(final.size());
02237 for( size_t mI = 0; mI < final.size(); ++mI )
02238 new_matches[mI] = final[mI]->original_match;
02239
02240 IdentifyBreakpoints( new_matches, breakpoints );
02241 ComputeLCBs_v2( new_matches, breakpoints, LCB_list );
02242
02243 }
02244 else
02245 {
02246
02247
02248 IdentifyBreakpoints( ancestral_matches, breakpoints );
02249 ComputeLCBs_v2( ancestral_matches, breakpoints, LCB_list );
02250
02251 vector< double > lcb_scores( LCB_list.size() );
02252 double score_sum = 100;
02253 for( size_t lcbI = 0; lcbI < LCB_list.size(); ++lcbI )
02254 {
02255 lcb_scores[lcbI] = SimpleGetLCBCoverage( LCB_list[lcbI] );
02256 score_sum += lcb_scores[lcbI];
02257 }
02258
02259 computeLCBAdjacencies_v3( LCB_list, lcb_scores, adjacencies );
02260
02261
02262 SimpleBreakpointScorer wbs( adjacencies, score_sum, true );
02263 cur_min_coverage = greedyBreakpointElimination_v4( adjacencies, lcb_scores, wbs, NULL, false );
02264 vector<AbstractMatch*> deleted_matches;
02265 filterMatches_v2( adjacencies, LCB_list, lcb_scores, deleted_matches );
02266 for( size_t delI = 0; delI < deleted_matches.size(); ++delI )
02267 deleted_matches[delI]->Free();
02268 }
02269 printMemUsage();
02270
02271 ancestral_matches.clear();
02272
02273 cout << "Arrived at " << LCB_list.size() << " intervals\n";
02274
02275 vector< Interval* > pairwise_intervals;
02276 Interval tmp_iv;
02277 for( size_t lcbI = 0; lcbI < LCB_list.size(); lcbI++ )
02278 {
02279 pairwise_intervals.push_back( tmp_iv.Copy() );
02280 pairwise_intervals.back()->SetMatches( LCB_list[lcbI] );
02281 }
02282 LCB_list.clear();
02283
02284 vector<gnSeqI> seq_lengths = vector<gnSeqI>(2,0);
02285 for( size_t aI = 0; aI < alignment_tree[node1].ordering.size(); ++aI )
02286 seq_lengths[0] += alignment_tree[node1].ordering[aI].Length();
02287 for( size_t aI = 0; aI < alignment_tree[node2].ordering.size(); ++aI )
02288 seq_lengths[1] += alignment_tree[node2].ordering[aI].Length();
02289
02290 cout << "Adding unaligned intervals\n";
02291 addUnalignedIntervals_v2(pairwise_intervals, set<uint>(), seq_lengths);
02292
02293 cout << "addUnalignedIntervals yields " << pairwise_intervals.size() << " intervals\n";
02294
02295 bool borked = false;
02296 if(debug_aligner)
02297 borked = validatePairwiseIntervals(node1, node2, pairwise_intervals);
02298
02299
02300 cout << "Merging unaligned intervals\n";
02301 cout.flush();
02302 vector<Interval*> new_list1;
02303 vector<Interval*> merged_intervals;
02304 mergeUnalignedIntervals( 1, pairwise_intervals, new_list1 );
02305 mergeUnalignedIntervals( 0, new_list1, merged_intervals );
02306 cout << "Marbling gaps\n";
02307 cout.flush();
02308 for( size_t ivI = 0; ivI < merged_intervals.size(); ivI++ )
02309 merged_intervals[ivI]->Marble(50);
02310
02311 cout << "Propagating descendant breakpoints\n";
02312
02313
02314 propagateDescendantBreakpoints( node1, 0, merged_intervals );
02315 propagateDescendantBreakpoints( node2, 1, merged_intervals );
02316
02317 cout << "descendant 0(" << node1 << ") has " << alignment_tree[node1].ordering.size() << " intervals\n";
02318 cout << "descendant 1(" << node2 << ") has " << alignment_tree[node2].ordering.size() << " intervals\n";
02319 cout << "propagateDescendantBreakpoints yields " << merged_intervals.size() << " intervals\n";
02320
02321 if(debug_aligner)
02322 borked = validatePairwiseIntervals(node1, node2, merged_intervals);
02323 cout << "Creating ancestral ordering\n";
02324 alignment_tree[ancestor].ordering.clear();
02325 createAncestralOrdering( merged_intervals, alignment_tree[ancestor].ordering );
02326 for( size_t ivI = 0; ivI < merged_intervals.size(); ivI++ )
02327 merged_intervals[ivI]->Free();
02328 merged_intervals.clear();
02329
02330 if(debug_aligner)
02331 validateSuperIntervals( node1, node2, ancestor );
02332
02333
02334 gnSeqI cur_ancestral_seq_len = 0;
02335 for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
02336 cur_ancestral_seq_len += alignment_tree[ancestor].ordering[aI].Length();
02337
02338 if( !collinear_genomes )
02339 cout << "Previous anchoring score: " << prev_anchoring_score << ", new anchor score: " << cur_anchoring_score << endl;
02340 else
02341 cout << "Prev alignment len: " << prev_ancestral_seq_len << ", new alignment length: " << cur_ancestral_seq_len << endl;
02342
02343
02344 if( collinear_genomes && cur_ancestral_seq_len >= prev_ancestral_seq_len )
02345 break;
02346
02347 if( !collinear_genomes && cur_anchoring_score <= prev_anchoring_score )
02348 break;
02349 prev_anchoring_score = cur_anchoring_score;
02350 prev_ancestral_seq_len = cur_ancestral_seq_len;
02351
02352
02353 cout << "Backing up alignment tree...\n";
02354 cout.flush();
02355 aln_tree_backup = alignment_tree;
02356
02357 cout << "propagating ancestral breakpoints\n";
02358 cout.flush();
02359 recursiveApplyAncestralBreakpoints(ancestor);
02360
02361
02362 if( debug_me )
02363 {
02364 for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
02365 {
02366 GappedAlignment gal;
02367 extractAlignment(ancestor, aI, gal);
02368
02369 bool check = false;
02370 for( size_t ii = 0; ii < gal.SeqCount(); ++ii )
02371 {
02372 if( gal.LeftEnd(ii) == 0 )
02373 continue;
02374 for( size_t jj = 0; jj < gal.SeqCount(); ++jj )
02375 {
02376 if( gal.LeftEnd(jj) == 0 )
02377 continue;
02378 check = check || computeID( gal, ii, jj ) < .5;
02379 }
02380 }
02381 if( check )
02382 cerr << "check iv " << aI << " dbg_count " << dbg_count << endl;
02383 else
02384 continue;
02385
02386 const vector< string >& aln_mat = GetAlignment(gal, this->original_ml.seq_table);
02387 gnSequence seq;
02388 for( size_t seqI = 0; seqI < gal.SeqCount(); ++seqI )
02389 if( gal.LeftEnd(seqI) != NO_MATCH )
02390 seq += aln_mat[seqI];
02391
02392 stringstream dbg_fname;
02393 dbg_fname << "prof_dbg_iv_" << aI << ".dbg." << dbg_count++ << ".fas";
02394 ofstream debug_file( dbg_fname.str().c_str() );
02395 gnFASSource::Write( seq, debug_file, false );
02396 debug_file.close();
02397 }
02398 }
02399
02400 if(debug_aligner)
02401 validateSuperIntervals( node1, node2, ancestor );
02402
02403 if(recursive)
02404 {
02405
02406 cout << "recursive anchor search\n";
02407 cout.flush();
02408 Matrix<MatchList> matches;
02409 Matrix< std::vector< search_cache_t > > new_cache_db(node1_seqs.size(), node2_seqs.size());
02410
02411 boost::multi_array< std::vector< std::vector< int64 > >, 2 > iv_regions( boost::extents[node1_seqs.size()][node2_seqs.size()] );
02412 for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
02413 for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
02414 iv_regions[seqI][seqJ].resize(2);
02415 vector< gnSequence* > bseqs( node1_seqs.size() + node2_seqs.size() );
02416 for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
02417 {
02418 CompactGappedAlignment<> cga;
02419 extractAlignment(ancestor, aI, cga);
02420 recurseOnPairs(node1_seqs, node2_seqs, cga, matches, search_cache_db, new_cache_db, iv_regions);
02421
02422
02423 for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
02424 for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
02425 pairwise_matches(seqI, seqJ).insert( pairwise_matches(seqI, seqJ).end(), matches(seqI, seqJ).begin(), matches(seqI, seqJ).end() );
02426
02427 }
02428
02429
02430 for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
02431 bseqs[seqI] = alignment_tree[ node1_seqs[seqI] ].sequence;
02432 for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
02433 bseqs[seqI+seqJ] = alignment_tree[ node2_seqs[seqJ] ].sequence;
02434
02435 MaskedMemHash nway_mh;
02436
02437 for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
02438 for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
02439 {
02440 std::sort( iv_regions[seqI][seqJ][0].begin(), iv_regions[seqI][seqJ][0].end() );
02441 std::sort( iv_regions[seqI][seqJ][1].begin(), iv_regions[seqI][seqJ][1].end() );
02442 MatchList new_matches;
02443 new_matches.seq_table.resize(2);
02444 new_matches.seq_table[0] = bseqs[seqI];
02445 new_matches.seq_table[1] = bseqs[node1_seqs.size() + seqJ];
02446 SearchLCBGaps( new_matches, iv_regions[seqI][seqJ], nway_mh );
02447 pairwise_matches(seqI, seqJ).insert( pairwise_matches(seqI, seqJ).end(), new_matches.begin(), new_matches.end() );
02448 }
02449
02450 if(using_cache_db)
02451 {
02452
02453 for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
02454 {
02455 for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
02456 {
02457 for( size_t mI = 0; mI < search_cache_db(seqI,seqJ).size(); mI++ )
02458 {
02459 if( search_cache_db(seqI,seqJ)[mI].first != NULL )
02460 search_cache_db(seqI,seqJ)[mI].first->Free();
02461 if( search_cache_db(seqI,seqJ)[mI].second != NULL )
02462 search_cache_db(seqI,seqJ)[mI].second->Free();
02463 }
02464 search_cache_db(seqI,seqJ).clear();
02465 if(new_cache_db(seqI, seqJ).size() > 0)
02466 {
02467
02468 search_cache_t* sc_array = new search_cache_t[new_cache_db(seqI,seqJ).size()];
02469 for( size_t i = 0; i < new_cache_db(seqI,seqJ).size(); i++ )
02470 sc_array[i] = new_cache_db(seqI,seqJ)[i];
02471 qsort(sc_array, new_cache_db(seqI,seqJ).size(), sizeof(AbstractMatch*), cachecomp);
02472
02473 search_cache_db(seqI, seqJ).resize(new_cache_db(seqI,seqJ).size());
02474 for( size_t i = 0; i < new_cache_db(seqI,seqJ).size(); i++ )
02475 search_cache_db(seqI, seqJ)[i] = sc_array[i];
02476 delete[] sc_array;
02477
02478 new_cache_db(seqI, seqJ).clear();
02479 }
02480 if( pairwise_matches(seqI,seqJ).size() > 0 )
02481 cout << seqI << "," << seqJ << " has an additional " << pairwise_matches(seqI,seqJ).size() << " matches\n";
02482 }
02483 }
02484
02485 }
02486 }
02487
02488
02489
02490 cout << "Restoring backed up alignment tree...\n";
02491 cout.flush();
02492 swap( alignment_tree, aln_tree_backup );
02493
02494 }
02495
02496 if( using_cache_db )
02497 {
02498
02499 for( seqI = 0; seqI < node1_seqs.size(); seqI++ )
02500 for( seqJ = 0; seqJ < node2_seqs.size(); seqJ++ )
02501 for( size_t mI = 0; mI < search_cache_db(seqI,seqJ).size(); mI++ )
02502 {
02503 if( search_cache_db(seqI,seqJ)[mI].first != NULL )
02504 search_cache_db(seqI,seqJ)[mI].first->Free();
02505 if( search_cache_db(seqI,seqJ)[mI].second != NULL )
02506 search_cache_db(seqI,seqJ)[mI].second->Free();
02507 }
02508 }
02509
02510 printMemUsage();
02511
02512
02513 swap( alignment_tree, aln_tree_backup );
02514 cout << "propagating ancestral breakpoints\n";
02515 recursiveApplyAncestralBreakpoints(ancestor);
02516
02517 printMemUsage();
02518
02519
02520 if( gapped_alignment )
02521 {
02522 cout << "performing a gapped alignment\n";
02523 doGappedAlignment(ancestor, true);
02524 }else
02525 cerr << "skipping gapped alignment\n";
02526 if( refine )
02527 {
02528 size_t unrefined = countUnrefined( alignment_tree, ancestor );
02529 if( unrefined > 5 && ancestor != alignment_tree.root )
02530 {
02531 cout << "performing iterative refinement\n";
02532 doGappedAlignment(ancestor, false);
02533 markAsRefined( alignment_tree, ancestor );
02534 }
02535 }
02536 printMemUsage();
02537
02538
02539 if( debug_me )
02540 {
02541 for( size_t aI = 0; aI < alignment_tree[ancestor].ordering.size(); aI++ )
02542 {
02543
02544 static int dbg_count = 0;
02545 GappedAlignment gal;
02546 extractAlignment(ancestor, aI, gal);
02547
02548 bool check = false;
02549 for( size_t ii = 0; ii < gal.SeqCount(); ++ii )
02550 {
02551 if( gal.LeftEnd(ii) == 0 )
02552 continue;
02553 for( size_t jj = 0; jj < gal.SeqCount(); ++jj )
02554 {
02555 if( gal.LeftEnd(jj) == 0 )
02556 continue;
02557 check = check || computeID( gal, ii, jj ) < .5;
02558 }
02559 }
02560 if( check )
02561 cerr << "check iv " << aI << " dbg_count " << dbg_count << endl;
02562 else
02563 continue;
02564
02565 const vector< string >& aln_mat = GetAlignment(gal, this->original_ml.seq_table);
02566 gnSequence seq;
02567 for( size_t seqI = 0; seqI < gal.SeqCount(); ++seqI )
02568 if( gal.LeftEnd(seqI) != NO_MATCH )
02569 seq += aln_mat[seqI];
02570
02571 stringstream dbg_fname;
02572 dbg_fname << "prof_dbg_iv_" << aI << ".dbg." << dbg_count++ << ".fas";
02573 ofstream debug_file( dbg_fname.str().c_str() );
02574 gnFASSource::Write( seq, debug_file, false );
02575 debug_file.close();
02576 }
02577 }
02578
02579
02580 }
02581
02582
02583 void addGuy( uint seqI, AbstractMatch::orientation orient,
02584 std::vector< AbstractMatch* >& new_ivs,
02585 vector<Interval*>& new_list )
02586 {
02587 Interval tmp_iv;
02588
02589 if( orient == AbstractMatch::reverse )
02590 {
02591 for( size_t nI = 0; nI < new_ivs.size(); nI++ )
02592 if( new_ivs[nI]->LeftEnd(seqI) != NO_MATCH && new_ivs[nI]->Orientation(seqI) != orient)
02593 new_ivs[nI]->Invert();
02594 }
02595
02596 Interval* added_iv = tmp_iv.Copy();
02597 added_iv->SetMatches( new_ivs );
02598 new_list.push_back(added_iv);
02599 }
02600
02601 void mergeUnalignedIntervals( uint seqI, vector< Interval* >& iv_list, vector< Interval* >& new_list )
02602 {
02603 SSC<Interval> ivlcJ(seqI);
02604 sort( iv_list.begin(), iv_list.end(), ivlcJ );
02605
02606 Interval tmp_iv;
02607 AbstractMatch::orientation orient = AbstractMatch::undefined;
02608 vector< AbstractMatch* > new_ivs;
02609 vector< Interval* > to_delete;
02610 for( size_t ordI = 0; ordI < iv_list.size(); ordI++ )
02611 {
02612 if( iv_list[ordI]->LeftEnd(seqI) == NO_MATCH )
02613 {
02614 new_list.push_back(iv_list[ordI]);
02615 iv_list[ordI] = NULL;
02616 continue;
02617 }
02618
02619 if( orient == AbstractMatch::undefined && iv_list[ordI]->Multiplicity() == 2 )
02620 {
02621 orient = iv_list[ordI]->Orientation(seqI);
02622 vector< AbstractMatch* > matches;
02623 iv_list[ordI]->StealMatches( matches );
02624 if( orient == AbstractMatch::forward )
02625 new_ivs.insert( new_ivs.end(), matches.begin(), matches.end() );
02626 else
02627 new_ivs.insert( new_ivs.begin(), matches.begin(), matches.end() );
02628
02629
02630 if( ordI + 1 == iv_list.size() )
02631 addGuy( seqI, orient, new_ivs, new_list );
02632 continue;
02633 }
02634 if( orient != AbstractMatch::undefined && iv_list[ordI]->Multiplicity() == 2 )
02635 {
02636
02637
02638 addGuy( seqI, orient, new_ivs, new_list );
02639
02640
02641 vector< AbstractMatch* > matches;
02642 orient = iv_list[ordI]->Orientation(seqI);
02643 iv_list[ordI]->StealMatches( matches );
02644 new_ivs.insert( new_ivs.end(), matches.begin(), matches.end() );
02645
02646 if( ordI + 1 == iv_list.size() )
02647 addGuy( seqI, orient, new_ivs, new_list );
02648 continue;
02649 }
02650 if( new_ivs.size() == 0 )
02651 {
02652 vector< AbstractMatch* > matches;
02653 iv_list[ordI]->StealMatches( matches );
02654 new_ivs.insert( new_ivs.end(), matches.begin(), matches.end() );
02655 continue;
02656 }
02657
02658 Interval* left_iv = iv_list[ordI]->Copy();
02659 to_delete.push_back( left_iv );
02660 bool cropped = (ordI + 1 < iv_list.size() && iv_list[ordI]->Length(seqI) > 1);
02661 if( cropped )
02662 {
02663 gnSeqI lendo = left_iv->AlignmentLength() / 2;
02664 left_iv->CropEnd( left_iv->AlignmentLength() - lendo );
02665 iv_list[ordI]->CropStart( lendo );
02666 }
02667 vector< AbstractMatch* > matches;
02668 left_iv->StealMatches( matches );
02669 if( orient == AbstractMatch::forward )
02670 new_ivs.insert( new_ivs.end(), matches.begin(), matches.end() );
02671 else
02672 new_ivs.insert( new_ivs.begin(), matches.begin(), matches.end() );
02673
02674 addGuy( seqI, orient, new_ivs, new_list );
02675
02676 orient = AbstractMatch::undefined;
02677 if(cropped)
02678 ordI--;
02679 }
02680
02681 if( new_ivs.size() > 0 )
02682 {
02683
02684 addGuy( seqI, AbstractMatch::forward, new_ivs, new_list );
02685 }
02686
02687
02688 for( size_t delI = 0; delI < to_delete.size(); delI++ )
02689 to_delete[delI]->Free();
02690
02691
02692 for( size_t ivI = 0; ivI < iv_list.size(); ivI++ )
02693 if( iv_list[ivI] != NULL )
02694 iv_list[ivI]->Free();
02695 iv_list.clear();
02696 }
02697
02698
02702 void ProgressiveAligner::createAncestralOrdering( vector<Interval*>& interval_list, vector< SuperInterval >& ancestral_sequence )
02703 {
02704
02705 int64 left_end = 1;
02706 ancestral_sequence.resize( interval_list.size() );
02707 for( uint ivI = 0; ivI < interval_list.size(); ++ivI ){
02708 if(debug_aligner)
02709 interval_list[ivI]->ValidateMatches();
02710 vector<AbstractMatch*> matches;
02711 interval_list[ivI]->StealMatches(matches);
02712 ancestral_sequence[ivI].reference_iv.SetMatches(matches);
02713 ancestral_sequence[ivI].SetLeftEnd(left_end);
02714 ancestral_sequence[ivI].SetLength(ancestral_sequence[ivI].reference_iv.AlignmentLength());
02715 if(debug_aligner)
02716 ancestral_sequence[ivI].ValidateSelf();
02717 left_end += ancestral_sequence[ivI].Length();
02718 }
02719 }
02720
02721 void markAligned( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t subject_node, node_id_t neighbor )
02722 {
02723 for( uint parentI = 0; parentI < alignment_tree[subject_node].parents.size(); parentI++ )
02724 if( alignment_tree[subject_node].parents[parentI] == neighbor )
02725 alignment_tree[subject_node].parents_aligned[parentI] = true;
02726 for( uint childI = 0; childI < alignment_tree[subject_node].children.size(); childI++ )
02727 if( alignment_tree[subject_node].children[childI] == neighbor )
02728 alignment_tree[subject_node].children_aligned[childI] = true;
02729 }
02730
02731
02732 bool
02733 ProgressiveAligner::validateSuperIntervals(node_id_t node1, node_id_t node2, node_id_t ancestor)
02734 {
02735
02736 bool borked = false;
02737 vector< SuperInterval >& siv_list = alignment_tree[ancestor].ordering;
02738 gnSeqI n1_len = 0;
02739 gnSeqI n2_len = 0;
02740 gnSeqI my_len = 0;
02741 gnSeqI my_iv_len = 0;
02742 for( size_t sivI = 0; sivI < siv_list.size(); sivI++ )
02743 {
02744 if( siv_list[sivI].reference_iv.Start(0) != 0 )
02745 n1_len += siv_list[sivI].reference_iv.Length(0);
02746 if( siv_list[sivI].reference_iv.Start(1) != 0 )
02747 n2_len += siv_list[sivI].reference_iv.Length(1);
02748 my_len += siv_list[sivI].Length();
02749 my_iv_len += siv_list[sivI].reference_iv.AlignmentLength();
02750 siv_list[sivI].ValidateSelf();
02751 }
02752 gnSeqI real_n1len = 0;
02753 gnSeqI real_n2len = 0;
02754
02755 vector< SuperInterval >& siv1_list = alignment_tree[node1].ordering;
02756 for( size_t sivI = 0; sivI < siv1_list.size(); sivI++ )
02757 {
02758 if( siv1_list[sivI].Length() == 0 )
02759 borked = true;
02760 real_n1len += siv1_list[sivI].Length();
02761 siv1_list[sivI].ValidateSelf();
02762 }
02763
02764 vector< SuperInterval >& siv2_list = alignment_tree[node2].ordering;
02765 for( size_t sivI = 0; sivI < siv2_list.size(); sivI++ )
02766 {
02767 if( siv2_list[sivI].Length() == 0 )
02768 borked = true;
02769 real_n2len += siv2_list[sivI].Length();
02770 siv2_list[sivI].ValidateSelf();
02771 }
02772
02773 if( real_n1len != n1_len || real_n2len != n2_len )
02774 borked = true;
02775
02776
02777 for( size_t sivI = 1; sivI < siv1_list.size(); sivI++ )
02778 if( siv1_list[sivI].LeftEnd() != siv1_list[sivI-1].LeftEnd() + siv1_list[sivI-1].Length() )
02779 {
02780 borked = true;
02781 }
02782 for( size_t sivI = 1; sivI < siv2_list.size(); sivI++ )
02783 if( siv2_list[sivI].LeftEnd() != siv2_list[sivI-1].LeftEnd() + siv2_list[sivI-1].Length() )
02784 {
02785 borked = true;
02786 }
02787
02788 if( my_len != my_iv_len )
02789 borked = true;
02790
02791 if( my_len < real_n1len || my_len < real_n2len )
02792 borked = true;
02793
02794 if( borked )
02795 {
02796 breakHere();
02797 cerr << "child1 has " << siv1_list.size() << " ivs totalling " << real_n1len << " nt\n";
02798 cerr << "child2 has " << siv2_list.size() << " ivs totalling " << real_n2len << " nt\n";
02799 cerr << "parent has " << siv_list.size() << " ivs, n1_len: " << n1_len << " n2_len: " << n2_len << endl;
02800 }
02801 return borked;
02802
02803 }
02804
02805 bool ProgressiveAligner::validatePairwiseIntervals(node_id_t node1, node_id_t node2, std::vector<Interval*>& pair_iv)
02806 {
02807
02808 bool borked = false;
02809 gnSeqI n1_len = 0;
02810 gnSeqI n2_len = 0;
02811 for( size_t sivI = 0; sivI < pair_iv.size(); sivI++ )
02812 {
02813 if( pair_iv[sivI]->Start(0) != 0 )
02814 n1_len += pair_iv[sivI]->Length(0);
02815 if( pair_iv[sivI]->Start(1) != 0 )
02816 n2_len += pair_iv[sivI]->Length(1);
02817
02818 vector< bitset_t > aln_mat;
02819 pair_iv[sivI]->GetAlignment(aln_mat);
02820 if( aln_mat[0].size() != pair_iv[sivI]->AlignmentLength() )
02821 {
02822 cerr << "broked\n";
02823 }
02824 pair_iv[sivI]->ValidateMatches();
02825 }
02826 gnSeqI real_n1len = 0;
02827 gnSeqI real_n2len = 0;
02828
02829 vector< SuperInterval >& siv1_list = alignment_tree[node1].ordering;
02830 for( size_t sivI = 0; sivI < siv1_list.size(); sivI++ )
02831 {
02832 if( siv1_list[sivI].Length() == 0 )
02833 borked = true;
02834 real_n1len += siv1_list[sivI].Length();
02835 }
02836
02837 vector< SuperInterval >& siv2_list = alignment_tree[node2].ordering;
02838 for( size_t sivI = 0; sivI < siv2_list.size(); sivI++ )
02839 {
02840 if( siv2_list[sivI].Length() == 0 )
02841 borked = true;
02842 real_n2len += siv2_list[sivI].Length();
02843 }
02844
02845 if( real_n1len != n1_len || real_n2len != n2_len )
02846 borked = true;
02847
02848
02849 vector< Interval* > tmp_iv_list = pair_iv;
02850 for( uint seqI = 0; seqI < 2; seqI++ )
02851 {
02852 SSC<Interval> ssc(seqI);
02853 sort( tmp_iv_list.begin(), tmp_iv_list.end(), ssc );
02854 for( size_t ivI = 1; ivI < tmp_iv_list.size(); ivI++ )
02855 {
02856 if( tmp_iv_list[ivI-1]->LeftEnd(seqI) == NO_MATCH || tmp_iv_list[ivI]->LeftEnd(seqI) == NO_MATCH )
02857 continue;
02858 if( tmp_iv_list[ivI-1]->RightEnd(seqI) >= tmp_iv_list[ivI]->LeftEnd(seqI) )
02859 {
02860 cerr << "overlap:\n";
02861 cerr << "tmp_iv_list[ivI-1].RightEnd(seqI): " << tmp_iv_list[ivI-1]->RightEnd(seqI) << endl;
02862 cerr << "tmp_iv_list[ivI].LeftEnd(seqI): " << tmp_iv_list[ivI]->LeftEnd(seqI) << endl;
02863 breakHere();
02864 }
02865 }
02866 }
02867
02868 if( borked )
02869 {
02870 cerr << "child1 has " << siv1_list.size() << " ivs totalling " << real_n1len << " nt\n";
02871 cerr << "child2 has " << siv2_list.size() << " ivs totalling " << real_n2len << " nt\n";
02872 cerr << "parent has " << pair_iv.size() << " ivs, n1_len: " << n1_len << " n2_len: " << n2_len << endl;
02873 if( n2_len < real_n2len )
02874 {
02875 SSC<Interval> sortie(1);
02876 sort( pair_iv.begin(), pair_iv.end(), sortie );
02877 size_t prev_iv = 9999999;
02878 for( size_t ivI = 0; ivI < pair_iv.size(); ++ivI)
02879 {
02880 if( pair_iv[ivI]->LeftEnd(1) == NO_MATCH )
02881 continue;
02882
02883 if( prev_iv != 9999999 )
02884 cerr << "diff: " << pair_iv[ivI]->LeftEnd(1) - pair_iv[prev_iv]->RightEnd(1) << endl;
02885 cerr << "Interval " << ivI << " LeftEnd(1): " << pair_iv[ivI]->LeftEnd(1) << " RightEnd(1): " << pair_iv[ivI]->RightEnd(1) << std::endl;
02886 prev_iv = ivI;
02887 }
02888 }else if( n2_len > real_n2len )
02889 {
02890 SSC<Interval> sortie(1);
02891 sort( pair_iv.begin(), pair_iv.end(), sortie );
02892 for( size_t ivI = 0; ivI < pair_iv.size(); ++ivI)
02893 {
02894 if( pair_iv[ivI]->LeftEnd(1) < real_n2len )
02895 continue;
02896 cerr << "Interval " << ivI << " LeftEnd(1): " << pair_iv[ivI]->LeftEnd(1) << " RightEnd(1): " << pair_iv[ivI]->RightEnd(1) << std::endl;
02897 }
02898 }
02899 breakHere();
02900 }
02901 return borked;
02902 }
02903
02904 void ProgressiveAligner::alignNodes( node_id_t node1, node_id_t node2, node_id_t ancestor )
02905 {
02906 cout << "Aligning node " << node1 << " to " << node2 << " via " << ancestor << "!\n";
02907
02908 if( alignment_tree[node1].parents[0] != ancestor ||
02909 alignment_tree[node2].parents[0] != ancestor )
02910 {
02911 breakHere();
02912 cerr << "rotten\n";
02913 }
02914
02915 alignProfileToProfile(node1, node2, ancestor);
02916
02917
02918 markAligned( alignment_tree, node1, node2 );
02919 markAligned( alignment_tree, node2, node1 );
02920 markAligned( alignment_tree, node1, ancestor );
02921 markAligned( alignment_tree, node2, ancestor );
02922 markAligned( alignment_tree, ancestor, node1 );
02923 markAligned( alignment_tree, ancestor, node2 );
02924 }
02925
02929 void findMidpoint( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t& n1, node_id_t& n2 )
02930 {
02931
02932
02933
02934 double scaling_factor = 100000;
02935 using namespace boost;
02936 typedef adjacency_list<vecS, vecS, undirectedS, no_property,
02937 property< edge_weight_t, int, property< edge_color_t, default_color_type > > > Graph;
02938 const int V = alignment_tree.size();
02939 const std::size_t E = alignment_tree.size()-1;
02940 typedef std::pair < int, int >Edge;
02941 Edge* edge_array = new Edge[ alignment_tree.size() - 1 ];
02942 int* weights = new int[ alignment_tree.size() - 1 ];
02943 bitset_t child_found( alignment_tree.size(), false );
02944 size_t eI = 0;
02945 for( size_t vI = 0; vI < V; ++vI )
02946 {
02947 if( alignment_tree[vI].parents.size() != 0 )
02948 {
02949 edge_array[eI] = Edge( vI, alignment_tree[vI].parents[0] );
02950
02951 weights[eI] = (int)(scaling_factor * genome::absolut(alignment_tree[vI].distance)) + 1;
02952 eI++;
02953 }
02954 }
02955
02956 #if defined(BOOST_MSVC) && BOOST_MSVC <= 1300
02957
02958 Graph g(V);
02959 for (std::size_t j = 0; j < E; ++j)
02960 add_edge(edge_array[j].first, edge_array[j].second, g);
02961 #else
02962 Graph g(edge_array, edge_array + E, V);
02963 #endif
02964
02965 property_map < Graph, edge_weight_t >::type w = get(edge_weight, g);
02966 int *wp = weights;
02967
02968 graph_traits < Graph >::edge_iterator e, e_end;
02969 for (boost::tie(e, e_end) = edges(g); e != e_end; ++e)
02970 w[*e] = *wp++;
02971
02972 boost::multi_array<int,2> D( boost::extents[V][V] );
02973 bool success = johnson_all_pairs_shortest_paths(g, D);
02974 if( !success )
02975 {
02976 cerr << "failed, is this really a tree?\n";
02977 return;
02978 }
02979
02980
02981 int max_dist = (std::numeric_limits<int>::min)();
02982 for (int i = 0; i < V; ++i) {
02983 for (int j = 0; j < V; ++j) {
02984 if( D[i][j] > max_dist )
02985 {
02986 max_dist = D[i][j];
02987 n1 = i;
02988 n2 = j;
02989 }
02990 }
02991 }
02992
02993 typedef graph_traits<Graph>::vertex_descriptor vertex_t;
02994 std::vector < vertex_t > pred(num_vertices(g));
02995 std::vector < int > dist(num_vertices(g));
02996 pred[n1] = n1;
02997
02998 undirected_dfs(g,
02999 root_vertex( vertex( n1, g ) ).
03000 visitor( make_dfs_visitor( make_pair(
03001 record_predecessors(&pred[0], on_tree_edge()),
03002 record_distances(&dist[0], on_tree_edge())
03003 ))).
03004 edge_color_map(get(edge_color, g))
03005 );
03006
03007 int cur_node = n2;
03008 int prev_node = n2;
03009 max_dist /= 2;
03010 while( cur_node != n1 && max_dist > 0 )
03011 {
03012 if( alignment_tree[cur_node].parents.size() > 0 &&
03013 alignment_tree[cur_node].parents[0] == pred[cur_node] )
03014 {
03015 max_dist -= (int)(scaling_factor * alignment_tree[cur_node].distance) + 1;
03016 prev_node = cur_node;
03017 cur_node = pred[cur_node];
03018 }else
03019 {
03020 prev_node = cur_node;
03021 cur_node = pred[cur_node];
03022 max_dist -= (int)(scaling_factor * alignment_tree[cur_node].distance) + 1;
03023 }
03024 }
03025 n1 = cur_node;
03026 n2 = prev_node;
03027
03028 delete[] edge_array;
03029 delete[] weights;
03030 }
03031
03032 void extendRootBranches( PhyloTree< AlignmentTreeNode >& alignment_tree )
03033 {
03034
03035
03036 node_id_t ancestor = alignment_tree.root;
03037 double max_blen = -(std::numeric_limits<double>::max)();
03038 for( size_t nI = 0; nI < alignment_tree.size(); ++nI )
03039 {
03040 if( alignment_tree[nI].distance > max_blen )
03041 max_blen = alignment_tree[nI].distance;
03042 if( alignment_tree[nI].children.size() > 0 &&
03043 alignment_tree[nI].children[0] > alignment_tree[nI].children[1] )
03044 {
03045 std::swap( alignment_tree[nI].children[0], alignment_tree[nI].children[1] );
03046 }
03047 }
03048 for( size_t cI = 0; cI < alignment_tree[ancestor].children.size(); ++cI )
03049 alignment_tree[alignment_tree[ancestor].children[cI]].distance = 2.0 * max_blen;
03050 }
03051
03052 void chooseNextAlignmentPair( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t& node1, node_id_t& node2, node_id_t& ancestor )
03053 {
03054
03055
03056 node1 = 0;
03057 node2 = 0;
03058 ancestor = 0;
03059 double nearest_distance = (numeric_limits<double>::max)();
03060 for( node_id_t nodeI = 0; nodeI < alignment_tree.size(); nodeI++ )
03061 {
03062 AlignmentTreeNode& cur_node = alignment_tree[ nodeI ];
03063
03064
03065
03066 boolean completely_aligned = true;
03067 for( uint alignedI = 0; alignedI < cur_node.children_aligned.size(); alignedI++ )
03068 completely_aligned = completely_aligned && cur_node.children_aligned[alignedI];
03069 for( uint alignedI = 0; alignedI < cur_node.parents_aligned.size(); alignedI++ )
03070 completely_aligned = completely_aligned && cur_node.parents_aligned[alignedI];
03071 if( cur_node.sequence != NULL || completely_aligned )
03072 continue;
03073
03074
03075 vector< node_id_t > neighbor_id;
03076 vector< boolean > alignable;
03077 vector< double > distance;
03078
03079 for( uint parentI = 0; parentI < cur_node.parents.size(); parentI++ )
03080 {
03081 neighbor_id.push_back( cur_node.parents[parentI] );
03082 vector< node_id_t >::iterator cur_neighbor = neighbor_id.end() - 1;
03083 if( *cur_neighbor == alignment_tree.root )
03084 {
03085
03086
03087
03088 }
03089 distance.push_back( cur_node.distance );
03090 alignable.push_back( !cur_node.parents_aligned[parentI] && (alignment_tree[*cur_neighbor].ordering.size() != 0 || alignment_tree[*cur_neighbor].sequence != NULL) );
03091 }
03092
03093 for( uint childI = 0; childI < cur_node.children.size(); childI++ )
03094 {
03095 neighbor_id.push_back( cur_node.children[childI] );
03096 vector< node_id_t >::iterator cur_neighbor = neighbor_id.end() - 1;
03097 distance.push_back( alignment_tree[*cur_neighbor].distance );
03098 alignable.push_back( !cur_node.children_aligned[childI] && (alignment_tree[*cur_neighbor].ordering.size() != 0 || alignment_tree[*cur_neighbor].sequence != NULL) );
03099 }
03100
03101 if( cur_node.ordering.size() != 0 )
03102 {
03103
03104
03105 for( int i = 0; i < neighbor_id.size(); i++ ){
03106 if( !alignable[i] )
03107 continue;
03108 if( distance[i] < nearest_distance )
03109 {
03110 nearest_distance = distance[i];
03111 node1 = nodeI;
03112 node2 = neighbor_id[i];
03113 ancestor = nodeI;
03114 }
03115 }
03116 }else{
03117
03118 for( int i = 0; i < neighbor_id.size(); i++ )
03119 {
03120 if( !alignable[i] )
03121 continue;
03122 for( int j = i+1; j < neighbor_id.size(); j++ )
03123 {
03124 if( !alignable[j] )
03125 continue;
03126 if( distance[i] + distance[j] < nearest_distance )
03127 {
03128 nearest_distance = distance[i] + distance[j];
03129 node1 = neighbor_id[i];
03130 node2 = neighbor_id[j];
03131 ancestor = nodeI;
03132 }
03133 }
03134 }
03135 }
03136 }
03137 }
03138
03140 void ProgressiveAligner::setPairwiseMatches( MatchList& pair_ml )
03141 {
03142 original_ml = pair_ml;
03143 pair_ml.clear();
03144 }
03145
03146
03147 node_id_t createAlignmentTreeRoot( PhyloTree< AlignmentTreeNode >& alignment_tree, node_id_t node1, node_id_t node2 )
03148 {
03149
03150 AlignmentTreeNode atn;
03151 alignment_tree.push_back( atn );
03152 AlignmentTreeNode& old_root = alignment_tree[alignment_tree.root];
03153 AlignmentTreeNode& new_root = alignment_tree.back();
03154
03155 if( find( alignment_tree[node1].children.begin(), alignment_tree[node1].children.end(), node2 ) !=
03156 alignment_tree[node1].children.end() )
03157 {
03158 new_root.children.push_back(node2);
03159 new_root.parents.push_back(node1);
03160 alignment_tree[node2].parents.push_back(alignment_tree.size()-1);
03161 alignment_tree[node1].children.push_back(alignment_tree.size()-1);
03162 }else{
03163 new_root.parents.push_back(node2);
03164 new_root.children.push_back(node1);
03165 alignment_tree[node2].children.push_back(alignment_tree.size()-1);
03166 alignment_tree[node1].parents.push_back(alignment_tree.size()-1);
03167 }
03168
03169
03170 findAndErase( alignment_tree[node1].children, node2 );
03171 findAndErase( alignment_tree[node2].children, node1 );
03172 findAndErase( alignment_tree[node1].parents, node2 );
03173 findAndErase( alignment_tree[node2].parents, node1 );
03174
03175
03176
03177 rerootTree( alignment_tree, alignment_tree.size()-1 );
03178
03179 new_root.children_aligned = vector< boolean >( new_root.children.size(), false );
03180 old_root.children_aligned = vector< boolean >( old_root.children.size(), false );
03181 old_root.parents_aligned = vector< boolean >( old_root.parents.size(), false );
03182 new_root.sequence = NULL;
03183
03184 return alignment_tree.root;
03185 }
03186
03187 void ProgressiveAligner::extractAlignment( node_id_t ancestor, size_t super_iv, GappedAlignment& gal )
03188 {
03189 CompactGappedAlignment<> cga;
03190 extractAlignment( ancestor, super_iv, cga );
03191 vector< string > aln;
03192 GetAlignment( cga, this->original_ml.seq_table, aln );
03193 gal = GappedAlignment(cga.SeqCount(), 0);
03194 for( size_t seqI = 0; seqI < cga.SeqCount(); ++seqI )
03195 {
03196 gal.SetStart(seqI, cga.Start(seqI));
03197 if( cga.Orientation(seqI) != AbstractMatch::undefined )
03198 gal.SetLength(cga.Length(seqI), seqI);
03199 }
03200 gal.SetAlignment(aln);
03201
03202 }
03203
03204 void ProgressiveAligner::extractAlignment( node_id_t ancestor, size_t super_iv, CompactGappedAlignment<>& cga )
03205 {
03206
03207 vector< pair< node_id_t, size_t > > node_siv_list;
03208 stack< pair<node_id_t,size_t> > node_stack;
03209 node_stack.push(make_pair(ancestor,super_iv));
03210 while( node_stack.size() > 0 )
03211 {
03212 pair<node_id_t,size_t> cur = node_stack.top();
03213 node_id_t cur_node = cur.first;
03214 node_stack.pop();
03215 if( alignment_tree[cur_node].children.size() == 0 )
03216 node_siv_list.push_back( cur );
03217 for( size_t childI = 0; childI < alignment_tree[cur_node].children.size(); childI++ )
03218 {
03219 if( alignment_tree[cur_node].ordering[cur.second].reference_iv.LeftEnd(childI) == NO_MATCH )
03220 continue;
03221 size_t child_siv = childI == 0 ? alignment_tree[cur_node].ordering[cur.second].c1_siv :
03222 alignment_tree[cur_node].ordering[cur.second].c2_siv;
03223 node_stack.push(make_pair(alignment_tree[cur_node].children[childI], child_siv) );
03224 node_id_t n = alignment_tree[cur_node].children[childI];
03225 if( alignment_tree[cur_node].ordering[cur.second].reference_iv.Length(childI) != alignment_tree[n].ordering[child_siv].Length() )
03226 {
03227 breakHere();
03228 cerr << "alignment_tree[cur_node].ordering[cur.second].reference_iv.Length(childI): " << alignment_tree[cur_node].ordering[cur.second].reference_iv.Length(childI) << endl;
03229 cerr << "rotten in the state of denmark...\n";
03230 }
03231 }
03232 }
03233
03234
03235
03236
03237 SuperInterval& a_iv = alignment_tree[ancestor].ordering[super_iv];
03238 cga = CompactGappedAlignment<>(seq_count, a_iv.Length());
03239 vector< bitset_t > aln_mats( seq_count );
03240
03241
03242
03243 for( size_t pairI = 0; pairI < node_siv_list.size(); pairI++ )
03244 {
03245 node_id_t nodeI = node_siv_list[pairI].first;
03246 size_t seq_siv = node_siv_list[pairI].second;
03247
03248
03249
03250
03251
03252 uint seqI = node_sequence_map[nodeI];
03253 Match mm(2);
03254 mm.SetStart(0, alignment_tree[nodeI].ordering[seq_siv].LeftEnd());
03255 mm.SetStart(1, alignment_tree[nodeI].ordering[seq_siv].LeftEnd());
03256 mm.SetLength( alignment_tree[nodeI].ordering[seq_siv].Length() );
03257
03258 vector< AbstractMatch* > aml( 1, mm.Copy() );
03259 translateGappedCoordinates( aml, 0, nodeI, ancestor );
03260
03261 if( aml.size() > 1 )
03262 {
03263 cerr << "huh?";
03264 genome::breakHere();
03265 SingleStartComparator<AbstractMatch> ssc( 0 );
03266 sort( aml.begin(), aml.end(), ssc );
03267 }
03268 CompactGappedAlignment<>* trans_cga = dynamic_cast<CompactGappedAlignment<>*>(aml[0]);
03269 if( trans_cga == NULL )
03270 {
03271 CompactGappedAlignment<> tmp_cga;
03272 trans_cga = tmp_cga.Copy();
03273 *trans_cga = CompactGappedAlignment<>(*aml[0]);
03274 }
03275
03276 if( trans_cga->LeftEnd(0) + trans_cga->Length(0) > a_iv.LeftEnd() + a_iv.Length() )
03277 {
03278 cerr << "trans_cga->Start(0): " << trans_cga->Start(0) << " trans_cga->Length(0): " << trans_cga->Length(0) << endl;
03279 cerr << "a_iv.LeftEnd(): " << a_iv.LeftEnd() << " a_iv.Length(): " << a_iv.Length() << endl;
03280 breakHere();
03281 }
03282 bool parity = trans_cga->Orientation(0) == trans_cga->Orientation(1);
03283 cga.SetLeftEnd(seqI, trans_cga->LeftEnd(1));
03284 AbstractMatch::orientation o = parity ? AbstractMatch::forward : AbstractMatch::reverse;
03285 cga.SetOrientation(seqI, o);
03286 const vector< bitset_t >& tmp = trans_cga->GetAlignment();
03287 aln_mats[seqI] = tmp[1];
03288
03289 size_t offset = trans_cga->LeftEnd(0) - a_iv.LeftEnd();
03290 if( aln_mats[seqI].size() < a_iv.Length() )
03291 {
03292
03293 aln_mats[seqI].resize( a_iv.Length() );
03294 aln_mats[seqI] <<= offset;
03295 }
03296 if( trans_cga->LeftEnd(0) < a_iv.LeftEnd() )
03297 {
03298 cerr << "trans_cga->LeftEnd(0): " << trans_cga->LeftEnd(0) << endl;
03299 cerr << "a_iv.LeftEnd(): " << a_iv.LeftEnd() << endl;
03300 breakHere();
03301 }
03302
03303
03304 if( trans_cga->Length(1) != alignment_tree[nodeI].ordering[seq_siv].Length() )
03305 {
03306 cerr << "b0rked\n";
03307 breakHere();
03308 }
03309
03310 cga.SetLength(trans_cga->Length(1), seqI);
03311
03312
03313 trans_cga->Free();
03314 }
03315 for( uint seqI = 0; seqI < aln_mats.size(); seqI++ )
03316 if( aln_mats[seqI].size() == 0 )
03317 aln_mats[seqI].resize( a_iv.Length() );
03318 cga.SetAlignment(aln_mats);
03319 }
03320
03321 unsigned getDefaultBreakpointMax( const std::vector< genome::gnSequence* >& seq_table )
03322 {
03323 double avg_len = 0;
03324 for( size_t seqI = 0; seqI < seq_table.size(); ++seqI )
03325 avg_len += seq_table[seqI]->length();
03326 avg_len /= (double)(seq_table.size());
03327
03328 avg_len /= 1000000.0;
03329 avg_len *= 15.0;
03330 return (unsigned)avg_len;
03331 }
03332
03333
03334 void ProgressiveAligner::CreatePairwiseBPDistance( boost::multi_array<double, 2>& bp_distmat )
03335 {
03336 uint seq_count = original_ml.seq_table.size();
03337 bp_distmat.resize(boost::extents[seq_count][seq_count]);
03338 for( size_t i = 0; i < seq_count; ++i )
03339 for( size_t j = 0; j < seq_count; ++j )
03340 bp_distmat[i][j] = 1;
03341
03342 #ifdef LCB_WEIGHT_LOSS_PLOT
03343 stringstream pair_bp_ofname;
03344 pair_bp_ofname << "pair_bp_log.txt";
03345 ofstream pair_bp_out( pair_bp_ofname.str().c_str() );
03346 #endif
03347
03348 vector< pair<uint, uint> > seq_pairs( (seq_count * (seq_count-1))/2 );
03349 int ii = 0;
03350 for( uint seqI = 0; seqI < seq_count; seqI++ )
03351 for( uint seqJ = seqI + 1; seqJ < seq_count; seqJ++ )
03352 seq_pairs[ii++] = make_pair(seqI,seqJ);
03353
03354 #pragma omp parallel for
03355 for(int i = 0; i < seq_pairs.size(); i++)
03356 {
03357 uint seqI = seq_pairs[i].first;
03358 uint seqJ = seq_pairs[i].second;
03359 vector<uint>::iterator n1 = find( node_sequence_map.begin(), node_sequence_map.end(), seqI );
03360 vector<uint>::iterator n2 = find( node_sequence_map.begin(), node_sequence_map.end(), seqJ );
03361 vector<node_id_t> n1_seqs( 1, n1-node_sequence_map.begin() );
03362 vector<node_id_t> n2_seqs( 1, n2-node_sequence_map.begin() );
03363 Matrix<MatchList> mml;
03364 getPairwiseMatches(n1_seqs, n2_seqs, mml);
03365 MatchList& ml = mml(0,0);
03366
03367
03368
03369 EliminateOverlaps_v2( ml, true );
03370 ml.MultiplicityFilter(2);
03371
03372
03373 vector< MatchList > LCB_list;
03374 vector< LCB > adjacencies;
03375 vector< gnSeqI > breakpoints;
03376 IdentifyBreakpoints( ml, breakpoints );
03377 ComputeLCBs_v2( ml, breakpoints, LCB_list );
03378 vector< double > lcb_scores( LCB_list.size() );
03379 for( size_t lcbI = 0; lcbI < LCB_list.size(); ++lcbI )
03380 lcb_scores[lcbI] = GetPairwiseAnchorScore( LCB_list[lcbI], ml.seq_table, this->subst_scoring, sol_list[seqI], sol_list[seqJ] );
03381
03382 computeLCBAdjacencies_v3( LCB_list, lcb_scores, adjacencies );
03383
03384
03385
03386 double cons_id = 1 - this->conservation_distance[seqI][seqJ];
03387 double scaled_score = max( bp_dist_estimate_score * cons_id * cons_id * cons_id * cons_id, min_breakpoint_penalty);
03388 cout << "Using scaled bp penalty: " << scaled_score << endl;
03389 GreedyRemovalScorer wbs( adjacencies, scaled_score );
03390 #ifdef LCB_WEIGHT_LOSS_PLOT
03391 cur_min_coverage = greedyBreakpointElimination_v4( adjacencies, lcb_scores, wbs, &pair_bp_out, seqI, seqJ );
03392 pair_bp_out.flush();
03393 #else
03394 cur_min_coverage = greedyBreakpointElimination_v4( adjacencies, lcb_scores, wbs, NULL );
03395 #endif
03396 MatchList deleted_matches;
03397 filterMatches_v2( adjacencies, LCB_list, lcb_scores, deleted_matches );
03398 cout << "Pair (" << seqI << "," << seqJ << ") has " << LCB_list.size() << " well-supported breakpoints\n";
03399
03400
03401 bp_distmat[seqI][seqJ] = LCB_list.size();
03402 bp_distmat[seqJ][seqI] = LCB_list.size();
03403
03404
03405 for( size_t dI = 0; dI < ml.size(); dI++ )
03406 ml[dI]->Free();
03407 }
03408
03409 double bp_max = 0;
03410 for( uint i = 0; i < bp_distmat.shape()[0]; ++i )
03411 for( uint j = 0; j < bp_distmat.shape()[1]; ++j )
03412 {
03413 if( bp_distmat[i][j] > bp_max )
03414 bp_max = bp_distmat[i][j];
03415 }
03416
03417 double default_max = getDefaultBreakpointMax(original_ml.seq_table);
03418 bp_max = bp_max > default_max ? bp_max : default_max;
03419
03420 for( uint i = 0; i < bp_distmat.shape()[0]; ++i )
03421 for( uint j = 0; j < bp_distmat.shape()[1]; ++j )
03422 {
03423 if( i != j )
03424 bp_distmat[i][j] /= bp_max;
03425 bp_distmat[i][j] *= bp_dist_scale;
03426 }
03427 }
03428
03429 template< typename MatchListType >
03430 void makeAlignmentTree( PhyloTree< AlignmentTreeNode >& alignment_tree, MatchListType& mlist, vector< uint >& node_sequence_map )
03431 {
03432
03433 for( node_id_t nodeI = 0; nodeI < alignment_tree.size(); nodeI++ )
03434 {
03435 alignment_tree[nodeI].children_aligned = vector< boolean >( alignment_tree[nodeI].children.size(), false );
03436 alignment_tree[nodeI].parents_aligned = vector< boolean >( alignment_tree[nodeI].parents.size(), false );
03437 alignment_tree[nodeI].sequence = NULL;
03438 alignment_tree[nodeI].refined = false;
03439 }
03440
03441
03442 node_sequence_map = vector< uint >( alignment_tree.size(), -1 );
03443 for( uint seqI = 0; seqI < mlist.seq_table.size(); seqI++ )
03444 {
03445 stringstream seq_name;
03446 seq_name << "seq" << seqI + 1;
03447 node_id_t nodeI = 0;
03448 for( ; nodeI < alignment_tree.size(); nodeI++ )
03449 {
03450 if( seq_name.str() == alignment_tree[nodeI].name )
03451 {
03452 alignment_tree[nodeI].sequence = mlist.seq_table[seqI];
03453 Match mm(1);
03454 Match* m = mm.Copy();
03455 m->SetStart(0,1);
03456 m->SetLength(alignment_tree[nodeI].sequence->length(),0);
03457 vector<AbstractMatch*> tmp(1,m);
03458 Interval iv( tmp.begin(), tmp.end() );
03459 m->Free();
03460 SuperInterval si( iv );
03461 si.SetLeftEnd(1);
03462 si.SetLength(alignment_tree[nodeI].sequence->length());
03463 alignment_tree[nodeI].ordering.push_back( si );
03464 node_sequence_map[nodeI] = seqI;
03465 break;
03466 }
03467 }
03468 if( nodeI == alignment_tree.size() )
03469 throw "Phylogenetic tree names unrecognized. Should follow seqN naming format\n";
03470 }
03471 }
03472
03473 void DistanceMatrix( IntervalList& iv_list, NumericMatrix<double>& distmat )
03474 {
03475 IdentityMatrix( iv_list, distmat );
03476 TransformDistanceIdentity(distmat);
03477 }
03478
03479
03480
03481
03482
03483
03484
03485
03486
03487
03488
03489
03490
03491
03492
03493
03494
03495
03496
03497
03498
03499
03500
03501
03502
03503
03504
03505
03506
03507
03508
03509
03510
03511
03512
03513
03514
03515
03516
03517
03518
03519
03520
03521
03522
03523
03524
03525
03526
03527
03528
03529
03530 void ProgressiveAligner::alignPP(IntervalList& prof1, IntervalList& prof2, IntervalList& interval_list )
03531 {
03532 if( debug_aligner )
03533 {
03534 debug_interval = true;
03535 debug_cga = true;
03536 }
03537
03538 seq_count = prof1.seq_table.size() + prof2.seq_table.size();
03539
03540 if( this->breakpoint_penalty == -1 )
03541 this->breakpoint_penalty = getDefaultBreakpointPenalty( original_ml.seq_table );
03542
03543 if( this->bp_dist_estimate_score == -1 )
03544 this->bp_dist_estimate_score = getDefaultBpDistEstimateMinScore( original_ml.seq_table );
03545 cout << "using default bp penalty: " << breakpoint_penalty << endl;
03546 cout << "using default bp estimate min score: " << bp_dist_estimate_score << endl;
03547
03548 if( this->collinear_genomes )
03549 this->breakpoint_penalty = -1;
03550
03551 if( collinear_genomes )
03552 cout << "\nAssuming collinear genomes...\n";
03553
03554 EliminateOverlaps_v2( original_ml );
03555
03556 MatchList mlist;
03557 mlist.clear();
03558 mlist = original_ml;
03559 cout << "Starting with " << mlist.size() << " multi-matches\n";
03560
03561
03562
03563
03564 NumericMatrix< double > distance1;
03565 DistanceMatrix( prof1, distance1 );
03566 NumericMatrix< double > distance2;
03567 DistanceMatrix( prof2, distance2 );
03568
03569
03570
03571 MuscleInterface& ci = MuscleInterface::getMuscleInterface();
03572 string guide_tree_fname1 = CreateTempFileName("guide_tree");
03573 registerFileToDelete( guide_tree_fname1 );
03574 ci.CreateTree( distance1, guide_tree_fname1 );
03575 string guide_tree_fname2 = CreateTempFileName("guide_tree");
03576 registerFileToDelete( guide_tree_fname2 );
03577 ci.CreateTree( distance2, guide_tree_fname2 );
03578
03579
03580 ifstream tree_file1( guide_tree_fname1.c_str() );
03581 if( !tree_file1.is_open() )
03582 throw "Error opening guide tree file";
03583 PhyloTree< AlignmentTreeNode > tree1;
03584 tree1.readTree( tree_file1 );
03585 tree_file1.close();
03586 ifstream tree_file2( guide_tree_fname2.c_str() );
03587 if( !tree_file2.is_open() )
03588 throw "Error opening guide tree file";
03589 PhyloTree< AlignmentTreeNode > tree2;
03590 tree2.readTree( tree_file2 );
03591 tree_file2.close();
03592
03593
03594
03595 NumericMatrix< double > distance;
03596 DistanceMatrix( mlist, distance );
03597 conservation_distance.resize(boost::extents[seq_count][seq_count]);
03598 for( uint seqI = 0; seqI < seq_count; ++seqI )
03599 for( uint seqJ = 0; seqJ < seq_count; ++seqJ )
03600 if( seqJ > seqI )
03601 conservation_distance[seqI][seqJ] = distance(seqI,seqJ);
03602 else
03603 conservation_distance[seqI][seqJ] = distance(seqJ,seqI);
03604
03605
03606 if( !collinear_genomes )
03607 {
03608 cout << "Calculating pairwise breakpoint distances\n";
03609 CreatePairwiseBPDistance(bp_distance);
03610 cout << "bp distance matrix:\n";
03611 print2d_matrix(bp_distance, cout);
03612 cout << endl;
03613 }
03614
03615
03616 double conservation_range = 1;
03617 double bp_range = 1;
03618 for( uint seqI = 0; seqI < seq_count; ++seqI )
03619 for( uint seqJ = 0; seqJ < seq_count; ++seqJ )
03620 conservation_distance[seqI][seqJ] = distance(seqI,seqJ) / conservation_range;
03621
03622 if( !(collinear_genomes && seq_count > 20 ) )
03623 {
03624 cout << "genome content distance matrix:\n";
03625 print2d_matrix(conservation_distance, cout);
03626 cout << endl;
03627 }
03628
03629
03630
03631
03632 vector< uint > nsmap1;
03633 vector< uint > nsmap2;
03634 makeAlignmentTree( tree1, prof1, nsmap1 );
03635
03636 makeAlignmentTree( tree2, prof2, nsmap2 );
03637
03638
03639 alignment_tree.resize( tree1.size() + tree2.size() + 1 );
03640
03641 node_sequence_map = vector< uint >( alignment_tree.size(), -1 );
03642
03643
03644 for( node_id_t nodeI = 0; nodeI < alignment_tree.size()-1; nodeI++ )
03645 {
03646 if( nodeI < tree1.size() )
03647 {
03648 alignment_tree[nodeI].sequence = tree1[nodeI].sequence;
03649 alignment_tree[nodeI].children = tree1[nodeI].children;
03650 alignment_tree[nodeI].parents = tree1[nodeI].parents;
03651 alignment_tree[nodeI].ordering = tree1[nodeI].ordering;
03652 alignment_tree[nodeI].distance = tree1[nodeI].distance;
03653 alignment_tree[nodeI].name = tree1[nodeI].name;
03654 node_sequence_map[nodeI] = nsmap1[nodeI];
03655 }else{
03656 alignment_tree[nodeI].sequence = tree2[nodeI-tree1.size()].sequence;
03657 alignment_tree[nodeI].children = tree2[nodeI-tree1.size()].children;
03658 alignment_tree[nodeI].parents = tree2[nodeI-tree1.size()].parents;
03659 alignment_tree[nodeI].ordering = tree2[nodeI-tree1.size()].ordering;
03660 alignment_tree[nodeI].distance = tree2[nodeI-tree1.size()].distance;
03661 alignment_tree[nodeI].name = tree2[nodeI-tree1.size()].name;
03662 for( size_t cI = 0; cI < alignment_tree[nodeI].children.size(); cI++ )
03663 alignment_tree[nodeI].children[cI] += tree1.size();
03664 for( size_t pI = 0; pI < alignment_tree[nodeI].parents.size(); pI++ )
03665 alignment_tree[nodeI].parents[pI] += tree1.size();
03666 node_sequence_map[nodeI] = nsmap2[nodeI-tree1.size()];
03667 if( node_sequence_map[nodeI] != (std::numeric_limits<uint>::max)() )
03668 node_sequence_map[nodeI] += prof1.seq_table.size();
03669 }
03670
03671 alignment_tree[nodeI].children_aligned = vector< boolean >( alignment_tree[nodeI].children.size(), true );
03672 alignment_tree[nodeI].parents_aligned = vector< boolean >( alignment_tree[nodeI].parents.size(), true );
03673 alignment_tree[nodeI].refined = true;
03674 }
03675
03676 alignment_tree.back().children.push_back( tree1.size()-1 );
03677 alignment_tree.back().children.push_back( alignment_tree.size()-2 );
03678 alignment_tree.back().distance = 100;
03679 alignment_tree.back().children_aligned = vector< boolean >( alignment_tree.back().children.size(), true );
03680 alignment_tree.back().parents_aligned = vector< boolean >( alignment_tree.back().parents.size(), true );
03681 alignment_tree.back().refined = false;
03682
03683
03684 getAlignment( interval_list );
03685
03686 }
03687
03688 void ProgressiveAligner::getAlignment( IntervalList& interval_list )
03689 {
03690 cout << "Aligning...\n";
03691
03692 while(true)
03693 {
03694 node_id_t node1;
03695 node_id_t node2;
03696 node_id_t ancestor;
03697 chooseNextAlignmentPair( alignment_tree, node1, node2, ancestor );
03698 if( node1 == node2 )
03699 break;
03700
03701
03702
03703 alignNodes( node1, node2, ancestor );
03704 if( ancestor == alignment_tree.root )
03705 break;
03706 }
03707
03708 if( refine )
03709 {
03710
03711 cout << "Performing final pass iterative refinement\n";
03712 doGappedAlignment(alignment_tree.root, false);
03713 }
03714
03715
03716 cout << "root alignment has " << alignment_tree[alignment_tree.root].ordering.size() << " superintervals\n";
03717 vector< SuperInterval >& a_ivs = alignment_tree[alignment_tree.root].ordering;
03718 gnSeqI len = 0;
03719 for( size_t ivI = 0; ivI < a_ivs.size(); ivI++ )
03720 {
03721 len += a_ivs[ivI].Length();
03722 }
03723 cout << "root alignment length: " << len << endl;
03724
03725
03726
03727 for( size_t ivI = 0; ivI < a_ivs.size(); ivI++ )
03728 {
03729 GappedAlignment ga(seq_count, a_ivs[ivI].Length());
03730 extractAlignment(alignment_tree.root, ivI, ga);
03731 vector<AbstractMatch*> tmp(1, &ga);
03732 interval_list.push_back( Interval(tmp.begin(), tmp.end()) );
03733 }
03734 }
03735
03740 void ProgressiveAligner::align( vector< gnSequence* >& seq_table, IntervalList& interval_list ){
03741 if( debug_aligner )
03742 {
03743 debug_interval = true;
03744 debug_cga = true;
03745 }
03746
03747 seq_count = seq_table.size();
03748 this->currently_recursing = false;
03749 interval_list.seq_table = seq_table;
03750
03751
03752 MatchList mlist;
03753 mlist.seq_table = seq_table;
03754
03755 if( this->breakpoint_penalty == -1 )
03756 this->breakpoint_penalty = getDefaultBreakpointPenalty( seq_table );
03757 if( this->bp_dist_estimate_score == -1 )
03758 this->bp_dist_estimate_score = getDefaultBpDistEstimateMinScore( original_ml.seq_table );
03759 cout << "using default bp penalty: " << breakpoint_penalty << endl;
03760 cout << "using default bp estimate min score: " << bp_dist_estimate_score << endl;
03761
03762 if( this->collinear_genomes )
03763 this->breakpoint_penalty = -1;
03764
03765 if( collinear_genomes )
03766 cout << "\nAssuming collinear genomes...\n";
03767
03768 mlist.clear();
03769 mlist = original_ml;
03770 cout << "Starting with " << mlist.size() << " multi-matches\n";
03771 cout << "Computing genome content distance matrix...\n";
03772
03773
03774
03775
03776 NumericMatrix< double > distance;
03777 SingleCopyDistanceMatrix( mlist, mlist.seq_table, distance );
03778 cout << "\n\nGenome conservation distance matrix: " << endl;
03779 distance.print(cout);
03780 cout << endl;
03781
03782 bool input_tree_specified = input_guide_tree_fname != "";
03783 bool output_tree_specified = output_guide_tree_fname != "";
03784 if( !input_tree_specified )
03785 {
03786
03787 if( !output_tree_specified )
03788 output_guide_tree_fname = CreateTempFileName("guide_tree");
03789 input_guide_tree_fname = output_guide_tree_fname;
03790 cout << "Writing guide tree to " << output_guide_tree_fname << endl;
03791 MuscleInterface& mi = MuscleInterface::getMuscleInterface();
03792 mi.CreateTree( distance, output_guide_tree_fname );
03793
03794
03795 }
03796
03797 conservation_distance.resize(boost::extents[seq_count][seq_count]);
03798 for( uint seqI = 0; seqI < seq_count; ++seqI )
03799 for( uint seqJ = 0; seqJ < seq_count; ++seqJ )
03800 if( seqJ > seqI )
03801 {
03802 conservation_distance[seqI][seqJ] = distance(seqI,seqJ);
03803 conservation_distance[seqJ][seqI] = distance(seqI,seqJ);
03804 }
03805 else
03806 {
03807 conservation_distance[seqI][seqJ] = distance(seqJ,seqI);
03808 conservation_distance[seqJ][seqI] = distance(seqJ,seqI);
03809 }
03810
03811 cout << "reading tree...\n";
03812
03813 ifstream tree_file( input_guide_tree_fname.c_str() );
03814 if( !tree_file.is_open() )
03815 throw "Error opening guide tree file";
03816 alignment_tree.readTree( tree_file );
03817 tree_file.close();
03818
03819 cout << "initializing alignment tree...\n";
03820
03821 makeAlignmentTree( alignment_tree, mlist, node_sequence_map );
03822 node_id_t node1;
03823 node_id_t node2;
03824
03825
03826
03827
03828
03829
03830
03831 if( !input_tree_specified || output_tree_specified )
03832 {
03833 ofstream out_tree_file( output_guide_tree_fname.c_str() );
03834 if( !out_tree_file.is_open() )
03835 throw "Error opening guide tree file for write";
03836 alignment_tree.writeTree( out_tree_file );
03837 out_tree_file.close();
03838 }
03839
03840
03841 extendRootBranches(alignment_tree);
03842
03843
03844 if( !collinear_genomes )
03845 {
03846
03847 vector<SeedOccurrenceList> blah(seq_count);
03848 swap( blah, sol_list );
03849
03850
03851
03852
03853
03854
03855
03856
03857 cout << "Constructing seed occurrence lists for repeat detection\n";
03858 #pragma omp parallel for
03859 for( int seqI = 0; seqI < seq_count; seqI++ )
03860 {
03861 sol_list[seqI].construct(*(mlist.sml_table[seqI]));
03862
03863 }
03864
03865 }
03866 if( !collinear_genomes && use_weight_scaling )
03867 {
03868 cout << "Calculating pairwise breakpoint distances\n";
03869 CreatePairwiseBPDistance(bp_distance);
03870 }
03871
03872
03873 if( use_weight_scaling )
03874 {
03875 for( uint seqI = 0; seqI < seq_count; ++seqI )
03876 for( uint seqJ = 0; seqJ < seq_count; ++seqJ )
03877 conservation_distance[seqI][seqJ] = distance(seqI,seqJ) * conservation_dist_scale;
03878 }else{
03879 bp_distance.resize(boost::extents[seq_count][seq_count]);
03880 for( uint seqI = 0; seqI < seq_count; ++seqI )
03881 for( uint seqJ = 0; seqJ < seq_count; ++seqJ )
03882 {
03883 conservation_distance[seqI][seqJ] = 0;
03884 bp_distance[seqI][seqJ] = 0;
03885 }
03886 }
03887
03888 if( !collinear_genomes )
03889 {
03890 cout << "genome content distance matrix:\n";
03891 print2d_matrix(conservation_distance, cout);
03892 cout << endl;
03893 cout << "bp distance matrix:\n";
03894 print2d_matrix(bp_distance, cout);
03895 cout << endl;
03896 }
03897
03898 getAlignment( interval_list );
03899 }
03900
03901
03902
03903
03904 }