00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifndef __Islands_h__
00010 #define __Islands_h__
00011
00012 #ifdef HAVE_CONFIG_H
00013 #include "config.h"
00014 #endif
00015
00016 #include "libGenome/gnSequence.h"
00017 #include "libMems/SubstitutionMatrix.h"
00018 #include "libMems/IntervalList.h"
00019 #include "libMems/NumericMatrix.h"
00020 #include "libMems/MatchList.h"
00021 #include "libMems/GappedAlignment.h"
00022 #include "libMems/CompactGappedAlignment.h"
00023 #include "libMems/Aligner.h"
00024 #include <boost/multi_array.hpp>
00025 #include "libMems/HomologyHMM/homology.h"
00026 #include "libMems/Scoring.h"
00027
00028 namespace mems {
00029
00035 class Island{
00036 public:
00037 uint seqI;
00038 uint seqJ;
00039 int64 leftI;
00040 int64 leftJ;
00041 int64 rightI;
00042 int64 rightJ;
00043 };
00044
00049 void simpleFindIslands( IntervalList& iv_list, uint island_size, std::ostream& island_out );
00050 void findIslandsBetweenLCBs( IntervalList& iv_list, uint island_size, std::ostream& island_out );
00051 void simpleFindIslands( IntervalList& iv_list, uint island_size, std::vector< Island >& island_list );
00052
00053 class HssCols{
00054 public:
00055 uint seqI;
00056 uint seqJ;
00057 size_t left_col;
00058 size_t right_col;
00059 };
00060
00061 typedef std::vector< HssCols > hss_list_t;
00062 typedef boost::multi_array< hss_list_t, 3 > hss_array_t;
00063
00064 typedef HssCols IslandCols;
00065
00066 void findHssRandomWalkScoreVector( std::vector< score_t > scores, score_t significance_threshold, hss_list_t& hss_list, uint seqI = 0, uint seqJ = 0, boolean left_homologous = false, boolean right_homologous = false );
00067
00068 template<typename MatchVector>
00069 void findHssRandomWalk( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, hss_array_t& hss_array, boolean left_homologous = false, boolean right_homologous = false );
00070
00071 template<typename MatchVector>
00072 void hssColsToIslandCols( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, std::vector< HssCols >& hss_list, std::vector< IslandCols >& island_col_list );
00073
00074 template<typename MatchVector>
00075 void findHssRandomWalkCga( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, std::vector< CompactGappedAlignment<>* >& hss_list );
00076
00077 template<typename MatchVector>
00078 void findIslandsRandomWalkCga( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, std::vector< CompactGappedAlignment<>* >& island_list );
00079
00080 template<typename MatchVector>
00081 void findIslandsRandomWalk( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, std::vector< Island >& island_list );
00082
00087 void addUnalignedIntervals( IntervalList& iv_list, std::set< uint > seq_set = std::set< uint >(), std::vector<gnSeqI> seq_lengths = std::vector<gnSeqI>() );
00088
00094 void simpleFindBackbone( IntervalList& iv_list, uint backbone_size, uint max_gap_size, std::vector< GappedAlignment >& backbone_regions );
00095
00099 void outputBackbone( const std::vector< GappedAlignment >& backbone_regions, std::ostream& backbone_out );
00100
00101 void getGapBounds( std::vector<gnSeqI>& seq_lengths, std::vector< LCB >& adjacencies, uint seqJ, int leftI, int rightI, int64& left_start, int64& right_start );
00102
00103
00104 inline
00105 void findRightEndpoint( size_t seqI, size_t seqJ, score_t significance_threshold, std::vector< score_t >& scores, hss_list_t& hss_list )
00106 {
00107
00108 score_t score_sum = significance_threshold;
00109 size_t colI = scores.size();
00110 for( ; colI > 0; --colI )
00111 {
00112 if( scores[colI-1] == INVALID_SCORE )
00113 continue;
00114
00115 if( score_sum >= 0 && score_sum + scores[colI-1] < 0 )
00116 {
00117
00118 score_sum = 0;
00119
00120
00121 score_t rev_score_sum = 0;
00122 size_t rev_ladder_point = colI-1;
00123 size_t rcolI = colI-1;
00124 for( ; ((int64)rcolI) < scores.size(); ++rcolI )
00125 {
00126 if( scores[rcolI] == INVALID_SCORE )
00127 continue;
00128 if( rev_score_sum > significance_threshold )
00129 break;
00130 if( rev_score_sum >= 0 && rev_score_sum + scores[rcolI] < 0 )
00131 {
00132 rev_score_sum = 0;
00133 }else if( rev_score_sum == 0 && scores[rcolI] > 0 )
00134 {
00135
00136 rev_score_sum += scores[rcolI];
00137 rev_ladder_point = rcolI;
00138 }else
00139 rev_score_sum += scores[rcolI];
00140 }
00141
00142 if( rcolI < scores.size() )
00143 {
00144 HssCols ic;
00145 ic.seqI = seqI;
00146 ic.seqJ = seqJ;
00147 ic.left_col = rev_ladder_point;
00148 ic.right_col = scores.size()-1;
00149 hss_list.push_back( ic );
00150 }
00151 break;
00152 }else
00153 score_sum += scores[colI-1];
00154 }
00155 }
00156
00157
00158 inline
00159 void findHssExcursions( std::vector< score_t > scores, score_t significance_threshold, hss_list_t& hss_list, uint seqI, uint seqJ, boolean left_hss, boolean right_hss )
00160 {
00161 score_t score_sum = left_hss ? significance_threshold : 0;
00162 int64 ladder_point = 0;
00163 bool fwd_hss = left_hss;
00164
00165
00166 for( size_t colI = 0; colI <= scores.size(); ++colI )
00167 {
00168 if( colI < scores.size() && scores[colI] == INVALID_SCORE )
00169 continue;
00170
00171 if( colI == scores.size() || (score_sum >= 0 && score_sum + scores[colI] < 0) )
00172 {
00173
00174 if( colI == scores.size() && right_hss )
00175 fwd_hss = true;
00176
00177 score_sum = 0;
00178 if( fwd_hss )
00179 {
00180
00181
00182 HssCols ic;
00183 ic.seqI = seqI;
00184 ic.seqJ = seqJ;
00185 ic.left_col = ladder_point;
00186 if( colI == scores.size() )
00187 ic.right_col = colI - 1;
00188 else
00189 ic.right_col = colI;
00190 hss_list.push_back( ic );
00191 }
00192 fwd_hss = false;
00193 }else if( score_sum == 0 && scores[colI] > 0 )
00194 {
00195
00196 score_sum += scores[colI];
00197 ladder_point = colI;
00198 }else
00199 score_sum += scores[colI];
00200
00201 if( score_sum > significance_threshold )
00202 fwd_hss = true;
00203 }
00204 }
00205
00206
00207 inline
00208 void findMscFromExcursions( std::vector< score_t > scores, score_t significance_threshold, hss_list_t& hss_list, hss_list_t& msc_list, uint seqI, uint seqJ, boolean left_hss, boolean right_hss )
00209 {
00210 score_t left_end_score = left_hss ? significance_threshold : 0;
00211 score_t right_end_score = right_hss ? significance_threshold : 0;
00212 score_t score_sum = left_end_score;
00213 int64 ladder_point = 0;
00214 bool fwd_hss = true;
00215 if( left_hss )
00216 scores.front() = significance_threshold;
00217 if( right_hss )
00218 scores.back() = significance_threshold;
00219
00220
00221 for( size_t exI = 0; exI < hss_list.size(); exI++ )
00222 {
00223
00224 size_t col_base = hss_list[exI].left_col;
00225 size_t col_count = hss_list[exI].right_col - hss_list[exI].left_col + 1;
00226
00227 size_t positive_count = 0;
00228 for( size_t colI = hss_list[exI].left_col; colI < hss_list[exI].right_col + 1; colI++ )
00229 {
00230 if( scores[colI] <= 0 || scores[colI] == INVALID_SCORE )
00231 continue;
00232 positive_count++;
00233 }
00234 std::vector< size_t > pos_map(positive_count);
00235 positive_count = 0;
00236 for( size_t colI = hss_list[exI].left_col; colI < hss_list[exI].right_col + 1; colI++ )
00237 {
00238 if( scores[colI] <= 0 || scores[colI] == INVALID_SCORE )
00239 continue;
00240 pos_map[positive_count] = colI;
00241 positive_count++;
00242 }
00243
00244 std::vector< score_t > score_sums(positive_count, 0);
00245 size_t sum_base = 0;
00246 std::vector<HssCols> cur_msc_list;
00247 size_t invalid_count = 0;
00248 for( size_t colI = hss_list[exI].left_col; colI < hss_list[exI].right_col + 1; colI++ )
00249 {
00250
00251 if( scores[colI] == INVALID_SCORE )
00252 continue;
00253
00254 int64 msc_col = -1;
00255 size_t sumI = sum_base;
00256 for( ; sumI < score_sums.size(); sumI++ )
00257 {
00258
00259 if( pos_map[sumI] > colI )
00260 break;
00261
00262 if( score_sums[sumI] < 0 )
00263 continue;
00264 score_sums[sumI] += scores[colI];
00265
00266 if( score_sums[sumI] < 0 )
00267 invalid_count++;
00268
00269 if( score_sums[sumI] >= significance_threshold )
00270 msc_col = sumI;
00271 }
00272
00273 if( msc_col != -1 )
00274 {
00275 HssCols ic;
00276 ic.seqI = seqI;
00277 ic.seqJ = seqJ;
00278 ic.left_col = pos_map[msc_col];
00279 ic.right_col = colI;
00280 cur_msc_list.push_back( ic );
00281 sum_base = msc_col + 1;
00282 }
00283
00284
00285
00286
00287
00288 }
00289
00290 size_t disjoint = cur_msc_list.size() > 0 ? 1 : 0;
00291 for( size_t mscI = 1; mscI < cur_msc_list.size(); mscI++ )
00292 {
00293 if( cur_msc_list[mscI].left_col <= cur_msc_list[mscI-1].right_col )
00294 {
00295 cur_msc_list[mscI].left_col = cur_msc_list[mscI-1].left_col;
00296 cur_msc_list[mscI-1].left_col = (std::numeric_limits<size_t>::max)();
00297 cur_msc_list[mscI-1].right_col = (std::numeric_limits<size_t>::max)();
00298 }else
00299 disjoint++;
00300 }
00301 std::vector<HssCols> cur_msc_list2( disjoint );
00302 size_t mI = 0;
00303 for( size_t mscI = 0; mscI < cur_msc_list.size(); mscI++ )
00304 {
00305 if( cur_msc_list[mscI].left_col != (std::numeric_limits<size_t>::max)() )
00306 cur_msc_list2[mI++] = cur_msc_list[mscI];
00307 }
00308
00309
00310 for( mI = 0; mI < cur_msc_list2.size(); mI++ )
00311 {
00312 HssCols& hss = cur_msc_list2[mI];
00313
00314 size_t lcolI = hss.left_col + 1;
00315 for( ; lcolI > 0 && scores[lcolI - 1] > 0; lcolI-- );
00316 size_t rcolI = hss.right_col;
00317 for( ; rcolI < scores.size() && scores[rcolI] > 0; rcolI++ );
00318 hss.left_col = lcolI;
00319 hss.right_col = rcolI - 1;
00320 }
00321 swap( cur_msc_list, cur_msc_list2 );
00322
00323
00324
00325
00326 disjoint = cur_msc_list.size() > 0 ? 1 : 0;
00327 for( size_t mscI = 1; mscI < cur_msc_list.size(); mscI++ )
00328 {
00329 if( cur_msc_list[mscI].left_col <= cur_msc_list[mscI-1].right_col )
00330 {
00331 cur_msc_list[mscI].left_col = cur_msc_list[mscI-1].left_col;
00332 cur_msc_list[mscI-1].left_col = (std::numeric_limits<size_t>::max)();
00333 cur_msc_list[mscI-1].right_col = (std::numeric_limits<size_t>::max)();
00334 }else
00335 disjoint++;
00336 }
00337 mI = msc_list.size();
00338 msc_list.resize( mI + disjoint );
00339 for( size_t mscI = 0; mscI < cur_msc_list.size(); mscI++ )
00340 {
00341 if( cur_msc_list[mscI].left_col != (std::numeric_limits<size_t>::max)() )
00342 msc_list[mI++] = cur_msc_list[mscI];
00343 }
00344 }
00345 }
00346
00347 static char charmap[128];
00348 inline
00349 char* getCharmap()
00350 {
00351 static bool initialized = false;
00352 if(initialized)
00353 return charmap;
00354 memset(charmap, 0, 128);
00355 charmap['a'] = 0;
00356 charmap['c'] = 1;
00357 charmap['g'] = 2;
00358 charmap['t'] = 3;
00359 charmap['-'] = 4;
00360 charmap['A'] = 0;
00361 charmap['C'] = 1;
00362 charmap['G'] = 2;
00363 charmap['T'] = 3;
00364 charmap['-'] = 4;
00365 initialized = true;
00366 return charmap;
00367 }
00368
00369
00370 static char colmap[5][5] = {
00371
00372 {'1','3','4','5','7'},
00373 {'3','2','6','4','7'},
00374 {'4','6','2','3','7'},
00375 {'5','4','3','1','7'},
00376 {'7','7','7','7','\0'},
00377 };
00378
00379
00380 inline
00381 void findHssHomologyHMM( std::vector< std::string >& aln_table, hss_list_t& hss_list, uint seqI, uint seqJ, const Params& hmm_params,
00382 boolean left_homologous, boolean right_homologous )
00383 {
00384 static char* charmap = getCharmap();
00385
00386
00387 std::string column_states(aln_table[0].size(),'q');
00388 vector< size_t > col_reference(column_states.size(), (std::numeric_limits<size_t>::max)() );
00389 size_t refI = 0;
00390 for( size_t colI = 0; colI < column_states.size(); colI++ )
00391 {
00392 char a = charmap[aln_table[seqI][colI]];
00393 char b = charmap[aln_table[seqJ][colI]];
00394 column_states[colI] = colmap[a][b];
00395 if(column_states[colI] != 0 )
00396 col_reference[refI++] = colI;
00397 }
00398
00399 std::string::iterator sitr = std::remove(column_states.begin(), column_states.end(), 0);
00400 column_states.resize(sitr - column_states.begin());
00401
00402 for( size_t colI = 2; colI < column_states.size(); colI++ )
00403 {
00404 if( column_states[colI] == '7' &&
00405 column_states[colI-1] == '7' &&
00406 (column_states[colI-2] == '7' || column_states[colI-2] == '8') )
00407 column_states[colI-1] = '8';
00408 }
00409 if( column_states.size() > 1 && column_states[0] == '7' && (column_states[1] == '7' || column_states[1] == '8'))
00410 column_states[0] = '8';
00411 if( column_states.size() > 1 && column_states[column_states.size()-1] == '7' && (column_states[column_states.size()-2] == '7'|| column_states[column_states.size()-2] == '8') )
00412 column_states[column_states.size()-1] = '8';
00413
00414 string prediction;
00415 if( right_homologous && !left_homologous )
00416 std::reverse(column_states.begin(), column_states.end());
00417
00418 run(column_states, prediction, hmm_params);
00419
00420 if( right_homologous && !left_homologous )
00421 std::reverse(prediction.begin(), prediction.end());
00422 size_t prev_h = 0;
00423 size_t i = 1;
00424 for( ; i < prediction.size(); i++ )
00425 {
00426 if( prediction[i] == 'H' && prediction[i-1] == 'N' )
00427 {
00428 prev_h = i;
00429 }
00430 if( prediction[i] == 'N' && prediction[i-1] == 'H' )
00431 {
00432 HssCols hc;
00433 hc.seqI = seqI;
00434 hc.seqJ = seqJ;
00435 hc.left_col = col_reference[prev_h];
00436 hc.right_col = col_reference[i-1];
00437 hss_list.push_back(hc);
00438 prev_h = i;
00439 }
00440 }
00441
00442 if( prediction[i-1] == 'H' )
00443 {
00444 HssCols hc;
00445 hc.seqI = seqI;
00446 hc.seqJ = seqJ;
00447 hc.left_col = col_reference[prev_h];
00448 hc.right_col = col_reference[i-1];
00449 hss_list.push_back(hc);
00450 }
00451 }
00452
00453 inline
00454 void findHssRandomWalkScoreVector( std::vector< score_t > scores, score_t significance_threshold, hss_list_t& hss_list, uint seqI, uint seqJ, boolean left_homologous, boolean right_homologous )
00455 {
00456
00457 score_t left_end_score = left_homologous ? 0 : significance_threshold;
00458 score_t right_end_score = right_homologous ? 0 : significance_threshold;
00459 score_t score_sum = left_end_score;
00460 score_t lrh = score_sum;
00461 int64 ladder_point = -1;
00462 int64 rev_ladder_point = 0;
00463 bool fwd_hss = !left_homologous;
00464
00465 for( size_t colI = 0; colI <= scores.size(); ++colI )
00466 {
00467 if( colI < scores.size() && scores[colI] == INVALID_SCORE )
00468 continue;
00469
00470 if( colI == scores.size() || (score_sum >= 0 && score_sum + scores[colI] < 0) ||
00471 (score_sum >= lrh - significance_threshold && score_sum + scores[colI] < lrh - significance_threshold ) )
00472 {
00473 if( !fwd_hss && colI == scores.size() && !right_homologous )
00474 {
00475 fwd_hss = true;
00476 if( score_sum <= 0 )
00477 ladder_point = colI - 1;
00478 }
00479
00480 score_sum = 0;
00481 lrh = 0;
00482 if( fwd_hss )
00483 {
00484
00485
00486 score_t rev_score_sum = 0;
00487 if( colI == scores.size() && !right_homologous )
00488 rev_score_sum = significance_threshold;
00489 size_t rev_ladder_point = ladder_point;
00490 if( colI == scores.size() && !right_homologous )
00491 rev_ladder_point = colI - 1;
00492 for( size_t rcolI = colI; ((int64)rcolI) > ladder_point && rcolI > 0; --rcolI )
00493 {
00494 if( scores[rcolI-1] == INVALID_SCORE )
00495 continue;
00496 if( rev_score_sum > significance_threshold )
00497 break;
00498 if( rev_score_sum >= 0 && rev_score_sum + scores[rcolI-1] < 0 )
00499 {
00500 rev_score_sum = 0;
00501 }else if( rev_score_sum == 0 && scores[rcolI-1] > 0 )
00502 {
00503
00504 rev_score_sum += scores[rcolI-1];
00505 rev_ladder_point = rcolI-1;
00506 }else
00507 rev_score_sum += scores[rcolI-1];
00508
00509 }
00510
00511
00512 if( ((rev_ladder_point != 0 || ladder_point != -1) ||
00513 colI == scores.size()) && rev_ladder_point != -1 )
00514 {
00515 if( colI == scores.size() && ladder_point == -1 )
00516 {
00517 rev_ladder_point = scores.size()-1;
00518 }
00519 if( ladder_point == -1 )
00520 ladder_point = 0;
00521
00522 HssCols ic;
00523 ic.seqI = seqI;
00524 ic.seqJ = seqJ;
00525 ic.left_col = ladder_point;
00526 ic.right_col = rev_ladder_point;
00527 hss_list.push_back( ic );
00528 }
00529 }
00530 fwd_hss = false;
00531 }else if( score_sum == 0 && scores[colI] > 0 )
00532 {
00533
00534 score_sum += scores[colI];
00535 ladder_point = colI;
00536 }else
00537 score_sum += scores[colI];
00538
00539 if( score_sum > significance_threshold )
00540 fwd_hss = true;
00541 if( score_sum > lrh )
00542 lrh = score_sum;
00543 }
00544 }
00545
00546 template< typename MatchVector >
00547 void findHssRandomWalk_v2( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, hss_array_t& hss_array, boolean left_homologous, boolean right_homologous )
00548 {
00549 typedef typename MatchVector::value_type MatchType;
00550 if( iv_list.size() == 0 )
00551 return;
00552 uint seq_count = seq_table.size();
00553 hss_array.resize( boost::extents[seq_count][seq_count][iv_list.size()] );
00554 for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
00555 const MatchType& iv = iv_list[ iv_listI ];
00556 std::vector< std::string > aln_table;
00557 GetAlignment( *iv, seq_table, aln_table );
00558
00559 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00560 uint seqJ;
00561 for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00562
00563 hss_list_t& hss_list = hss_array[seqI][seqJ][iv_listI];
00564 hss_list.clear();
00565
00566 std::vector< score_t > scores;
00567 computeMatchScores( aln_table[seqI], aln_table[seqJ], scoring, scores );
00568 computeGapScores( aln_table[seqI], aln_table[seqJ], scoring, scores );
00569
00570
00571 for( size_t sI = 0; sI < scores.size(); ++sI )
00572 if( scores[sI] != INVALID_SCORE)
00573 scores[sI] = -scores[sI];
00574
00575 hss_list_t excursion_list;
00576 findHssExcursions( scores, significance_threshold, excursion_list, seqI, seqJ, !left_homologous, !right_homologous );
00577 findMscFromExcursions( scores, significance_threshold, excursion_list, hss_list, seqI, seqJ, !left_homologous, !right_homologous );
00578 }
00579 }
00580 }
00581 }
00582
00583 template< typename MatchVector >
00584 void findHssRandomWalk( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, hss_array_t& hss_array, boolean left_homologous, boolean right_homologous )
00585 {
00586 typedef typename MatchVector::value_type MatchType;
00587 if( iv_list.size() == 0 )
00588 return;
00589 uint seq_count = seq_table.size();
00590 hss_array.resize( boost::extents[seq_count][seq_count][iv_list.size()] );
00591 for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
00592 const MatchType& iv = iv_list[ iv_listI ];
00593 std::vector< std::string > aln_table;
00594 GetAlignment( *iv, seq_table, aln_table );
00595
00596 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00597 uint seqJ;
00598 for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00599
00600 hss_list_t& hss_list = hss_array[seqI][seqJ][iv_listI];
00601 hss_list.clear();
00602
00603 std::vector< score_t > scores;
00604 computeMatchScores( aln_table[seqI], aln_table[seqJ], scoring, scores );
00605 computeGapScores( aln_table[seqI], aln_table[seqJ], scoring, scores );
00606
00607
00608 for( size_t sI = 0; sI < scores.size(); ++sI )
00609 if( scores[sI] != INVALID_SCORE)
00610 scores[sI] = -scores[sI];
00611 findHssRandomWalkScoreVector( scores, significance_threshold, hss_list, seqI, seqJ, left_homologous, right_homologous );
00612 }
00613 }
00614 }
00615 }
00616
00617 template< typename MatchVector >
00618 void findHssHomologyHMM( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, hss_array_t& hss_array, const Params& hmm_params, boolean left_homologous, boolean right_homologous )
00619 {
00620 typedef typename MatchVector::value_type MatchType;
00621 if( iv_list.size() == 0 )
00622 return;
00623 uint seq_count = seq_table.size();
00624 hss_array.resize( boost::extents[seq_count][seq_count][iv_list.size()] );
00625 for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
00626 const MatchType& iv = iv_list[ iv_listI ];
00627 std::vector< std::string > aln_table;
00628 GetAlignment( *iv, seq_table, aln_table );
00629
00630 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00631 uint seqJ;
00632 for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00633
00634 hss_list_t& hss_list = hss_array[seqI][seqJ][iv_listI];
00635 hss_list.clear();
00636 findHssHomologyHMM( aln_table, hss_list, seqI, seqJ, hmm_params, left_homologous, right_homologous );
00637 }
00638 }
00639 }
00640 }
00641
00642
00643 template< typename MatchVector >
00644 void HssColsToIslandCols( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, hss_array_t& hss_array, hss_array_t& island_col_array )
00645 {
00646
00647 typedef typename MatchVector::value_type MatchType;
00648 uint seq_count = seq_table.size();
00649 island_col_array.resize( boost::extents[seq_count][seq_count][iv_list.size()] );
00650 for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
00651 const MatchType& iv = iv_list[ iv_listI ];
00652 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00653 uint seqJ;
00654 for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00655 hss_list_t& hss_list = hss_array[seqI][seqJ][iv_listI];
00656 hss_list_t& island_col_list = island_col_array[seqI][seqJ][iv_listI];
00657 ComplementHss(iv_list[iv_listI]->AlignmentLength(),hss_list,island_col_list,seqI,seqJ);
00658 }
00659 }
00660 }
00661 }
00662 inline
00663 void ComplementHss( const size_t alignment_length, hss_list_t& hss_list, hss_list_t& island_col_list, uint seqI=0, uint seqJ=0 )
00664 {
00665
00666
00667 size_t left_col = 0;
00668 for( size_t hssI = 0; hssI < hss_list.size(); ++hssI )
00669 {
00670 if( left_col >= hss_list[hssI].left_col )
00671 {
00672 left_col = hss_list[hssI].right_col + 1;
00673 continue;
00674 }
00675
00676 IslandCols isle;
00677 isle.seqI = seqI;
00678 isle.seqJ = seqJ;
00679 isle.left_col = left_col;
00680 isle.right_col = hss_list[hssI].left_col;
00681 island_col_list.push_back(isle);
00682 left_col = hss_list[hssI].right_col + 1;
00683 }
00684
00685 if( left_col < alignment_length )
00686 {
00687
00688 IslandCols isle;
00689 isle.seqI = seqI;
00690 isle.seqJ = seqJ;
00691 isle.left_col = left_col;
00692 isle.right_col = alignment_length-1;
00693 island_col_list.push_back(isle);
00694 }
00695 }
00696
00697 template< typename MatchVector >
00698 void HssArrayToCga( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, hss_array_t& hss_array, std::vector< CompactGappedAlignment<>* >& cga_list )
00699 {
00700 typedef typename MatchVector::value_type MatchType;
00701 uint seq_count = seq_table.size();
00702 for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
00703 const MatchType& iv = iv_list[ iv_listI ];
00704
00705 CompactGappedAlignment<>* iv_cga = dynamic_cast< CompactGappedAlignment<>* >(iv);
00706 bool allocated = false;
00707 if( iv_cga == NULL )
00708 {
00709 CompactGappedAlignment<> tmp_cga;
00710 iv_cga = tmp_cga.Copy();
00711 new (iv_cga) CompactGappedAlignment<>(*iv);
00712 allocated = true;
00713 }
00714 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00715 for( uint seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00716 hss_list_t& isle_list = hss_array[seqI][seqJ][iv_listI];
00717 for( size_t curI = 0; curI < isle_list.size(); ++curI )
00718 {
00719
00720 CompactGappedAlignment<> tmp_cga;
00721 cga_list.push_back( tmp_cga.Copy() );
00722 iv_cga->copyRange( *(cga_list.back()), isle_list[curI].left_col, isle_list[curI].right_col - isle_list[curI].left_col + 1 );
00723 if( cga_list.back()->LeftEnd(0) == NO_MATCH )
00724 {
00725
00726 cga_list.back()->Free();
00727 cga_list.erase( cga_list.end()-1 );
00728 }
00729 }
00730 }
00731 }
00732 if( allocated )
00733 iv_cga->Free();
00734 }
00735 }
00736
00737 template< typename MatchVector >
00738 void findHssRandomWalkCga( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, std::vector< CompactGappedAlignment<>* >& hss_list )
00739 {
00740 if( iv_list.size() == 0 )
00741 return;
00742 hss_array_t hss_array;
00743 findHssRandomWalk( iv_list, seq_table, scoring, significance_threshold, hss_array );
00744 hss_array_t homo_array;
00745 HssColsToIslandCols( iv_list, seq_table, hss_array, homo_array );
00746 HssArrayToCga(iv_list, seq_table, homo_array, hss_list);
00747 }
00748 template< typename MatchVector >
00749 void findIslandsRandomWalkCga( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, std::vector< CompactGappedAlignment<>* >& island_list )
00750 {
00751 if( iv_list.size() == 0 )
00752 return;
00753 hss_array_t hss_array;
00754 findHssRandomWalk( iv_list, seq_table, scoring, significance_threshold, hss_array );
00755 hss_array_t island_col_array;
00756
00757 HssArrayToCga(iv_list, seq_table, hss_array, island_list);
00758 }
00759
00760 template< typename MatchVector >
00761 void findIslandsRandomWalk( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, std::vector< Island >& island_list )
00762 {
00763 if( iv_list.size() == 0 )
00764 return;
00765 hss_array_t hss_array;
00766 findHssRandomWalk( iv_list, seq_table, scoring, significance_threshold, hss_array );
00767
00768 typedef typename MatchVector::value_type MatchType;
00769 for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
00770 const MatchType& iv = iv_list[ iv_listI ];
00771 uint seq_count = seq_table.size();
00772 std::vector< std::string > aln_table;
00773 GetAlignment( *iv, seq_table, aln_table );
00774
00775 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00776 uint seqJ;
00777 for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00778 hss_list_t& hss_list = hss_array[seqI][seqJ][iv_listI];
00779
00780 size_t cur_hss = 0;
00781 uint columnI = 0;
00782 gnSeqI curI = 0;
00783 gnSeqI curJ = 0;
00784 gnSeqI lastI = 0;
00785 gnSeqI lastJ = 0;
00786 for( columnI = 0; columnI < aln_table[0].size(); columnI++ ){
00787 if( aln_table[ seqI ][ columnI ] != '-' )
00788 curI++;
00789 if( aln_table[ seqJ ][ columnI ] != '-' )
00790 curJ++;
00791
00792 if( cur_hss < hss_list.size() &&
00793 columnI == hss_list[cur_hss].left_col )
00794 {
00795
00796 int64 leftI = iv.Start( seqI );
00797 int64 rightI = leftI < 0 ? leftI - curI : leftI + curI;
00798 leftI = leftI < 0 ? leftI - lastI : leftI + lastI;
00799 int64 leftJ = iv.Start( seqJ );
00800 int64 rightJ = leftJ < 0 ? leftJ - curJ : leftJ + curJ;
00801 leftJ = leftJ < 0 ? leftJ - lastJ : leftJ + lastJ;
00802 Island isle;
00803 isle.seqI = seqI;
00804 isle.seqJ = seqJ;
00805 isle.leftI = leftI;
00806 isle.leftJ = leftJ;
00807 isle.rightI = rightI;
00808 isle.rightJ = rightJ;
00809 island_list.push_back(isle);
00810 }
00811 else if( cur_hss < hss_list.size() &&
00812 columnI == hss_list[cur_hss].right_col )
00813 {
00814
00815 lastI = curI;
00816 lastJ = curJ;
00817 cur_hss++;
00818 }
00819 }
00820
00821
00822 int64 leftI = iv.Start( seqI );
00823 int64 rightI = leftI < 0 ? leftI - curI : leftI + curI;
00824 leftI = leftI < 0 ? leftI - lastI : leftI + lastI;
00825 int64 leftJ = iv.Start( seqJ );
00826 int64 rightJ = leftJ < 0 ? leftJ - curJ : leftJ + curJ;
00827 leftJ = leftJ < 0 ? leftJ - lastJ : leftJ + lastJ;
00828 Island isle;
00829 isle.seqI = seqI;
00830 isle.seqJ = seqJ;
00831 isle.leftI = leftI;
00832 isle.leftJ = leftJ;
00833 isle.rightI = rightI;
00834 isle.rightJ = rightJ;
00835 island_list.push_back(isle);
00836 }
00837 }
00838 }
00839 }
00840
00841 template< class IntervalListType >
00842 void addUnalignedRegions( IntervalListType& iv_list)
00843 {
00844 std::vector< AbstractMatch* > new_ivs;
00845 std::vector< AbstractMatch* > iv_ptrs(iv_list.size());
00846 for( size_t i = 0; i < iv_list.size(); ++i )
00847 iv_ptrs[i] = &iv_list[i];
00848 for( size_t seqI = 0; seqI < iv_list.seq_table.size(); ++seqI )
00849 {
00850 SingleStartComparator< AbstractMatch > ssc( seqI );
00851 std::sort( iv_ptrs.begin(), iv_ptrs.end(), ssc );
00852 size_t ivI = 0;
00853 for( ; ivI < iv_ptrs.size(); ++ivI )
00854 if( iv_ptrs[ivI]->LeftEnd(seqI) != NO_MATCH )
00855 break;
00856 std::list< AbstractMatch* > iv_ptr_list;
00857 iv_ptr_list.insert( iv_ptr_list.end(), iv_ptrs.begin()+ivI, iv_ptrs.end() );
00858 AddGapMatches( iv_ptr_list, iv_ptr_list.begin(), iv_ptr_list.end(), seqI, 1, iv_list.seq_table[seqI]->length()+1, AbstractMatch::forward, iv_list.seq_table.size() );
00859 std::list< AbstractMatch* >::iterator iter = iv_ptr_list.begin();
00860 while( ivI != iv_ptrs.size() && iter != iv_ptr_list.end() )
00861 {
00862 if( iv_ptrs[ivI] == *iter )
00863 ivI++;
00864 else
00865 new_ivs.push_back( *iter );
00866 ++iter;
00867 }
00868 while( iter != iv_ptr_list.end() )
00869 {
00870 new_ivs.push_back( *iter );
00871 ++iter;
00872 }
00873 }
00874
00875 size_t prev_size = iv_list.size();
00876 iv_list.resize( iv_list.size() + new_ivs.size() );
00877 for( size_t newI = 0; newI < new_ivs.size(); ++newI )
00878 {
00879 Interval iv( new_ivs.begin() + newI, new_ivs.begin() + newI + 1 );
00880 iv_list[prev_size + newI] = iv;
00881 new_ivs[newI]->Free();
00882 }
00883 }
00884
00885
00886 }
00887
00888 #endif // __Islands_h__