libMems/Islands.h

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * $Id: Islands.h,v 1.7 2004/03/01 02:40:08 darling Exp $
00003  * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
00004  * This file is licensed under the GPL.
00005  * Please see the file called COPYING for licensing details.
00006  * **************
00007  ******************************************************************************/
00008 
00009 #ifndef __Islands_h__
00010 #define __Islands_h__
00011 
00012 #ifdef HAVE_CONFIG_H
00013 #include "config.h"
00014 #endif
00015 
00016 #include "libGenome/gnSequence.h"
00017 #include "libMems/SubstitutionMatrix.h"
00018 #include "libMems/IntervalList.h"
00019 #include "libMems/NumericMatrix.h"
00020 #include "libMems/MatchList.h"
00021 #include "libMems/GappedAlignment.h"
00022 #include "libMems/CompactGappedAlignment.h"
00023 #include "libMems/Aligner.h"
00024 #include <boost/multi_array.hpp>
00025 #include "libMems/HomologyHMM/homology.h"
00026 #include "libMems/Scoring.h"
00027 
00028 namespace mems {
00029 
00035 class Island{
00036 public:
00037         uint seqI;
00038         uint seqJ;
00039         int64 leftI;
00040         int64 leftJ;
00041         int64 rightI;
00042         int64 rightJ;
00043 };
00044 
00049 void simpleFindIslands( IntervalList& iv_list, uint island_size, std::ostream& island_out );
00050 void findIslandsBetweenLCBs( IntervalList& iv_list, uint island_size, std::ostream& island_out );
00051 void simpleFindIslands( IntervalList& iv_list, uint island_size, std::vector< Island >& island_list );
00052 
00053 class HssCols{
00054 public:
00055         uint seqI;
00056         uint seqJ;
00057         size_t left_col;
00058         size_t right_col;
00059 };
00060 
00061 typedef std::vector< HssCols > hss_list_t;
00062 typedef boost::multi_array< hss_list_t, 3 > hss_array_t;
00063 
00064 typedef HssCols IslandCols;     // use the same structure for island segs
00065 
00066 void findHssRandomWalkScoreVector( std::vector< score_t > scores, score_t significance_threshold, hss_list_t& hss_list, uint seqI = 0, uint seqJ = 0, boolean left_homologous = false, boolean right_homologous = false );
00067 
00068 template<typename MatchVector>
00069 void findHssRandomWalk( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, hss_array_t& hss_array, boolean left_homologous = false, boolean right_homologous = false );
00070 
00071 template<typename MatchVector>
00072 void hssColsToIslandCols( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, std::vector< HssCols >& hss_list, std::vector< IslandCols >& island_col_list );
00073 
00074 template<typename MatchVector>
00075 void findHssRandomWalkCga( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, std::vector< CompactGappedAlignment<>* >& hss_list );
00076 
00077 template<typename MatchVector>
00078 void findIslandsRandomWalkCga( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, std::vector< CompactGappedAlignment<>* >& island_list );
00079 
00080 template<typename MatchVector>
00081 void findIslandsRandomWalk( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, std::vector< Island >& island_list );
00082 
00087 void addUnalignedIntervals( IntervalList& iv_list, std::set< uint > seq_set = std::set< uint >(), std::vector<gnSeqI> seq_lengths = std::vector<gnSeqI>() );
00088 
00094 void simpleFindBackbone( IntervalList& iv_list, uint backbone_size, uint max_gap_size, std::vector< GappedAlignment >& backbone_regions );
00095 
00099 void outputBackbone( const std::vector< GappedAlignment >& backbone_regions, std::ostream& backbone_out );
00100 
00101 void getGapBounds( std::vector<gnSeqI>& seq_lengths, std::vector< LCB >& adjacencies, uint seqJ, int leftI, int rightI, int64& left_start, int64& right_start );
00102 
00103 
00104 inline
00105 void findRightEndpoint( size_t seqI, size_t seqJ, score_t significance_threshold, std::vector< score_t >& scores, hss_list_t& hss_list )
00106 {
00107         // see how long it takes score_sum to go to 0, then scan forward to determine where the hss begins
00108         score_t score_sum = significance_threshold;
00109         size_t colI = scores.size();
00110         for( ; colI > 0; --colI )
00111         {
00112                 if( scores[colI-1] == INVALID_SCORE )
00113                         continue;
00114 
00115                 if( score_sum >= 0 && score_sum + scores[colI-1] < 0 )
00116                 {
00117                         // end of an excursion
00118                         score_sum = 0;
00119                         // backtrack to find the MSC in the other direction?
00120                         // call the entire segment between MSCs the HSS?
00121                         score_t rev_score_sum = 0;
00122                         size_t rev_ladder_point = colI-1;
00123                         size_t rcolI = colI-1;
00124                         for( ; ((int64)rcolI) < scores.size(); ++rcolI )
00125                         {
00126                                 if( scores[rcolI] == INVALID_SCORE )
00127                                         continue;
00128                                 if( rev_score_sum > significance_threshold )
00129                                         break;
00130                                 if( rev_score_sum >= 0 && rev_score_sum + scores[rcolI] < 0 )
00131                                 {
00132                                         rev_score_sum = 0;
00133                                 }else if( rev_score_sum == 0 && scores[rcolI] > 0 )
00134                                 {
00135                                         // start a new excursion
00136                                         rev_score_sum += scores[rcolI];
00137                                         rev_ladder_point = rcolI;
00138                                 }else
00139                                         rev_score_sum += scores[rcolI];
00140                         }
00141                         // the segment between ladder_point and rev_ladder_point is an HSS
00142                         if( rcolI < scores.size() )
00143                         {
00144                                 HssCols ic;
00145                                 ic.seqI = seqI;
00146                                 ic.seqJ = seqJ;
00147                                 ic.left_col = rev_ladder_point;
00148                                 ic.right_col = scores.size()-1;
00149                                 hss_list.push_back( ic );
00150                         }
00151                         break;
00152                 }else
00153                         score_sum += scores[colI-1];
00154         }
00155 }
00156 
00157 
00158 inline
00159 void findHssExcursions( std::vector< score_t > scores, score_t significance_threshold, hss_list_t& hss_list, uint seqI, uint seqJ, boolean left_hss, boolean right_hss )
00160 {
00161         score_t score_sum = left_hss ? significance_threshold : 0;      // start in an hss if non-homologous
00162         int64 ladder_point = 0;
00163         bool fwd_hss = left_hss;
00164 
00165         // scan left to right over the columns to identify HSS
00166         for( size_t colI = 0; colI <= scores.size(); ++colI )
00167         {
00168                 if( colI < scores.size() && scores[colI] == INVALID_SCORE )
00169                         continue;
00170 
00171                 if( colI == scores.size() || (score_sum >= 0 && score_sum + scores[colI] < 0)  )
00172                 {
00173                         // end of an excursion
00174                         if( colI == scores.size() && right_hss )
00175                                 fwd_hss = true;
00176 
00177                         score_sum = 0;
00178                         if( fwd_hss )
00179                         {
00180                                 // call the entire segment between the current column and the ladder point
00181                                 // an excursion
00182                                 HssCols ic;
00183                                 ic.seqI = seqI;
00184                                 ic.seqJ = seqJ;
00185                                 ic.left_col = ladder_point;
00186                                 if( colI == scores.size() )
00187                                         ic.right_col = colI - 1;
00188                                 else
00189                                         ic.right_col = colI;
00190                                 hss_list.push_back( ic );
00191                         }
00192                         fwd_hss = false;
00193                 }else if( score_sum == 0 && scores[colI] > 0 )
00194                 {
00195                         // start a new excursion
00196                         score_sum += scores[colI];
00197                         ladder_point = colI;
00198                 }else
00199                         score_sum += scores[colI];
00200 
00201                 if( score_sum > significance_threshold )
00202                         fwd_hss = true;
00203         }
00204 }
00205 
00206 
00207 inline
00208 void findMscFromExcursions( std::vector< score_t > scores, score_t significance_threshold, hss_list_t& hss_list, hss_list_t& msc_list, uint seqI, uint seqJ, boolean left_hss, boolean right_hss )
00209 {
00210         score_t left_end_score = left_hss ? significance_threshold : 0;
00211         score_t right_end_score = right_hss ? significance_threshold : 0;
00212         score_t score_sum = left_end_score;     // start in an hss if non-homologous
00213         int64 ladder_point = 0;
00214         bool fwd_hss = true;
00215         if( left_hss )
00216                 scores.front() = significance_threshold;
00217         if( right_hss )
00218                 scores.back() = significance_threshold;
00219 
00220         // for each excursion in hss_list
00221         for( size_t exI = 0; exI < hss_list.size(); exI++ )
00222         {
00223                 // create a vector of score sums
00224                 size_t col_base = hss_list[exI].left_col;
00225                 size_t col_count = hss_list[exI].right_col - hss_list[exI].left_col + 1;
00226                 // find cols with positive scores
00227                 size_t positive_count = 0;
00228                 for( size_t colI = hss_list[exI].left_col; colI < hss_list[exI].right_col + 1; colI++ )
00229                 {
00230                         if( scores[colI] <= 0 || scores[colI] == INVALID_SCORE  )
00231                                 continue;
00232                         positive_count++;
00233                 }
00234                 std::vector< size_t > pos_map(positive_count);
00235                 positive_count = 0;
00236                 for( size_t colI = hss_list[exI].left_col; colI < hss_list[exI].right_col + 1; colI++ )
00237                 {
00238                         if( scores[colI] <= 0 || scores[colI] == INVALID_SCORE )
00239                                 continue;
00240                         pos_map[positive_count] = colI;
00241                         positive_count++;
00242                 }
00243 
00244                 std::vector< score_t > score_sums(positive_count, 0);
00245                 size_t sum_base = 0;
00246                 std::vector<HssCols> cur_msc_list;
00247                 size_t invalid_count = 0;
00248                 for( size_t colI = hss_list[exI].left_col; colI < hss_list[exI].right_col + 1; colI++ )
00249                 {
00250                         // skip this column if it has an invalid score
00251                         if( scores[colI] == INVALID_SCORE )
00252                                 continue;
00253                         // otherwise add the current column score to all relevant score sums
00254                         int64 msc_col = -1;
00255                         size_t sumI = sum_base;
00256                         for( ; sumI < score_sums.size(); sumI++ )
00257                         {
00258                                 // break the loop if the next positive column is past colI
00259                                 if( pos_map[sumI] > colI )
00260                                         break;
00261                                 // don't worry about this one if it's invalid
00262                                 if( score_sums[sumI] < 0 )
00263                                         continue;
00264                                 score_sums[sumI] += scores[colI];
00265                                 // if the local score bottoms out to 0 then this one has failed
00266                                 if( score_sums[sumI] < 0 )
00267                                         invalid_count++;
00268                                 // take the right-most starting column which yields an msc
00269                                 if( score_sums[sumI] >= significance_threshold )
00270                                         msc_col = sumI;
00271                         }
00272                         // did we find a minimum significant cluster?
00273                         if( msc_col != -1 )
00274                         {
00275                                 HssCols ic;
00276                                 ic.seqI = seqI;
00277                                 ic.seqJ = seqJ;
00278                                 ic.left_col = pos_map[msc_col];
00279                                 ic.right_col = colI;
00280                                 cur_msc_list.push_back( ic );
00281                                 sum_base = msc_col + 1; // any new MSC needs to be to the right of this one's left-end
00282                         }
00283                         // or did all of our local sums become invalid?
00284 //                      if( invalid_count == sumI - sum_base )
00285 //                      {
00286 //                              sum_base = sumI + 1;
00287 //                      }
00288                 }
00289                 // merge any overlapping MSCs
00290                 size_t disjoint = cur_msc_list.size() > 0 ? 1 : 0;
00291                 for( size_t mscI = 1; mscI < cur_msc_list.size(); mscI++ )
00292                 {
00293                         if( cur_msc_list[mscI].left_col <= cur_msc_list[mscI-1].right_col )
00294                         {
00295                                 cur_msc_list[mscI].left_col = cur_msc_list[mscI-1].left_col;
00296                                 cur_msc_list[mscI-1].left_col = (std::numeric_limits<size_t>::max)();
00297                                 cur_msc_list[mscI-1].right_col = (std::numeric_limits<size_t>::max)();
00298                         }else
00299                                 disjoint++;
00300                 }
00301                 std::vector<HssCols> cur_msc_list2( disjoint );
00302                 size_t mI = 0;
00303                 for( size_t mscI = 0; mscI < cur_msc_list.size(); mscI++ )
00304                 {
00305                         if( cur_msc_list[mscI].left_col != (std::numeric_limits<size_t>::max)() )
00306                                 cur_msc_list2[mI++] = cur_msc_list[mscI];
00307                 }
00308 
00309                 // TODO: grow MSC boundaries to include all surrounding positively scoring regions
00310                 for( mI = 0; mI < cur_msc_list2.size(); mI++ )
00311                 {
00312                         HssCols& hss = cur_msc_list2[mI];
00313                         // first left, then right
00314                         size_t lcolI = hss.left_col + 1;
00315                         for( ; lcolI > 0 && scores[lcolI - 1] > 0; lcolI-- );
00316                         size_t rcolI = hss.right_col;
00317                         for( ; rcolI < scores.size() && scores[rcolI] > 0; rcolI++ );
00318                         hss.left_col = lcolI;
00319                         hss.right_col = rcolI - 1;
00320                 }
00321                 swap( cur_msc_list, cur_msc_list2 );
00322 
00323                 // merge overlapping MSCs again
00324                 // BAD: copied code from above
00325 
00326                 disjoint = cur_msc_list.size() > 0 ? 1 : 0;
00327                 for( size_t mscI = 1; mscI < cur_msc_list.size(); mscI++ )
00328                 {
00329                         if( cur_msc_list[mscI].left_col <= cur_msc_list[mscI-1].right_col )
00330                         {
00331                                 cur_msc_list[mscI].left_col = cur_msc_list[mscI-1].left_col;
00332                                 cur_msc_list[mscI-1].left_col = (std::numeric_limits<size_t>::max)();
00333                                 cur_msc_list[mscI-1].right_col = (std::numeric_limits<size_t>::max)();
00334                         }else
00335                                 disjoint++;
00336                 }
00337                 mI = msc_list.size();
00338                 msc_list.resize( mI + disjoint );
00339                 for( size_t mscI = 0; mscI < cur_msc_list.size(); mscI++ )
00340                 {
00341                         if( cur_msc_list[mscI].left_col != (std::numeric_limits<size_t>::max)() )
00342                                 msc_list[mI++] = cur_msc_list[mscI];
00343                 }
00344         }
00345 }
00346 
00347 static char charmap[128];
00348 inline
00349 char* getCharmap()
00350 {
00351         static bool initialized = false;
00352         if(initialized)
00353                 return charmap;
00354         memset(charmap, 0, 128);
00355         charmap['a'] = 0;
00356         charmap['c'] = 1;
00357         charmap['g'] = 2;
00358         charmap['t'] = 3;
00359         charmap['-'] = 4;
00360         charmap['A'] = 0;
00361         charmap['C'] = 1;
00362         charmap['G'] = 2;
00363         charmap['T'] = 3;
00364         charmap['-'] = 4;
00365         initialized = true;
00366         return charmap;
00367 }
00368 // a mapping from pairwise alignment columns to HomologyHMM emission codes
00369 // row/column indices are as given by the charmap above (ACGT- == 01234).
00370 static char colmap[5][5] = {
00371 //    A   C   G   T   -
00372         {'1','3','4','5','7'},  // A
00373         {'3','2','6','4','7'},  // C
00374         {'4','6','2','3','7'},  // G
00375         {'5','4','3','1','7'},  // T
00376         {'7','7','7','7','\0'},  // -
00377 };
00378 
00379 
00380 inline
00381 void findHssHomologyHMM( std::vector< std::string >& aln_table, hss_list_t& hss_list, uint seqI, uint seqJ, const Params& hmm_params,
00382                                                 boolean left_homologous, boolean right_homologous )
00383 {
00384         static char* charmap = getCharmap();
00385 
00386         // encode the alignment as column states
00387         std::string column_states(aln_table[0].size(),'q');
00388         vector< size_t > col_reference(column_states.size(), (std::numeric_limits<size_t>::max)() );
00389         size_t refI = 0;
00390         for( size_t colI = 0; colI < column_states.size(); colI++ )
00391         {
00392                 char a = charmap[aln_table[seqI][colI]];
00393                 char b = charmap[aln_table[seqJ][colI]];
00394                 column_states[colI] = colmap[a][b];
00395                 if(column_states[colI] != 0 )
00396                         col_reference[refI++] = colI;
00397         }
00398         // filter out the gap/gap cols
00399         std::string::iterator sitr = std::remove(column_states.begin(), column_states.end(), 0);
00400         column_states.resize(sitr - column_states.begin());
00401 
00402         for( size_t colI = 2; colI < column_states.size(); colI++ )
00403         {
00404                 if( column_states[colI] == '7' &&
00405                         column_states[colI-1] == '7' &&
00406                         (column_states[colI-2] == '7' || column_states[colI-2] == '8') )
00407                         column_states[colI-1] = '8';
00408         }
00409         if( column_states.size() > 1 && column_states[0] == '7' && (column_states[1] == '7' || column_states[1] == '8'))
00410                 column_states[0] = '8';
00411         if( column_states.size() > 1 && column_states[column_states.size()-1] == '7' && (column_states[column_states.size()-2] == '7'|| column_states[column_states.size()-2] == '8') )
00412                 column_states[column_states.size()-1] = '8';
00413         // now feed it to the Homology prediction HMM
00414         string prediction;
00415         if( right_homologous && !left_homologous )
00416                 std::reverse(column_states.begin(), column_states.end());
00417 
00418         run(column_states, prediction, hmm_params);
00419 
00420         if( right_homologous && !left_homologous )
00421                 std::reverse(prediction.begin(), prediction.end());
00422         size_t prev_h = 0;
00423         size_t i = 1;
00424         for( ; i < prediction.size(); i++ )
00425         {
00426                 if( prediction[i] == 'H' && prediction[i-1] == 'N' )
00427                 {
00428                         prev_h = i;
00429                 }
00430                 if( prediction[i] == 'N' && prediction[i-1] == 'H' )
00431                 {
00432                         HssCols hc;
00433                         hc.seqI = seqI;
00434                         hc.seqJ = seqJ;
00435                         hc.left_col = col_reference[prev_h];
00436                         hc.right_col = col_reference[i-1];
00437                         hss_list.push_back(hc);
00438                         prev_h = i;
00439                 }
00440         }
00441         // get the last one
00442         if( prediction[i-1] == 'H' )
00443         {
00444                 HssCols hc;
00445                 hc.seqI = seqI;
00446                 hc.seqJ = seqJ;
00447                 hc.left_col = col_reference[prev_h];
00448                 hc.right_col = col_reference[i-1];
00449                 hss_list.push_back(hc);
00450         }
00451 }
00452 
00453 inline
00454 void findHssRandomWalkScoreVector( std::vector< score_t > scores, score_t significance_threshold, hss_list_t& hss_list, uint seqI, uint seqJ, boolean left_homologous, boolean right_homologous )
00455 {
00456 
00457         score_t left_end_score = left_homologous ? 0 : significance_threshold;
00458         score_t right_end_score = right_homologous ? 0 : significance_threshold;
00459         score_t score_sum = left_end_score;     // start in an hss if non-homologous
00460         score_t lrh = score_sum;
00461         int64 ladder_point = -1;
00462         int64 rev_ladder_point = 0;
00463         bool fwd_hss = !left_homologous;
00464 
00465         for( size_t colI = 0; colI <= scores.size(); ++colI )
00466         {
00467                 if( colI < scores.size() && scores[colI] == INVALID_SCORE )
00468                         continue;
00469 
00470                 if( colI == scores.size() || (score_sum >= 0 && score_sum + scores[colI] < 0) ||
00471                         (score_sum >= lrh - significance_threshold && score_sum + scores[colI] < lrh - significance_threshold ) )
00472                 {
00473                         if( !fwd_hss && colI == scores.size() && !right_homologous )
00474                         {
00475                                 fwd_hss = true;
00476                                 if( score_sum <= 0 )
00477                                         ladder_point = colI - 1;
00478                         }
00479                         // end of an excursion
00480                         score_sum = 0;
00481                         lrh = 0;
00482                         if( fwd_hss )
00483                         {
00484                                 // backtrack to find the MSC in the other direction?
00485                                 // call the entire segment between MSCs the HSS?
00486                                 score_t rev_score_sum = 0;
00487                                 if( colI == scores.size() && !right_homologous )
00488                                         rev_score_sum = significance_threshold;
00489                                 size_t rev_ladder_point = ladder_point;
00490                                 if( colI == scores.size() && !right_homologous )
00491                                         rev_ladder_point = colI - 1;
00492                                 for( size_t rcolI = colI; ((int64)rcolI) > ladder_point && rcolI > 0; --rcolI )
00493                                 {
00494                                         if( scores[rcolI-1] == INVALID_SCORE )
00495                                                 continue;
00496                                         if( rev_score_sum > significance_threshold )
00497                                                 break;
00498                                         if( rev_score_sum >= 0 && rev_score_sum + scores[rcolI-1] < 0 )
00499                                         {
00500                                                 rev_score_sum = 0;
00501                                         }else if( rev_score_sum == 0 && scores[rcolI-1] > 0 )
00502                                         {
00503                                                 // start a new excursion
00504                                                 rev_score_sum += scores[rcolI-1];
00505                                                 rev_ladder_point = rcolI-1;
00506                                         }else
00507                                                 rev_score_sum += scores[rcolI-1];
00508 
00509                                 }
00510                                 // don't make an HSS if there was no reverse HSS, unless we
00511                                 // ended the excursion artificially because we hit the end of the block...
00512                                 if( ((rev_ladder_point != 0 || ladder_point != -1) ||
00513                                         colI == scores.size()) && rev_ladder_point != -1 )
00514                                 {
00515                                         if( colI == scores.size() && ladder_point == -1 )
00516                                         {
00517                                                 rev_ladder_point = scores.size()-1;
00518                                         }
00519                                         if( ladder_point == -1 )
00520                                                 ladder_point = 0;
00521                                         // the segment between ladder_point and rev_ladder_point is an HSS
00522                                         HssCols ic;
00523                                         ic.seqI = seqI;
00524                                         ic.seqJ = seqJ;
00525                                         ic.left_col = ladder_point;
00526                                         ic.right_col = rev_ladder_point;
00527                                         hss_list.push_back( ic );
00528                                 }
00529                         }
00530                         fwd_hss = false;
00531                 }else if( score_sum == 0 && scores[colI] > 0 )
00532                 {
00533                         // start a new excursion
00534                         score_sum += scores[colI];
00535                         ladder_point = colI;
00536                 }else
00537                         score_sum += scores[colI];
00538 
00539                 if( score_sum > significance_threshold )
00540                         fwd_hss = true;
00541                 if( score_sum > lrh )
00542                         lrh = score_sum;
00543         }
00544 }
00545 
00546 template< typename MatchVector >
00547 void findHssRandomWalk_v2( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, hss_array_t& hss_array, boolean left_homologous, boolean right_homologous )
00548 {
00549         typedef typename MatchVector::value_type MatchType;
00550         if( iv_list.size() == 0 )
00551                 return;
00552         uint seq_count = seq_table.size();
00553         hss_array.resize( boost::extents[seq_count][seq_count][iv_list.size()] );
00554         for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
00555                 const MatchType& iv = iv_list[ iv_listI ];
00556                 std::vector< std::string > aln_table;
00557                 GetAlignment( *iv, seq_table, aln_table );
00558                 
00559                 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00560                         uint seqJ;
00561                         for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00562 
00563                                 hss_list_t& hss_list = hss_array[seqI][seqJ][iv_listI];
00564                                 hss_list.clear();
00565 
00566                                 std::vector< score_t > scores;
00567                                 computeMatchScores( aln_table[seqI], aln_table[seqJ], scoring, scores );
00568                                 computeGapScores( aln_table[seqI], aln_table[seqJ], scoring, scores );
00569 
00570                                 // Invert the scores since we're trying to detect rare bouts of non-homologous sequence
00571                                 for( size_t sI = 0; sI < scores.size(); ++sI )
00572                                         if( scores[sI] != INVALID_SCORE)
00573                                                 scores[sI] = -scores[sI];
00574 
00575                                 hss_list_t excursion_list;
00576                                 findHssExcursions( scores, significance_threshold, excursion_list, seqI, seqJ, !left_homologous, !right_homologous );
00577                                 findMscFromExcursions( scores, significance_threshold, excursion_list, hss_list, seqI, seqJ, !left_homologous, !right_homologous );
00578                         }
00579                 }
00580         }
00581 }
00582 
00583 template< typename MatchVector >
00584 void findHssRandomWalk( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, hss_array_t& hss_array, boolean left_homologous, boolean right_homologous )
00585 {
00586         typedef typename MatchVector::value_type MatchType;
00587         if( iv_list.size() == 0 )
00588                 return;
00589         uint seq_count = seq_table.size();
00590         hss_array.resize( boost::extents[seq_count][seq_count][iv_list.size()] );
00591         for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
00592                 const MatchType& iv = iv_list[ iv_listI ];
00593                 std::vector< std::string > aln_table;
00594                 GetAlignment( *iv, seq_table, aln_table );
00595                 
00596                 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00597                         uint seqJ;
00598                         for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00599 
00600                                 hss_list_t& hss_list = hss_array[seqI][seqJ][iv_listI];
00601                                 hss_list.clear();
00602 
00603                                 std::vector< score_t > scores;
00604                                 computeMatchScores( aln_table[seqI], aln_table[seqJ], scoring, scores );
00605                                 computeGapScores( aln_table[seqI], aln_table[seqJ], scoring, scores );
00606 
00607                                 // Invert the scores since we're trying to detect rare bouts of non-homologous sequence
00608                                 for( size_t sI = 0; sI < scores.size(); ++sI )
00609                                         if( scores[sI] != INVALID_SCORE)
00610                                                 scores[sI] = -scores[sI];
00611                                 findHssRandomWalkScoreVector( scores, significance_threshold, hss_list, seqI, seqJ, left_homologous, right_homologous );
00612                         }
00613                 }
00614         }
00615 }
00616 
00617 template< typename MatchVector >
00618 void findHssHomologyHMM( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table,  hss_array_t& hss_array, const Params& hmm_params, boolean left_homologous, boolean right_homologous )
00619 {
00620         typedef typename MatchVector::value_type MatchType;
00621         if( iv_list.size() == 0 )
00622                 return;
00623         uint seq_count = seq_table.size();
00624         hss_array.resize( boost::extents[seq_count][seq_count][iv_list.size()] );
00625         for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
00626                 const MatchType& iv = iv_list[ iv_listI ];
00627                 std::vector< std::string > aln_table;
00628                 GetAlignment( *iv, seq_table, aln_table );
00629                 
00630                 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00631                         uint seqJ;
00632                         for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00633 
00634                                 hss_list_t& hss_list = hss_array[seqI][seqJ][iv_listI];
00635                                 hss_list.clear();
00636                                 findHssHomologyHMM( aln_table, hss_list, seqI, seqJ, hmm_params, left_homologous, right_homologous );
00637                         }
00638                 }
00639         }
00640 }
00641 
00642 
00643 template< typename MatchVector >
00644 void HssColsToIslandCols( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, hss_array_t& hss_array, hss_array_t& island_col_array )
00645 {
00646 
00647         typedef typename MatchVector::value_type MatchType;
00648         uint seq_count = seq_table.size();
00649         island_col_array.resize( boost::extents[seq_count][seq_count][iv_list.size()] );
00650         for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
00651                 const MatchType& iv = iv_list[ iv_listI ];
00652                 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00653                         uint seqJ;
00654                         for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00655                                 hss_list_t& hss_list = hss_array[seqI][seqJ][iv_listI];
00656                                 hss_list_t& island_col_list = island_col_array[seqI][seqJ][iv_listI];
00657                                 ComplementHss(iv_list[iv_listI]->AlignmentLength(),hss_list,island_col_list,seqI,seqJ);
00658                         }
00659                 }
00660         }
00661 }
00662 inline
00663 void ComplementHss( const size_t alignment_length, hss_list_t& hss_list, hss_list_t& island_col_list, uint seqI=0, uint seqJ=0 )
00664 {
00665 
00666 
00667         size_t left_col = 0;
00668         for( size_t hssI = 0; hssI < hss_list.size(); ++hssI )
00669         {
00670                 if( left_col >= hss_list[hssI].left_col ) 
00671                 {
00672                         left_col = hss_list[hssI].right_col + 1;
00673                         continue;       // handle the case where the HSS starts at col 0
00674                 }
00675                 // ending an island
00676                 IslandCols isle;
00677                 isle.seqI = seqI;
00678                 isle.seqJ = seqJ;
00679                 isle.left_col = left_col;
00680                 isle.right_col = hss_list[hssI].left_col;
00681                 island_col_list.push_back(isle);
00682                 left_col = hss_list[hssI].right_col + 1;
00683         }
00684 
00685         if( left_col < alignment_length )
00686         {
00687                 // add the last island
00688                 IslandCols isle;
00689                 isle.seqI = seqI;
00690                 isle.seqJ = seqJ;
00691                 isle.left_col = left_col;
00692                 isle.right_col = alignment_length-1;
00693                 island_col_list.push_back(isle);
00694         }
00695 }
00696 
00697 template< typename MatchVector >
00698 void HssArrayToCga( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, hss_array_t& hss_array, std::vector< CompactGappedAlignment<>* >& cga_list )
00699 {
00700         typedef typename MatchVector::value_type MatchType;
00701         uint seq_count = seq_table.size();
00702         for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
00703                 const MatchType& iv = iv_list[ iv_listI ];
00704                 
00705                 CompactGappedAlignment<>* iv_cga = dynamic_cast< CompactGappedAlignment<>* >(iv);
00706                 bool allocated = false;
00707                 if( iv_cga == NULL )
00708                 {
00709                         CompactGappedAlignment<> tmp_cga;
00710                         iv_cga = tmp_cga.Copy();
00711                         new (iv_cga) CompactGappedAlignment<>(*iv);
00712                         allocated = true;
00713                 }
00714                 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00715                         for( uint seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00716                                 hss_list_t& isle_list = hss_array[seqI][seqJ][iv_listI];
00717                                 for( size_t curI = 0; curI < isle_list.size(); ++curI )
00718                                 {
00719                                         // extract a cga
00720                                         CompactGappedAlignment<> tmp_cga;
00721                                         cga_list.push_back( tmp_cga.Copy() );
00722                                         iv_cga->copyRange( *(cga_list.back()), isle_list[curI].left_col, isle_list[curI].right_col - isle_list[curI].left_col + 1 );
00723                                         if( cga_list.back()->LeftEnd(0) == NO_MATCH )
00724                                         {
00725                                                 // this one must have been covering an invalid region (gaps aligned to gaps)
00726                                                 cga_list.back()->Free();
00727                                                 cga_list.erase( cga_list.end()-1 );
00728                                         }
00729                                 }
00730                         }
00731                 }
00732                 if( allocated )
00733                         iv_cga->Free();
00734         }
00735 }
00736 
00737 template< typename MatchVector >
00738 void findHssRandomWalkCga( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, std::vector< CompactGappedAlignment<>* >& hss_list )
00739 {
00740         if( iv_list.size() == 0 )
00741                 return;
00742         hss_array_t hss_array;
00743         findHssRandomWalk( iv_list, seq_table, scoring, significance_threshold, hss_array );
00744         hss_array_t homo_array;
00745         HssColsToIslandCols( iv_list, seq_table, hss_array, homo_array );
00746         HssArrayToCga(iv_list, seq_table, homo_array, hss_list);
00747 }
00748 template< typename MatchVector >
00749 void findIslandsRandomWalkCga( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, std::vector< CompactGappedAlignment<>* >& island_list )
00750 {
00751         if( iv_list.size() == 0 )
00752                 return;
00753         hss_array_t hss_array;
00754         findHssRandomWalk( iv_list, seq_table, scoring, significance_threshold, hss_array );
00755         hss_array_t island_col_array;
00756 //      HssColsToIslandCols( iv_list, hss_array, island_col_array );
00757         HssArrayToCga(iv_list, seq_table, hss_array, island_list);
00758 }
00759 
00760 template< typename MatchVector >
00761 void findIslandsRandomWalk( const MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, const PairwiseScoringScheme& scoring, score_t significance_threshold, std::vector< Island >& island_list )
00762 {
00763         if( iv_list.size() == 0 )
00764                 return;
00765         hss_array_t hss_array;
00766         findHssRandomWalk( iv_list, seq_table, scoring, significance_threshold, hss_array );
00767 
00768         typedef typename MatchVector::value_type MatchType;
00769         for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
00770                 const MatchType& iv = iv_list[ iv_listI ];
00771                 uint seq_count = seq_table.size();
00772                 std::vector< std::string > aln_table;
00773                 GetAlignment( *iv, seq_table, aln_table );
00774                 
00775                 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00776                         uint seqJ;
00777                         for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00778                                 hss_list_t& hss_list = hss_array[seqI][seqJ][iv_listI];
00779 
00780                                 size_t cur_hss = 0;
00781                                 uint columnI = 0;
00782                                 gnSeqI curI = 0;
00783                                 gnSeqI curJ = 0;
00784                                 gnSeqI lastI = 0;
00785                                 gnSeqI lastJ = 0;
00786                                 for( columnI = 0; columnI < aln_table[0].size(); columnI++ ){
00787                                         if( aln_table[ seqI ][ columnI ] != '-' )
00788                                                 curI++;
00789                                         if( aln_table[ seqJ ][ columnI ] != '-' )
00790                                                 curJ++;
00791 
00792                                         if( cur_hss < hss_list.size() && 
00793                                                 columnI == hss_list[cur_hss].left_col )
00794                                         {
00795                                                 // ending an island
00796                                                 int64 leftI = iv.Start( seqI );
00797                                                 int64 rightI = leftI < 0 ? leftI - curI : leftI + curI;
00798                                                 leftI = leftI < 0 ? leftI - lastI : leftI + lastI;
00799                                                 int64 leftJ = iv.Start( seqJ );
00800                                                 int64 rightJ = leftJ < 0 ? leftJ - curJ : leftJ + curJ;
00801                                                 leftJ = leftJ < 0 ? leftJ - lastJ : leftJ + lastJ;
00802                                                 Island isle;
00803                                                 isle.seqI = seqI;
00804                                                 isle.seqJ = seqJ;
00805                                                 isle.leftI = leftI;
00806                                                 isle.leftJ = leftJ;
00807                                                 isle.rightI = rightI;
00808                                                 isle.rightJ = rightJ;
00809                                                 island_list.push_back(isle);
00810                                         }
00811                                         else if( cur_hss < hss_list.size() &&
00812                                                 columnI == hss_list[cur_hss].right_col )
00813                                         {
00814                                                 // starting an island
00815                                                 lastI = curI;
00816                                                 lastJ = curJ;
00817                                                 cur_hss++;
00818                                         }
00819                                 }
00820 
00821                                 // add the last island
00822                                 int64 leftI = iv.Start( seqI );
00823                                 int64 rightI = leftI < 0 ? leftI - curI : leftI + curI;
00824                                 leftI = leftI < 0 ? leftI - lastI : leftI + lastI;
00825                                 int64 leftJ = iv.Start( seqJ );
00826                                 int64 rightJ = leftJ < 0 ? leftJ - curJ : leftJ + curJ;
00827                                 leftJ = leftJ < 0 ? leftJ - lastJ : leftJ + lastJ;
00828                                 Island isle;
00829                                 isle.seqI = seqI;
00830                                 isle.seqJ = seqJ;
00831                                 isle.leftI = leftI;
00832                                 isle.leftJ = leftJ;
00833                                 isle.rightI = rightI;
00834                                 isle.rightJ = rightJ;
00835                                 island_list.push_back(isle);
00836                         }
00837                 }
00838         }
00839 }
00840 
00841 template< class IntervalListType >
00842 void addUnalignedRegions( IntervalListType& iv_list)
00843 {
00844         std::vector< AbstractMatch* > new_ivs;
00845         std::vector< AbstractMatch* > iv_ptrs(iv_list.size());
00846         for( size_t i = 0; i < iv_list.size(); ++i )
00847                 iv_ptrs[i] = &iv_list[i];
00848         for( size_t seqI = 0; seqI < iv_list.seq_table.size(); ++seqI )
00849         {
00850                 SingleStartComparator< AbstractMatch > ssc( seqI );
00851                 std::sort( iv_ptrs.begin(), iv_ptrs.end(), ssc );
00852                 size_t ivI = 0;
00853                 for( ; ivI < iv_ptrs.size(); ++ivI )
00854                         if( iv_ptrs[ivI]->LeftEnd(seqI) != NO_MATCH )
00855                                 break;
00856                 std::list< AbstractMatch* > iv_ptr_list;
00857                 iv_ptr_list.insert( iv_ptr_list.end(), iv_ptrs.begin()+ivI, iv_ptrs.end() );
00858                 AddGapMatches( iv_ptr_list, iv_ptr_list.begin(), iv_ptr_list.end(), seqI, 1, iv_list.seq_table[seqI]->length()+1, AbstractMatch::forward, iv_list.seq_table.size() );
00859                 std::list< AbstractMatch* >::iterator iter = iv_ptr_list.begin();
00860                 while( ivI != iv_ptrs.size() && iter != iv_ptr_list.end() )
00861                 {
00862                         if( iv_ptrs[ivI] == *iter )
00863                                 ivI++;
00864                         else
00865                                 new_ivs.push_back( *iter );
00866                         ++iter;
00867                 }
00868                 while( iter != iv_ptr_list.end() )
00869                 {
00870                         new_ivs.push_back( *iter );
00871                         ++iter;
00872                 }
00873         }
00874         // now add all the new intervals to iv_list
00875         size_t prev_size = iv_list.size();
00876         iv_list.resize( iv_list.size() + new_ivs.size() );
00877         for( size_t newI = 0; newI < new_ivs.size(); ++newI )
00878         {
00879                 Interval iv( new_ivs.begin() + newI, new_ivs.begin() + newI + 1 );
00880                 iv_list[prev_size + newI] = iv;
00881                 new_ivs[newI]->Free();
00882         }
00883 }
00884 
00885 
00886 }
00887 
00888 #endif // __Islands_h__

Generated on Fri Mar 14 06:01:03 2008 for libMems by doxygen 1.3.6