libMems/Backbone.cpp

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * $Id: Backbone.cpp,v 1.12 2004/04/19 23:11:19 darling Exp $
00003  * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
00004  * Please see the file called COPYING for licensing, copying, and modification
00005  * Please see the file called COPYING for licensing details.
00006  * **************
00007  ******************************************************************************/
00008 
00009 #ifdef HAVE_CONFIG_H
00010 #include "config.h"
00011 #endif
00012 
00013 #include "libMems/ProgressiveAligner.h"
00014 #include "libMems/Backbone.h"
00015 #include "libMems/Islands.h"
00016 #include "libMems/CompactGappedAlignment.h"
00017 
00018 #include <boost/property_map.hpp>
00019 #include <boost/graph/graph_traits.hpp>
00020 #include <boost/graph/adjacency_list.hpp>
00021 #include <boost/graph/topological_sort.hpp>
00022 #include <boost/graph/johnson_all_pairs_shortest.hpp>
00023 #include <boost/graph/depth_first_search.hpp>
00024 #include <boost/graph/undirected_dfs.hpp>
00025 
00026 using namespace std;
00027 using namespace genome;
00028 namespace mems {
00029 
00030 
00031 template< typename MatchVector >
00032 void getBpList( MatchVector& mvect, uint seq, vector< gnSeqI >& bp_list )
00033 {
00034         bp_list.clear();
00035         for( size_t ivI = 0; ivI < mvect.size(); ivI++ )
00036         {
00037                 if( mvect[ivI]->LeftEnd(seq) == NO_MATCH )
00038                         continue;
00039                 bp_list.push_back( mvect[ivI]->LeftEnd(seq) );
00040                 bp_list.push_back( mvect[ivI]->RightEnd(seq)+1 );
00041         }
00042         std::sort( bp_list.begin(), bp_list.end() );
00043 }
00044 
00045 template< typename MatchVector >
00046 void createMap( const MatchVector& mv_from, const MatchVector& mv_to, vector< size_t >& map )
00047 {
00048         typedef typename MatchVector::value_type MatchPtr;
00049         vector< pair< MatchPtr, size_t > > m1(mv_from.size());
00050         vector< pair< MatchPtr, size_t > > m2(mv_to.size());
00051         for( size_t i = 0; i < mv_from.size(); ++i )
00052                 m1[i] = make_pair( mv_from[i], i );
00053         for( size_t i = 0; i < mv_to.size(); ++i )
00054                 m2[i] = make_pair( mv_to[i], i );
00055         std::sort( m1.begin(), m1.end() );
00056         std::sort( m2.begin(), m2.end() );
00057         map.resize( m1.size() );
00058         for( size_t i = 0; i < m1.size(); ++i )
00059                 map[m1[i].second] = m2[i].second;
00060 }
00061 
00062 typedef pair< size_t, Interval* > iv_tracker_t;
00063 class IvTrackerComp
00064 {
00065 public:
00066         IvTrackerComp( uint seq ) : ssc( seq ) {}
00067         bool operator()( const iv_tracker_t& a, const iv_tracker_t& b )
00068         {
00069                 return ssc(a.second, b.second);
00070         }
00071 private:
00072         SingleStartComparator<Interval> ssc;
00073 };
00074 
00075 const int LEFT_NEIGHBOR = -1;
00076 const int RIGHT_NEIGHBOR = 1;
00077 typedef vector< size_t > neighbor_t;
00078 
00079 neighbor_t& getNeighbor( pair< neighbor_t, neighbor_t >& entry, int direction )
00080 {
00081         if( direction == RIGHT_NEIGHBOR )
00082                 return entry.first;
00083         else
00084                 return entry.second;
00085 }
00086 
00087 
00088 void collapseCollinear( IntervalList& iv_list )
00089 {
00090         if( iv_list.size() == 0 )
00091                 return; // nothing to see here, move along...
00092         const size_t seq_count = iv_list.seq_table.size();
00093         std::vector< Interval* > iv_ptrs(iv_list.size());
00094         size_t lilI = 0;
00095         for( size_t i = 0; i < iv_list.size(); ++i )
00096         {
00097                 // ignore unaligned regions
00098                 if( iv_list[i].Multiplicity() < 2 )
00099                         continue;
00100                 iv_ptrs[lilI++] = &iv_list[i];
00101         }
00102         iv_ptrs.resize(lilI);
00103         const size_t NEIGHBOR_UNKNOWN = (std::numeric_limits<size_t>::max)();
00104         neighbor_t lefties_tmp( seq_count, NEIGHBOR_UNKNOWN );
00105         pair< neighbor_t, neighbor_t > neighbor_pair( lefties_tmp, lefties_tmp );
00106         vector< pair< neighbor_t, neighbor_t > > neighbor_list( iv_ptrs.size(), neighbor_pair );
00107         vector< iv_tracker_t > iv_tracker( iv_ptrs.size() );
00108         for( size_t i = 0; i < iv_ptrs.size(); ++i )
00109         {
00110                 iv_tracker[i] = make_pair( i, iv_ptrs[i] );
00111         }
00112         for( size_t seqI = 0; seqI < seq_count; ++seqI )
00113         {
00114                 IvTrackerComp ivc( seqI );
00115                 sort( iv_tracker.begin(), iv_tracker.end(), ivc );
00116                 size_t prev_i = NEIGHBOR_UNKNOWN;
00117                 size_t cur_i = NEIGHBOR_UNKNOWN;
00118                 for( size_t i = 0; i < iv_tracker.size(); ++i )
00119                 {
00120                         if( iv_tracker[i].second->LeftEnd(seqI) == NO_MATCH )
00121                                 continue;
00122                         if( cur_i != NEIGHBOR_UNKNOWN )
00123                         {
00124                                 neighbor_list[cur_i].first[seqI] = prev_i;
00125                                 neighbor_list[cur_i].second[seqI] = iv_tracker[i].first;
00126                         }
00127                         prev_i = cur_i;
00128                         cur_i = iv_tracker[i].first;
00129                 }
00130                 // get the last one
00131                 if( cur_i != NEIGHBOR_UNKNOWN )
00132                 {
00133                         neighbor_list[cur_i].first[seqI] = prev_i;
00134                         neighbor_list[cur_i].second[seqI] = NEIGHBOR_UNKNOWN;
00135                 }
00136         }
00137 
00138         // now look for neighbor pair entries which can be merged
00139         for( int d = -1; d < 2; d+= 2 ) // iterate over both directions
00140         {
00141                 size_t unknown_count = 0;
00142                 for( size_t nI = 0; nI < neighbor_list.size(); ++nI )
00143                 {
00144                         size_t nayb = NEIGHBOR_UNKNOWN;
00145                         size_t seqI = 0;
00146                         bool parity = false;
00147                         size_t ct = 0;
00148                         for( ; seqI < seq_count; ++seqI )
00149                         {
00150                                 if( iv_ptrs[nI]->Orientation(seqI) == AbstractMatch::undefined )
00151                                         continue;
00152                                 int orient = iv_ptrs[nI]->Orientation(seqI) == AbstractMatch::forward ? 1 : -1;
00153 
00154                                 if( nayb == NEIGHBOR_UNKNOWN )
00155                                 {
00156                                         nayb = getNeighbor( neighbor_list[nI], d * orient * -1 )[seqI];
00157                                         if( nayb != NEIGHBOR_UNKNOWN )
00158                                                 parity = iv_ptrs[nI]->Orientation(seqI) == iv_ptrs[nayb]->Orientation(seqI);
00159                                 }
00160                                 else if( nayb != getNeighbor( neighbor_list[nI], d * orient * -1 )[seqI] )
00161                                         break;
00162                                 else if( parity != (iv_ptrs[nI]->Orientation(seqI) == iv_ptrs[nayb]->Orientation(seqI)) )
00163                                         break;
00164                                 if( nayb != NEIGHBOR_UNKNOWN )
00165                                         ct++;
00166                         }
00167                         if( seqI < seq_count || ct < iv_ptrs[nI]->Multiplicity() )
00168                                 continue;       // not collinear
00169                         if( nayb == NEIGHBOR_UNKNOWN )
00170                                 continue;
00171 
00172                         // merge nI and nayb
00173                         uint fs = iv_ptrs[nI]->FirstStart();
00174                         gnSeqI nI_lend_fs = iv_ptrs[nI]->LeftEnd(fs);
00175                         gnSeqI nayb_lend_fs = iv_ptrs[nayb]->LeftEnd(fs);
00176                         AbstractMatch::orientation o = iv_ptrs[nI]->Orientation(fs);
00177                         vector< AbstractMatch* > nI_matches;
00178                         iv_ptrs[nI]->StealMatches( nI_matches );
00179                         vector< AbstractMatch* > nayb_matches;
00180                         iv_ptrs[nayb]->StealMatches( nayb_matches );
00181                         if( !parity )
00182                         {
00183                                 std::reverse( nI_matches.begin(), nI_matches.end() );
00184                                 for( size_t i = 0; i < nI_matches.size(); ++i )
00185                                         nI_matches[i]->Invert();
00186                                 o = o == AbstractMatch::forward ? AbstractMatch::reverse : AbstractMatch::forward;
00187                         }
00188                         if( (o == AbstractMatch::forward && nI_lend_fs > nayb_lend_fs) ||
00189                                 (o == AbstractMatch::reverse && nI_lend_fs < nayb_lend_fs))
00190                                 nayb_matches.insert( nayb_matches.end(), nI_matches.begin(), nI_matches.end() );
00191                         else
00192                                 nayb_matches.insert( nayb_matches.begin(), nI_matches.begin(), nI_matches.end() );
00193 
00194                         iv_ptrs[nayb]->SetMatches( nayb_matches );
00195 
00196                         // update all pointers to point to nayb
00197                         seqI = 0;
00198                         for( ; seqI < seq_count; ++seqI )
00199                         {
00200                                 if( getNeighbor( neighbor_list[nI], -1 )[seqI] == NEIGHBOR_UNKNOWN &&
00201                                         getNeighbor( neighbor_list[nI], 1 )[seqI] == NEIGHBOR_UNKNOWN )
00202                                         continue;
00203                                 int orient = iv_ptrs[nayb]->Orientation(seqI) == AbstractMatch::forward ? 1 : -1;
00204                                 size_t other_nayb = getNeighbor( neighbor_list[nI], d * orient * (parity ? 1 : -1) )[seqI];
00205                                 if( other_nayb != NEIGHBOR_UNKNOWN )
00206                                 {
00207                                         if( getNeighbor( neighbor_list[other_nayb], 1 )[seqI] == nI )
00208                                                 getNeighbor( neighbor_list[other_nayb], 1 )[seqI] = nayb;
00209                                         else if( getNeighbor( neighbor_list[other_nayb], -1 )[seqI] == nI )
00210                                                 getNeighbor( neighbor_list[other_nayb], -1 )[seqI] = nayb;
00211                                         else
00212                                         {
00213                                                 cerr << "serious programmer error\n";
00214                                                 genome::breakHere();
00215                                         }
00216                                 }
00217                                 if( getNeighbor( neighbor_list[nayb], 1 )[seqI] == nI )
00218                                         getNeighbor( neighbor_list[nayb], 1 )[seqI] = other_nayb;
00219                                 else if( getNeighbor( neighbor_list[nayb], -1 )[seqI] == nI )
00220                                         getNeighbor( neighbor_list[nayb], -1 )[seqI] = other_nayb;
00221                                 else
00222                                 {
00223                                         cerr << "inexcusable programmer error\n";
00224                                         genome::breakHere();
00225                                 }
00226                                 neighbor_list[nI].first[seqI] = NEIGHBOR_UNKNOWN;
00227                                 neighbor_list[nI].second[seqI] = NEIGHBOR_UNKNOWN;
00228                         }
00229                 }
00230         }
00231 
00232         IntervalList new_list;
00233         new_list.seq_filename = iv_list.seq_filename;
00234         new_list.seq_table = iv_list.seq_table;
00235         new_list.resize( iv_ptrs.size() );
00236         size_t newI = 0;
00237         for( size_t ivI = 0; ivI < iv_ptrs.size(); ++ivI )
00238         {
00239                 vector< AbstractMatch* > matches;
00240                 iv_ptrs[ivI]->StealMatches( matches );
00241                 if( matches.size() > 0 )
00242                         new_list[newI++].SetMatches( matches );
00243         }
00244         new_list.resize(newI);
00245         swap( iv_list, new_list );
00246         addUnalignedRegions(iv_list);
00247 }
00248 
00249 
00250 void checkForAllGapColumns( IntervalList& iv_list )
00251 {
00252         // debug: sanity check whether there are all gap columns
00253         for( size_t ivI = 0; ivI < iv_list.size(); ivI++ )
00254         {
00255                 vector< string > aln;
00256                 mems::GetAlignment( iv_list[ivI], iv_list.seq_table, aln );
00257                 for( size_t colI = 0; colI < aln[0].size(); ++colI )
00258                 {
00259                         size_t rowI = 0;
00260                         for( ; rowI < aln.size(); ++rowI )
00261                                 if( aln[rowI][colI] != '-' )
00262                                         break;
00263                         if( rowI == aln.size() )
00264                         {
00265                                 cerr << "ERROR!  IV " << ivI << " COLUMN " << colI << " IS ALL GAPS!\n";
00266                         }
00267                 }
00268         }
00269 }
00270 
00271 
00272 // indexed by seqI, seqJ, ivI, hssI (left col, right col)
00273 typedef boost::multi_array< vector< pair< size_t, size_t > >, 3 > pairwise_genome_hss_t;
00274 
00275 void translateToPairwiseGenomeHSS( const hss_array_t& hss_array, pairwise_genome_hss_t& hss_cols )
00276 {
00277         uint seq_count = hss_array.shape()[0];
00278         uint iv_count = hss_array.shape()[2];
00279         hss_cols.resize( boost::extents[seq_count][seq_count][iv_count] );
00280 
00281         // make pairwise projections of intervals and find LCBs...
00282         for( size_t seqI = 0; seqI < seq_count; ++seqI )
00283         {
00284                 for( size_t seqJ = seqI+1; seqJ < seq_count; ++seqJ )
00285                 {
00286                         for( size_t ivI = 0; ivI < iv_count; ++ivI )
00287                         {
00288                                 const hss_list_t& cur_list = hss_array[seqI][seqJ][ivI];
00289                                 hss_cols[seqI][seqJ][ivI].resize( cur_list.size() );
00290                                 for( size_t hssI = 0; hssI < cur_list.size(); hssI++ )
00291                                 {
00292                                         hss_cols[seqI][seqJ][ivI][hssI].first = cur_list[hssI].left_col;
00293                                         hss_cols[seqI][seqJ][ivI][hssI].second = cur_list[hssI].right_col;
00294                                 }
00295                         }
00296                 }
00297         }
00298 }
00299 
00300 
00301 double computeGC( std::vector< gnSequence* >& seq_table )
00302 {
00303         const uint8* tab = SortedMerList::BasicDNATable();
00304         size_t counts[4];
00305         for( int i = 0; i < 4; i++ )
00306                 counts[i] = 0;
00307         for( size_t seqI = 0; seqI < seq_table.size(); seqI++ )
00308         {
00309                 std::string seq;
00310                 seq_table[seqI]->ToString( seq );
00311                 for( size_t cI = 0; cI < seq.size(); cI++  )
00312                         counts[ tab[ seq[cI] ] ]++;
00313         }
00314         return double(counts[1]+counts[2]) / double(counts[1]+counts[2] + counts[0]+counts[3]);
00315 }
00316 
00317 void makeAllPairwiseGenomeHSS( IntervalList& iv_list, vector< CompactGappedAlignment<>* >& iv_ptrs, vector< CompactGappedAlignment<>* >& iv_orig_ptrs, pairwise_genome_hss_t& hss_cols,  const Params& hmm_params )
00318 {
00319         uint seq_count = iv_list.seq_table.size();
00320         // make pairwise projections of intervals and find LCBs...
00321         for( size_t seqI = 0; seqI < seq_count; ++seqI )
00322         {
00323                 for( size_t seqJ = seqI+1; seqJ < seq_count; ++seqJ )
00324                 {
00325                         vector< uint > projection;
00326                         projection.push_back( seqI );
00327                         projection.push_back( seqJ );
00328                         vector< vector< MatchProjectionAdapter* > > LCB_list;
00329                         vector< LCB > projected_adjs;
00330                         projectIntervalList( iv_list, projection, LCB_list, projected_adjs );
00331                         // make intervals
00332                         IntervalList pair_ivs;
00333                         pair_ivs.seq_table.push_back( iv_list.seq_table[seqI] );
00334                         pair_ivs.seq_table.push_back( iv_list.seq_table[seqJ] );
00335                         pair_ivs.resize( LCB_list.size() );
00336                         for( size_t lcbI = 0; lcbI < LCB_list.size(); ++lcbI )
00337                                 pair_ivs[lcbI].SetMatches( LCB_list[lcbI] );
00338                         LCB_list.clear();
00339 
00340                         vector< CompactGappedAlignment<>* > pair_cgas( pair_ivs.size() );
00341                         for( size_t lcbI = 0; lcbI < pair_ivs.size(); ++lcbI )
00342                         {
00343                                 CompactGappedAlignment<> tmp_cga;
00344                                 pair_cgas[lcbI] = tmp_cga.Copy();
00345                                 new (pair_cgas[lcbI])CompactGappedAlignment<>( pair_ivs[lcbI] );
00346                         }
00347 
00348                         vector< CompactGappedAlignment<>* > hss_list;
00349                         // now find islands
00350                         hss_array_t hss_array;
00351                         findHssHomologyHMM( pair_cgas, pair_ivs.seq_table, hss_array, hmm_params, true, true );
00352                         HssArrayToCga(pair_cgas, pair_ivs.seq_table, hss_array, hss_list);
00353 
00354                         for( size_t cgaI = 0; cgaI < pair_cgas.size(); ++cgaI )
00355                                 pair_cgas[cgaI]->Free();
00356                         pair_cgas.clear();
00357 
00358                         // now split up on iv boundaries
00359                         vector< gnSeqI > bp_list;
00360                         getBpList( iv_ptrs, seqI, bp_list );
00361                         GenericMatchSeqManipulator< CompactGappedAlignment<> > gmsm(0);
00362                         SingleStartComparator< CompactGappedAlignment<> > ssc(0);
00363                         std::sort(hss_list.begin(), hss_list.end(), ssc );
00364                         applyBreakpoints( bp_list, hss_list, gmsm );
00365                         // and again on seqJ
00366                         getBpList( iv_ptrs, seqJ, bp_list );
00367                         GenericMatchSeqManipulator< CompactGappedAlignment<> > gmsm1(1);
00368                         SingleStartComparator< CompactGappedAlignment<> > ssc1(1);
00369                         std::sort(hss_list.begin(), hss_list.end(), ssc1 );
00370                         applyBreakpoints( bp_list, hss_list, gmsm1 );
00371 
00372                         // now transform into interval-specific columns
00373                         std::sort(hss_list.begin(), hss_list.end(), ssc );
00374 
00375                         SingleStartComparator< CompactGappedAlignment<> > ivcomp(seqI);
00376                         std::sort( iv_ptrs.begin(), iv_ptrs.end(), ivcomp );
00377                         vector< size_t > iv_map;
00378                         createMap( iv_ptrs, iv_orig_ptrs, iv_map );
00379                         size_t ivI = 0;
00380                         while( ivI < iv_ptrs.size() && iv_ptrs[ivI]->LeftEnd(0) == NO_MATCH )
00381                                 ++ivI;
00382                         for( size_t hssI = 0; hssI < hss_list.size(); ++hssI )
00383                         {
00384                                 if( hss_list[hssI]->LeftEnd(0) == NO_MATCH || hss_list[hssI]->Length(0) == 0 )
00385                                         continue;
00386                                 if( ivI == iv_ptrs.size() )
00387                                 {
00388                                         cerr << "huh?\n";
00389                                         cerr << hss_list[hssI]->LeftEnd(0) << endl;
00390                                         cerr << hss_list[hssI]->RightEnd(0) << endl;
00391                                         cerr << iv_ptrs.back()->LeftEnd(seqI) << endl;
00392                                         cerr << iv_ptrs.back()->RightEnd(seqI) << endl;
00393                                 }
00394                                 while( ivI < iv_ptrs.size() && 
00395                                         (iv_ptrs[ivI]->LeftEnd(seqI) == NO_MATCH ||
00396                                         hss_list[hssI]->LeftEnd(0) > iv_ptrs[ivI]->RightEnd(seqI) ) )
00397                                         ++ivI;
00398                                 if( ivI == iv_ptrs.size() )
00399                                 {
00400                                         cerr << "hssI fit!!\n";
00401                                         genome::breakHere();
00402                                 }
00403                                 // check for containment in seqJ
00404                                 if( iv_ptrs[ivI]->LeftEnd(seqJ) == NO_MATCH ||
00405                                         iv_ptrs[ivI]->RightEnd(seqJ) < hss_list[hssI]->LeftEnd(1) ||
00406                                         hss_list[hssI]->RightEnd(1) < iv_ptrs[ivI]->LeftEnd(seqJ) )
00407                                         continue;       // this hss falls to an invalid range in seqJ
00408 
00409                                 if( hss_list[hssI]->RightEnd(0) < iv_ptrs[ivI]->LeftEnd(seqI) )
00410                                 {
00411                                         cerr << "huh 2?\n";
00412                                         cerr << hss_list[hssI]->LeftEnd(0) << endl;
00413                                         cerr << hss_list[hssI]->RightEnd(0) << endl;
00414                                         cerr << iv_ptrs[ivI]->LeftEnd(seqI) << endl;
00415                                         cerr << iv_ptrs[ivI]->RightEnd(seqI) << endl;
00416                                         hssI++;
00417                                         continue;
00418                                 }
00419 
00420                                 vector< pair< size_t, size_t > >& cur_hss_cols = hss_cols[seqI][seqJ][iv_map[ivI]];
00421 
00422                                 gnSeqI left_col = iv_ptrs[ivI]->SeqPosToColumn( seqI, hss_list[hssI]->LeftEnd(0) );
00423                                 gnSeqI right_col = iv_ptrs[ivI]->SeqPosToColumn( seqI, hss_list[hssI]->RightEnd(0) );
00424                                 if(left_col > right_col && iv_ptrs[ivI]->Orientation(seqI) == AbstractMatch::reverse )
00425                                 {
00426                                         swap(left_col, right_col);      // must have been a revcomp seq
00427                                 }
00428                                 else if(left_col > right_col)
00429                                 {
00430                                         cerr << "bad cols\n";
00431                                         cerr << hss_list[hssI]->LeftEnd(0) << endl;
00432                                         cerr << hss_list[hssI]->RightEnd(0) << endl;
00433                                         cerr << iv_ptrs[ivI]->LeftEnd(seqI) << endl;
00434                                         cerr << iv_ptrs[ivI]->RightEnd(seqI) << endl;
00435                                         genome::breakHere();
00436                                 }
00437 
00438                                 if( left_col > 2000000000 || right_col > 2000000000 )
00439                                 {
00440                                         cerr << "huh 2?\n";
00441                                         cerr << hss_list[hssI]->LeftEnd(0) << endl;
00442                                         cerr << hss_list[hssI]->RightEnd(0) << endl;
00443                                         cerr << iv_ptrs[ivI]->LeftEnd(seqI) << endl;
00444                                         cerr << iv_ptrs[ivI]->RightEnd(seqI) << endl;
00445                                         genome::breakHere();
00446                                 }
00447                                 cur_hss_cols.push_back( make_pair( left_col, right_col ) );
00448                         }
00449                         for( size_t hssI = 0; hssI < hss_list.size(); ++hssI )
00450                                 hss_list[hssI]->Free();
00451                 }
00452         }
00453 }
00454 
00455 void mergePairwiseHomologyPredictions(  vector< CompactGappedAlignment<>* >& iv_orig_ptrs, pairwise_genome_hss_t& hss_cols, vector< vector< ULA* > >& ula_list )
00456 {
00457         uint seq_count = hss_cols.shape()[0];
00458         uint iv_count = hss_cols.shape()[2];
00459         //
00460         // FINALLY!  ready to merge.  how to do it?
00461         // make an empty list of UngappedLocalAlignments
00462         // start with the first seq and create a ULA for every col
00463         // range.  Then continue to the second seq, and when
00464         // a col range overlaps a pre-existing ULA, create a new ULA
00465         // for the intersected region and a smaller ULA for the non-intersected region
00466         ula_list.resize( iv_count );
00467         for( size_t ivI = 0; ivI < iv_count; ++ivI )
00468         {
00469                 vector< ULA* >& iv_ulas = ula_list[ivI];
00470                 for( size_t seqI = 0; seqI < seq_count; ++seqI )
00471                 {
00472                         for( size_t seqJ = seqI+1; seqJ < seq_count; ++seqJ )
00473                         {
00474                                 vector< pair< size_t, size_t > >& cur_hss_cols = hss_cols[seqI][seqJ][ivI];
00475                                 vector< ULA* > cur_ulas( cur_hss_cols.size() );
00476                                 ULA tmp_ula(seq_count);
00477                                 for( size_t hssI = 0; hssI < cur_hss_cols.size(); ++hssI )
00478                                 {
00479                                         cur_ulas[hssI] = tmp_ula.Copy();
00480                                         cur_ulas[hssI]->SetStart(seqI, cur_hss_cols[hssI].first+1);
00481                                         cur_ulas[hssI]->SetStart(seqJ, cur_hss_cols[hssI].first+1);
00482                                         cur_ulas[hssI]->SetLength( cur_hss_cols[hssI].second - cur_hss_cols[hssI].first + 1 );
00483                                 }
00484 
00485                                 vector< gnSeqI > iv_bp_list;
00486                                 vector< gnSeqI > cur_bp_list;
00487                                 SingleStartComparator<ULA> ulacompI(seqI);
00488                                 std::sort( iv_ulas.begin(), iv_ulas.end(), ulacompI );
00489                                 std::sort( cur_ulas.begin(), cur_ulas.end(), ulacompI );
00490                                 getBpList( iv_ulas, seqI, iv_bp_list );
00491                                 getBpList( cur_ulas, seqI, cur_bp_list );
00492                                 GenericMatchSeqManipulator< ULA > gmsm(seqI);
00493                                 applyBreakpoints( iv_bp_list, cur_ulas, gmsm );
00494                                 applyBreakpoints( cur_bp_list, iv_ulas, gmsm );
00495 
00496                                 SingleStartComparator<ULA> ulacompJ(seqJ);
00497                                 std::sort( iv_ulas.begin(), iv_ulas.end(), ulacompJ );
00498                                 std::sort( cur_ulas.begin(), cur_ulas.end(), ulacompJ );
00499                                 getBpList( iv_ulas, seqJ, iv_bp_list );
00500                                 getBpList( cur_ulas, seqJ, cur_bp_list );
00501                                 GenericMatchSeqManipulator< ULA > gmsmJ(seqJ);
00502                                 applyBreakpoints( iv_bp_list, cur_ulas, gmsmJ );
00503                                 applyBreakpoints( cur_bp_list, iv_ulas, gmsmJ );
00504 
00505                                 // do seqI a second time to propagate any breakpoints introduced by seqJ
00506                                 std::sort( iv_ulas.begin(), iv_ulas.end(), ulacompI );
00507                                 std::sort( cur_ulas.begin(), cur_ulas.end(), ulacompI );
00508                                 getBpList( iv_ulas, seqI, iv_bp_list );
00509                                 getBpList( cur_ulas, seqI, cur_bp_list );
00510                                 applyBreakpoints( iv_bp_list, cur_ulas, gmsm );
00511                                 applyBreakpoints( cur_bp_list, iv_ulas, gmsm );
00512 
00513                                 std::sort( iv_ulas.begin(), iv_ulas.end(), ulacompI );
00514                                 std::sort( cur_ulas.begin(), cur_ulas.end(), ulacompI );
00515                                 // now that cur_ulas and iv_ulas are all broken up according to each other's boundaries
00516                                 // we can simply scan along and add
00517                                 size_t iv_ulas_size = iv_ulas.size();
00518                                 size_t ivuI = 0;
00519                                 size_t curuI = 0;
00520                                 vector< ULA* > added_to( cur_ulas.size(), NULL );       // this tracks which of iv_ulas a cur_ula was added to
00521                                 vector< ULA* > to_delete;
00522                                 while( ivuI < iv_ulas_size && curuI < cur_ulas.size() )
00523                                 {
00524                                         if( iv_ulas[ivuI]->LeftEnd(seqI) == cur_ulas[curuI]->LeftEnd(seqI) )
00525                                         {
00526                                                 if( added_to[curuI] == iv_ulas[ivuI] )
00527                                                 {
00528                                                         // do nothing
00529                                                 }else if( added_to[curuI] == NULL )
00530                                                 {
00531                                                         iv_ulas[ivuI]->SetLeftEnd(seqJ, cur_ulas[curuI]->LeftEnd(seqJ));
00532                                                         added_to[curuI] = iv_ulas[ivuI];
00533                                                 }else{
00534                                                         ULA* merge = added_to[curuI];
00535                                                         for( size_t seqK = 0; seqK < seq_count; ++seqK )
00536                                                         {
00537                                                                 if( merge->Start(seqK) == NO_MATCH )
00538                                                                         continue;
00539                                                                 iv_ulas[ivuI]->SetStart( seqK, merge->Start(seqK) );
00540                                                         }
00541                                                         to_delete.push_back( merge );
00542                                                 }
00543                                                 ivuI++;
00544                                         }else if( iv_ulas[ivuI]->LeftEnd(seqI) < cur_ulas[curuI]->LeftEnd(seqI) )
00545                                         {
00546                                                 ivuI++;
00547                                         }else
00548                                                 curuI++;
00549                                 }
00550 
00551                                 // delete to_delete...
00552                                 std::sort( to_delete.begin(), to_delete.end() );
00553                                 vector< ULA* >::iterator last = std::unique( to_delete.begin(), to_delete.end() );
00554                                 to_delete.erase( last, to_delete.end() );
00555                                 vector< ULA* > new_iv_ulas( iv_ulas.size() - to_delete.size() );
00556                                 std::sort( iv_ulas.begin(), iv_ulas.end() );
00557                                 std::set_difference( iv_ulas.begin(), iv_ulas.end(), to_delete.begin(), to_delete.end(), new_iv_ulas.begin() );
00558                                 swap( iv_ulas, new_iv_ulas );
00559                                 for( size_t delI = 0; delI < to_delete.size(); ++delI )
00560                                         to_delete[delI]->Free();
00561 
00562                                 vector< ULA* > orig_ula_order = cur_ulas;
00563                                 // now do something similar for seqJ
00564                                 std::sort( iv_ulas.begin(), iv_ulas.end(), ulacompJ );
00565                                 std::sort( cur_ulas.begin(), cur_ulas.end(), ulacompJ );
00566 
00567                                 vector< size_t > added_map;
00568                                 createMap( cur_ulas, orig_ula_order, added_map );
00569 
00570                                 ivuI = 0;
00571                                 curuI = 0;
00572                                 to_delete.clear();
00573                                 while( ivuI < iv_ulas_size && curuI < cur_ulas.size() )
00574                                 {
00575                                         if( iv_ulas[ivuI]->LeftEnd(seqJ) == cur_ulas[curuI]->LeftEnd(seqJ) )
00576                                         {
00577                                                 if( added_to[added_map[curuI]] == iv_ulas[ivuI] )
00578                                                 {
00579                                                         // do nothing
00580                                                 }else if( added_to[added_map[curuI]] == NULL )
00581                                                 {
00582                                                         iv_ulas[ivuI]->SetLeftEnd(seqI, cur_ulas[curuI]->LeftEnd(seqI));
00583                                                         added_to[added_map[curuI]] = iv_ulas[ivuI];
00584                                                 }else{
00585                                                         ULA* merge = added_to[added_map[curuI]];
00586                                                         for( size_t seqK = 0; seqK < seq_count; ++seqK )
00587                                                         {
00588                                                                 if( merge->Start(seqK) == NO_MATCH )
00589                                                                         continue;
00590                                                                 iv_ulas[ivuI]->SetStart( seqK, merge->Start(seqK) );
00591                                                         }
00592                                                         to_delete.push_back( merge );
00593                                                 }
00594                                                 ivuI++;
00595                                         }else if( iv_ulas[ivuI]->LeftEnd(seqJ) < cur_ulas[curuI]->LeftEnd(seqJ) )
00596                                         {
00597                                                 ivuI++;
00598                                         }else
00599                                         {
00600                                                 curuI++;
00601                                         }
00602                                 }
00603 
00604                                 // anything with a null added_to entry needs to be added to iv_ulas
00605                                 // everything else needs to get freed
00606                                 std::sort( cur_ulas.begin(), cur_ulas.end(), ulacompI );
00607                                 for( curuI = 0; curuI < cur_ulas.size(); ++curuI )
00608                                 {
00609                                         if( added_to[curuI] == NULL )
00610                                                 iv_ulas.push_back( cur_ulas[curuI] );
00611                                         else
00612                                                 cur_ulas[curuI]->Free();
00613                                 }
00614                                 // delete to_delete...
00615                                 std::sort( to_delete.begin(), to_delete.end() );
00616                                 last = std::unique( to_delete.begin(), to_delete.end() );
00617                                 to_delete.erase( last, to_delete.end() );
00618                                 new_iv_ulas = vector< ULA* >( iv_ulas.size() - to_delete.size() );
00619                                 std::sort( iv_ulas.begin(), iv_ulas.end() );
00620                                 std::set_difference( iv_ulas.begin(), iv_ulas.end(), to_delete.begin(), to_delete.end(), new_iv_ulas.begin() );
00621                                 swap( iv_ulas, new_iv_ulas );
00622                                 for( size_t delI = 0; delI < to_delete.size(); ++delI )
00623                                         to_delete[delI]->Free();
00624                         }
00625                 }
00626         }
00627 
00628         // Eliminate segments that have no representation in a genome
00629         for( size_t ivI = 0; ivI < ula_list.size(); ++ivI )
00630         {
00631                 for( size_t mI = 0; mI < ula_list[ivI].size(); ++mI )
00632                 {
00633                         size_t seqI = ula_list[ivI][mI]->FirstStart();
00634                         std::vector<gnSeqI> l_pos;
00635                         std::vector<bool> l_column;
00636                         std::vector<gnSeqI> r_pos;
00637                         std::vector<bool> r_column;
00638                         gnSeqI left_col = ula_list[ivI][mI]->LeftEnd(seqI)-1;
00639                         gnSeqI right_col = ula_list[ivI][mI]->RightEnd(seqI)-1;
00640                         iv_orig_ptrs[ivI]->GetColumn(left_col, l_pos, l_column);
00641                         iv_orig_ptrs[ivI]->GetColumn(right_col, r_pos, r_column);
00642                         for( ; seqI < ula_list[ivI][mI]->SeqCount(); ++seqI )
00643                         {
00644                                 if( ula_list[ivI][mI]->LeftEnd(seqI) == NO_MATCH )
00645                                         continue;
00646                                 if( l_pos[seqI] == r_pos[seqI] && !l_column[seqI] && !r_column[seqI] )
00647                                         ula_list[ivI][mI]->SetStart(seqI, NO_MATCH);    // no match in this col
00648                         }
00649                         if( ula_list[ivI][mI]->Multiplicity() < 2 )
00650                         {
00651                                 ula_list[ivI][mI]->Free();
00652                                 ula_list[ivI][mI] = NULL;
00653                         }
00654                 }
00655                 // clean out any NULL ptrs
00656                 std::vector< ULA* >::iterator last = std::remove( ula_list[ivI].begin(), ula_list[ivI].end(), (ULA*)NULL );
00657                 ula_list[ivI].erase( last, ula_list[ivI].end() );
00658         }
00659 }
00660 
00661 void unalignIslands( IntervalList& iv_list, vector< CompactGappedAlignment<>* >& iv_orig_ptrs, vector< vector< ULA* > >& ula_list )
00662 {
00663         uint seq_count = iv_list.seq_table.size();
00664         // unalign regions in the iv list that aren't contained in backbone
00665         for( size_t ivI = 0; ivI < ula_list.size(); ++ivI )
00666         {
00667                 vector< AbstractMatch* > new_matches(ula_list[ivI].size());
00668                 for( size_t mI = 0; mI < ula_list[ivI].size(); ++mI )
00669                 {
00670                         size_t seqI = ula_list[ivI][mI]->FirstStart();
00671                         gnSeqI left_col = ula_list[ivI][mI]->LeftEnd(seqI)-1;
00672                         CompactGappedAlignment<> tmp_cga;
00673                         CompactGappedAlignment<>* new_cga = tmp_cga.Copy();
00674                         iv_orig_ptrs[ivI]->copyRange(*new_cga, left_col, ula_list[ivI][mI]->Length(seqI));
00675                         for( seqI = 0; seqI < ula_list[ivI][mI]->SeqCount(); ++seqI )
00676                         {
00677                                 if( ula_list[ivI][mI]->LeftEnd(seqI) == NO_MATCH )
00678                                         new_cga->SetLeftEnd(seqI, NO_MATCH);
00679                         }
00680                         new_cga->CondenseGapColumns();
00681                         new_matches[mI] = new_cga;
00682                 }
00683                 if( new_matches.size() > 0 )
00684                 {
00685 
00686                         vector< vector< AbstractMatch* > > disjoint_subsets;
00687                         {
00688                                 // split into multiple intervals if some sequences are completely unaligned
00689                                 // use a union-find structure to quickly figure out how many subgroups there are
00690                                 vector< uint > seq_map( seq_count );
00691                                 for( size_t sI = 0; sI < seq_map.size(); ++sI )
00692                                         seq_map[sI] = sI;
00693                                 for( size_t mI = 0; mI < new_matches.size(); ++mI )
00694                                 {
00695                                         uint sI = new_matches[mI]->FirstStart();
00696                                         uint map_to = seq_map[sI];
00697                                         while( map_to != seq_map[map_to] )
00698                                                 map_to = seq_map[map_to];
00699                                         seq_map[sI] = map_to;
00700                                         for( ++sI; sI < seq_count; ++sI )
00701                                         {
00702                                                 if( new_matches[mI]->LeftEnd(sI) == NO_MATCH )
00703                                                         continue;
00704                                                 uint map_from = seq_map[sI];
00705                                                 while( map_from != seq_map[map_from] )
00706                                                         map_from = seq_map[map_from];
00707                                                 seq_map[map_from] = map_to;
00708                                         }
00709                                 }
00710                                 vector< vector< AbstractMatch* > > mapped_lists( seq_count );
00711                                 for( size_t mI = 0; mI < new_matches.size(); ++mI )
00712                                 {
00713                                         uint sI = new_matches[mI]->FirstStart();
00714                                         uint map_to = seq_map[sI];
00715                                         while( map_to != seq_map[map_to] )
00716                                                 map_to = seq_map[map_to];
00717                                         mapped_lists[map_to].push_back( new_matches[mI] );
00718                                 }
00719                                 for( uint sI = 0; sI < seq_count; ++sI )
00720                                 {
00721                                         if( mapped_lists[sI].size() > 0 )
00722                                                 disjoint_subsets.push_back( mapped_lists[sI] );
00723                                 }
00724                         }
00725 
00726                         for( size_t dI = 0; dI < disjoint_subsets.size(); ++dI )
00727                         {
00728                                 vector< AbstractMatch* >& cur_d_matches = disjoint_subsets[dI];
00729                                 vector< AbstractMatch* > orig_order = cur_d_matches;
00730                                 // need to sort these.  use boost's topological sort.
00731                                 vector< size_t > id_map;
00732                                 typedef boost::adjacency_list< boost::vecS, boost::vecS, boost::directedS, boost::property<boost::vertex_color_t, boost::default_color_type> > Graph;
00733                                 typedef boost::graph_traits<Graph>::vertex_descriptor Vertex;
00734                                 typedef std::pair< int, int > Pair;
00735                                 vector< Pair > edges;
00736                                 for( size_t seqI = 0; seqI < seq_count; ++seqI )
00737                                 {
00738                                         SingleStartComparator<AbstractMatch> ssc(seqI);
00739                                         std::sort( cur_d_matches.begin(), cur_d_matches.end(), ssc );
00740                                         createMap( cur_d_matches, orig_order, id_map );
00741                                         int prev = -1;
00742                                         int first = -1;
00743                                         bool reverse = false;
00744                                         for( int mI = 0; mI < cur_d_matches.size(); ++mI )
00745                                         {
00746                                                 if( cur_d_matches[mI]->LeftEnd(seqI) == NO_MATCH )
00747                                                         continue;
00748                                                 if( prev != -1 )
00749                                                 {
00750                                                         Pair edge( id_map[prev], id_map[mI] );
00751                                                         if( reverse )
00752                                                                 swap( edge.first, edge.second );
00753                                                         edges.push_back(edge);
00754                                                 }else
00755                                                 {
00756                                                         reverse = cur_d_matches[mI]->Start(seqI) < 0;
00757                                                         first = mI;
00758                                                 }
00759                                                 prev = mI;
00760                                         }
00761                                         if( prev != -1 && !reverse )
00762                                                 edges.push_back( Pair( id_map[prev], cur_d_matches.size() ) );
00763                                         else if( prev != -1 && reverse )
00764                                                 edges.push_back( Pair( id_map[first], cur_d_matches.size() ) );
00765                                 }
00766                                 std::sort( edges.begin(), edges.end() );
00767                                 vector< Pair >::iterator ee_iter = std::unique( edges.begin(), edges.end() );
00768                                 edges.erase( ee_iter, edges.end() );
00769                                 Pair* edge_array = new Pair[edges.size()];
00770                                 for( size_t eI = 0; eI < edges.size(); ++eI )
00771                                         edge_array[eI] = edges[eI];
00772                                 typedef boost::graph_traits<Graph>::vertices_size_type v_size_t;
00773                                 Graph G(edge_array, edge_array + edges.size(), v_size_t(edges.size()));
00774                                 typedef std::vector< Vertex > container;
00775                                 container c;
00776                                 topological_sort(G, std::back_inserter(c));
00777                                 cur_d_matches.clear();
00778                                 for ( container::reverse_iterator ii=c.rbegin(); ii!=c.rend(); ++ii)
00779                                 {
00780                                         if( *ii < orig_order.size() )
00781                                                 cur_d_matches.push_back( orig_order[ *ii ] );
00782                                 }
00783                                 if( dI == 0 )
00784                                         iv_list[ivI].SetMatches(cur_d_matches);
00785                                 else
00786                                 {
00787                                         Interval new_iv( cur_d_matches.begin(), cur_d_matches.end() );
00788                                         iv_list.push_back(new_iv);
00789                                 }
00790                                 delete[] edge_array;
00791                         }
00792                 }
00793                 else
00794                 {
00795                         iv_orig_ptrs[ivI]->Free();
00796                         iv_orig_ptrs[ivI] = NULL;
00797                 }
00798         }
00799 
00800 
00801         // update iv_list to match the filtered iv_orig_ptrs
00802         size_t givI = 0;
00803         for( size_t iI = 0; iI < iv_orig_ptrs.size(); ++iI )
00804         {
00805                 if( iv_orig_ptrs[iI] != NULL )
00806                 {
00807                         swap( iv_list[givI], iv_list[iI] );
00808                         iv_orig_ptrs[iI]->Free();       // done with the CompactGappedAlignments
00809                         iv_orig_ptrs[iI] = NULL;
00810                         givI++;
00811                 }
00812         }
00813         // pick up any intervals that were split in half
00814         for( size_t iI = iv_orig_ptrs.size(); iI < iv_list.size(); ++iI )
00815                 swap( iv_list[givI++], iv_list[iI] );
00816         iv_list.erase( iv_list.begin()+givI, iv_list.end() );
00817 
00818         // collapse any intervals that are trivially collinear
00819         collapseCollinear( iv_list );
00820 }
00821 
00822 void createBackboneList( const IntervalList& iv_list, backbone_list_t& ula_list ) 
00823 {
00824         ula_list.resize( iv_list.size() );
00825         for( size_t ivI = 0; ivI < iv_list.size(); ++ivI )
00826         {
00827                 if( iv_list[ivI].Multiplicity() < 2 )
00828                         continue;
00829                 const vector< AbstractMatch* >& matches = iv_list[ivI].GetMatches();
00830                 int64 right_col = 0;
00831                 int64 left_col = 0;
00832                 for( size_t mI = 0; mI < matches.size(); ++mI )
00833                 {
00834                         left_col = right_col;
00835                         right_col += matches[mI]->AlignmentLength();
00836                         if( matches[mI]->Multiplicity() < 2 )
00837                                 continue;
00838                         ULA tmp_ula(matches[mI]->SeqCount());
00839                         ULA* mula = tmp_ula.Copy();
00840                         for( size_t seqI = 0; seqI < matches[mI]->SeqCount(); ++seqI )
00841                                 if( matches[mI]->LeftEnd(seqI) != NO_MATCH )
00842                                         mula->SetLeftEnd( seqI, left_col+1 );
00843                         mula->SetLength( right_col - left_col );
00844                         ula_list[ivI].push_back(mula);
00845                 }
00846                 // merge neighbors that cover identical match components
00847                 for( size_t ulaI = 1; ulaI < ula_list[ivI].size(); ulaI++ )
00848                 {
00849                         size_t seqI = 0;
00850                         for( ; seqI < ula_list[ivI][ulaI]->SeqCount(); ++seqI )
00851                         {
00852                                 int64 s1 = ula_list[ivI][ulaI-1]->Start(seqI);
00853                                 int64 s2 = ula_list[ivI][ulaI]->Start(seqI);
00854                                 if( s1 == mems::NO_MATCH && s2 == mems::NO_MATCH )
00855                                         continue;
00856                                 if( s1 == mems::NO_MATCH && s2 != mems::NO_MATCH )
00857                                         break;
00858                                 if( s1 != mems::NO_MATCH && s2 == mems::NO_MATCH )
00859                                         break;
00860                                 int64 r1 = ula_list[ivI][ulaI-1]->RightEnd(seqI);
00861                                 if( r1 + 1 != s2 )
00862                                         break;  // must be adjacent to each other
00863                         }
00864                         if( seqI == ula_list[ivI][ulaI]->SeqCount() )
00865                         {
00866                                 // ulaI-1 needs to be swallowed up by ulaI
00867                                 ula_list[ivI][ulaI]->ExtendStart( ula_list[ivI][ulaI-1]->Length() );
00868                                 ula_list[ivI][ulaI-1]->SetLength(0);
00869                         }
00870                 }
00871                 // get rid of matches that were swallowed up
00872                 vector< ULA* > condensed_list;
00873                 for( size_t ulaI = 0; ulaI < ula_list[ivI].size(); ulaI++ )
00874                 {
00875                         if( ula_list[ivI][ulaI]->Length() > 0 )
00876                                 condensed_list.push_back(ula_list[ivI][ulaI]);
00877                         else
00878                                 ula_list[ivI][ulaI]->Free();
00879                 }
00880                 swap( ula_list[ivI], condensed_list );
00881         }
00882 }
00883 
00884 void detectAndApplyBackbone( AbstractMatch* m, vector< gnSequence* >& seq_table, CompactGappedAlignment<>*& result, backbone_list_t& bb_list, const Params& hmm_params, boolean left_homologous, boolean right_homologous )
00885 {
00886         vector< AbstractMatch* > mlist( 1, m );
00887         uint seq_count = seq_table.size();
00888 
00889         // indexed by seqI, seqJ, ivI, hssI (left col, right col)
00890         pairwise_genome_hss_t hss_cols(boost::extents[seq_count][seq_count][1]);
00891 
00892         // ugg.  need CompactGappedAlignment for its SeqPosToColumn
00893         vector< CompactGappedAlignment<>* > iv_ptrs(1);
00894         CompactGappedAlignment<> tmp_cga;
00895         iv_ptrs[0] = tmp_cga.Copy();
00896         new (iv_ptrs[0])CompactGappedAlignment<>( *m ); // this will be freed when unalignIslands() gets called
00897 
00898         vector< CompactGappedAlignment<>* > iv_orig_ptrs(iv_ptrs);
00899         hss_array_t island_array, hss_array;
00900 
00901         findHssHomologyHMM( mlist, seq_table, island_array, hmm_params, left_homologous, right_homologous );
00902         translateToPairwiseGenomeHSS( island_array, hss_cols );
00903 
00904 
00905         // merge overlapping pairwise homology predictions into n-way predictions
00906         backbone_list_t ula_list;
00907         mergePairwiseHomologyPredictions( iv_orig_ptrs, hss_cols, ula_list );
00908 
00909         // unalignIslands wants an IntervalList
00910         IntervalList iv_list;
00911         iv_list.seq_table = seq_table;
00912         iv_list.resize(1);
00913         vector<AbstractMatch*> asdf(1, iv_orig_ptrs.front()->Copy() );
00914         iv_list[0].SetMatches( asdf );
00915         // unalign regions found to be non-homologous
00916         unalignIslands( iv_list, iv_orig_ptrs, ula_list );
00917 
00918         // free all ULAs and reconstruct them from the new alignment column coordinates
00919         for( size_t ulaI = 0; ulaI < ula_list.size(); ++ulaI )
00920                 for( size_t i = 0; i < ula_list[ulaI].size(); ++i )
00921                         ula_list[ulaI][i]->Free();
00922         ula_list.clear();
00923 
00924 
00925         createBackboneList( iv_list, ula_list );
00926 
00927         iv_orig_ptrs.clear();
00928 
00929         bb_list.clear();
00930         bb_list = ula_list;
00931 
00932         //checkForAllGapColumns( iv_list );
00933 
00934         result = tmp_cga.Copy();
00935         if( iv_list.size() > 0 )
00936                 new (result)CompactGappedAlignment<>( iv_list[0] );
00937 }
00938 
00939 
00940 void detectAndApplyBackbone( IntervalList& iv_list, backbone_list_t& bb_list, const Params& hmm_params )
00941 {
00942         // collapse any intervals that are trivially collinear
00943         collapseCollinear( iv_list );
00944 
00945         uint seq_count = iv_list.seq_table.size();
00946         boost::multi_array< vector< CompactGappedAlignment<>* >, 2> island_array;
00947 
00948         // indexed by seqI, seqJ, ivI, hssI (left col, right col)
00949         pairwise_genome_hss_t hss_cols(boost::extents[seq_count][seq_count][iv_list.size()]);
00950 
00951         // ugg.  need CompactGappedAlignment for its SeqPosToColumn
00952         vector< CompactGappedAlignment<>* > iv_ptrs(iv_list.size());
00953         for( size_t i = 0; i < iv_list.size(); ++i )
00954         {
00955                 CompactGappedAlignment<> tmp_cga;
00956                 iv_ptrs[i] = tmp_cga.Copy();
00957                 new (iv_ptrs[i])CompactGappedAlignment<>( iv_list[i] );
00958         }
00959         vector< CompactGappedAlignment<>* > iv_orig_ptrs(iv_ptrs);
00960 
00961         makeAllPairwiseGenomeHSS( iv_list, iv_ptrs, iv_orig_ptrs, hss_cols, hmm_params );
00962 
00963         backbone_list_t ula_list;
00964 
00965         // merge overlapping pairwise homology predictions into n-way predictions
00966         mergePairwiseHomologyPredictions( iv_orig_ptrs, hss_cols, ula_list );
00967 
00968         // unalign regions found to be non-homologous
00969         unalignIslands( iv_list, iv_orig_ptrs, ula_list );
00970 
00971         // need to add in all the unaligned regions so the viewer doesn't throw a fit
00972         addUnalignedRegions( iv_list );
00973 
00974         // free all ULAs and reconstruct them from the new alignment column coordinates
00975         for( size_t ulaI = 0; ulaI < ula_list.size(); ++ulaI )
00976                 for( size_t i = 0; i < ula_list[ulaI].size(); ++i )
00977                         ula_list[ulaI][i]->Free();
00978         ula_list.clear();
00979 
00980 
00981         createBackboneList( iv_list, ula_list );
00982 
00983         iv_orig_ptrs.clear();
00984 
00985         bb_list.clear();
00986         bb_list = ula_list;
00987 
00988         checkForAllGapColumns( iv_list );
00989 }
00990 
00991 void writeBackboneColumns( ostream& bb_out, backbone_list_t& bb_list )
00992 {
00993         //
00994         // At last! write out the backbone list
00995         //
00996         for( size_t ivI = 0; ivI < bb_list.size(); ++ivI )
00997         {
00998                 for( size_t mI = 0; mI < bb_list[ivI].size(); ++mI )
00999                 {
01000                         size_t seqI = bb_list[ivI][mI]->FirstStart();
01001                         bb_out << ivI << '\t' << bb_list[ivI][mI]->LeftEnd(seqI) << '\t' << bb_list[ivI][mI]->Length();
01002                         for( ; seqI < bb_list[ivI][mI]->SeqCount(); ++seqI )
01003                         {
01004                                 if( bb_list[ivI][mI]->LeftEnd(seqI) == NO_MATCH )
01005                                         continue;
01006                                 bb_out << '\t' << seqI;
01007                         }
01008                         bb_out << endl;
01009                 }
01010         }
01011 }
01012 
01013 void writeBackboneSeqCoordinates( backbone_list_t& bb_list, IntervalList& iv_list, ostream& bb_out )
01014 {
01015         if( bb_list.size() == 0 )
01016                 return;
01017         // find seq_count
01018         uint seq_count = 0;
01019         for( size_t bbI = 0; bbI < bb_list.size(); ++bbI )
01020                 if( bb_list[bbI].size() > 0 )
01021                 {
01022                         seq_count = bb_list[bbI].front()->SeqCount();
01023                         break;
01024                 }
01025 
01026         // different format -- use real sequence coordinates...
01027         // print a header line first
01028         for( size_t seqI = 0; seqI < seq_count; ++seqI )
01029         {
01030                 if( seqI > 0 )
01031                         bb_out << '\t';
01032                 bb_out << "seq_" << seqI << "_leftend\t";
01033                 bb_out << "seq_" << seqI << "_rightend";
01034         }
01035         bb_out << endl;
01036         for( size_t ivI = 0; ivI < bb_list.size(); ++ivI )
01037         {
01038                 // there seems to be a bug in the backbone creation code that causes the CGA that gets
01039                 // stuffed into the interval to have the wrong coordinates internally, while the interval
01040                 // maintains the correct coordinates.  work around it by converting the whole interval to a cga
01041                 CompactGappedAlignment<> iv_cga( iv_list[ivI] );
01042                 for( size_t mI = 0; mI < bb_list[ivI].size(); ++mI )
01043                 {
01044                         uint fs = bb_list[ivI][mI]->FirstStart();
01045                         // get the sequence positions out of the alignment
01046                         vector< gnSeqI > left_pos;
01047                         vector< bool > left_cols;
01048                         iv_cga.GetColumn( bb_list[ivI][mI]->LeftEnd(fs)-1, left_pos, left_cols );
01049                         vector< gnSeqI > right_pos;
01050                         vector< bool > right_cols;
01051                         iv_cga.GetColumn( bb_list[ivI][mI]->RightEnd(fs)-1, right_pos, right_cols );
01052                         for( size_t seqI = 0; seqI < bb_list[ivI][mI]->SeqCount(); ++seqI )
01053                         {
01054                                 if( seqI > 0 )
01055                                         bb_out << '\t';
01056                                 if( bb_list[ivI][mI]->LeftEnd(seqI) == NO_MATCH )
01057                                 {
01058                                         bb_out << "0\t0";
01059                                         continue;
01060                                 }else{
01061                                         int64 leftI = left_pos[seqI];
01062                                         int64 rightI = right_pos[seqI];
01063                                         if( iv_cga.Orientation(seqI) == AbstractMatch::forward && leftI != 0 && !left_cols[seqI] )
01064                                                 leftI++;
01065                                         if( iv_cga.Orientation(seqI) == AbstractMatch::reverse && rightI != 0 && !right_cols[seqI] )
01066                                                 rightI++;
01067                                         if( iv_cga.Orientation(seqI) == AbstractMatch::reverse )
01068                                         {
01069                                                 swap( leftI, rightI );  // must be reverse complement
01070                                         }
01071                                         if( rightI + 1 == leftI )
01072                                         {
01073                                                 bb_out << "0\t0";
01074                                                 continue;
01075                                         }
01076                                         if( leftI > rightI )
01077                                         {
01078                                                 cerr << "oh crahpey!\n";
01079                                                 cerr << "leftI: " << leftI << endl;
01080                                                 cerr << "rightI: " << rightI << endl;
01081                                                 cerr << "seqI: " << seqI << endl;
01082                                                 cerr << "ivI: " << ivI << endl;
01083                                         }
01084                                         if( leftI == 0 )
01085                                                 leftI = iv_cga.LeftEnd(seqI);
01086                                         if( rightI == iv_cga.RightEnd(seqI)+1 )
01087                                                 rightI--;
01088                                         if( iv_cga.Orientation(seqI) == AbstractMatch::reverse )
01089                                         {
01090                                                 leftI *= -1;
01091                                                 rightI *= -1;
01092                                         }
01093                                         bb_out << leftI << '\t' << rightI;
01094                                 }
01095                         }
01096                         bb_out << endl;
01097                 }
01098         }
01099 }
01100 
01101 
01102 }  // namespace mems
01103 

Generated on Fri Mar 14 06:01:02 2008 for libMems by doxygen 1.3.6