libMems/Interval.h

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * $Id: GenericInterval.h,v 1.4 2004/03/01 02:40:08 darling Exp $
00003  * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
00004  * This file is licensed under the GPL.
00005  * Please see the file called COPYING for licensing details.
00006  * **************
00007  ******************************************************************************/
00008 
00009 #ifndef __Interval_h__
00010 #define __Interval_h__
00011 
00012 #ifdef HAVE_CONFIG_H
00013 #include "config.h"
00014 #endif
00015 
00016 #include "libGenome/gnClone.h"
00017 #include "libGenome/gnDebug.h"
00018 #include "libMems/SparseAbstractMatch.h"
00019 #include "libMems/gnAlignedSequences.h"
00020 #include "libMems/AbstractGappedAlignment.h"
00021 #include "libMems/Match.h"
00022 #include "libMems/GappedAlignment.h"
00023 #include <iostream>
00024 #include <vector>
00025 #include "libMems/twister.h"
00026 
00027 //#include "boost/pool/object_pool.hpp"
00028 
00029 namespace mems {
00030 
00031 // adapter function to allow inserts on reverse iterators
00032 template< typename ListType, typename RanIt, typename Ty >
00033 void insert( ListType& the_list, std::reverse_iterator<RanIt>& riter, Ty& val )
00034 {
00035         the_list.insert( riter.base(), val );
00036         ++riter;        // need to shift riter
00037 }
00038 template< typename ListType, typename Ty >
00039 void insert( ListType& the_list, const typename ListType::iterator& iter, Ty& val )
00040 {
00041         the_list.insert( iter, val );
00042 }
00043 
00044 
00045 template< class GappedBaseImpl = AbstractGappedAlignment< SparseAbstractMatch<> > >
00046 class GenericInterval : public GappedBaseImpl
00047 {
00048 public:
00049         GenericInterval(){};
00050 
00051 //      GenericInterval( uint seq_count, gnSeqI aln_length) : GappedBaseImpl( seq_count, aln_length ){};
00052 
00054         template<typename BidIt>
00055         GenericInterval( BidIt it_begin, const BidIt& it_end ) : GappedBaseImpl( (*it_begin)->SeqCount(), 0 )
00056         {
00057                 std::vector<gnSeqI> pos((*it_begin)->SeqCount(), NO_MATCH);
00058                 for( ; it_begin != it_end; ++it_begin )
00059                         this->matches.push_back( (*it_begin)->Copy() );
00060                 CalculateOffset();
00061                 addUnalignedRegions();
00062                 CalculateAlignmentLength();
00063                 ValidateMatches();
00064         }
00065 
00066         GenericInterval( const GenericInterval& iv );
00067         ~GenericInterval();
00068         GenericInterval& operator=( const GenericInterval& iv );
00069         
00070         GenericInterval* Clone() const;
00071         GenericInterval* Copy() const;
00072         virtual void Free();
00073         
00075         template< class MatchVector >
00076         void SetMatches( MatchVector& matches )
00077         {
00078                 // Set the SeqCount and other bits
00079                 Match m( matches[0]->SeqCount() );
00080                 std::vector<AbstractMatch*> tmp(1, &m);
00081                 *this = GenericInterval( tmp.begin(), tmp.end() );
00082 
00083                 // then delete the allocated dummy match
00084                 for( std::size_t mI = 0; mI < this->matches.size(); mI++ )
00085                         this->matches[mI]->Free();
00086                 
00087                 // now set the matches and update the interval data
00088                 this->matches.resize(matches.size());
00089                 std::copy(matches.begin(), matches.end(), this->matches.begin());
00090 //              this->matches.insert( this->matches.end(), matches.begin(), matches.end() );
00091                 CalculateOffset();
00092                 addUnalignedRegions();
00093                 CalculateAlignmentLength();
00094                 ValidateMatches();
00095 
00096                 // finally, clear the user supplied matches to indicate that we own the memory
00097                 matches.clear();
00098         }
00099 
00101         template< class MatchVector >
00102         void SetMatchesTemp( MatchVector& matches )
00103         {
00104                 
00105                 // delete the allocated dummy match
00106                 for( std::size_t mI = 0; mI < this->matches.size(); mI++ )
00107                         this->matches[mI]->Free();
00108                 
00109                 // now set the matches and update the interval data
00110                 this->matches.resize(matches.size());
00111                 std::copy(matches.begin(), matches.end(), this->matches.begin());
00112 //              this->matches.insert( this->matches.end(), matches.begin(), matches.end() );
00113                 CalculateOffset();
00114                 addUnalignedRegions();
00115                 CalculateAlignmentLength();
00116                 ValidateMatches();
00117 
00118                 // finally, clear the user supplied matches to indicate that we own the memory
00119                 matches.clear();
00120         }
00124         template<typename BaseImpl> friend std::ostream& operator<<(std::ostream& os, const GenericInterval<BaseImpl>& iv); //write to source.
00125 
00129         template<typename BaseImpl> friend std::istream& operator>>(std::istream& is, const GenericInterval<BaseImpl>& iv); //read from source
00130 
00131         // Inherited methods from AbstractMatch:
00132         void Invert();
00133         void CropStart(gnSeqI crop_amount);
00134         void CropEnd(gnSeqI crop_amount);
00135         void MoveStart(int64 move_amount);
00136         void MoveEnd(int64 move_amount);
00137 
00138         virtual void CalculateOffset();
00139 
00140         void add( AbstractMatch* am ){ matches.push_back( am->Copy() ); }
00141         
00146         virtual void GetAlignedSequences( gnAlignedSequences& gnas, const std::vector< genome::gnSequence* >& seq_table ) const;
00147 
00148         void GetAlignment( std::vector< bitset_t >& align_matrix ) const;
00149 
00150         void CropLeft( gnSeqI amount, uint seqI );
00151         void CropRight( gnSeqI amount, uint seqI );
00152 
00153         void SetAlignment( const std::vector< std::string >& seq_align );
00154 
00155         // TODO: get rid of code that uses this hack...
00156         const std::vector<AbstractMatch*>& GetMatches() const{ return matches; }
00157         void StealMatches( std::vector<AbstractMatch*>& matches );
00158 
00160         void Marble( gnSeqI size );
00161 
00162         void GetColumn( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column ) const;
00163         
00164         bool IsGap( uint seq, gnSeqI col ) const;
00165 
00167         void ValidateMatches() const;
00168 
00169         void swap( GenericInterval& other ){ swap(&other); }
00170 
00171 protected:
00172         // for use by derived classes in order to swap contents
00173         void swap( GenericInterval* other ){
00174                 std::swap( matches, other->matches );
00175                 GappedBaseImpl::swap( other );
00176         }
00177         std::vector< AbstractMatch* > matches;
00178 private:
00179         void addUnalignedRegions();
00180         void FindMatchPos( uint seqI, gnSeqI pos, size_t& matchI, gnSeqI& match_pos );
00181         void GetColumnAndMatch( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column, size_t& matchI, gnSeqI& match_col ) const;
00182         void CalculateAlignmentLength();
00183 };
00184 
00185 typedef GenericInterval<> Interval;
00186 
00187 
00188 template<class GappedBaseImpl>
00189 GenericInterval<GappedBaseImpl>* GenericInterval<GappedBaseImpl>::Copy() const
00190 {
00191         return m_allocateAndCopy( *this );
00192 }
00193 template<class GappedBaseImpl>
00194 void GenericInterval<GappedBaseImpl>::Free()
00195 {
00196         m_free(this);
00197 }
00198 
00199 template<class GappedBaseImpl>
00200 GenericInterval<GappedBaseImpl>::~GenericInterval()
00201 {
00202         for( std::size_t mI = 0; mI < matches.size(); mI++ )
00203                 matches[mI]->Free();
00204 }
00205 
00206 template<class GappedBaseImpl>
00207 void GenericInterval<GappedBaseImpl>::StealMatches( std::vector<AbstractMatch*>& matches ){
00208         matches = this->matches;
00209         this->matches.clear();
00210         for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
00211         {
00212                 this->SetLeftEnd( seqI, NO_MATCH );
00213                 this->SetLength( 0, seqI );
00214         }
00215         this->SetAlignmentLength(0);
00216 }
00217 
00218 template<class GappedBaseImpl>
00219 GenericInterval<GappedBaseImpl>::GenericInterval( const GenericInterval<GappedBaseImpl>& iv )
00220 {
00221         *this = iv;
00222 }
00223 
00224 template<class GappedBaseImpl>
00225 GenericInterval<GappedBaseImpl>& GenericInterval<GappedBaseImpl>::operator=( const GenericInterval& iv )
00226 {
00227         GappedBaseImpl::operator=( iv );
00228         for( std::size_t mI = 0; mI < matches.size(); mI++ )
00229                 matches[mI]->Free();
00230         matches.clear();
00231         for( std::size_t mI = 0; mI < iv.matches.size(); mI++ )
00232                 matches.push_back( iv.matches[mI]->Copy() );
00233         return *this;
00234 }
00235 
00236 template<class GappedBaseImpl>
00237 GenericInterval<GappedBaseImpl>* GenericInterval<GappedBaseImpl>::Clone() const 
00238 {
00239         return new GenericInterval( *this );
00240 }
00241 
00242 
00243 static bool debug_interval = false;
00244 
00245 template<class GappedBaseImpl>
00246 void GenericInterval<GappedBaseImpl>::ValidateMatches() const
00247 {
00248         if( !debug_interval )
00249                 return;
00250         if( matches.size() == 0 )
00251         {
00252 //              genome::breakHere();
00253 //              std::cerr << "iv has no matches\n";
00254                 return;
00255         }
00256         for( uint seqI = 0; seqI < matches[0]->SeqCount(); ++seqI )
00257         {
00258                 gnSeqI prev_rend = this->LeftEnd(seqI);
00259                 if( this->Orientation(seqI) == AbstractMatch::forward )
00260                 {
00261                         for( size_t mI = 0; mI < matches.size(); ++mI )
00262                         {
00263                                 if( matches[mI]->LeftEnd(seqI) != NO_MATCH )
00264                                 {
00265                                         if( prev_rend != matches[mI]->LeftEnd(seqI) )
00266                                         {
00267                                                 std::cerr << "iv broken\n";
00268                                                 std::cerr << "seqI: " << seqI << "\t prev_rend: " << prev_rend << std::endl;
00269                                                 std::cerr << "mI: " << mI << "\tlend: " << matches[mI]->LeftEnd(seqI) << std::endl;
00270                                                 genome::breakHere();
00271                                         }
00272                                         prev_rend = matches[mI]->RightEnd(seqI) + 1;
00273                                 }
00274                         }
00275                 }else if( this->Orientation(seqI) == AbstractMatch::reverse )
00276                 {
00277                         for( size_t mI = matches.size(); mI > 0; mI-- )
00278                         {
00279                                 if( matches[mI-1]->LeftEnd(seqI) != NO_MATCH )
00280                                 {
00281                                         if( prev_rend != matches[mI-1]->LeftEnd(seqI) )
00282                                         {
00283                                                 std::cerr << "iv broken 2\n";
00284                                                 genome::breakHere();
00285                                         }
00286                                         prev_rend = matches[mI-1]->RightEnd(seqI) + 1;
00287                                 }
00288                         }
00289                 }
00290 
00291                 if( this->Orientation(seqI) != AbstractMatch::undefined && this->Length(seqI) == 0 )
00292                 {
00293                         genome::breakHere();
00294                         std::cerr << "ERROR: confused interval\n";
00295                 }
00296         }
00297 }
00298 
00299 template<class GappedBaseImpl>
00300 void GenericInterval<GappedBaseImpl>::GetColumnAndMatch( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column, size_t& matchI, gnSeqI& match_col ) const
00301 {
00302         // bail when the appropriate match is found
00303         gnSeqI col_pos = 0;
00304         size_t mI = 0;
00305         pos.clear();
00306         for( uint seqI = 0; seqI < this->SeqCount(); ++seqI )
00307         {
00308                 if( this->LeftEnd(seqI) == NO_MATCH )
00309                         pos.push_back(NO_MATCH);
00310                 else if( this->Orientation(seqI) == AbstractMatch::forward )
00311                         pos.push_back(this->LeftEnd(seqI));
00312                 else
00313                         pos.push_back(this->RightEnd(seqI)+1);
00314         }
00315 
00316         column = std::vector<bool>(this->SeqCount(), false);
00317 
00318         for( ; mI < matches.size(); ++mI )
00319         {
00320                 uint seqI = 0;
00321 
00322                 gnSeqI diff = matches[mI]->AlignmentLength();
00323                 diff = col_pos + diff <= col ? diff : col - col_pos;
00324 
00325                 for( seqI = 0; seqI < this->SeqCount(); ++seqI )
00326                         if( this->Orientation(seqI) == AbstractMatch::forward )
00327                                 pos[seqI] += diff;
00328                         else if( this->Orientation(seqI) == AbstractMatch::reverse )
00329                                 pos[seqI] -= diff;
00330 
00331                 col_pos += diff;
00332 
00333                 if( col_pos >= col && diff < matches[mI]->AlignmentLength() )
00334                 {
00335                         std::vector<gnSeqI> m_pos;
00336                         matches[mI]->GetColumn( diff, m_pos, column );
00337                         for( uint seqI = 0; seqI < this->SeqCount(); ++seqI )
00338                                 if( m_pos[seqI] != NO_MATCH )
00339                                         pos[seqI] = m_pos[seqI];
00340                         matchI = mI;
00341                         match_col = diff;
00342                         break;
00343                 }
00344         }
00345 }
00346 
00347 template<typename ListType, typename Iter>
00348 void AddGapMatches( ListType& the_list, const Iter& first, const Iter& last, 
00349                                    uint seqI, int64 left_end, int64 right_end, 
00350                                    AbstractMatch::orientation seq_orient, uint seq_count )
00351 {
00352         Iter iter = first;
00353         int64 pos = left_end-1;
00354         for( ; iter != last; ++iter )
00355         {
00356                 if( (*iter)->LeftEnd(seqI) != NO_MATCH )
00357                 {
00358                         gnSeqI len = (*iter)->LeftEnd(seqI)-pos-1;
00359                         if( len > 4000000000u )
00360                         {
00361                                 std::cerr << "triplebogus interval data\n";
00362                                 std::cerr << "(*iter)->LeftEnd(" << seqI << "): " << (*iter)->LeftEnd(seqI) << std::endl;
00363                                 std::cerr << "pos: " << pos << std::endl;
00364                                 genome::breakHere();
00365                         }
00366 
00367                         if( len > 0 )
00368                         {
00369                                 Match tmp(seq_count);
00370                                 Match* new_m = tmp.Copy();
00371                                 new_m->SetLeftEnd(seqI, pos + 1);
00372                                 new_m->SetOrientation(seqI, seq_orient);
00373                                 new_m->SetLength(len);
00374                                 pos = (*iter)->RightEnd(seqI);
00375                                 insert(the_list, iter, new_m);  // this may move iter
00376                         }else
00377                                 pos = (*iter)->RightEnd(seqI);
00378                 }
00379         }
00380 
00381         if( right_end != pos )
00382         {
00383                 Match tmp(seq_count);
00384                 Match* new_m = tmp.Copy();
00385                 new_m->SetLeftEnd(seqI, pos+1);
00386                 new_m->SetLength(right_end-pos-1);
00387                 insert(the_list, iter, new_m);
00388         }
00389 }
00390 
00391 // The best steaks are well marbled
00392 template<class GappedBaseImpl>
00393 void GenericInterval<GappedBaseImpl>::Marble( gnSeqI size )
00394 {
00395         if( this->SeqCount() > 2 )
00396                 throw "I can't handle that many at once\n";
00397         if( this->Multiplicity() < 2 )
00398                 return; // can't marble unless there are at least two seqs
00399 
00400         // first break up all the pieces
00401         std::list<AbstractMatch*> mlist;
00402         mlist.insert( mlist.end(), matches.begin(), matches.end() );
00403         std::list<AbstractMatch*>::iterator m_iter = mlist.begin();
00404         for(; m_iter != mlist.end(); ++m_iter )
00405         {
00406                 if( (*m_iter)->Multiplicity() != 1 || (*m_iter)->AlignmentLength() <= size )
00407                         continue;
00408                 // which seq are we working with?
00409                 uint seqI = 0;
00410                 for( ; seqI < (*m_iter)->SeqCount(); seqI++ )
00411                         if( (*m_iter)->LeftEnd(seqI) != NO_MATCH )
00412                                 break;
00413                 AbstractMatch* left_iv = (*m_iter)->Copy();
00414                 left_iv->CropEnd( left_iv->AlignmentLength() - size );
00415                 (*m_iter)->CropStart( size );
00416                 m_iter = mlist.insert( m_iter, left_iv );
00417         }
00418         matches.clear();
00419         matches.insert( matches.end(), mlist.begin(), mlist.end() );
00420         this->ValidateMatches();
00421 
00422         // now interleave the gaps
00423         std::vector< std::vector<AbstractMatch*>::iterator > seq_iter( this->SeqCount(), matches.begin() );
00424         std::vector< AbstractMatch* > interleaved(matches.size());
00425         std::vector<AbstractMatch*>::iterator anchor = matches.begin();
00426         for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
00427         {
00428                 if( this->LeftEnd(seqI) == NO_MATCH )
00429                         continue;
00430                 for( ; seq_iter[seqI] != matches.end() && (*seq_iter[seqI])->LeftEnd(seqI) == NO_MATCH; ++seq_iter[seqI] );
00431         }
00432         for( ; anchor != matches.end() && (*anchor)->Multiplicity() < this->SeqCount(); ++anchor );
00433         size_t cur = 0;
00434         while(true)
00435         {
00436                 // increment anchor if an iter has caught up to it...
00437                 uint seqI = 0;
00438                 do{
00439                         for( seqI = 0; seqI < this->SeqCount(); seqI++ )
00440                         {
00441                                 if( seq_iter[seqI] == anchor && anchor != matches.end() )
00442                                 {
00443                                         for( uint seqJ = 0; seqJ < this->SeqCount(); seqJ++ )
00444                                         {
00445                                                 // add anything in seq_iter[seqJ]
00446                                                 while( seq_iter[seqJ] != anchor )
00447                                                 {
00448                                                         interleaved[cur++] = *(seq_iter[seqJ]);
00449                                                         for( ++seq_iter[seqJ]; seq_iter[seqJ] != matches.end() && (*seq_iter[seqJ])->LeftEnd(seqJ) == NO_MATCH; ++seq_iter[seqJ] );
00450                                                 }
00451                                                 // don't end on an anchor
00452                                                 for( ++seq_iter[seqJ]; seq_iter[seqJ] != matches.end() && (*seq_iter[seqJ])->LeftEnd(seqJ) == NO_MATCH; ++seq_iter[seqJ] );
00453                                         }
00454                                         // increment anchor
00455                                         interleaved[cur++] = *anchor;
00456                                         for( ++anchor; anchor != matches.end() && (*anchor)->Multiplicity() < this->SeqCount(); ++anchor );
00457 
00458                                         break;
00459                                 }
00460                         }
00461                 }while( seqI < this->SeqCount() );
00462 
00463                 size_t diff1 = anchor - seq_iter[0];
00464                 size_t diff2 = anchor - seq_iter[1];
00465                 if( diff1 == 0 && diff2 == 0 )
00466                         break;
00467                 // sample from a binomial with p(success) = diff1 / diff1+diff2
00468 //              double samp = ((double)rand())/((double)RAND_MAX);
00469                 double samp = RandTwisterDouble();
00470                 // add one of the intervals and move on to the next...
00471                 if( diff2 == 0 || (samp < .5 && diff1 > 0) )
00472                 {
00473                         interleaved[cur++] = *(seq_iter[0]);
00474                         for( ++seq_iter[0]; seq_iter[0] != matches.end() && (*seq_iter[0])->LeftEnd(0) == NO_MATCH; ++seq_iter[0] );
00475                 }else{
00476                         interleaved[cur++] = *(seq_iter[1]);
00477                         for( ++seq_iter[1]; seq_iter[1] != matches.end() && (*seq_iter[1])->LeftEnd(1) == NO_MATCH; ++seq_iter[1] );
00478                 }
00479         }
00480         matches = interleaved;
00481         this->ValidateMatches();
00482 }
00483 
00484 
00485 template<class GappedBaseImpl>
00486 void GenericInterval<GappedBaseImpl>::CropStart(gnSeqI crop_amount)
00487 {
00488         if( crop_amount > this->AlignmentLength() )
00489                 Throw_gnEx( genome::SeqIndexOutOfBounds() );
00490         if( crop_amount == 0 )
00491                 return;
00492         
00493         std::vector<bool> col;
00494         std::vector<gnSeqI> pos;
00495         size_t matchI = 0;
00496         gnSeqI match_col;
00497         this->GetColumnAndMatch( crop_amount, pos, col, matchI, match_col );
00498 
00499         // delete everything before matchI
00500         for( size_t mI = 0; mI < matchI; ++mI )
00501                 matches[mI]->Free();
00502         matches.erase(matches.begin(), matches.begin()+matchI);
00503 
00504         // crop from within matchI
00505         matches[0]->CropStart(match_col);
00506 
00507         this->CalculateOffset();
00508         this->ValidateMatches();
00509 }
00510 
00511 template<class GappedBaseImpl>
00512 void GenericInterval<GappedBaseImpl>::CropEnd(gnSeqI crop_amount)
00513 {
00514         if( crop_amount > this->AlignmentLength() )
00515                 Throw_gnEx( genome::SeqIndexOutOfBounds() );
00516         if( crop_amount == 0 )
00517                 return;
00518         std::vector<bool> col;
00519         std::vector<gnSeqI> pos;
00520         size_t matchI = 0;
00521         gnSeqI match_col;
00522         this->GetColumnAndMatch( this->AlignmentLength()-crop_amount, pos, col, matchI, match_col );
00523 
00524         // delete everything after matchI
00525         size_t plusmatch = match_col == 0 ? 0 : 1;
00526         for( size_t mI = matchI+plusmatch; mI < matches.size(); ++mI )
00527                 matches[mI]->Free();
00528         matches.erase(matches.begin()+matchI+plusmatch, matches.end());
00529 
00530         // crop from within matchI
00531         if( matches.size() > 0 && plusmatch == 1 )
00532                 matches.back()->CropEnd(matches.back()->AlignmentLength() - match_col);
00533 
00534         this->CalculateOffset();
00535         this->ValidateMatches();
00536 }
00537 
00538 template<class GappedBaseImpl>
00539 void GenericInterval<GappedBaseImpl>::GetAlignment( std::vector< bitset_t >& align_matrix ) const
00540 {
00541         gnSeqI cur_col = 0;
00542         align_matrix = std::vector< bitset_t >( this->SeqCount(), bitset_t(this->AlignmentLength(),false) );
00543         for( uint matchI = 0; matchI < matches.size(); ++matchI ){
00544                 std::vector< bitset_t > aln_mat;
00545                 matches[matchI]->GetAlignment( aln_mat );
00546                 for( uint seqI = 0; seqI < this->SeqCount(); ++seqI )
00547                 {
00548                         if( matches[matchI]->LeftEnd(seqI) == NO_MATCH || matches[matchI]->Length(seqI) == 0 )
00549                                 continue;
00550 
00551                         size_t ct = 0;
00552                         gnSeqI len = matches[matchI]->Length(seqI);
00553                         for( bitset_t::size_type pos = aln_mat[seqI].find_first(); ct < len; pos = aln_mat[seqI].find_next(pos) )
00554                         {
00555                                 align_matrix[seqI].set( cur_col + pos );
00556                                 ct++;
00557                         }
00558                 }
00559                 cur_col += matches[matchI]->AlignmentLength();
00560         }
00561 }
00562 
00563 template<class GappedBaseImpl>
00564 void GenericInterval<GappedBaseImpl>::CropLeft( gnSeqI amount, uint seqI )
00565 {
00566         if( amount > this->Length(seqI) )
00567                 Throw_gnEx( genome::SeqIndexOutOfBounds() );
00568         if( this->LeftEnd(seqI) == NO_MATCH || amount == 0 )
00569                 return;
00570 
00571         // for debugging
00572         gnSeqI pre_len = this->Length(seqI);
00573         gnSeqI pre_lend = this->LeftEnd(seqI);
00574 
00575         gnSeqI match_pos;
00576         size_t mI;
00577         this->FindMatchPos(seqI, amount, mI, match_pos);
00578         if( matches[mI]->Orientation(seqI) == this->Orientation(seqI) )
00579                 matches[mI]->CropLeft(match_pos, seqI);
00580         else
00581                 matches[mI]->CropRight(match_pos, seqI);
00582 
00583         if( matches[mI]->Length(seqI) == 0 )
00584                 std::cerr << "Big fat zero 1\n";
00585 
00586         // get rid of everything to the left of mI
00587         if( this->Orientation(seqI) == AbstractMatch::forward )
00588         {
00589                 for( size_t m = 0; m < mI; m++ )
00590                         matches[m]->Free();
00591                 matches.erase(matches.begin(), matches.begin()+mI);
00592         }else{
00593                 for( size_t m = mI+1; m < matches.size(); m++ )
00594                         matches[m]->Free();
00595                 matches.erase(matches.begin()+mI+1, matches.end());
00596         }
00597 
00598         this->CalculateOffset();
00599         this->ValidateMatches();
00600 
00601         if( this->Length(seqI) != pre_len - amount )
00602         {
00603                 std::cerr << "Error intercroplef\n";
00604                 std::cerr << "pre len: " << pre_len << std::endl;
00605                 std::cerr << "pre lend: " << pre_lend << std::endl;
00606                 std::cerr << "amount: " << amount << std::endl;
00607                 std::cerr << "LeftEnd(seqI) " << this->LeftEnd(seqI) << std::endl;
00608                 std::cerr << "Length(seqI) " << this->Length(seqI) << std::endl;
00609                 std::cerr << "AlignmentLength() " << this->AlignmentLength() << std::endl;
00610                 genome::breakHere();
00611         }
00612 }
00613 
00614 template<class GappedBaseImpl>
00615 void GenericInterval<GappedBaseImpl>::CropRight( gnSeqI amount, uint seqI )
00616 {
00617         if( amount > this->Length(seqI) )
00618                 Throw_gnEx( genome::SeqIndexOutOfBounds() );
00619 
00620         if( this->LeftEnd(seqI) == NO_MATCH || amount == 0 )
00621                 return;
00622 
00623         // for debugging
00624         gnSeqI pre_len = this->Length(seqI);
00625         gnSeqI pre_lend = this->LeftEnd(seqI);
00626 
00627         gnSeqI left_amount = this->Length(seqI) - amount;
00628         gnSeqI match_pos;
00629         size_t mI;
00630         this->FindMatchPos(seqI, left_amount, mI, match_pos);
00631         if( matches[mI]->Orientation(seqI) == this->Orientation(seqI) )
00632                 matches[mI]->CropRight(matches[mI]->Length(seqI)-match_pos, seqI);
00633         else
00634                 matches[mI]->CropLeft(matches[mI]->Length(seqI)-match_pos, seqI);
00635 
00636         if( matches[mI]->Length(seqI) == 0 )
00637                 mI += this->Orientation(seqI) == AbstractMatch::forward ? -1 : 1;       // delete this match too
00638 
00639         // get rid of everything to the left of mI
00640         if( this->Orientation(seqI) == AbstractMatch::forward )
00641         {
00642                 for( size_t m = mI+1; m < matches.size(); m++ )
00643                         matches[m]->Free();
00644                 matches.erase(matches.begin()+(mI+1), matches.end());
00645         }else{
00646                 for( size_t m = 0; m < mI; m++ )
00647                         matches[m]->Free();
00648                 matches.erase(matches.begin(), matches.begin()+mI);
00649         }
00650 
00651         this->CalculateOffset();
00652         this->ValidateMatches();
00653 
00654         if( this->Length(seqI) != pre_len - amount )
00655         {
00656                 std::cerr << "Error intercropright\n";
00657                 std::cerr << "pre len: " << pre_len << std::endl;
00658                 std::cerr << "pre lend: " << pre_lend << std::endl;
00659                 std::cerr << "amount: " << amount << std::endl;
00660                 std::cerr << "LeftEnd(seqI) " << this->LeftEnd(seqI) << std::endl;
00661                 std::cerr << "Length(seqI) " << this->Length(seqI) << std::endl;
00662                 std::cerr << "AlignmentLength() " << this->AlignmentLength() << std::endl;
00663                 genome::breakHere();
00664         }
00665 }
00666 
00667 template<class GappedBaseImpl>
00668 void GenericInterval<GappedBaseImpl>::MoveStart(int64 move_amount)
00669 {
00670         GappedBaseImpl::MoveStart(move_amount);
00671         for( size_t mI = 0; mI < matches.size(); mI++ )
00672                 matches[mI]->MoveStart(move_amount);
00673 }
00674 
00675 template<class GappedBaseImpl>
00676 void GenericInterval<GappedBaseImpl>::MoveEnd(int64 move_amount)
00677 {
00678         GappedBaseImpl::MoveEnd(move_amount);
00679         for( size_t mI = 0; mI < matches.size(); mI++ )
00680                 matches[mI]->MoveEnd(move_amount);
00681 }
00682 
00683 
00684 template< class MatchVector >
00685 void FindBoundaries( const MatchVector& matches, std::vector<gnSeqI>& left_ends, std::vector<gnSeqI>& lengths, std::vector<bool>& orientations )
00686 {
00687         uint seqI;
00688         boolean zero_exists = false;
00689         uint seq_count = matches.front()->SeqCount();
00690         left_ends = std::vector<gnSeqI>( seq_count, NO_MATCH );
00691         lengths = std::vector<gnSeqI>( seq_count, 0 );
00692         orientations = std::vector<bool>( seq_count, false );
00693 
00694         // find leftend in each forward sequence
00695         uint matchI = 0;
00696         for(; matchI != matches.size(); ++matchI )
00697         {
00698                 zero_exists = false;
00699                 for( seqI = 0; seqI < seq_count; ++seqI )
00700                 {
00701                         if( left_ends[seqI] == NO_MATCH && matches[matchI]->Orientation(seqI) == AbstractMatch::forward )
00702                         {
00703                                 left_ends[seqI] = matches[ matchI ]->LeftEnd(seqI);
00704                                 orientations[seqI] = true;
00705                         }
00706                         else if( left_ends[seqI] == NO_MATCH )
00707                                 zero_exists = true;
00708                 }
00709                 if( !zero_exists )
00710                         break;
00711         }
00712 
00713         // find end in each forward sequence
00714         for( matchI = matches.size(); matchI > 0; matchI-- )
00715         {
00716                 zero_exists = false;
00717                 for( seqI = 0; seqI < seq_count; ++seqI )
00718                 {
00719                         if( lengths[seqI] == 0 &&
00720                                 matches[ matchI - 1 ]->Orientation(seqI) == AbstractMatch::forward )
00721                         {
00722                                         lengths[seqI] = matches[matchI - 1]->LeftEnd(seqI) + matches[matchI - 1]->Length(seqI) - left_ends[seqI];
00723                         }
00724                         if( left_ends[seqI] != NO_MATCH && lengths[seqI] == 0 )
00725                                 zero_exists = true;
00726                         if( lengths[seqI] > 1000000000 )
00727                         {
00728                                 std::cerr << "matches[matchI - 1]->Length(" << seqI << ") = " << matches[matchI - 1]->Length(seqI) << std::endl;
00729                                 std::cerr << "matches[matchI - 1]->LeftEnd(" << seqI << ") = " << matches[matchI - 1]->LeftEnd(seqI) << std::endl;
00730 
00731                                 std::cerr << "oh skeethockey mushrooms\n";
00732                                 genome::breakHere();
00733                         }
00734                 }
00735                 if( !zero_exists )
00736                         break;
00737         }
00738 
00739         // find start in each reverse sequence
00740         for( matchI = matches.size(); matchI > 0; matchI-- )
00741         {
00742                 zero_exists = false;
00743                 for( seqI = 0; seqI < seq_count; ++seqI )
00744                 {
00745                         if( left_ends[seqI] == NO_MATCH && matches[ matchI - 1 ]->Orientation(seqI) == AbstractMatch::reverse )
00746                                 left_ends[seqI] = matches[matchI - 1]->LeftEnd(seqI);
00747                         if( left_ends[seqI] == NO_MATCH )
00748                                 zero_exists = true;
00749                 }
00750                 if( !zero_exists )
00751                         break;
00752         }
00753 
00754         // find end in each reverse sequence
00755         for( matchI = 0; matchI != matches.size(); ++matchI )
00756         {
00757                 zero_exists = false;
00758                 for( seqI = 0; seqI < seq_count; ++seqI )
00759                 {
00760                         if( lengths[seqI] == 0 &&
00761                                 matches[matchI]->Orientation(seqI) == AbstractMatch::reverse )
00762                         {
00763                                         lengths[seqI] = matches[matchI]->Length(seqI)+(matches[matchI]->LeftEnd(seqI) - left_ends[seqI]);
00764                         }
00765                         if( lengths[seqI] == 0 )
00766                                 zero_exists = true;
00767                         if( lengths[seqI] > 1000000000 )
00768                         {
00769                                 std::cerr << "matches[" << matchI << "]->Length(" << seqI << ") = " << matches[matchI]->Length(seqI) << std::endl;
00770                                 std::cerr << "matches[" << matchI << "]->LeftEnd(" << seqI << ") = " << matches[matchI]->LeftEnd(seqI) << std::endl;
00771 
00772                                 std::cerr << "oh skeethockey mushrooms too\n";
00773                                 genome::breakHere();
00774                         }
00775                 }
00776                 if( !zero_exists )
00777                         break;
00778         }
00779 }
00780 
00781 
00782 template<class GappedBaseImpl>
00783 void GenericInterval<GappedBaseImpl>::addUnalignedRegions()
00784 {
00785         std::list<AbstractMatch*> new_matches(matches.begin(), matches.end());
00786 
00787         for( uint seqI = 0; seqI < this->SeqCount(); ++seqI )
00788         {
00789                 if( this->LeftEnd(seqI) == NO_MATCH )
00790                         continue;
00791                 if(this->Orientation(seqI) == AbstractMatch::forward)
00792                         AddGapMatches( new_matches, new_matches.begin(), new_matches.end(), seqI, this->LeftEnd(seqI), this->RightEnd(seqI), this->Orientation(seqI), this->SeqCount() );
00793                 else
00794                         AddGapMatches( new_matches, new_matches.rbegin(), new_matches.rend(), seqI, this->LeftEnd(seqI), this->RightEnd(seqI), this->Orientation(seqI), this->SeqCount() );
00795         }
00796         matches.clear();
00797         matches.insert(matches.end(), new_matches.begin(), new_matches.end());
00798 }
00799 
00800 template<class GappedBaseImpl>
00801 void GenericInterval<GappedBaseImpl>::Invert(){
00802         GappedBaseImpl::Invert();
00803         for( uint matchI = 0; matchI < matches.size(); ++matchI )
00804                 matches[ matchI ]->Invert();
00805 
00806         std::reverse( matches.begin(), matches.end() );
00807 }
00808 
00809 template<class GappedBaseImpl>
00810 void GenericInterval<GappedBaseImpl>::GetColumn( gnSeqI col, std::vector<gnSeqI>& pos, std::vector<bool>& column ) const
00811 {
00812         size_t matchI;
00813         gnSeqI match_col;
00814         this->GetColumnAndMatch( col, pos, column, matchI, match_col );
00815 }
00816 
00817 template<class GappedBaseImpl>
00818 void GenericInterval<GappedBaseImpl>::FindMatchPos( uint seqI, gnSeqI pos, size_t& matchI, gnSeqI& match_pos )
00819 {
00820         match_pos = pos;
00821         int diff_amt = 0;
00822         int incr = 1;
00823         matchI = 0;
00824         size_t end_mI = matches.size();
00825         if( this->Orientation(seqI) == AbstractMatch::reverse )
00826         {
00827                 diff_amt = -1;
00828                 incr = -1;
00829                 matchI = matches.size();
00830                 end_mI = 0;
00831         }
00832 
00833         for( ; matchI != end_mI; matchI+=incr )
00834         {
00835                 if( matches[matchI+diff_amt]->LeftEnd(seqI) == NO_MATCH )
00836                         continue;
00837                 if( matches[matchI+diff_amt]->Length(seqI) <= match_pos )
00838                         match_pos -= matches[matchI+diff_amt]->Length(seqI);
00839                 else
00840                         break;
00841         }
00842 
00843         if( this->Orientation(seqI) == AbstractMatch::reverse )
00844                 matchI--;
00845 }
00846 
00847 template<class GappedBaseImpl>
00848 void GenericInterval<GappedBaseImpl>::CalculateOffset(){
00849         std::vector<gnSeqI> left_end( this->SeqCount(), NO_MATCH );
00850         std::vector<gnSeqI> length( this->SeqCount(), 0 );
00851         std::vector<bool> orientation;
00852         if( this->matches.size() > 0 )
00853                 FindBoundaries( this->matches, left_end, length, orientation );
00854         for( uint seqI = 0; seqI < this->SeqCount(); seqI++ )
00855         {
00856                 if( left_end[seqI] != 0 )
00857                 {
00858                         this->SetLeftEnd(seqI, left_end[seqI]);
00859                         this->SetLength(length[seqI], seqI);
00860                         if( orientation[seqI] )
00861                                 this->SetOrientation(seqI, AbstractMatch::forward);
00862                         else
00863                                 this->SetOrientation(seqI, AbstractMatch::reverse);
00864                 }else if( this->LeftEnd(seqI) != NO_MATCH )
00865                 {
00866                         this->SetLength(0, seqI);
00867                         this->SetLeftEnd(seqI, NO_MATCH);
00868                 }
00869 
00870         }
00871 
00872         this->CalculateAlignmentLength();
00873 }
00874 
00875 template<class GappedBaseImpl>
00876 void GenericInterval<GappedBaseImpl>::SetAlignment( const std::vector< std::string >& seq_align )
00877 {
00878         GappedAlignment* ga = new GappedAlignment(seq_align.size(), seq_align[0].size());
00879         matches.clear();
00880         matches.push_back(ga);
00881         ga->SetAlignment(seq_align);
00882         for( uint seqI = 0; seqI < this->SeqCount(); ++seqI )
00883         {
00884                 ga->SetStart(seqI, this->Start(seqI));
00885                 ga->SetLength(this->Length(seqI), seqI);
00886         }
00887 }
00888 
00889 
00893 template<class GappedBaseImpl>
00894 std::ostream& operator<<(std::ostream& os, const GenericInterval<GappedBaseImpl>& cr){
00895         try{
00896         for( uint matchI = 0; matchI < cr.matches.size(); ++matchI ){
00897                 const AbstractMatch* m = cr.matches[ matchI ];
00898                 const GappedAlignment* clust = dynamic_cast< const GappedAlignment* >( m );
00899                 if( clust != NULL )
00900                         os << *clust;
00901                 const Match* match = dynamic_cast< const Match* >( m );
00902                 if( match != NULL )
00903                         os << *match;
00904                 os << std::endl;
00905         }
00906         }catch(...){
00907                 std::cerr << "Exceptional handler\n";
00908         }
00909         return os;
00910 }
00911 
00912 template<class GappedBaseImpl>
00913 void GenericInterval<GappedBaseImpl>::CalculateAlignmentLength()
00914 {
00915         gnSeqI aln_len = 0;
00916         // count each match's alignment length
00917         for( size_t mI = 0; mI < matches.size(); ++mI )
00918                 aln_len += matches[mI]->AlignmentLength();
00919         this->SetAlignmentLength(aln_len);
00920 }
00921 
00922 template<class GappedBaseImpl>
00923 void GenericInterval<GappedBaseImpl>::GetAlignedSequences( gnAlignedSequences& gnas, const std::vector< genome::gnSequence* >& seq_table ) const 
00924 {
00925         gnas.names.clear();
00926         for( uint seqI = 0; seqI < seq_table.size(); ++seqI ){
00927                 std::string name;
00928                 if( seq_table[ seqI ]->contigListSize() > 0 )
00929                         name = seq_table[ seqI ]->contigName( 0 );
00930                 gnas.names.push_back( name );
00931                 gnas.positions.push_back(this->Start(seqI));
00932         }
00933         mems::GetAlignment( *this, seq_table, gnas.sequences );
00934 }
00935 
00936 template<class GappedBaseImpl>
00937 bool GenericInterval<GappedBaseImpl>::IsGap( uint seq, gnSeqI col ) const
00938 {
00939         std::vector<gnSeqI> pos;
00940         std::vector<bool> column;
00941         GetColumn(col, pos, column);
00942         return column[seq];
00943 }
00944 
00945 }
00946 
00947 namespace std {
00948 template<> inline
00949 void swap( mems::Interval& a, mems::Interval& b )
00950 {
00951         a.swap(b);
00952 }
00953 }
00954 
00955 #endif  // __Interval_h__

Generated on Fri Mar 14 06:01:03 2008 for libMems by doxygen 1.3.6