libMems/MatchFinder.h

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * $Id: MatchFinder.h,v 1.23 2004/03/01 02:40:08 darling Exp $
00003  * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
00004  * This file is licensed under the GPL.
00005  * Please see the file called COPYING for licensing details.
00006  * **************
00007  ******************************************************************************/
00008 
00009 #ifndef _MatchFinder_h_
00010 #define _MatchFinder_h_
00011 
00012 #ifdef HAVE_CONFIG_H
00013 #include "config.h"
00014 #endif
00015 
00016 #include "libMems/SortedMerList.h"
00017 #include "libMems/Match.h"
00018 #include "libMems/MatchList.h"
00019 #include <list>
00020 #include <iostream>
00021 #include <boost/pool/pool_alloc.hpp>
00022 
00023 namespace mems {
00024 
00025 struct idmer{
00026         gnSeqI  position;       //starting position of this mer in the genome
00027         uint64  mer;            //the actual sequence
00028         sarID_t id;                     //the sequence identifier.
00029 };
00030 
00031 // typedef std::list<idmer, boost::fast_pool_allocator<idmer> > IdmerList;
00032 // using boost::fast_pool_allocator<idmer> results in a significant speedup
00033 // over std::allocator.  testing on a Salmonella vs. Y. pestis comparison shows
00034 // a 30% speedup
00035 typedef std::list<idmer> IdmerList;
00036 
00037 const unsigned int PROGRESS_GRANULARITY = 100;
00038 
00046 class MatchFinder : public genome::gnClone{
00047 public:
00048         MatchFinder();
00049         ~MatchFinder();
00050         MatchFinder(const MatchFinder& mf);
00051         virtual void Clear();
00057         virtual boolean AddSequence( SortedMerList* sar, genome::gnSequence* seq = NULL );
00063         virtual void GetBreakpoint( uint32 sarI, gnSeqI startI, std::vector<gnSeqI>& breakpoints ) const;
00064         virtual uint32 Multiplicity(void){return seq_count;};
00066         virtual void SetAmbiguityTolerance(uint32 ambiguity_tol){ambiguity_tolerance = ambiguity_tol;}
00068         virtual uint32 AmbiguityTolerance(){return ambiguity_tolerance;}
00070         virtual float GetProgress() const {return m_progress;}
00071 
00073         virtual void FindMatchSeeds();
00075         virtual void FindMatchSeeds( const std::vector<gnSeqI>& start_offsets );
00076 
00080         virtual void LogProgress( std::ostream* os );
00081         void SetOffsetLog( std::ostream* offset_stream ){ this->offset_stream = offset_stream; }
00082 protected:
00088         virtual boolean SearchRange(std::vector<gnSeqI>& start_points, std::vector<gnSeqI>& search_len);
00090         virtual boolean HashMatch(IdmerList& match_list) = 0;
00091         virtual boolean EnumerateMatches(IdmerList& match_list);
00092 
00093         template< class MatchType >
00094         void FindSubsets(const MatchType& mhe, std::vector<MatchType>& subset_matches);
00095 
00096         template< class UngappedMatchType >
00097         void ExtendMatch(UngappedMatchType& mhe, std::vector<UngappedMatchType>& subset_matches, gnSeqI max_backward = GNSEQI_END, gnSeqI max_forward = GNSEQI_END);
00098 
00099         virtual SortedMerList* GetSar(uint32 sarI) const;
00100         std::vector<SortedMerList*> sar_table;
00101         std::vector<genome::gnSequence*> seq_table;
00102         
00103         uint32 mer_size;
00104         uint32 seq_count;
00105         uint32 ambiguity_tolerance;
00106         
00107         // for subset matches
00108         std::vector< std::vector< uint32 > > alpha_map;
00109         uint alpha_map_size;
00110         uint alphabet_bits;
00111         
00112         float m_progress;
00113         std::ostream* log_stream;
00114 
00115         uint64 mers_processed;  
00116         uint64 total_mers;      
00117         std::ostream* offset_stream;    
00118 };
00119 
00123 CREATE_EXCEPTION( InvalidData );
00124 
00125 inline
00126 SortedMerList* MatchFinder::GetSar(uint32 sarI) const{
00127         return sar_table[sarI];
00128 }
00129 
00130 inline
00131 bool idmer_lessthan(idmer& a_v, idmer& m_v){
00132         return (a_v.mer < m_v.mer);// ? true : false;
00133 };
00134 
00135 //id less than function for STL sort functions
00136 inline
00137 bool idmer_id_lessthan(idmer& a_v, idmer& m_v){
00138         return (a_v.id < m_v.id);// ? true : false;
00139 };
00140 
00141 
00142 
00143 // takes as input a fully extended mem and returns the subset matches on the lower side
00144 template< class MatchType >
00145 void MatchFinder::FindSubsets(const MatchType& mhe, std::vector<MatchType>& subset_matches){
00146 
00147         SMLHeader head = GetSar( 0 )->GetHeader();
00148         uint shift_amt = 64 - head.alphabet_bits;
00149         uint rshift_amt = head.alphabet_bits * ( GetSar(0)->SeedLength() - 1 );
00150 
00151         uint seqI, alphaI;
00152 
00153         // initialize subset match data structures
00154         alpha_map_size = 1;
00155         alpha_map_size <<= alphabet_bits;
00156         if( alpha_map.size() != alpha_map_size ){
00157                 alpha_map.clear();
00158                 alpha_map.reserve( alpha_map_size );
00159                 std::vector< uint32 > tmp_list;
00160                 tmp_list.reserve( seq_count );
00161                 for( uint alphaI = 0; alphaI < alpha_map_size; ++alphaI )
00162                         alpha_map.push_back( tmp_list );
00163         }else{
00164                 for( uint alphaI = 0; alphaI < alpha_map_size; ++alphaI )
00165                         alpha_map[ alphaI ].clear();
00166         }
00167         
00168         
00169         for( seqI = 0; seqI < seq_count; ++seqI ){
00170                 //check that all mers at the new position match
00171                 int64 mer_to_get = mhe[ seqI ];
00172                 if( mer_to_get == NO_MATCH )
00173                         continue;
00174                 if(mer_to_get < 0){
00175                         mer_to_get *= -1;
00176                         mer_to_get += mhe.Length() - GetSar(0)->SeedLength();
00177                 }
00178 
00179                 uint64 cur_mer = GetSar( seqI )->GetMer( mer_to_get - 1 );
00180 
00181                 boolean parity;
00182                 if( mhe[ seqI ] < 0 )
00183                         parity = cur_mer & 0x1;
00184                 else
00185                         parity = !(cur_mer & 0x1);
00186 
00187                 if( parity ){
00188                         cur_mer >>= shift_amt;
00189                 }else{
00190                         cur_mer <<= rshift_amt;
00191                         cur_mer = ~cur_mer;
00192                         cur_mer >>= shift_amt;
00193                 }
00194 
00195                 alpha_map[ cur_mer ].push_back( seqI );
00196 
00197         }
00198         
00199         for( alphaI = 0; alphaI < alpha_map_size; ++alphaI ){
00200                 if( alpha_map[ alphaI ].size() < 2 ){
00201                         alpha_map[ alphaI ].clear();
00202                         continue;
00203                 }
00204                 // this is a subset
00205                 MatchType cur_subset = mhe;
00206                 cur_subset.SetLength( mhe.Length() );
00207                 for( uint sqI = 0; sqI < mhe.SeqCount(); ++sqI )
00208                         cur_subset.SetStart( sqI, NO_MATCH );   // init everything to NO_MATCH
00209                 for( uint subI = 0; subI < alpha_map[ alphaI ].size(); ++subI )
00210                         cur_subset.SetStart( alpha_map[ alphaI ][ subI ], mhe[ alpha_map[ alphaI ][ subI ] ] );
00211                 subset_matches.push_back( cur_subset );
00212                 alpha_map[ alphaI ].clear();
00213         }
00214 }
00215 
00216 // BUGS:
00217 // matches which span the end-start of a circular sequence will be hashed a second time
00218 template< class UngappedMatchType >
00219 void MatchFinder::ExtendMatch(UngappedMatchType& mhe, std::vector<UngappedMatchType>& subset_matches, gnSeqI max_backward, gnSeqI max_forward){
00220         uint64 cur_mer;
00221         uint64 mer_mask = GetSar(0)->GetSeedMask();
00222 
00223         //which sequences are used in this match?
00224         uint32* cur_seqs = new uint32[mhe.SeqCount()];
00225         uint32 used_seqs = 0;
00226         for(uint32 seqI = 0; seqI < mhe.SeqCount(); ++seqI){
00227                 if(mhe[seqI] != NO_MATCH){
00228                         cur_seqs[used_seqs] = seqI;
00229                         ++used_seqs;
00230                 }
00231         }
00232         //First extend backwards then extend forwards.  The following loop does them both.
00233         int jump_size = GetSar(0)->SeedLength();
00234         uint extend_limit = 0;  
00235         uint extend_attempts = 0;       
00236         boolean extend_again = false;   
00237         for(uint32 directionI = 0; directionI < 4; ++directionI){
00238                 //how far can we go?    
00239                 //first calculate the maximum amount of traversal
00240                 //then do fewer comparisons.
00241                 int64 maxlen = GNSEQI_END;
00242                 if(directionI == 0)
00243                         maxlen = max_backward;
00244                 else if(directionI == 1)
00245                         maxlen = max_forward;
00246                 else
00247                         maxlen = GetSar(0)->SeedLength();
00248                 for(uint32 maxI = 0; maxI < used_seqs; ++maxI)
00249                         if(GetSar(cur_seqs[maxI])->IsCircular()){
00250                                 if(GetSar(cur_seqs[maxI])->Length() < maxlen)
00251                                         maxlen = GetSar(cur_seqs[maxI])->Length();
00252                         }else if(mhe[cur_seqs[maxI]] < 0){
00253                                 int64 rc_len = GetSar(cur_seqs[maxI])->Length() - mhe.Length() + mhe[cur_seqs[maxI]] + 1;
00254                                 if( rc_len < maxlen)
00255                                         maxlen = rc_len;
00256                         }else if(mhe[cur_seqs[maxI]] - 1 < maxlen)
00257                                 maxlen = mhe[cur_seqs[maxI]] - 1;
00258                 uint32 j=0;
00259                 uint32 i = used_seqs;   // set to used_seqs in case maxlen is already less than jump size.
00260 
00261                 extend_limit = 0;
00262                 extend_attempts = 0;
00263 
00264                 while(maxlen - jump_size >= 0){
00265                         mhe.SetLength(mhe.Length() + jump_size);
00266                         maxlen -= jump_size;
00267                         for(j=0; j < used_seqs; ++j){
00268                                 if(mhe[cur_seqs[j]] > 0){
00269                                         mhe.SetStart(cur_seqs[j], mhe[cur_seqs[j]] - jump_size);
00270                                         if(mhe[cur_seqs[j]] <= 0)
00271                                                 mhe.SetStart(cur_seqs[j], mhe[cur_seqs[j]] + GetSar(cur_seqs[j])->Length());
00272                                 }
00273                         }
00274                         //check that all mers at the new position match
00275                         int64 mer_to_get = mhe[cur_seqs[0]];
00276                         if(mer_to_get < 0){
00277                                 mer_to_get *= -1;
00278                                 mer_to_get += mhe.Length() - GetSar(0)->SeedLength();
00279                         }
00280                         cur_mer = GetSar(cur_seqs[0])->GetSeedMer(mer_to_get - 1);
00281                         boolean parity;
00282                         if( mhe[cur_seqs[0]] < 0 )
00283                                 parity = cur_mer & 0x1;
00284                         else
00285                                 parity = !(cur_mer & 0x1);
00286                         cur_mer &= mer_mask;
00287 
00288                         for(i=1; i < used_seqs; ++i){
00289                                 mer_to_get = mhe[cur_seqs[i]];
00290                                 if(mer_to_get < 0){
00291                                         //Convert the cur_seqs[i] entry since negative implies reverse complement
00292                                         mer_to_get *= -1;
00293                                         mer_to_get += mhe.Length() - GetSar(0)->SeedLength();
00294                                 }
00295                                 uint64 comp_mer = GetSar(cur_seqs[i])->GetSeedMer(mer_to_get - 1);
00296                                 boolean comp_parity;                            
00297                                 if( mhe[cur_seqs[i]] < 0 )
00298                                         comp_parity = comp_mer & 0x1;
00299                                 else
00300                                         comp_parity = !(comp_mer & 0x1);
00301                                 comp_mer &= mer_mask;
00302                                 
00303                                 if(cur_mer != comp_mer || parity != comp_parity ){
00304                                         if( directionI < 2 )
00305                                                 maxlen = 0;
00306                                         break;
00307                                 }
00308                         }
00309                         extend_attempts += jump_size;
00310                         if( i == used_seqs )
00311                                 extend_limit = extend_attempts;
00312                         if( directionI > 1 && extend_attempts == GetSar(0)->SeedLength() )
00313                                 break;
00314                 }
00315                 //this stuff cleans up if there was a mismatch
00316                 if(i < used_seqs){
00317                         mhe.SetLength(mhe.Length() - jump_size);
00318                         for(;j > 0; j--){
00319                                 if(mhe[cur_seqs[j - 1]] >= 0)
00320                                         mhe.SetStart(cur_seqs[j - 1], mhe[cur_seqs[j - 1]] + jump_size);
00321                         }
00322                 }
00323                 // check whether any of the overlapping seeds matched.
00324                 // if so, set the match to that length and set the flag to start the search again
00325                 if( directionI > 1 && extend_attempts > 0 ){
00326                         if( extend_limit > 0 )
00327                                 extend_again = true;
00328                         // minus jump_size because the cleanup above already moved the length back a little
00329                         int unmatched_diff = extend_attempts - extend_limit;
00330                         if( i < used_seqs )
00331                                 unmatched_diff -= jump_size;
00332                         if( (unmatched_diff > 2000000000 || unmatched_diff > mhe.Length()) && unmatched_diff >= 0 )
00333                                 std::cerr << "oh sheethockey mushrooms\n";
00334                         mhe.SetLength(mhe.Length() - unmatched_diff);
00335                         for(j=0; j < used_seqs; ++j){
00336                                 if(mhe[cur_seqs[j]] > 0){
00337                                         mhe.SetStart(cur_seqs[j], mhe[cur_seqs[j]] + unmatched_diff);
00338                                         if(mhe[cur_seqs[j]] > GetSar(cur_seqs[j])->Length() )
00339                                                 mhe.SetStart(cur_seqs[j], mhe[cur_seqs[j]] - GetSar(cur_seqs[j])->Length() );
00340                                 }
00341                         }
00342                 }
00343                 //Invert the sequence directions so that we extend in the other direction
00344                 //next time through the loop.  The second time we do this we are setting
00345                 //sequence directions back to normal.
00346                 mhe.Invert();
00347 
00348                 //if we've already been through twice then decrease the jump size
00349                 if(directionI >= 1)
00350                         jump_size = 1;
00351                 if( directionI == 3 && extend_again ){
00352                         directionI = -1;        // will become 0 on next iteration
00353                         jump_size = GetSar(0)->SeedLength();
00354                         extend_again = false;
00355                 }
00356         }
00357         // after the match has been fully extended, search for subset matches
00358         // this code only works when using SOLID seeds-- so it's been disabled
00359 /*      if( used_seqs > 2 ){
00360                 FindSubsets( mhe, subset_matches );
00361                 mhe.Invert();
00362                 FindSubsets( mhe, subset_matches );
00363                 mhe.Invert();
00364         }
00365 */
00366         // set the subsets so their reference sequence is always positive
00367         for(uint32 subsetI = 0; subsetI < subset_matches.size(); ++subsetI){
00368                 if( subset_matches[subsetI][subset_matches[subsetI].FirstStart()] < 0 )
00369                         subset_matches[subsetI].Invert();
00370                 subset_matches[subsetI].CalculateOffset();
00371         }
00372 
00373         delete[] cur_seqs;
00374 }
00375 
00376 
00377 
00378 }
00379 
00380 #endif  //_MatchFinder_h_

Generated on Fri Mar 14 06:01:03 2008 for libMems by doxygen 1.3.6