00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifndef _MatchFinder_h_
00010 #define _MatchFinder_h_
00011
00012 #ifdef HAVE_CONFIG_H
00013 #include "config.h"
00014 #endif
00015
00016 #include "libMems/SortedMerList.h"
00017 #include "libMems/Match.h"
00018 #include "libMems/MatchList.h"
00019 #include <list>
00020 #include <iostream>
00021 #include <boost/pool/pool_alloc.hpp>
00022
00023 namespace mems {
00024
00025 struct idmer{
00026 gnSeqI position;
00027 uint64 mer;
00028 sarID_t id;
00029 };
00030
00031
00032
00033
00034
00035 typedef std::list<idmer> IdmerList;
00036
00037 const unsigned int PROGRESS_GRANULARITY = 100;
00038
00046 class MatchFinder : public genome::gnClone{
00047 public:
00048 MatchFinder();
00049 ~MatchFinder();
00050 MatchFinder(const MatchFinder& mf);
00051 virtual void Clear();
00057 virtual boolean AddSequence( SortedMerList* sar, genome::gnSequence* seq = NULL );
00063 virtual void GetBreakpoint( uint32 sarI, gnSeqI startI, std::vector<gnSeqI>& breakpoints ) const;
00064 virtual uint32 Multiplicity(void){return seq_count;};
00066 virtual void SetAmbiguityTolerance(uint32 ambiguity_tol){ambiguity_tolerance = ambiguity_tol;}
00068 virtual uint32 AmbiguityTolerance(){return ambiguity_tolerance;}
00070 virtual float GetProgress() const {return m_progress;}
00071
00073 virtual void FindMatchSeeds();
00075 virtual void FindMatchSeeds( const std::vector<gnSeqI>& start_offsets );
00076
00080 virtual void LogProgress( std::ostream* os );
00081 void SetOffsetLog( std::ostream* offset_stream ){ this->offset_stream = offset_stream; }
00082 protected:
00088 virtual boolean SearchRange(std::vector<gnSeqI>& start_points, std::vector<gnSeqI>& search_len);
00090 virtual boolean HashMatch(IdmerList& match_list) = 0;
00091 virtual boolean EnumerateMatches(IdmerList& match_list);
00092
00093 template< class MatchType >
00094 void FindSubsets(const MatchType& mhe, std::vector<MatchType>& subset_matches);
00095
00096 template< class UngappedMatchType >
00097 void ExtendMatch(UngappedMatchType& mhe, std::vector<UngappedMatchType>& subset_matches, gnSeqI max_backward = GNSEQI_END, gnSeqI max_forward = GNSEQI_END);
00098
00099 virtual SortedMerList* GetSar(uint32 sarI) const;
00100 std::vector<SortedMerList*> sar_table;
00101 std::vector<genome::gnSequence*> seq_table;
00102
00103 uint32 mer_size;
00104 uint32 seq_count;
00105 uint32 ambiguity_tolerance;
00106
00107
00108 std::vector< std::vector< uint32 > > alpha_map;
00109 uint alpha_map_size;
00110 uint alphabet_bits;
00111
00112 float m_progress;
00113 std::ostream* log_stream;
00114
00115 uint64 mers_processed;
00116 uint64 total_mers;
00117 std::ostream* offset_stream;
00118 };
00119
00123 CREATE_EXCEPTION( InvalidData );
00124
00125 inline
00126 SortedMerList* MatchFinder::GetSar(uint32 sarI) const{
00127 return sar_table[sarI];
00128 }
00129
00130 inline
00131 bool idmer_lessthan(idmer& a_v, idmer& m_v){
00132 return (a_v.mer < m_v.mer);
00133 };
00134
00135
00136 inline
00137 bool idmer_id_lessthan(idmer& a_v, idmer& m_v){
00138 return (a_v.id < m_v.id);
00139 };
00140
00141
00142
00143
00144 template< class MatchType >
00145 void MatchFinder::FindSubsets(const MatchType& mhe, std::vector<MatchType>& subset_matches){
00146
00147 SMLHeader head = GetSar( 0 )->GetHeader();
00148 uint shift_amt = 64 - head.alphabet_bits;
00149 uint rshift_amt = head.alphabet_bits * ( GetSar(0)->SeedLength() - 1 );
00150
00151 uint seqI, alphaI;
00152
00153
00154 alpha_map_size = 1;
00155 alpha_map_size <<= alphabet_bits;
00156 if( alpha_map.size() != alpha_map_size ){
00157 alpha_map.clear();
00158 alpha_map.reserve( alpha_map_size );
00159 std::vector< uint32 > tmp_list;
00160 tmp_list.reserve( seq_count );
00161 for( uint alphaI = 0; alphaI < alpha_map_size; ++alphaI )
00162 alpha_map.push_back( tmp_list );
00163 }else{
00164 for( uint alphaI = 0; alphaI < alpha_map_size; ++alphaI )
00165 alpha_map[ alphaI ].clear();
00166 }
00167
00168
00169 for( seqI = 0; seqI < seq_count; ++seqI ){
00170
00171 int64 mer_to_get = mhe[ seqI ];
00172 if( mer_to_get == NO_MATCH )
00173 continue;
00174 if(mer_to_get < 0){
00175 mer_to_get *= -1;
00176 mer_to_get += mhe.Length() - GetSar(0)->SeedLength();
00177 }
00178
00179 uint64 cur_mer = GetSar( seqI )->GetMer( mer_to_get - 1 );
00180
00181 boolean parity;
00182 if( mhe[ seqI ] < 0 )
00183 parity = cur_mer & 0x1;
00184 else
00185 parity = !(cur_mer & 0x1);
00186
00187 if( parity ){
00188 cur_mer >>= shift_amt;
00189 }else{
00190 cur_mer <<= rshift_amt;
00191 cur_mer = ~cur_mer;
00192 cur_mer >>= shift_amt;
00193 }
00194
00195 alpha_map[ cur_mer ].push_back( seqI );
00196
00197 }
00198
00199 for( alphaI = 0; alphaI < alpha_map_size; ++alphaI ){
00200 if( alpha_map[ alphaI ].size() < 2 ){
00201 alpha_map[ alphaI ].clear();
00202 continue;
00203 }
00204
00205 MatchType cur_subset = mhe;
00206 cur_subset.SetLength( mhe.Length() );
00207 for( uint sqI = 0; sqI < mhe.SeqCount(); ++sqI )
00208 cur_subset.SetStart( sqI, NO_MATCH );
00209 for( uint subI = 0; subI < alpha_map[ alphaI ].size(); ++subI )
00210 cur_subset.SetStart( alpha_map[ alphaI ][ subI ], mhe[ alpha_map[ alphaI ][ subI ] ] );
00211 subset_matches.push_back( cur_subset );
00212 alpha_map[ alphaI ].clear();
00213 }
00214 }
00215
00216
00217
00218 template< class UngappedMatchType >
00219 void MatchFinder::ExtendMatch(UngappedMatchType& mhe, std::vector<UngappedMatchType>& subset_matches, gnSeqI max_backward, gnSeqI max_forward){
00220 uint64 cur_mer;
00221 uint64 mer_mask = GetSar(0)->GetSeedMask();
00222
00223
00224 uint32* cur_seqs = new uint32[mhe.SeqCount()];
00225 uint32 used_seqs = 0;
00226 for(uint32 seqI = 0; seqI < mhe.SeqCount(); ++seqI){
00227 if(mhe[seqI] != NO_MATCH){
00228 cur_seqs[used_seqs] = seqI;
00229 ++used_seqs;
00230 }
00231 }
00232
00233 int jump_size = GetSar(0)->SeedLength();
00234 uint extend_limit = 0;
00235 uint extend_attempts = 0;
00236 boolean extend_again = false;
00237 for(uint32 directionI = 0; directionI < 4; ++directionI){
00238
00239
00240
00241 int64 maxlen = GNSEQI_END;
00242 if(directionI == 0)
00243 maxlen = max_backward;
00244 else if(directionI == 1)
00245 maxlen = max_forward;
00246 else
00247 maxlen = GetSar(0)->SeedLength();
00248 for(uint32 maxI = 0; maxI < used_seqs; ++maxI)
00249 if(GetSar(cur_seqs[maxI])->IsCircular()){
00250 if(GetSar(cur_seqs[maxI])->Length() < maxlen)
00251 maxlen = GetSar(cur_seqs[maxI])->Length();
00252 }else if(mhe[cur_seqs[maxI]] < 0){
00253 int64 rc_len = GetSar(cur_seqs[maxI])->Length() - mhe.Length() + mhe[cur_seqs[maxI]] + 1;
00254 if( rc_len < maxlen)
00255 maxlen = rc_len;
00256 }else if(mhe[cur_seqs[maxI]] - 1 < maxlen)
00257 maxlen = mhe[cur_seqs[maxI]] - 1;
00258 uint32 j=0;
00259 uint32 i = used_seqs;
00260
00261 extend_limit = 0;
00262 extend_attempts = 0;
00263
00264 while(maxlen - jump_size >= 0){
00265 mhe.SetLength(mhe.Length() + jump_size);
00266 maxlen -= jump_size;
00267 for(j=0; j < used_seqs; ++j){
00268 if(mhe[cur_seqs[j]] > 0){
00269 mhe.SetStart(cur_seqs[j], mhe[cur_seqs[j]] - jump_size);
00270 if(mhe[cur_seqs[j]] <= 0)
00271 mhe.SetStart(cur_seqs[j], mhe[cur_seqs[j]] + GetSar(cur_seqs[j])->Length());
00272 }
00273 }
00274
00275 int64 mer_to_get = mhe[cur_seqs[0]];
00276 if(mer_to_get < 0){
00277 mer_to_get *= -1;
00278 mer_to_get += mhe.Length() - GetSar(0)->SeedLength();
00279 }
00280 cur_mer = GetSar(cur_seqs[0])->GetSeedMer(mer_to_get - 1);
00281 boolean parity;
00282 if( mhe[cur_seqs[0]] < 0 )
00283 parity = cur_mer & 0x1;
00284 else
00285 parity = !(cur_mer & 0x1);
00286 cur_mer &= mer_mask;
00287
00288 for(i=1; i < used_seqs; ++i){
00289 mer_to_get = mhe[cur_seqs[i]];
00290 if(mer_to_get < 0){
00291
00292 mer_to_get *= -1;
00293 mer_to_get += mhe.Length() - GetSar(0)->SeedLength();
00294 }
00295 uint64 comp_mer = GetSar(cur_seqs[i])->GetSeedMer(mer_to_get - 1);
00296 boolean comp_parity;
00297 if( mhe[cur_seqs[i]] < 0 )
00298 comp_parity = comp_mer & 0x1;
00299 else
00300 comp_parity = !(comp_mer & 0x1);
00301 comp_mer &= mer_mask;
00302
00303 if(cur_mer != comp_mer || parity != comp_parity ){
00304 if( directionI < 2 )
00305 maxlen = 0;
00306 break;
00307 }
00308 }
00309 extend_attempts += jump_size;
00310 if( i == used_seqs )
00311 extend_limit = extend_attempts;
00312 if( directionI > 1 && extend_attempts == GetSar(0)->SeedLength() )
00313 break;
00314 }
00315
00316 if(i < used_seqs){
00317 mhe.SetLength(mhe.Length() - jump_size);
00318 for(;j > 0; j--){
00319 if(mhe[cur_seqs[j - 1]] >= 0)
00320 mhe.SetStart(cur_seqs[j - 1], mhe[cur_seqs[j - 1]] + jump_size);
00321 }
00322 }
00323
00324
00325 if( directionI > 1 && extend_attempts > 0 ){
00326 if( extend_limit > 0 )
00327 extend_again = true;
00328
00329 int unmatched_diff = extend_attempts - extend_limit;
00330 if( i < used_seqs )
00331 unmatched_diff -= jump_size;
00332 if( (unmatched_diff > 2000000000 || unmatched_diff > mhe.Length()) && unmatched_diff >= 0 )
00333 std::cerr << "oh sheethockey mushrooms\n";
00334 mhe.SetLength(mhe.Length() - unmatched_diff);
00335 for(j=0; j < used_seqs; ++j){
00336 if(mhe[cur_seqs[j]] > 0){
00337 mhe.SetStart(cur_seqs[j], mhe[cur_seqs[j]] + unmatched_diff);
00338 if(mhe[cur_seqs[j]] > GetSar(cur_seqs[j])->Length() )
00339 mhe.SetStart(cur_seqs[j], mhe[cur_seqs[j]] - GetSar(cur_seqs[j])->Length() );
00340 }
00341 }
00342 }
00343
00344
00345
00346 mhe.Invert();
00347
00348
00349 if(directionI >= 1)
00350 jump_size = 1;
00351 if( directionI == 3 && extend_again ){
00352 directionI = -1;
00353 jump_size = GetSar(0)->SeedLength();
00354 extend_again = false;
00355 }
00356 }
00357
00358
00359
00360
00361
00362
00363
00364
00365
00366
00367 for(uint32 subsetI = 0; subsetI < subset_matches.size(); ++subsetI){
00368 if( subset_matches[subsetI][subset_matches[subsetI].FirstStart()] < 0 )
00369 subset_matches[subsetI].Invert();
00370 subset_matches[subsetI].CalculateOffset();
00371 }
00372
00373 delete[] cur_seqs;
00374 }
00375
00376
00377
00378 }
00379
00380 #endif //_MatchFinder_h_