libMems/MatchFinder.cpp

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * $Id: MatchFinder.cpp,v 1.39 2004/03/01 02:40:08 darling Exp $
00003  * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
00004  * Please see the file called COPYING for licensing, copying, and modification
00005  * Please see the file called COPYING for licensing details.
00006  * **************
00007  ******************************************************************************/
00008 
00009 #ifdef HAVE_CONFIG_H
00010 #include "config.h"
00011 #endif
00012 
00013 #include "libMems/MatchFinder.h"
00014 
00015 using namespace std;
00016 using namespace genome;
00017 namespace mems {
00018 
00019 MatchFinder::MatchFinder(){
00020         mer_size = DNA_MER_SIZE;
00021         seq_count = 0;
00022         ambiguity_tolerance = 0;
00023         m_progress = -1;
00024         log_stream = NULL;
00025         offset_stream = NULL;
00026 }
00027 
00028 //make sure this calls the destructor on each element
00029 MatchFinder::~MatchFinder(){
00030 }
00031 
00032 MatchFinder::MatchFinder(const MatchFinder& mf){
00033         mer_size = mf.mer_size;
00034         seq_count = mf.seq_count;
00035         ambiguity_tolerance = mf.ambiguity_tolerance;
00036 
00037         m_progress = mf.m_progress;
00038         sar_table = mf.sar_table;
00039         seq_table = mf.seq_table;
00040         log_stream = mf.log_stream;
00041         offset_stream = mf.offset_stream;
00042 }
00043 
00044 void MatchFinder::Clear(){
00045         mer_size = DNA_MER_SIZE;
00046         seq_count = 0;
00047         ambiguity_tolerance = 0;
00048         m_progress = -1;
00049         sar_table.clear();
00050         seq_table.clear();
00051         log_stream = NULL;
00052         offset_stream = NULL;
00053 }
00054 
00055 void MatchFinder::LogProgress( ostream* os ){
00056         log_stream = os;
00057 }
00058 
00059 boolean MatchFinder::AddSequence( SortedMerList* sar, gnSequence* seq ){
00060         if(sar == NULL){
00061                 Throw_gnExMsg( NullPointer(), "Null SortedMerList pointer" );
00062         }
00063         if(sar == NULL){
00064                 Throw_gnExMsg( NullPointer(), "Null gnSequence pointer" );
00065         }
00066         
00067         //check for consistency between sequence length and sorted mer list lengths
00068 /*      if(seq != NULL && seq->length() != sar->Length()){
00069                 cerr << "MatchFinder::AddSequence: Error mismatched sml and sequence length.\n";
00070                 cerr << "Seq length: " << seq->length() << "\tSML length: " << sar->Length() << endl;
00071                 DebugMsg("MatchFinder::AddSequence: Error mismatched sml and sequence length.");
00072                 return false;
00073         }
00074 */      
00075         //passed checks, add it to the data structures
00076         sar_table.push_back(sar);
00077         ++seq_count;
00078         if(seq != NULL){
00079                 seq_table.push_back(seq);
00080         }
00081         
00082         SMLHeader header = sar->GetHeader();
00083         alphabet_bits = header.alphabet_bits;
00084         
00085         return true;
00086 
00087 }
00088 
00089 void MatchFinder::GetBreakpoint( uint32 sarI, gnSeqI startI, vector<gnSeqI>& breakpoints ) const{
00090         breakpoints.clear();
00091         
00092         //put the mer to break on in break_mer
00093         bmer break_mer  = (*GetSar(sarI))[startI];
00094         uint64 mer_mask = GetSar(sarI)->GetSeedMask();
00095         bmer prev_mer = break_mer;
00096         //search backwards for the first index of this mer
00097         while((prev_mer.mer & mer_mask) == (break_mer.mer & mer_mask)){
00098                 if(startI == 0){
00099                         startI--;
00100                         break;
00101                 }
00102                 startI--;
00103                 prev_mer = (*GetSar(sarI))[startI];
00104         }
00105         ++startI;
00106 
00107         //find the mer's location in the other sorted mer lists
00108         for(uint32 i=0; i < seq_count; ++i){
00109                 if(i == sarI){
00110                         breakpoints.push_back(startI);
00111                 }else{
00112                         gnSeqI cur_start;
00113                         if(GetSar(i)->FindMer(break_mer.mer, cur_start)){
00114                                 //we found a match, see how far backwards we can go.
00115                                 int64 cur_matchI = cur_start;
00116                                 bmer matchmer = (*GetSar(i))[cur_start];
00117                                 while(cur_matchI >= 0 && ((matchmer.mer & mer_mask) == (break_mer.mer && mer_mask))){
00118                                         cur_matchI--;
00119                                         matchmer = (*GetSar(i))[cur_start];
00120                                 }
00121                                 cur_start = cur_matchI+1;
00122                         }
00123                         breakpoints.push_back(cur_start);
00124                 }
00125         }
00126 }
00127 
00128 void MatchFinder::FindMatchSeeds(){
00129         vector<gnSeqI> start_points;
00130 
00131         for(uint32 i=0; i < sar_table.size(); ++i){
00132                 start_points.push_back(0);
00133         }
00134         FindMatchSeeds( start_points );
00135 }
00136 
00137 void MatchFinder::FindMatchSeeds( const vector<gnSeqI>& start_offsets ){
00138         vector<gnSeqI> start_points = start_offsets;
00139         vector<gnSeqI> search_len;
00140         // keep track of the number of mers processed and the total for progress reporting
00141         mers_processed = 0;
00142         total_mers = 0;
00143         m_progress = -1;
00144         for(uint32 i=0; i < sar_table.size(); ++i){
00145                 search_len.push_back(GNSEQI_END);
00146                 total_mers += search_len[i] == GNSEQI_END ? sar_table[i]->Length() : search_len[i];
00147                 mers_processed += start_points[ i ];
00148         }
00149         while( !SearchRange(start_points, search_len) ){
00150                 mers_processed = 0;
00151                 for( uint32 seqI = 0; seqI < sar_table.size(); ++seqI ){
00152                         if( offset_stream != NULL ){
00153                                 if( seqI > 0 )
00154                                         *offset_stream << '\t';
00155                                 *offset_stream << start_points[ seqI ];
00156                         }
00157                         mers_processed += start_points[ seqI ];
00158                 }
00159                 if( offset_stream != NULL ){
00160                         *offset_stream << endl;
00161                         offset_stream->flush();
00162                 }
00163         }
00164 }
00165 
00166 #define MER_REPEAT_LIMIT 1000 // The maximum number of matching mers before they are completely
00167                                                                 // ignored.
00168 
00169 boolean print_sp = false;
00170 //startI must be 0
00171 //At most search_length mers in any one genome will be checked.
00172 boolean MatchFinder::SearchRange(vector<gnSeqI>& start_points, vector<gnSeqI>& search_len){
00173         //picked a semi-arbitrary number for buffer size.
00174         uint32 MER_BUFFER_SIZE = 10000;
00175         vector<uint32> mer_index;   // stores the indexes of the current mers in mer_vector
00176         vector<uint32> mer_baseindex;   // stores the index in the SortedMerList of each of the first mers in mer_vector
00177         IdmerList cur_mers;     // stores the current mers.
00178         IdmerList cur_match;    // stores the current matching mers.
00179         list<uint32> sar_hitlist;       // list of sars to replace
00180         uint32 read_size;
00181         
00182         //make sure there is at least one sequence
00183         if(sar_table.size() < 1)
00184                 return true;
00185         
00186         //check for consistency in seed patterns.
00187         uint64 mer_mask = sar_table[0]->GetSeedMask();
00188         uint64 seed = sar_table[0]->Seed();
00189         mer_size = sar_table[0]->SeedWeight();
00190         for(uint32 maskI = 0; maskI < sar_table.size(); ++maskI){
00191                 if(seed != sar_table[maskI]->Seed()){
00192                         Throw_gnExMsg(InvalidData(), "Different seed patterns.");
00193                 }
00194         }
00195         
00196         //check that start_points and end_points are ok.
00197         if((start_points.size() != sar_table.size()) || (search_len.size() != sar_table.size())){
00198                 Throw_gnExMsg(InvalidData(), "Inconsistent search range specification.");
00199         }
00200         
00201         //allocate buffer space
00202         // stores arrays of bmers for each sml.
00203 
00204         vector< vector< bmer > > mer_vector;
00205         for( uint vecI = 0; vecI < sar_table.size(); ++vecI ){
00206                 vector< bmer > vec;
00207                 mer_vector.push_back( vec );
00208         }
00209 
00210         //initialize the data structures
00211         idmer newmer;
00212         for(uint32 n = 0; n < sar_table.size(); ++n){
00213                 read_size = MER_BUFFER_SIZE < search_len[n] ? MER_BUFFER_SIZE : search_len[n]; 
00214                 mer_vector[n].reserve(read_size);
00215                 sar_table[n]->Read(mer_vector[n], read_size, start_points[n]);
00216                 mer_index.push_back(0);
00217                 mer_baseindex.push_back(0);
00218                 if( mer_vector[n].size() > 0 ){
00219                         newmer.position = mer_vector[n][0].position;
00220                         newmer.mer = mer_vector[n][0].mer & mer_mask;
00221                         newmer.id = n;
00222                         cur_mers.push_back(newmer);  //cur_mers gets the first mer from each sorted mer list
00223                 }
00224         }
00225         
00226         if( print_sp ){
00227         cerr << "First mers are: " << mer_vector[0][0].mer << endl;
00228         cerr << "First mers are: " << mer_vector[1][0].mer << endl;
00229         cerr << "First mers are: " << mer_vector[2][0].mer << endl;
00230         print_sp = false;
00231         }       
00232         //nobody reads these fucking things.  why am i writing this.because my fucking 
00233         //roomate needs a goddamn roadmap......   ohhh ecstasy.... haptic pumpkins
00234 
00235         //loop while there is data to hash.
00236         cur_mers.sort(&idmer_lessthan);
00237         while(cur_mers.size() > 0){
00238                 IdmerList::iterator mer_iter = cur_mers.begin();
00239                 sarID_t cur_id = mer_iter->id;
00240                 //first check for matches across genomes.
00241                 if(cur_match.size() > 0){
00242                         if(mer_iter->mer > cur_match.begin()->mer){
00243                                 //we are done with this matching.  hash it.
00244                                 if(cur_match.size() > 1)
00245                                         EnumerateMatches(cur_match);
00246                                 cur_match.clear();
00247                         }else if(mer_iter->mer < cur_match.begin()->mer){
00248                                 //error checking stuff.
00249                                 ErrorMsg("Horrible error occurred!!\n");
00250                         }
00251                 }
00252 
00253                 if( cur_match.size() > MER_REPEAT_LIMIT ){
00254                         // scan past the repetitive mers
00255                         // create the lexicographically next mer
00256                         uint64 next_mer = cur_match.begin()->mer;
00257                         next_mer += ~mer_mask + 1;
00258 //                      cerr << "Searching to: " << next_mer << endl;
00259                         gnSeqI next_pos = 0;
00260                         uint seqI = 0;
00261                         for( ; seqI < sar_table.size(); ++seqI ){
00262                                 if( !sar_table[ seqI ]->FindMer( next_mer, next_pos ))
00263                                         ++next_pos;
00264                                 if( next_pos < sar_table[ seqI ]->SMLLength() )
00265                                         break;
00266                         }
00267                         vector< gnSeqI > old_starts = start_points;
00268                         if( seqI < sar_table.size() )
00269                                 GetBreakpoint( seqI, next_pos, start_points );
00270                         for( int spI = 0; spI < start_points.size(); ++spI ){
00271                                 // don't allow it to move backwards!
00272                                 start_points[ spI ] = start_points[ spI ] < mer_index[ spI ] + mer_baseindex[ spI ] + old_starts[ spI ] ? old_starts[ spI ] + mer_index[ spI ] + mer_baseindex[ spI ] : start_points[ spI ];
00273                                 if( spI < seqI )
00274                                         start_points[ spI ] = sar_table[ spI ]->SMLLength();
00275                         } 
00276                         return false;
00277                 }
00278                 //check for matches within the same genome
00279                 gnSeqI merI = mer_index[cur_id];
00280                 boolean buffer_exhausted = merI < mer_vector[cur_id].size() ? false : true;
00281                 while(!buffer_exhausted && (mer_iter->mer == (mer_vector[cur_id][merI].mer & mer_mask))){
00282                         newmer.position = mer_vector[cur_id][merI].position;
00283                         newmer.mer = mer_vector[cur_id][merI].mer & mer_mask;
00284                         newmer.id = cur_id;
00285                         cur_match.push_back(newmer);
00286                         ++merI;
00287                         ++mer_index[cur_id];
00288                         //check if we've exhausted our buffer
00289                         if(merI == mer_vector[cur_id].size())
00290                                 buffer_exhausted = true;
00291                 }
00292 
00293                 if(buffer_exhausted){
00294                         //if we've exhausted our buffer then refill it
00295                         mer_baseindex[cur_id] += mer_vector[cur_id].size();
00296                         
00297                         // update the mers processed
00298                         mers_processed += mer_vector[cur_id].size();
00299 #pragma omp critical
00300 {
00301                         float64 m_oldprogress = m_progress;
00302                         m_progress = ((float64)mers_processed / (float64)total_mers) * PROGRESS_GRANULARITY;
00303                         if( log_stream != NULL ){
00304                                 if((int)m_oldprogress != (int)m_progress){
00305                                         (*log_stream) << (int)((m_progress / PROGRESS_GRANULARITY) * 100) << "%..";
00306                                         log_stream->flush();
00307                                 }
00308                                 if(((int)m_oldprogress / 10) != ((int)m_progress / 10))
00309                                         (*log_stream) << std::endl;
00310                         }
00311 }
00312                         uint32 read_size = MER_BUFFER_SIZE;
00313                         if(MER_BUFFER_SIZE + mer_baseindex[cur_id] > search_len[cur_id])
00314                                 read_size = search_len[cur_id] - mer_baseindex[cur_id];
00315 
00316                         sar_table[cur_id]->Read(mer_vector[cur_id], read_size, start_points[cur_id] + mer_baseindex[cur_id]);
00317                         mer_index[cur_id] = 0;
00318                         if(mer_vector[cur_id].size() == 0){
00319                                 //remove mer_iter so that this sar is forgotten
00320                                 cur_mers.erase(mer_iter);
00321                         }
00322                 }else{
00323                         //if we haven't exhausted our buffer then we must have
00324                         //run out of matching mers.
00325                         //remove mer_iter and put in a new idmer with the same id
00326                         cur_mers.erase(mer_iter);
00327                         newmer.position = mer_vector[cur_id][merI].position;
00328                         newmer.mer = mer_vector[cur_id][merI].mer & mer_mask;
00329                         newmer.id = cur_id;
00330                         mer_iter = cur_mers.begin();
00331                         while(mer_iter != cur_mers.end() && mer_iter->mer < newmer.mer )
00332                                 ++mer_iter;
00333                         cur_mers.insert(mer_iter, newmer);
00334                 }
00335                 
00336         }
00337         return true;
00338 }
00339 
00340 boolean MatchFinder::EnumerateMatches( IdmerList& match_list ){
00341         //this must call HashMatch on every possible combination of matches in the list.
00342         if(match_list.size() == 2){
00343                 //this is the smallest possible match.  simply hash it.
00344                 return HashMatch(match_list);
00345         }
00346         
00347         match_list.sort(&idmer_id_lessthan);
00348         vector<uint32> id_start;
00349         vector<IdmerList::iterator> id_pos;
00350         vector<IdmerList::iterator> id_end;
00351         IdmerList::iterator iter = match_list.begin();
00352         IdmerList::iterator iter2 = match_list.begin();
00353         ++iter2;
00354         id_start.push_back(0);
00355         id_pos.push_back(iter);
00356         for(uint32 i=0; iter2 != match_list.end(); ++i){
00357                 if(iter->id != iter2->id){
00358                         id_start.push_back(i);
00359                         id_pos.push_back(iter2);
00360                 }
00361                 ++iter;
00362                 ++iter2;
00363         }
00364         //the following loop iterates through all possible combinations of idmers with
00365         //different id's and hashes them.
00366         id_end = id_pos;
00367         id_end.push_back(match_list.end());
00368         while(true){
00369                 IdmerList cur_match;
00370                 for(uint32 k = 0; k < id_pos.size(); ++k){
00371                         cur_match.push_back(*id_pos[k]);
00372                 }
00373                 HashMatch(cur_match);
00374                 cur_match.clear();
00375 
00376                 //increment the iterators (like an odometer)
00377                 uint32 m = id_pos.size() - 1;
00378                 while(true){
00379                         ++id_pos[m];
00380                         if(id_pos[m] == id_end[m+1]){
00381                                 if(m == 0)
00382                                         return true;
00383                                 id_pos[m] = id_end[m];
00384                                 m--;
00385                         }else
00386                                 break;
00387                 }
00388         }
00389 
00390         return true;
00391 }
00392 /*
00393 boolean MatchFinder::MatchAmbiguities(MatchHashEntry& mhe, uint32 match_size){
00394         if(ambiguity_tolerance == 0)
00395                 return false;
00396                         //check that all mers at the new position match
00397         //which sequences are used in this match?
00398         uint32* cur_seqs = new uint32[mhe.SeqCount()];
00399         uint32 used_seqs = 0;
00400         for(uint32 seqI = 0; seqI < mhe.SeqCount(); ++seqI){
00401                 if(mhe[seqI] != NO_MATCH){
00402                         cur_seqs[used_seqs] = seqI;
00403                         ++used_seqs;
00404                 }
00405         }
00406         string cur_mer, mer_i;
00407         gnSequence mer_seq;
00408         int64 mer_to_get = mhe[cur_seqs[0]];
00409         if(mer_to_get < 0){
00410                 mer_to_get *= -1;
00411                 mer_to_get += mhe.Length() - mer_size;
00412         }
00413         cur_mer = seq_table[cur_seqs[0]]->subseq(mer_to_get, match_size).ToString();
00414         
00415         for(uint32 i=1; i < used_seqs; ++i){
00416                 mer_to_get = mhe[cur_seqs[i]];
00417                 if(mer_to_get < 0){
00418                         //Convert the cur_seqs[i] entry since negative implies reverse complement
00419                         mer_to_get *= -1;
00420                         mer_to_get += mhe.Length() - mer_size;
00421                 }
00422                 mer_seq = seq_table[cur_seqs[i]]->subseq(mer_to_get, match_size);
00423                 if(mer_seq.compare(cur_mer) != 0){
00424                         delete[] cur_seqs;
00425                         return false;
00426                 }
00427                 mer_i = mer_seq.ToString();
00428                 uint32 ambiguity_count = 0;
00429                 for(uint32 baseI = 0; baseI < match_size; ++baseI)
00430                         if(cur_mer[baseI] != mer_i[baseI])
00431                                 ++ambiguity_count;
00432                 if(ambiguity_count > ambiguity_tolerance){
00433                         delete[] cur_seqs;
00434                         return false;
00435                 }
00436         }
00437         delete[] cur_seqs;
00438         return true;
00439 }
00440 */
00441 
00442 } // namespace mems

Generated on Fri Mar 14 06:01:03 2008 for libMems by doxygen 1.3.6