00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifdef HAVE_CONFIG_H
00010 #include "config.h"
00011 #endif
00012
00013 #include "libMems/MatchFinder.h"
00014
00015 using namespace std;
00016 using namespace genome;
00017 namespace mems {
00018
00019 MatchFinder::MatchFinder(){
00020 mer_size = DNA_MER_SIZE;
00021 seq_count = 0;
00022 ambiguity_tolerance = 0;
00023 m_progress = -1;
00024 log_stream = NULL;
00025 offset_stream = NULL;
00026 }
00027
00028
00029 MatchFinder::~MatchFinder(){
00030 }
00031
00032 MatchFinder::MatchFinder(const MatchFinder& mf){
00033 mer_size = mf.mer_size;
00034 seq_count = mf.seq_count;
00035 ambiguity_tolerance = mf.ambiguity_tolerance;
00036
00037 m_progress = mf.m_progress;
00038 sar_table = mf.sar_table;
00039 seq_table = mf.seq_table;
00040 log_stream = mf.log_stream;
00041 offset_stream = mf.offset_stream;
00042 }
00043
00044 void MatchFinder::Clear(){
00045 mer_size = DNA_MER_SIZE;
00046 seq_count = 0;
00047 ambiguity_tolerance = 0;
00048 m_progress = -1;
00049 sar_table.clear();
00050 seq_table.clear();
00051 log_stream = NULL;
00052 offset_stream = NULL;
00053 }
00054
00055 void MatchFinder::LogProgress( ostream* os ){
00056 log_stream = os;
00057 }
00058
00059 boolean MatchFinder::AddSequence( SortedMerList* sar, gnSequence* seq ){
00060 if(sar == NULL){
00061 Throw_gnExMsg( NullPointer(), "Null SortedMerList pointer" );
00062 }
00063 if(sar == NULL){
00064 Throw_gnExMsg( NullPointer(), "Null gnSequence pointer" );
00065 }
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076 sar_table.push_back(sar);
00077 ++seq_count;
00078 if(seq != NULL){
00079 seq_table.push_back(seq);
00080 }
00081
00082 SMLHeader header = sar->GetHeader();
00083 alphabet_bits = header.alphabet_bits;
00084
00085 return true;
00086
00087 }
00088
00089 void MatchFinder::GetBreakpoint( uint32 sarI, gnSeqI startI, vector<gnSeqI>& breakpoints ) const{
00090 breakpoints.clear();
00091
00092
00093 bmer break_mer = (*GetSar(sarI))[startI];
00094 uint64 mer_mask = GetSar(sarI)->GetSeedMask();
00095 bmer prev_mer = break_mer;
00096
00097 while((prev_mer.mer & mer_mask) == (break_mer.mer & mer_mask)){
00098 if(startI == 0){
00099 startI--;
00100 break;
00101 }
00102 startI--;
00103 prev_mer = (*GetSar(sarI))[startI];
00104 }
00105 ++startI;
00106
00107
00108 for(uint32 i=0; i < seq_count; ++i){
00109 if(i == sarI){
00110 breakpoints.push_back(startI);
00111 }else{
00112 gnSeqI cur_start;
00113 if(GetSar(i)->FindMer(break_mer.mer, cur_start)){
00114
00115 int64 cur_matchI = cur_start;
00116 bmer matchmer = (*GetSar(i))[cur_start];
00117 while(cur_matchI >= 0 && ((matchmer.mer & mer_mask) == (break_mer.mer && mer_mask))){
00118 cur_matchI--;
00119 matchmer = (*GetSar(i))[cur_start];
00120 }
00121 cur_start = cur_matchI+1;
00122 }
00123 breakpoints.push_back(cur_start);
00124 }
00125 }
00126 }
00127
00128 void MatchFinder::FindMatchSeeds(){
00129 vector<gnSeqI> start_points;
00130
00131 for(uint32 i=0; i < sar_table.size(); ++i){
00132 start_points.push_back(0);
00133 }
00134 FindMatchSeeds( start_points );
00135 }
00136
00137 void MatchFinder::FindMatchSeeds( const vector<gnSeqI>& start_offsets ){
00138 vector<gnSeqI> start_points = start_offsets;
00139 vector<gnSeqI> search_len;
00140
00141 mers_processed = 0;
00142 total_mers = 0;
00143 m_progress = -1;
00144 for(uint32 i=0; i < sar_table.size(); ++i){
00145 search_len.push_back(GNSEQI_END);
00146 total_mers += search_len[i] == GNSEQI_END ? sar_table[i]->Length() : search_len[i];
00147 mers_processed += start_points[ i ];
00148 }
00149 while( !SearchRange(start_points, search_len) ){
00150 mers_processed = 0;
00151 for( uint32 seqI = 0; seqI < sar_table.size(); ++seqI ){
00152 if( offset_stream != NULL ){
00153 if( seqI > 0 )
00154 *offset_stream << '\t';
00155 *offset_stream << start_points[ seqI ];
00156 }
00157 mers_processed += start_points[ seqI ];
00158 }
00159 if( offset_stream != NULL ){
00160 *offset_stream << endl;
00161 offset_stream->flush();
00162 }
00163 }
00164 }
00165
00166 #define MER_REPEAT_LIMIT 1000 // The maximum number of matching mers before they are completely
00167
00168
00169 boolean print_sp = false;
00170
00171
00172 boolean MatchFinder::SearchRange(vector<gnSeqI>& start_points, vector<gnSeqI>& search_len){
00173
00174 uint32 MER_BUFFER_SIZE = 10000;
00175 vector<uint32> mer_index;
00176 vector<uint32> mer_baseindex;
00177 IdmerList cur_mers;
00178 IdmerList cur_match;
00179 list<uint32> sar_hitlist;
00180 uint32 read_size;
00181
00182
00183 if(sar_table.size() < 1)
00184 return true;
00185
00186
00187 uint64 mer_mask = sar_table[0]->GetSeedMask();
00188 uint64 seed = sar_table[0]->Seed();
00189 mer_size = sar_table[0]->SeedWeight();
00190 for(uint32 maskI = 0; maskI < sar_table.size(); ++maskI){
00191 if(seed != sar_table[maskI]->Seed()){
00192 Throw_gnExMsg(InvalidData(), "Different seed patterns.");
00193 }
00194 }
00195
00196
00197 if((start_points.size() != sar_table.size()) || (search_len.size() != sar_table.size())){
00198 Throw_gnExMsg(InvalidData(), "Inconsistent search range specification.");
00199 }
00200
00201
00202
00203
00204 vector< vector< bmer > > mer_vector;
00205 for( uint vecI = 0; vecI < sar_table.size(); ++vecI ){
00206 vector< bmer > vec;
00207 mer_vector.push_back( vec );
00208 }
00209
00210
00211 idmer newmer;
00212 for(uint32 n = 0; n < sar_table.size(); ++n){
00213 read_size = MER_BUFFER_SIZE < search_len[n] ? MER_BUFFER_SIZE : search_len[n];
00214 mer_vector[n].reserve(read_size);
00215 sar_table[n]->Read(mer_vector[n], read_size, start_points[n]);
00216 mer_index.push_back(0);
00217 mer_baseindex.push_back(0);
00218 if( mer_vector[n].size() > 0 ){
00219 newmer.position = mer_vector[n][0].position;
00220 newmer.mer = mer_vector[n][0].mer & mer_mask;
00221 newmer.id = n;
00222 cur_mers.push_back(newmer);
00223 }
00224 }
00225
00226 if( print_sp ){
00227 cerr << "First mers are: " << mer_vector[0][0].mer << endl;
00228 cerr << "First mers are: " << mer_vector[1][0].mer << endl;
00229 cerr << "First mers are: " << mer_vector[2][0].mer << endl;
00230 print_sp = false;
00231 }
00232
00233
00234
00235
00236 cur_mers.sort(&idmer_lessthan);
00237 while(cur_mers.size() > 0){
00238 IdmerList::iterator mer_iter = cur_mers.begin();
00239 sarID_t cur_id = mer_iter->id;
00240
00241 if(cur_match.size() > 0){
00242 if(mer_iter->mer > cur_match.begin()->mer){
00243
00244 if(cur_match.size() > 1)
00245 EnumerateMatches(cur_match);
00246 cur_match.clear();
00247 }else if(mer_iter->mer < cur_match.begin()->mer){
00248
00249 ErrorMsg("Horrible error occurred!!\n");
00250 }
00251 }
00252
00253 if( cur_match.size() > MER_REPEAT_LIMIT ){
00254
00255
00256 uint64 next_mer = cur_match.begin()->mer;
00257 next_mer += ~mer_mask + 1;
00258
00259 gnSeqI next_pos = 0;
00260 uint seqI = 0;
00261 for( ; seqI < sar_table.size(); ++seqI ){
00262 if( !sar_table[ seqI ]->FindMer( next_mer, next_pos ))
00263 ++next_pos;
00264 if( next_pos < sar_table[ seqI ]->SMLLength() )
00265 break;
00266 }
00267 vector< gnSeqI > old_starts = start_points;
00268 if( seqI < sar_table.size() )
00269 GetBreakpoint( seqI, next_pos, start_points );
00270 for( int spI = 0; spI < start_points.size(); ++spI ){
00271
00272 start_points[ spI ] = start_points[ spI ] < mer_index[ spI ] + mer_baseindex[ spI ] + old_starts[ spI ] ? old_starts[ spI ] + mer_index[ spI ] + mer_baseindex[ spI ] : start_points[ spI ];
00273 if( spI < seqI )
00274 start_points[ spI ] = sar_table[ spI ]->SMLLength();
00275 }
00276 return false;
00277 }
00278
00279 gnSeqI merI = mer_index[cur_id];
00280 boolean buffer_exhausted = merI < mer_vector[cur_id].size() ? false : true;
00281 while(!buffer_exhausted && (mer_iter->mer == (mer_vector[cur_id][merI].mer & mer_mask))){
00282 newmer.position = mer_vector[cur_id][merI].position;
00283 newmer.mer = mer_vector[cur_id][merI].mer & mer_mask;
00284 newmer.id = cur_id;
00285 cur_match.push_back(newmer);
00286 ++merI;
00287 ++mer_index[cur_id];
00288
00289 if(merI == mer_vector[cur_id].size())
00290 buffer_exhausted = true;
00291 }
00292
00293 if(buffer_exhausted){
00294
00295 mer_baseindex[cur_id] += mer_vector[cur_id].size();
00296
00297
00298 mers_processed += mer_vector[cur_id].size();
00299 #pragma omp critical
00300 {
00301 float64 m_oldprogress = m_progress;
00302 m_progress = ((float64)mers_processed / (float64)total_mers) * PROGRESS_GRANULARITY;
00303 if( log_stream != NULL ){
00304 if((int)m_oldprogress != (int)m_progress){
00305 (*log_stream) << (int)((m_progress / PROGRESS_GRANULARITY) * 100) << "%..";
00306 log_stream->flush();
00307 }
00308 if(((int)m_oldprogress / 10) != ((int)m_progress / 10))
00309 (*log_stream) << std::endl;
00310 }
00311 }
00312 uint32 read_size = MER_BUFFER_SIZE;
00313 if(MER_BUFFER_SIZE + mer_baseindex[cur_id] > search_len[cur_id])
00314 read_size = search_len[cur_id] - mer_baseindex[cur_id];
00315
00316 sar_table[cur_id]->Read(mer_vector[cur_id], read_size, start_points[cur_id] + mer_baseindex[cur_id]);
00317 mer_index[cur_id] = 0;
00318 if(mer_vector[cur_id].size() == 0){
00319
00320 cur_mers.erase(mer_iter);
00321 }
00322 }else{
00323
00324
00325
00326 cur_mers.erase(mer_iter);
00327 newmer.position = mer_vector[cur_id][merI].position;
00328 newmer.mer = mer_vector[cur_id][merI].mer & mer_mask;
00329 newmer.id = cur_id;
00330 mer_iter = cur_mers.begin();
00331 while(mer_iter != cur_mers.end() && mer_iter->mer < newmer.mer )
00332 ++mer_iter;
00333 cur_mers.insert(mer_iter, newmer);
00334 }
00335
00336 }
00337 return true;
00338 }
00339
00340 boolean MatchFinder::EnumerateMatches( IdmerList& match_list ){
00341
00342 if(match_list.size() == 2){
00343
00344 return HashMatch(match_list);
00345 }
00346
00347 match_list.sort(&idmer_id_lessthan);
00348 vector<uint32> id_start;
00349 vector<IdmerList::iterator> id_pos;
00350 vector<IdmerList::iterator> id_end;
00351 IdmerList::iterator iter = match_list.begin();
00352 IdmerList::iterator iter2 = match_list.begin();
00353 ++iter2;
00354 id_start.push_back(0);
00355 id_pos.push_back(iter);
00356 for(uint32 i=0; iter2 != match_list.end(); ++i){
00357 if(iter->id != iter2->id){
00358 id_start.push_back(i);
00359 id_pos.push_back(iter2);
00360 }
00361 ++iter;
00362 ++iter2;
00363 }
00364
00365
00366 id_end = id_pos;
00367 id_end.push_back(match_list.end());
00368 while(true){
00369 IdmerList cur_match;
00370 for(uint32 k = 0; k < id_pos.size(); ++k){
00371 cur_match.push_back(*id_pos[k]);
00372 }
00373 HashMatch(cur_match);
00374 cur_match.clear();
00375
00376
00377 uint32 m = id_pos.size() - 1;
00378 while(true){
00379 ++id_pos[m];
00380 if(id_pos[m] == id_end[m+1]){
00381 if(m == 0)
00382 return true;
00383 id_pos[m] = id_end[m];
00384 m--;
00385 }else
00386 break;
00387 }
00388 }
00389
00390 return true;
00391 }
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442 }