00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifdef HAVE_CONFIG_H
00010 #include "config.h"
00011 #endif
00012
00013 #include "libMems/SortedMerList.h"
00014
00015 using namespace std;
00016 using namespace genome;
00017 namespace mems {
00018
00019 const uint8* SortedMerList::BasicDNATable(){
00020 static const uint8* const bdt = SortedMerList::CreateBasicDNATable();
00021 return bdt;
00022 }
00023
00024 const uint8* SortedMerList::ProteinTable(){
00025 static const uint8* const bdt = SortedMerList::CreateProteinTable();
00026 return bdt;
00027 }
00028
00029 const uint8* SortedMerList::CreateBasicDNATable(){
00030 uint8* bdt = new uint8[UINT8_MAX];
00031 memset(bdt, 0, UINT8_MAX);
00032 bdt['c'] = 1;
00033 bdt['C'] = 1;
00034 bdt['b'] = 1;
00035 bdt['B'] = 1;
00036 bdt['y'] = 1;
00037 bdt['Y'] = 1;
00038 bdt['g'] = 2;
00039 bdt['G'] = 2;
00040 bdt['s'] = 2;
00041 bdt['S'] = 2;
00042 bdt['k'] = 2;
00043 bdt['K'] = 2;
00044 bdt['t'] = 3;
00045 bdt['T'] = 3;
00046 return bdt;
00047 }
00048
00049 const uint8* SortedMerList::CreateProteinTable(){
00050 uint8* pt = new uint8[UINT8_MAX];
00051 memset(pt, 0, UINT8_MAX);
00052 pt['A'] = 0;
00053 pt['R'] = 1;
00054 pt['N'] = 2;
00055 pt['D'] = 3;
00056 pt['C'] = 4;
00057 pt['Q'] = 5;
00058 pt['E'] = 6;
00059 pt['G'] = 7;
00060 pt['H'] = 8;
00061 pt['I'] = 9;
00062 pt['L'] = 10;
00063 pt['K'] = 11;
00064 pt['M'] = 12;
00065 pt['F'] = 13;
00066 pt['P'] = 14;
00067 pt['S'] = 15;
00068 pt['T'] = 16;
00069 pt['W'] = 17;
00070 pt['Y'] = 18;
00071 pt['V'] = 19;
00072
00073 pt['a'] = 0;
00074 pt['r'] = 1;
00075 pt['n'] = 2;
00076 pt['d'] = 3;
00077 pt['c'] = 4;
00078 pt['q'] = 5;
00079 pt['e'] = 6;
00080 pt['g'] = 7;
00081 pt['h'] = 8;
00082 pt['i'] = 9;
00083 pt['l'] = 10;
00084 pt['k'] = 11;
00085 pt['m'] = 12;
00086 pt['f'] = 13;
00087 pt['p'] = 14;
00088 pt['s'] = 15;
00089 pt['t'] = 16;
00090 pt['w'] = 17;
00091 pt['y'] = 18;
00092 pt['v'] = 19;
00093 return pt;
00094 }
00095
00096 SortedMerList::SortedMerList(){
00097
00098 header.length = 0;
00099 header.alphabet_bits = 2;
00100 header.unique_mers = NO_UNIQUE_COUNT;
00101 memcpy(header.translation_table, BasicDNATable(), UINT8_MAX);
00102 header.description[0] = 0;
00103 header.seed_length = DNA_MER_SIZE;
00104 header.id = 0;
00105 header.circular = false;
00106 mask_size = DNA_MER_SIZE;
00107 mer_mask = 0;
00108 seed_mask = 0;
00109
00110 sequence = NULL;
00111 binary_seq_len = 0;
00112 }
00113
00114 SortedMerList::SortedMerList( const SortedMerList& sa ){
00115 sequence = NULL;
00116 *this = sa;
00117 }
00118
00119 SortedMerList& SortedMerList::operator=(const SortedMerList& sa)
00120 {
00121 header = sa.header;
00122 mer_mask = sa.mer_mask;
00123 seed_mask = sa.seed_mask;
00124 mask_size = sa.mask_size;
00125 binary_seq_len = sa.binary_seq_len;
00126
00127
00128 if( sa.sequence != NULL ){
00129 if( sequence != NULL )
00130 delete[] sequence;
00131 sequence = new uint32[binary_seq_len];
00132 memcpy(sequence, sa.sequence, sizeof(uint32) * binary_seq_len);
00133 }else
00134 sequence = NULL;
00135
00136 return *this;
00137 }
00138
00139 SortedMerList::~SortedMerList(){
00140 if( sequence != NULL )
00141 delete[] sequence;
00142 }
00143
00144 void SortedMerList::Clear(){
00145
00146 header.length = 0;
00147 header.alphabet_bits = 2;
00148 header.unique_mers = NO_UNIQUE_COUNT;
00149 memcpy(header.translation_table, BasicDNATable(), UINT8_MAX);
00150 header.description[0] = 0;
00151 header.seed_length = DNA_MER_SIZE;
00152 header.id = 0;
00153 header.circular = false;
00154 mask_size = DNA_MER_SIZE;
00155 mer_mask = 0;
00156 seed_mask = 0;
00157
00158 if( sequence != NULL ){
00159 delete[] sequence;
00160 sequence = NULL;
00161 }
00162 binary_seq_len = 0;
00163 }
00164
00165 uint32 SortedMerList::CalculateMaxMerSize() const{
00166 bmer tmp;
00167 return (sizeof(tmp.mer) * 8) / header.alphabet_bits;
00168 }
00169
00170 boolean SortedMerList::FindMer(const uint64 query_mer, gnSeqI& result){
00171 bmer merle;
00172 merle.mer = query_mer;
00173 gnSeqI last_pos = Length();
00174 if( last_pos == 0 || (last_pos < header.seed_length && !header.circular) )
00175 return false;
00176 last_pos -= header.circular ? 1 : header.seed_length;
00177 result = bsearch(merle, 0, last_pos );
00178 return ((*this)[result].mer == merle.mer);
00179 }
00180
00181 boolean SortedMerList::Find(const string& query_seq, gnSeqI& result) {
00182 struct bmer merle;
00183 merle.mer = 0;
00184
00185
00186 gnSeqI len = query_seq.length() * header.alphabet_bits < 64 ?
00187 query_seq.length() : 64 / header.alphabet_bits;
00188
00189 translate((uint8*)&merle.mer, query_seq.c_str(), len);
00190 return FindMer( merle.mer, result );
00191 }
00192
00193 void SortedMerList::FindAll(const string& query_seq, vector<gnSeqI> result) {
00194 struct bmer merle;
00195 merle.mer = 0;
00196
00197
00198 gnSeqI len = query_seq.length() * header.alphabet_bits < 64 ?
00199 query_seq.length() : 64 / header.alphabet_bits;
00200
00201 translate((uint8*)&merle.mer, query_seq.c_str(), len);
00202
00203
00204 gnSeqI matchI = 0;
00205 gnSeqI last_pos = Length();
00206 last_pos -= header.circular ? 1 : header.seed_length;
00207 bmer matchmer;
00208 matchI = bsearch(merle, 0, last_pos);
00209
00210
00211 int64 cur_matchI = matchI;
00212 matchmer = (*this)[matchI];
00213 while(cur_matchI >= 0 && matchmer.mer == merle.mer){
00214 cur_matchI--;
00215 matchmer = (*this)[cur_matchI];
00216 }
00217 int64 first_matchI = cur_matchI+1;
00218
00219
00220 cur_matchI = matchI+1;
00221 matchmer = (*this)[cur_matchI];
00222 while(cur_matchI < GNSEQI_END && matchmer.mer == merle.mer){
00223 cur_matchI++;
00224 matchmer = (*this)[cur_matchI];
00225 }
00226
00227 for(matchI = first_matchI; matchI < cur_matchI; matchI++)
00228 result.push_back(matchI);
00229 }
00230
00231 string SortedMerList::Description() const{
00232 return header.description;
00233 }
00234
00235 void SortedMerList::SetDescription(const string& d){
00236 strncpy(header.description, d.c_str(), DESCRIPTION_SIZE-1);
00237 }
00238
00239 uint SortedMerList::SeedLength() const{
00240 return header.seed_length;
00241 }
00245 uint SortedMerList::SeedWeight() const{
00246 return header.seed_weight;
00247 }
00251 uint64 SortedMerList::Seed() const{
00252 return header.seed;
00253 }
00254
00255 boolean SortedMerList::IsCircular() const{
00256 return header.circular;
00257 }
00258
00259 uint64 SortedMerList::GetMerMask() const{
00260 return mer_mask;
00261 }
00262
00263 uint64 SortedMerList::GetSeedMask() const{
00264 return seed_mask;
00265 }
00266
00267 uint32 SortedMerList::GetMerMaskSize() const{
00268 return mask_size;
00269 }
00270
00271 void SortedMerList::SetMerMaskSize(uint32 mer_size){
00272 if(mer_size > header.seed_length)
00273 mask_size = header.seed_length;
00274 else
00275 mask_size = mer_size;
00276
00277
00278 mer_mask = UINT32_MAX;
00279 mer_mask <<= 32;
00280 mer_mask |= UINT32_MAX;
00281 mer_mask <<= (64 - header.alphabet_bits * mer_size);
00282 }
00283
00284 gnSeqI SortedMerList::Length() const{
00285 return header.length;
00286 }
00287
00288 gnSeqI SortedMerList::SMLLength() const{
00289
00290 if( header.length < header.seed_length )
00291 return 0;
00292 if( !header.circular )
00293 return header.length - header.seed_length + 1;
00294 return header.length;
00295 }
00296
00297 sarID_t SortedMerList::GetID() const{
00298 return header.id;
00299 }
00300 void SortedMerList::SetID(const sarID_t d){
00301 header.id = d;
00302 }
00303
00304 #define OPT_HEADER_ALPHABET_BITS DNA_ALPHA_BITS
00305
00306 void SortedMerList::SetSequence(gnSeqC* seq_buf, gnSeqI seq_len){
00307 binary_seq_len = (seq_len * header.alphabet_bits) / 32;
00308 if((seq_len * header.alphabet_bits) % 32 != 0)
00309 binary_seq_len++;
00310
00311 binary_seq_len+=2;
00312
00313 if( sequence != NULL )
00314 delete[] sequence;
00315 sequence = new uint32[binary_seq_len];
00316 translate32(sequence, seq_buf, seq_len);
00317 }
00318
00319
00320
00321 uint64 SortedMerList::GetMer(gnSeqI position) const
00322 {
00323
00324 uint64 mer_a;
00325 gnSeqI mer_word, mer_bit;
00326 uint32 merle;
00327
00328 mer_a = 0;
00329 mer_word = (position * (gnSeqI)OPT_HEADER_ALPHABET_BITS) / (gnSeqI)32;
00330 mer_bit = (position * (gnSeqI)OPT_HEADER_ALPHABET_BITS) % (gnSeqI)32;
00331 mer_a |= sequence[mer_word++];
00332 mer_a <<= 32;
00333 mer_a |= sequence[mer_word++];
00334 if(mer_bit > 0){
00335 merle = sequence[mer_word];
00336 merle >>= 32 - mer_bit;
00337 mer_a <<= mer_bit;
00338 mer_a |= merle;
00339 }
00340 mer_a &= mer_mask;
00341 return mer_a;
00342 }
00343
00344
00345 void SortedMerList::GetBSequence(uint32* dest, const gnSeqI len, const gnSeqI offset){
00346
00347 if(offset >= header.length){
00348 Throw_gnEx( IndexOutOfBounds() );
00349 }
00350 uint64 startpos = (offset * OPT_HEADER_ALPHABET_BITS) / 32;
00351 int begin_remainder = (offset * OPT_HEADER_ALPHABET_BITS) % 32;
00352 uint64 readlen = offset + len < header.length ? len : header.length - offset;
00353
00354 gnSeqI word_read_len = (readlen * OPT_HEADER_ALPHABET_BITS) / 32;
00355 int end_remainder = (readlen * OPT_HEADER_ALPHABET_BITS) % 32;
00356 if(begin_remainder + (readlen * OPT_HEADER_ALPHABET_BITS) > 32
00357 && end_remainder > 0)
00358 word_read_len++;
00359 if(begin_remainder > 0)
00360 word_read_len++;
00361
00362
00363 memcpy((char*)dest, (char*)sequence + (startpos * 4), word_read_len * 4);
00364
00365
00366 ShiftWords(dest, word_read_len, -begin_remainder);
00367
00368
00369 if(end_remainder > begin_remainder){
00370 uint32 mask = 0xFFFFFFFF;
00371 mask <<= 32 - (end_remainder - begin_remainder);
00372 dest[word_read_len-1] &= mask;
00373 }else if(end_remainder < begin_remainder){
00374 uint32 mask = 0xFFFFFFFF;
00375 mask <<= (begin_remainder - end_remainder);
00376 dest[word_read_len-2] &= mask;
00377 }
00378 }
00379
00380 gnSeqI SortedMerList::bsearch(const struct bmer& query_mer, const gnSeqI start, const gnSeqI end) {
00381
00382 gnSeqI middle = (start + end) / 2;
00383 struct bmer midmer = (*this)[middle];
00384 if(midmer.mer == query_mer.mer)
00385 return middle;
00386 else if((midmer.mer < query_mer.mer) && (middle < end))
00387 return bsearch(query_mer, middle + 1, end);
00388 else if((midmer.mer > query_mer.mer) && (start < middle))
00389 return bsearch(query_mer, start, middle - 1);
00390
00391
00392
00393 return middle;
00394 }
00395
00396
00397
00398 void SortedMerList::translate(uint8* dest, const gnSeqC* src, const gnSeqI len) const{
00399 uint8 start_bit = 0;
00400 gnSeqI cur_byte = 0;
00401 const uint32 alpha_bits = OPT_HEADER_ALPHABET_BITS;
00402 dest[cur_byte] = 0;
00403 for(uint32 i=0; i < len; i++){
00404 uint8 tmp = header.translation_table[src[i]];
00405 if(start_bit + alpha_bits <= 8){
00406 tmp <<= 8 - start_bit - alpha_bits;
00407 dest[cur_byte] |= tmp;
00408 }else{
00409 uint8 over_bits = (start_bit + alpha_bits) % 8;
00410 uint8 tmp2 = tmp;
00411 tmp2 <<= 8 - over_bits;
00412 tmp >>= over_bits;
00413 dest[cur_byte] |= tmp;
00414 dest[cur_byte+1] |= tmp2;
00415 }
00416 start_bit += alpha_bits;
00417 if(start_bit >= 8){
00418 start_bit %= 8;
00419 cur_byte++;
00420 dest[cur_byte] = 0;
00421 }
00422 }
00423 }
00424
00425 void SortedMerList::translate32(uint32* dest, const gnSeqC* src, const gnSeqI len) const{
00426 if( len == 0 )
00427 return;
00428 uint8 start_bit = 0;
00429 gnSeqI cur_word = 0;
00430 const uint32 alpha_bits = OPT_HEADER_ALPHABET_BITS;
00431 dest[cur_word] = 0;
00432 for(uint32 i=0; i < len; i++){
00433 uint32 tmp = header.translation_table[src[i]];
00434 if(start_bit + alpha_bits <= 32){
00435 tmp <<= 32 - start_bit - alpha_bits;
00436 dest[cur_word] |= tmp;
00437 start_bit += alpha_bits;
00438 if(start_bit >= 32 && i < len - 1){
00439 start_bit %= 32;
00440 cur_word++;
00441 dest[cur_word] = 0;
00442 }
00443 }else{
00444 uint8 over_bits = (start_bit + alpha_bits) % 32;
00445 uint32 tmp2 = tmp;
00446 tmp2 <<= 32 - over_bits;
00447 tmp >>= over_bits;
00448 dest[cur_word] |= tmp;
00449 cur_word++;
00450 dest[cur_word] = 0;
00451 dest[cur_word] |= tmp2;
00452 start_bit = over_bits;
00453 }
00454 }
00455 }
00456 SMLHeader SortedMerList::GetHeader() const{
00457 return header;
00458 }
00459
00460 gnSeqI SortedMerList::UniqueMerCount(){
00461 if(header.unique_mers != NO_UNIQUE_COUNT)
00462 return header.unique_mers;
00463
00464 uint32 MER_BUFFER_SIZE = 16384;
00465 gnSeqI cur_pos = 0;
00466 vector<bmer> mer_vector;
00467 bmer prev_mer;
00468 gnSeqI m_unique = 0;
00469 gnSeqI report_interval = MER_BUFFER_SIZE * 212;
00470 while(cur_pos < header.length){
00471 if(!Read(mer_vector, MER_BUFFER_SIZE, cur_pos)){
00472 break;
00473
00474
00475 }
00476 uint32 mer_count = mer_vector.size();
00477 if(mer_count == 0)
00478 break;
00479 if(cur_pos > 0 && prev_mer.mer != mer_vector[0].mer)
00480 m_unique++;
00481
00482
00483 uint32 i = 0;
00484 for(uint32 j = 1; j < mer_count; j++){
00485 if((mer_vector[i].mer & mer_mask) != (mer_vector[j].mer & mer_mask) )
00486 m_unique++;
00487 i++;
00488 }
00489 prev_mer = mer_vector[i];
00490 cur_pos += mer_count;
00491 if( cur_pos % report_interval == 0 ){
00492
00493 cout << m_unique << "/" << cur_pos << endl;
00494 }
00495 }
00496 cout << endl;
00497 m_unique++;
00498 header.unique_mers = m_unique;
00499 return header.unique_mers;
00500 }
00501
00502
00503 void SortedMerList::ShiftWords(unsigned int* data, uint32 length, int32 bits)
00504 {
00505 int32 word_bits = 8 * sizeof(unsigned int);
00506 if(bits > 0 && bits < word_bits){
00507
00508 data[length - 1] >>= bits;
00509 for(int i=length-2; i >= 0; i--){
00510 uint32 tmp = data[i];
00511 tmp <<= word_bits - bits;
00512 data[i+1] |= tmp;
00513 data[i] >>= bits;
00514 }
00515 }else if(bits < 0 && bits > (-1)*word_bits){
00516 bits *= -1;
00517
00518 data[0] <<= bits;
00519 for(uint32 i=0; i < length; i++){
00520 uint32 tmp = data[i+1];
00521 tmp >>= word_bits - bits;
00522 data[i] |= tmp;
00523 data[i+1] <<= bits;
00524 }
00525 }
00526 }
00527
00528 void SortedMerList::FillSML(gnSeqC* seq_buf, gnSeqI seq_len, boolean circular, vector<bmer>& sml_array){
00529 const uint32 alpha_bits = OPT_HEADER_ALPHABET_BITS;
00530 const uint32 mer_size = header.seed_length;
00531 gnSeqI sar_len = seq_len;
00532 if(!circular)
00533 sar_len -= header.seed_length - 1;
00534 sml_array.reserve(sar_len);
00535
00536 bmer cur_suffix;
00537 cur_suffix.mer = 0;
00538 cur_suffix.position = 0;
00539
00540
00541 for(gnSeqI i=0; i < mer_size; i++){
00542 cur_suffix.mer <<= alpha_bits;
00543 cur_suffix.mer |= header.translation_table[seq_buf[i]];
00544 }
00545 uint8 dead_bits = 64 - (mer_size * alpha_bits);
00546 cur_suffix.mer <<= dead_bits;
00547
00548 sml_array.push_back(cur_suffix);
00549
00550
00551 for(gnSeqI seqI = 1; seqI < sar_len; seqI++){
00552
00553 cur_suffix.position++;
00554 cur_suffix.mer <<= alpha_bits;
00555 uint64 new_mer = header.translation_table[seq_buf[seqI+(mer_size-1)]];
00556 new_mer <<= dead_bits;
00557 cur_suffix.mer |= new_mer;
00558 sml_array.push_back(cur_suffix);
00559 }
00560 }
00561
00562 void SortedMerList::FillSML(const gnSequence& seq, vector<bmer>& sml_array){
00563 gnSeqI seq_len = seq.length();
00564 Array<gnSeqC> seq_buf( seq_len );
00565 seq.ToArray(seq_buf.data, seq_len);
00566 FillSML(seq_buf.data, seq_len, seq.isCircular(), sml_array);
00567 }
00568
00569 void SortedMerList::FillSML(gnSeqI seq_len, vector<gnSeqI>& pos_array){
00570 pos_array.clear();
00571 pos_array.reserve( seq_len );
00572 for(gnSeqI seqI = 0; seqI < seq_len; seqI++ )
00573 pos_array.push_back(seqI);
00574 }
00575
00576 uint64 SortedMerList::GetDnaMer(gnSeqI offset) const
00577 {
00578
00579 uint64 mer_a = SortedMerList::GetMer( offset );
00580
00581
00582 uint64 mer_c = RevCompMer( mer_a, header.seed_length );
00583
00584
00585
00586
00587 return mer_a < mer_c ? mer_a : mer_c;
00588 }
00589
00590 #define OPT_ALPHA_MASQ 0x00000003
00591
00592 uint64 SortedMerList::RevCompMer( uint64 mer_a, int mer_length ) const
00593 {
00594
00595
00596 uint64 mer_b, mer_c = 0;
00597 mer_b = ~mer_a;
00598
00599
00600 for(uint32 i = 0; i < 64; i += OPT_HEADER_ALPHABET_BITS){
00601 mer_c |= mer_b & OPT_ALPHA_MASQ;
00602
00603 mer_b >>= OPT_HEADER_ALPHABET_BITS;
00604 mer_c <<= OPT_HEADER_ALPHABET_BITS;
00605 }
00606 mer_c <<= 64 - (OPT_HEADER_ALPHABET_BITS * (mer_length+1));
00607 mer_c |= 1;
00608 return mer_c;
00609 }
00610
00611
00612 void SortedMerList::FillDnaSML(const gnSequence& seq, vector<bmer>& sml_array){
00613
00614 const uint32 alpha_bits = OPT_HEADER_ALPHABET_BITS;
00615 const uint32 mer_size = header.seed_length;
00616 gnSeqI sar_len = seq.length();
00617 if( sar_len < header.seed_length )
00618 return;
00619 if( !seq.isCircular() )
00620 sar_len -= ( header.seed_length - 1);
00621 sml_array.reserve(sar_len);
00622
00623 uint32 dead_bits = 64 - (mer_size * alpha_bits);
00624 uint64 create_mask = UINT32_MAX;
00625 create_mask <<= 32;
00626 create_mask |= UINT32_MAX;
00627 create_mask <<= dead_bits;
00628
00629 bmer cur_suffix, rcur_suffix;
00630 cur_suffix.mer = sequence[0];
00631 cur_suffix.mer <<= 32;
00632 cur_suffix.mer |= sequence[1];
00633 cur_suffix.mer &= create_mask;
00634 cur_suffix.position = 0;
00635 rcur_suffix.mer = 0;
00636 rcur_suffix.position = 0;
00637
00638
00639
00640 uint64 mer_b = 0;
00641 mer_b = ~cur_suffix.mer;
00642
00643
00644 for(uint32 i = 0; i < 64; i += alpha_bits){
00645
00646 rcur_suffix.mer |= mer_b & OPT_ALPHA_MASQ;
00647 mer_b >>= alpha_bits;
00648 rcur_suffix.mer <<= alpha_bits;
00649 }
00650 rcur_suffix.mer <<= dead_bits - alpha_bits;
00651 rcur_suffix.mer |= 1;
00652
00653
00654 if(cur_suffix.mer < rcur_suffix.mer)
00655 sml_array.push_back(cur_suffix);
00656 else
00657 sml_array.push_back(rcur_suffix);
00658
00659
00660 gnSeqI endI = sar_len + mer_size;
00661 if(seq.isCircular())
00662 endI += mer_size;
00663
00664 uint32 rdead_bits = 64 - alpha_bits - dead_bits;
00665 uint64 tmp_rseq = 0;
00666 uint32 seqI = (mer_size * alpha_bits) / 32;
00667 int32 cur_bit = 32 - alpha_bits - ((mer_size * alpha_bits) % 32);
00668 uint32 cur_seq = sequence[seqI];
00669 uint64 tmp_seq;
00670
00671
00672 uint64 revalpha_mask = OPT_ALPHA_MASQ;
00673 revalpha_mask <<= dead_bits;
00674
00675
00676
00677 for(gnSeqI cur_pos = mer_size + 1; cur_pos < endI; cur_pos++){
00678
00679
00680 cur_suffix.position++;
00681 rcur_suffix.position++;
00682
00683 tmp_seq = cur_seq;
00684 tmp_seq >>= cur_bit;
00685 tmp_seq &= OPT_ALPHA_MASQ;
00686 tmp_seq <<= dead_bits;
00687
00688
00689 cur_suffix.mer <<= alpha_bits;
00690 cur_suffix.mer |= tmp_seq;
00691
00692
00693 tmp_seq = ~tmp_seq;
00694 tmp_seq &= revalpha_mask;
00695 tmp_rseq = tmp_seq;
00696 tmp_rseq <<= rdead_bits;
00697 rcur_suffix.mer >>= alpha_bits;
00698 rcur_suffix.mer |= tmp_rseq;
00699 rcur_suffix.mer &= create_mask;
00700 rcur_suffix.mer |= 1;
00701 if(cur_suffix.mer < rcur_suffix.mer)
00702 sml_array.push_back(cur_suffix);
00703 else
00704 sml_array.push_back(rcur_suffix);
00705
00706 cur_bit -= alpha_bits;
00707 if(cur_bit < 0){
00708 cur_bit += alpha_bits;
00709 cur_seq <<= 16;
00710 cur_seq <<= 16 - (cur_bit);
00711 seqI++;
00712 tmp_seq = sequence[seqI];
00713 tmp_seq >>= cur_bit;
00714 cur_seq |= tmp_seq;
00715 cur_bit += 32 - alpha_bits;
00716 }
00717 }
00718 }
00719
00720
00721 uint64 SortedMerList::GetSeedMer( gnSeqI offset ) const
00722 {
00723
00724 uint64 mer_a = SortedMerList::GetMer( offset );
00725 uint64 mer_b = SortedMerList::GetMer( offset + 1 );
00726 uint64 seed_mer = 0;
00727 uint64 alpha_mask = 1;
00728 alpha_mask <<= OPT_HEADER_ALPHABET_BITS;
00729 alpha_mask--;
00730 alpha_mask <<= 62;
00731 uint64 cur_alpha_mask = alpha_mask;
00732 uint64 char_mask = 1;
00733 char_mask <<= header.seed_length - 1;
00734 uint64 cur_mer = mer_a;
00735 const int mer_transition = 64 / OPT_HEADER_ALPHABET_BITS;
00736 int patternI = 0;
00737 int rshift_amt = 64 - OPT_HEADER_ALPHABET_BITS;
00738 for( ; patternI < header.seed_length; patternI++ ){
00739 if( patternI == mer_transition ){
00740 cur_mer = mer_b;
00741 cur_alpha_mask = alpha_mask;
00742 rshift_amt = 64 - OPT_HEADER_ALPHABET_BITS;
00743 }
00744 if( (header.seed & char_mask) != 0 ){
00745 uint64 char_tmp = cur_mer & cur_alpha_mask;
00746 char_tmp >>= rshift_amt;
00747 seed_mer <<= OPT_HEADER_ALPHABET_BITS;
00748 seed_mer |= char_tmp;
00749 }
00750 cur_alpha_mask >>= OPT_HEADER_ALPHABET_BITS;
00751 char_mask >>= 1;
00752 rshift_amt -= OPT_HEADER_ALPHABET_BITS;
00753 }
00754
00755 seed_mer <<= 64 - (OPT_HEADER_ALPHABET_BITS * header.seed_weight);
00756 return seed_mer;
00757 }
00758
00759 uint64 SortedMerList::GetDnaSeedMer( gnSeqI offset ) const
00760 {
00761 uint64 seed_mer = SortedMerList::GetSeedMer( offset );
00762 uint64 rev_mer = RevCompMer( seed_mer, header.seed_weight );
00763 return seed_mer < rev_mer ? seed_mer : rev_mer;
00764 }
00765
00766 void SortedMerList::FillDnaSeedSML(const gnSequence& seq, vector<bmer>& sml_array){
00767
00768 gnSeqI sar_len = SMLLength();
00769 if( sar_len < header.seed_length )
00770 return;
00771 sml_array.resize(sar_len);
00772
00773
00774 for( gnSeqI seedI = 0; seedI < sar_len; seedI++ ){
00775 sml_array[seedI].mer = GetDnaSeedMer( seedI );
00776 sml_array[seedI].position = seedI;
00777 }
00778 }
00779
00780
00781 void SortedMerList::Create(const gnSequence& seq, const uint64 seed){
00782
00783 if(CalculateMaxMerSize() == 0)
00784 Throw_gnExMsg( SMLCreateError(), "Alphabet size is too large" );
00785
00786 int seed_length = getSeedLength( seed );
00787 int seed_weight = getSeedWeight( seed );
00788
00789 if(seed_length > CalculateMaxMerSize())
00790 Throw_gnExMsg( SMLCreateError(), "Mer size is too large" );
00791
00792 if(seed_length == 0)
00793 Throw_gnExMsg( SMLCreateError(), "Can't have 0 seed length" );
00794
00795
00796 gnSeqI seq_len = seq.length();
00797 if(!seq.isCircular()){
00798 header.circular = false;
00799 }else
00800 header.circular = true;
00801
00802 gnSeqI buf_len = seq.isCircular() ? seq_len + seed_length : seq_len;
00803 Array<gnSeqC> seq_buf( buf_len );
00804 seq.ToArray(seq_buf.data, seq_len);
00805 if( seq.isCircular() )
00806 seq.ToArray(seq_buf.data + seq_len, seed_length-1);
00807
00808
00809 header.length = seq_len;
00810 header.seed_length = seed_length;
00811 header.seed_weight = seed_weight;
00812 header.seed = seed;
00813
00814 SetMerMaskSize( seed_weight );
00815 seed_mask = mer_mask;
00816 SetMerMaskSize( seed_length );
00817
00818 SetSequence( seq_buf.data, buf_len );
00819 }
00820
00821 }