libMems/SeedMasks.h

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * $Id: SortedMerList.h,v 1.13 2004/02/27 23:08:55 darling Exp $
00003  * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
00004  * This file is licensed under the GPL.
00005  * Please see the file called COPYING for licensing details.
00006  * **************
00007  ******************************************************************************/
00008 
00009 #ifndef _SeedMasks_h_
00010 #define _SeedMasks_h_
00011 
00012 #ifdef HAVE_CONFIG_H
00013 #include "config.h"
00014 #endif
00015 
00016 #ifdef __cplusplus
00017 #include <cmath>
00018 #else
00019 #include <math.h>
00020 #endif
00021 #include "libGenome/gnDefs.h"
00022 
00023 /* Seed patterns taken from: AE Darling, T Treangen, L Zhang, C Kuiken, X Messeguer, NT Perna
00024  * "Procrastination leads to efficient match filtration for local multiple alignment" 
00025  * Lecture Notes in Bioinformatics 4175:126-137 Springer-Verlag 2006
00026  */
00027 
00032 #ifdef __cplusplus
00033 static
00034 #endif
00035 uint32** seedMasks();
00036 
00041 #ifdef __cplusplus
00042 inline static
00043 #endif
00044 uint32** seedMasks(){
00045         static uint32 seed_masks_3[] = 
00046         {
00047                 0,0xb, //0b1011
00048                 0,0,            0,0,            0,0,            0,0,            0,0,    };
00049         static uint32 seed_masks_4[] = 
00050         {
00051                 0,0x3b, //0b101011,
00052                 0,0,            0,0,            0,0,            0,0,            0,0,    };
00053         static uint32 seed_masks_5[] = 
00054         {
00055                 0,0x6b,         //0b1101011,
00056                 0,0x139,        //0b100111001,
00057                 0,0x193,        //0b110010011,
00058                 0,0x6b,         //0b1101011,
00059                 0,0,            0,0,    };
00060         static uint32 seed_masks_6[] = 
00061         {
00062                 0,0x58D, //0b10110001101,
00063                 0,0x653, //0b11001010011,
00064                 0,0x1AB, //0b110101011,
00065                 0,0xdb, //0b11011011,
00066                 0,0,            0,0,    };
00067         static uint32 seed_masks_7[] = 
00068         {
00069                 0,0x1953,       //0b1100101010011
00070                 0,0x588d,       //0b101100010001101
00071                 0,0x688b,       //0b110100010001011
00072                 0,0x17d,        //0b101111101,
00073                 0,0x164d,       //0b1011001001101,
00074                 0,0,            0,0,    };
00075         static uint32 seed_masks_8[] = 
00076         {
00077                 0,0x3927, //0b11100100100111,
00078                 0,0x1CA7, //0b1110010100111,
00079                 0,0x6553, //0b110010101010011,
00080                 0,0xb6d,        //0b101101101101,
00081                 0,0,            0,0,    };
00082         static uint32 seed_masks_9[] = 
00083         {
00084                 0,0x7497,       //0b111010010010111,
00085                 0,0x1c927,      //0b11100100100100111,
00086                 0,0x72a7,       //0b111001010100111,
00087                 0,0x6fb,        //0b11011111011,
00088                 0,0x16ed,       //0b1011011101101,
00089                 0,0,    };
00090         static uint32 seed_masks_10[] = 
00091         {
00092                 0,0x1d297,      //              0,0b11101001010010111,
00093                 0,0x3A497,  //          0,0b111010010010010111,
00094                 0,0xE997,  //           0,0b1110100110010111,
00095                 0,0x6D5B,  //           0,0b110110101011011,
00096                 0,0,            0,0,    };
00097         static uint32 seed_masks_11[] = 
00098         {
00099                 0,0x7954f,      //0b11110010101001111,
00100                 0,0x75257,      //0b1110101001001010111,
00101                 0,0x1c9527,     //0b111001001010100100111,
00102                 0,0x5bed,       //0b101101111101101,  // third b.p. coding pattern
00103                 0,0x5b26d,      //0b1011011001001101101,
00104                 0,0,    };
00105         static uint32 seed_masks_12[] = 
00106         {
00107                 0,0x7954f,      //              0,0b1111001010101001111,
00108                 0,0x3D32F,  //          0,0b111101001100101111,
00109                 0,0x768B7,  //          0,0b1110110100010110111,
00110                 0,0x5B56D,  //          0,0b1011011010101101101,
00111                 0,0,            0,0,    };
00112         static uint32 seed_masks_13[] = 
00113         {
00114                 0,0x792a4f,     //0b11110010010101001001111,
00115                 0,0x1d64d7,     //0b111010110010011010111,
00116                 0,0x1d3597,     //0b111010011010110010111,
00117                 0,0x1b7db,      //0b11011011111011011,  // third b.p. coding pattern
00118                 0,0x75ad7,      //0b1110101101011010111,
00119                 0,0,    };
00120         static uint32 seed_masks_14[] = 
00121         {
00122                 0,0x1e6acf,  //         0,0b111100110101011001111,
00123                 0,0xF59AF,   //         0,0b11110101100110101111,
00124                 0,0x3D4CAF,  //         0,0b1111010100110010101111,
00125                 0,0x35AD6B,  //         0,0b1101011010110101101011,
00126                 0,0,            0,0,    };
00127         static uint32 seed_masks_15[] = 
00128         {
00129                 0,0x7ac9af,     //0b11110101100100110101111,
00130                 0,0x7b2a6f,     //0b11110110010101001101111,
00131                 0,0x79aacf,     //0b11110011010101011001111,
00132                 0,0x16df6d,     //0b101101101111101101101,      // third b.p. coding pattern
00133                 0,0x6b5d6b,     //0b11010110101110101101011,
00134                 0,0,    };
00135         static uint32 seed_masks_16[] = 
00136         {
00137                 0,0xf599af,  //         0,0b111101011001100110101111,
00138                 0,0xEE5A77,  //         0,0b111011100101101001110111,
00139                 0,0x7CD59F,  //         0,0b11111001101010110011111,
00140                 0,0xEB5AD7,  //         0,0b111010110101101011010111,
00141                 0,0,            0,0,    };
00142         static uint32 seed_masks_17[] =
00143         {
00144                 0,0x6dbedb,     //0b11011011011111011011011,    // third b.p. coding pattern
00145                 0,0,            0,0,            0,0,            0,0,            0,0,    };
00146         static uint32 seed_masks_18[] =
00147         {
00148                 0,0x3E6B59F,//          0,0b11111001101011010110011111,
00149                 0,0x3EB335F,//          0,0b11111010110011001101011111,
00150                 0,0x7B3566F,//          0,0b111101100110101011001101111,
00151                 0,0,            0,0,            0,0,    };
00152 
00153         static uint32 seed_masks_19[] =
00154         {
00155                 0,0x7b974ef,    //0b111101110010111010011101111
00156                 0,0x7d6735f,    //0b111110101100111001101011111
00157                 0,0x1edd74f,    //0b1111011011101011101101111
00158                 0,0,            0,0,            0,0,    };
00159         static uint32 seed_masks_20[] =
00160         {
00161                 0,0x1F59B35F,   //0b11111010110011011001101011111,
00162                 0,0x3EDCEDF,    //0b11111011011100111011011111,
00163                 0,0xFAE675F,    //0b1111101011100110011101011111,
00164                 0,0,            0,0,            0,0,    };
00165         static uint32 seed_masks_21[] =
00166         {
00167                 0,0x7ddaddf,    //0b111110111011010110111011111,
00168                 0,0xaeb3f,              //0b11111100110101110101100111111,
00169                 0,0x7eb76bf,    //0b111111010110111011010111111,
00170                 0,0,            0,0,            0,0,    };
00171         // default to solid seeds for weight 22+
00172         static uint32 seed_masks_22[] =
00173         {
00174                 0,0x003fffff,
00175                 0,0,            0,0,            0,0,            0,0,            0,0,    };
00176         static uint32 seed_masks_23[] =
00177         {
00178                 0,0x007fffff,
00179                 0,0,            0,0,            0,0,            0,0,            0,0,    };
00180         static uint32 seed_masks_24[] =
00181         {
00182                 0,0x00ffffff,
00183                 0,0,            0,0,            0,0,            0,0,            0,0,    };
00184         static uint32 seed_masks_25[] =
00185         {
00186                 0,0x01ffffff,
00187                 0,0,            0,0,            0,0,            0,0,            0,0,    };
00188         static uint32 seed_masks_26[] =
00189         {
00190                 0,0x03ffffff,
00191                 0,0,            0,0,            0,0,            0,0,            0,0,    };
00192         static uint32 seed_masks_27[] =
00193         {
00194                 0,0x07ffffff,
00195                 0,0,            0,0,            0,0,            0,0,            0,0,    };
00196         static uint32 seed_masks_28[] =
00197         {
00198                 0,0x0fffffff,
00199                 0,0,            0,0,            0,0,            0,0,            0,0,    };
00200         static uint32 seed_masks_29[] =
00201         {
00202                 0,0x1fffffff,
00203                 0,0,            0,0,            0,0,            0,0,            0,0,    };
00204         static uint32 seed_masks_30[] =
00205         {
00206                 0,0x3fffffff,
00207                 0,0,            0,0,            0,0,            0,0,            0,0,    };
00208         static uint32 seed_masks_31[] =
00209         {
00210                 0,0x7fffffff,
00211                 0,0,            0,0,            0,0,            0,0,            0,0,    };
00212 
00213         static uint32 no_seeds[] = 
00214         {
00215                 0,0,    
00216                 0,0,    
00217                 0,0,    
00218                 0,0,    
00219                 0,0,    
00220                 0,0,    
00221         };
00222         
00223         static uint32* seed_masks[] = 
00224         {
00225         no_seeds,
00226         no_seeds,
00227         no_seeds,
00228         seed_masks_3,
00229         seed_masks_4,
00230         seed_masks_5,
00231         seed_masks_6,
00232         seed_masks_7,
00233         seed_masks_8,
00234         seed_masks_9,
00235         seed_masks_10,
00236         seed_masks_11,
00237         seed_masks_12,
00238         seed_masks_13,
00239         seed_masks_14,
00240         seed_masks_15,
00241         seed_masks_16,
00242         seed_masks_17,
00243         seed_masks_18,
00244         seed_masks_19,
00245         seed_masks_20,
00246         seed_masks_21,
00247         seed_masks_22,
00248         seed_masks_23,
00249         seed_masks_24,
00250         seed_masks_25,
00251         seed_masks_26,
00252         seed_masks_27,
00253         seed_masks_28,
00254         seed_masks_29,
00255         seed_masks_30,
00256         seed_masks_31,
00257         };
00258         
00259         return seed_masks;
00260 }
00261 
00262 static const int CODING_SEED = 3;
00263 static const int SOLID_SEED = INT_MAX;
00264 
00268 #ifdef __cplusplus
00269 static
00270 #endif
00271 int64 getSolidSeed( int weight );
00272 
00273 #ifdef __cplusplus
00274 inline static
00275 #endif
00276 int64 getSolidSeed( int weight ){
00277         int64 seed = 1;
00278         seed <<= weight;
00279         seed--;
00280         return seed;
00281 };
00282 
00283 
00284 
00289 #ifdef __cplusplus
00290 static int64 getSeed( int weight, int seed_rank = 0 );
00291 #else
00292 int64 getSeed( int weight, int seed_rank );
00293 #endif
00294 
00295 #ifdef __cplusplus
00296 inline static
00297 #endif
00298 int64 getSeed( int weight, int seed_rank ){
00299         uint32** masks;
00300         int high;
00301         int low;
00302         int i = 1;
00303         int64 seed = 0;
00304         if( seed_rank == SOLID_SEED )
00305                 return getSolidSeed( weight );
00306 
00307         masks = seedMasks();
00308         if(weight > 31)
00309                 return getSolidSeed(32);
00310         if( seed_rank > 5 )
00311                 return getSolidSeed(weight);
00312         if( masks[weight][seed_rank*2+1] == 0 )
00313                 return getSolidSeed(weight);
00314         high = masks[ weight ][ seed_rank*2 ];
00315         low = masks[ weight ][ seed_rank*2 + 1 ];
00316         
00317         seed |= high;
00318         seed <<= 32;
00319         seed |= low;
00320         return seed;
00321 };
00322 
00323 
00327 #ifdef __cplusplus
00328 static
00329 #endif
00330 int getSeedLength( int64 seed );
00331 
00332 #ifdef __cplusplus
00333 inline static
00334 #endif
00335 int getSeedLength( int64 seed ){
00336         int right_bit = -1;
00337         int left_bit = -1;
00338         uint bitI = 0;
00339         for( ; bitI < 64; ++bitI ){
00340                 if( (seed & 1) == 1 ){
00341                         left_bit = bitI;
00342                         if( right_bit == -1 )
00343                                 right_bit = bitI;
00344                 }
00345                 seed >>= 1;
00346         }
00347         if( left_bit != -1 )
00348                 return left_bit - right_bit + 1;
00349         return 0;
00350 }
00351 
00355 #ifdef __cplusplus
00356 static
00357 #endif
00358 int getSeedWeight( int64 seed );
00359 
00360 #ifdef __cplusplus
00361 inline static
00362 #endif
00363 int getSeedWeight( int64 seed ){
00364         int weight = 0;
00365         uint bitI = 0;
00366         for( ; bitI < 64; ++bitI ){
00367                 if( (seed & 1) == 1 ){
00368                         ++weight;
00369                 }
00370                 seed >>= 1;
00371         }
00372         return weight;
00373 }
00374 
00375 const uint MIN_DNA_SEED_WEIGHT = 5;
00376 const uint MAX_DNA_SEED_WEIGHT = 31;
00377 
00381 #ifdef __cplusplus
00382 static
00383 #endif
00384 uint getDefaultSeedWeight( gnSeqI avg_sequence_length );
00385 
00386 #ifdef __cplusplus
00387 inline static
00388 #endif
00389 uint getDefaultSeedWeight( gnSeqI avg_sequence_length ){
00390         uint mer_size = (uint)((log( (double)avg_sequence_length ) / log( 2.0 ))/1.5);
00391         // don't allow even weights-- they can be palindromic
00392         if( !(mer_size & 0x1 ) )
00393                 ++mer_size;
00394         mer_size = mer_size < MIN_DNA_SEED_WEIGHT ? 0 : mer_size;
00395         if( avg_sequence_length == 0 )
00396                 mer_size = 0;
00397 
00398         // 31 is the maximum DNA seed weight
00399         mer_size = mer_size > MAX_DNA_SEED_WEIGHT ? MAX_DNA_SEED_WEIGHT : mer_size;
00400         return mer_size;
00401 }
00402 
00403 
00404 #endif // _SeedMasks_h_

Generated on Fri Mar 14 06:01:04 2008 for libMems by doxygen 1.3.6