libMems/MemHash.cpp

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * $Id: MemHash.cpp,v 1.32 2004/03/01 02:40:08 darling Exp $
00003  * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
00004  * Please see the file called COPYING for licensing, copying, and modification
00005  * Please see the file called COPYING for licensing details.
00006  * **************
00007  ******************************************************************************/
00008 
00009 #ifdef HAVE_CONFIG_H
00010 #include "config.h"
00011 #endif
00012 
00013 #include "libMems/MemHash.h"
00014 #include "libGenome/gnFilter.h"
00015 #include <list>
00016 #include <map>
00017 #include <sstream>
00018 
00019 using namespace std;
00020 using namespace genome;
00021 namespace mems {
00022 
00023         MemHash::MemHash() : MatchFinder(), allocator( SlotAllocator<MatchHashEntry>::GetSlotAllocator() )
00024 
00025 {
00026         table_size = DEFAULT_MEM_TABLE_SIZE;
00027         seq_count = 0;
00028         m_mem_count = 0;
00029         m_collision_count = 0;
00030         m_repeat_tolerance = DEFAULT_REPEAT_TOLERANCE;
00031         m_enumeration_tolerance = DEFAULT_ENUMERATION_TOLERANCE;
00032         //allocate the hash table
00033         mem_table.resize(table_size);
00034         mem_table_count.reserve( table_size );
00035         for(uint32 i=0; i < table_size; ++i)
00036                 mem_table_count.push_back(0);
00037         match_log = NULL;
00038 }
00039 
00040 //make sure this calls the destructor on each element
00041 MemHash::~MemHash(){
00042 //      allocator.Free(allocated);
00043 }
00044 
00045 MemHash::MemHash(const MemHash& mh) : MatchFinder(mh), allocator( SlotAllocator<MatchHashEntry>::GetSlotAllocator() )
00046 {
00047         *this = mh;
00048 }
00049 
00050 MemHash& MemHash::operator=( const MemHash& mh ){
00051         table_size = mh.table_size;
00052         mer_size = mh.mer_size;
00053         seq_count = mh.seq_count;
00054         m_mem_count = mh.m_mem_count;
00055         m_collision_count = mh.m_collision_count;
00056         m_repeat_tolerance = mh.m_repeat_tolerance;
00057         m_enumeration_tolerance = mh.m_enumeration_tolerance;
00058         mem_table.resize(table_size);
00059         for(uint32 i=0; i < table_size; ++i){
00060                 mem_table_count.push_back(mh.mem_table_count[i]);
00061                 mem_table[i] = mh.mem_table[i];
00062         }
00063         match_log = mh.match_log;
00064         return *this;
00065 }
00066 
00067 MemHash* MemHash::Clone() const{
00068         return new MemHash(*this);
00069 }
00070 
00071 void MemHash::ClearSequences()
00072 {
00073         MatchFinder::Clear();
00074 }
00075 
00076 void MemHash::Clear()
00077 {
00078         MatchFinder::Clear();
00079         m_mem_count = 0;
00080         m_collision_count = 0;
00081         m_repeat_tolerance = DEFAULT_REPEAT_TOLERANCE;
00082         m_enumeration_tolerance = DEFAULT_ENUMERATION_TOLERANCE;
00083         //clear the hash table
00084         for(uint32 listI = 0; listI < table_size; ++listI){
00085                 mem_table[listI].clear();
00086                 mem_table_count[ listI ] = 0;
00087         }
00088         match_log = NULL;
00089 
00090         allocator.Free(allocated);
00091         // WARNING! WARNING! WARNING! this will destroy ALL objects since the allocator has static lifetime!!
00092 //      allocator.Purge();
00093 }
00094 
00095 void MemHash::SetTableSize(uint32 new_table_size){
00096         //allocate the hash table
00097         table_size = new_table_size;
00098         mem_table.clear();
00099         mem_table.resize(table_size);
00100         mem_table_count.clear();
00101         mem_table_count.resize(table_size,0);
00102 }
00103 
00104 boolean MemHash::CreateMatches(){
00105         MatchFinder::FindMatchSeeds();
00106         return true;
00107 }
00108 
00109 void MemHash::FindMatches( MatchList& ml ) {
00110         vector<gnSeqI> start_points;
00111         for( uint32 seqI = 0; seqI < ml.seq_table.size(); ++seqI ){
00112                 start_points.push_back( 0 );
00113         }
00114         FindMatchesFromPosition( ml, start_points );
00115 }
00116 
00117 void MemHash::FindMatchesFromPosition( MatchList& ml, const vector<gnSeqI>& start_points ){
00118         for( uint32 seqI = 0; seqI < ml.seq_table.size(); ++seqI ){
00119                 if( !AddSequence( ml.sml_table[ seqI ], ml.seq_table[ seqI ] ) ){
00120                         ErrorMsg( "Error adding " + ml.seq_filename[seqI] + "\n");
00121                         return;
00122                 }
00123         }
00124         MatchFinder::FindMatchSeeds( start_points );
00125 
00126         GetMatchList( ml );
00127 }
00128 
00129 MatchList MemHash::GetMatchList() const{
00130         MatchList ml;
00131         GetMatchList( ml );
00132         ml.seq_table = seq_table;
00133         ml.sml_table = sar_table;
00134 
00135         return ml;
00136 }
00137 
00138 // an attempt to do this without sorting, which appears to be very slow...
00139 boolean MemHash::EnumerateMatches( IdmerList& match_list )
00140 {
00141         vector< uint > enum_tally(seq_count, 0);
00142         IdmerList::iterator iter = match_list.begin();
00143         IdmerList hash_list;
00144         for(; iter != match_list.end(); ++iter)
00145         {
00146                 if( enum_tally[iter->id] < m_enumeration_tolerance )
00147                 {
00148                         hash_list.push_back(*iter);
00149                 }
00150                 if(enum_tally[iter->id] > m_repeat_tolerance)
00151                         return true;
00152                 ++enum_tally[iter->id];
00153         }
00154 
00155         if(hash_list.size() > 1){
00156                 if(m_enumeration_tolerance == 1)
00157                         return HashMatch(hash_list);
00158                 else
00159                         return MatchFinder::EnumerateMatches( hash_list );
00160         }
00161         return true;
00162 }
00163 
00164 //why have separate hash tables? dunno.  no reason.  what was i thinking
00165 // at that coffeehouse in portland when i wrote this crappy code?
00166 // MemHashEntries use GENETICIST coordinates.  They start at 1, not 0.
00167 boolean MemHash::HashMatch(IdmerList& match_list){
00168         //check that there is at least one forward component
00169 //      match_list.sort(&idmer_id_lessthan);
00170         // initialize the hash entry
00171         MatchHashEntry mhe = MatchHashEntry(seq_count, GetSar(0)->SeedLength());
00172         mhe.SetLength( GetSar(0)->SeedLength() );
00173         
00174         //Fill in the new Match and set direction parity if needed.
00175         IdmerList::iterator iter = match_list.begin();
00176         for(; iter != match_list.end(); ++iter)
00177                 mhe.SetStart(iter->id, iter->position + 1);
00178         SetDirection(mhe);
00179         mhe.CalculateOffset();
00180         if(mhe.Multiplicity() < 2){
00181                 cout << "red flag " << mhe << "\n";
00182                 cout << "match_list.size(): " << match_list.size() << endl;
00183         }else 
00184                 AddHashEntry(mhe);
00185 
00186         return true;
00187 }
00188 
00189 void MemHash::SetDirection(MatchHashEntry& mhe){
00190         //get the reference direction
00191         boolean ref_forward = false;
00192         uint32 seqI=0;
00193         for(; seqI < mhe.SeqCount(); ++seqI)
00194                 if(mhe[seqI] != NO_MATCH){
00195                         ref_forward = !(GetSar(seqI)->GetMer(mhe[seqI] - 1) & 0x1);
00196                         break;
00197                 }
00198         //set directional parity for the rest
00199         for(++seqI; seqI < mhe.SeqCount(); ++seqI)
00200                 if(mhe[seqI] != NO_MATCH)
00201                         if(ref_forward == (GetSar(seqI)->GetMer(mhe[seqI] - 1) & 0x1))
00202                                 mhe.SetStart(seqI, -mhe[seqI]);
00203 }
00204 
00205 // Tries to add a new mem to the mem hash table
00206 // If the mem already exists in the table, a pointer to it
00207 // is returned.  Otherwise mhe is added and a pointer to
00208 // it is returned.
00209 MatchHashEntry* MemHash::AddHashEntry(MatchHashEntry& mhe){
00210         //first compute which hash table bucket this is going into
00211         int64 offset = mhe.Offset();
00212 
00213         uint32 bucketI = ((offset % table_size) + table_size) % table_size;
00214         vector<MatchHashEntry*>::iterator insert_he;
00215         insert_he = std::lower_bound(mem_table[bucketI].begin(), mem_table[bucketI].end(), &mhe, mhecomp);
00216 //      insert_he = mem_table[bucketI].find(&mhe);
00217         if( insert_he != mem_table[bucketI].end() && (!mhecomp(*insert_he, &mhe) && !mhecomp(&mhe, *insert_he)) ){
00218                 ++m_collision_count;
00219                 return *insert_he;
00220         }
00221         
00222         //if we made it this far there were no collisions
00223         //extend the mem into the surrounding region.
00224         vector<MatchHashEntry> subset_matches;
00225         if( !mhe.Extended() )
00226                 ExtendMatch(mhe, subset_matches);
00227 
00228         MatchHashEntry* new_mhe = allocator.Allocate();
00229         new_mhe = new(new_mhe) MatchHashEntry(mhe); 
00230 //      *new_mhe = mhe;
00231         allocated.push_back(new_mhe);
00232         
00233         // can't insert until after the extend!!
00234         insert_he = std::lower_bound(mem_table[bucketI].begin(), mem_table[bucketI].end(), new_mhe, mhecomp);
00235         mem_table[bucketI].insert(insert_he, new_mhe);
00236 
00237         // log it.
00238         if( match_log != NULL ){
00239                 (*match_log) << *new_mhe << endl;
00240                 match_log->flush();
00241         }
00242         
00243         // link up the subset matches
00244         for(uint32 subsetI = 0; subsetI < subset_matches.size(); ++subsetI){
00245                 MatchHashEntry* submem = AddHashEntry( subset_matches[ subsetI ] );
00246         }
00247         
00248         ++mem_table_count[bucketI];
00249         ++m_mem_count;
00250         return new_mhe;
00251 }
00252 
00253 void MemHash::PrintDistribution(ostream& os) const{
00254     vector<MatchHashEntry*>::const_iterator mem_iter;
00255         gnSeqI base_count;
00256         for(uint32 i=0; i < mem_table_count.size(); ++i){
00257                 mem_iter = mem_table[i].begin();
00258                 base_count = 0;
00259                 for(; mem_iter != mem_table[i].end(); ++mem_iter){
00260                         base_count += (*mem_iter)->Length();
00261                 }
00262                 os << i << '\t' << mem_table_count[i] << '\t' << base_count << '\n';
00263         }
00264 }
00265 
00266 void MemHash::LoadFile(istream& mem_file){
00267         string tag;
00268         gnSeqI len;
00269         int64 start;
00270         MatchHashEntry mhe;
00271         getline( mem_file, tag );
00272         stringstream first_mum( tag );
00273         seq_count = 0;
00274         first_mum >> len;
00275         while( first_mum >> start ){
00276                 seq_count++;
00277         }
00278         mhe = MatchHashEntry(seq_count, mer_size, MatchHashEntry::seed);
00279         first_mum.str( tag );
00280         first_mum.clear();
00281         for(uint32 seqI = 0; seqI < seq_count; seqI++){
00282                 first_mum >> start;
00283                 mhe.SetStart(seqI, start);
00284         }
00285         mhe.SetLength( len );
00286         mhe.CalculateOffset();
00287         AddHashEntry(mhe);
00288         
00289         while(mem_file.good()){
00290                 mem_file >> len;
00291                 if(!mem_file.good())
00292                         break;
00293                 mhe.SetLength(len);
00294                 for(uint32 seqI = 0; seqI < seq_count; seqI++){
00295                         mem_file >> start;
00296                         mhe.SetStart(seqI, start);
00297                 }
00298                 //break if the stream ended
00299                 if(!mem_file.good())
00300                         break;
00301                 mhe.CalculateOffset();
00302                 AddHashEntry(mhe);
00303         }
00304 }
00305 
00306 void MemHash::WriteFile(ostream& mem_file) const{
00307         mem_file << "FormatVersion" << '\t' << 1 << "\n";
00308         mem_file << "SequenceCount" << '\t' << sar_table.size() << "\n";
00309         for(unsigned int seqI = 0; seqI < seq_count; seqI++){
00310                 mem_file << "Sequence" << seqI << "File";
00311                 gnGenomeSpec* specker = seq_table[seqI]->GetSpec();
00312                 string sourcename = specker->GetName();
00313                 if( sourcename == "" )
00314                         sourcename = "null";
00315                 mem_file << '\t' << sourcename << "\n";
00316                 mem_file << "Sequence" << seqI << "Length";
00317                 mem_file << '\t' << seq_table[seqI]->length() << "\n";
00318         }
00319         mem_file << "MatchCount" << '\t' << m_mem_count << endl;
00320         //get all the mems out of the hash table and write them out
00321     vector<MatchHashEntry*>::const_iterator mem_table_iter;
00322         for(uint32 i=0; i < table_size; i++){
00323                 mem_table_iter = mem_table[i].begin();
00324                 for(; mem_table_iter != mem_table[i].end(); mem_table_iter++)
00325                         mem_file << **mem_table_iter << "\n";
00326         }
00327 }
00328 
00329 
00330 } // namespace mems

Generated on Fri Mar 14 06:01:03 2008 for libMems by doxygen 1.3.6