00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifdef HAVE_CONFIG_H
00010 #include "config.h"
00011 #endif
00012
00013 #include "libMems/MemHash.h"
00014 #include "libGenome/gnFilter.h"
00015 #include <list>
00016 #include <map>
00017 #include <sstream>
00018
00019 using namespace std;
00020 using namespace genome;
00021 namespace mems {
00022
00023 MemHash::MemHash() : MatchFinder(), allocator( SlotAllocator<MatchHashEntry>::GetSlotAllocator() )
00024
00025 {
00026 table_size = DEFAULT_MEM_TABLE_SIZE;
00027 seq_count = 0;
00028 m_mem_count = 0;
00029 m_collision_count = 0;
00030 m_repeat_tolerance = DEFAULT_REPEAT_TOLERANCE;
00031 m_enumeration_tolerance = DEFAULT_ENUMERATION_TOLERANCE;
00032
00033 mem_table.resize(table_size);
00034 mem_table_count.reserve( table_size );
00035 for(uint32 i=0; i < table_size; ++i)
00036 mem_table_count.push_back(0);
00037 match_log = NULL;
00038 }
00039
00040
00041 MemHash::~MemHash(){
00042
00043 }
00044
00045 MemHash::MemHash(const MemHash& mh) : MatchFinder(mh), allocator( SlotAllocator<MatchHashEntry>::GetSlotAllocator() )
00046 {
00047 *this = mh;
00048 }
00049
00050 MemHash& MemHash::operator=( const MemHash& mh ){
00051 table_size = mh.table_size;
00052 mer_size = mh.mer_size;
00053 seq_count = mh.seq_count;
00054 m_mem_count = mh.m_mem_count;
00055 m_collision_count = mh.m_collision_count;
00056 m_repeat_tolerance = mh.m_repeat_tolerance;
00057 m_enumeration_tolerance = mh.m_enumeration_tolerance;
00058 mem_table.resize(table_size);
00059 for(uint32 i=0; i < table_size; ++i){
00060 mem_table_count.push_back(mh.mem_table_count[i]);
00061 mem_table[i] = mh.mem_table[i];
00062 }
00063 match_log = mh.match_log;
00064 return *this;
00065 }
00066
00067 MemHash* MemHash::Clone() const{
00068 return new MemHash(*this);
00069 }
00070
00071 void MemHash::ClearSequences()
00072 {
00073 MatchFinder::Clear();
00074 }
00075
00076 void MemHash::Clear()
00077 {
00078 MatchFinder::Clear();
00079 m_mem_count = 0;
00080 m_collision_count = 0;
00081 m_repeat_tolerance = DEFAULT_REPEAT_TOLERANCE;
00082 m_enumeration_tolerance = DEFAULT_ENUMERATION_TOLERANCE;
00083
00084 for(uint32 listI = 0; listI < table_size; ++listI){
00085 mem_table[listI].clear();
00086 mem_table_count[ listI ] = 0;
00087 }
00088 match_log = NULL;
00089
00090 allocator.Free(allocated);
00091
00092
00093 }
00094
00095 void MemHash::SetTableSize(uint32 new_table_size){
00096
00097 table_size = new_table_size;
00098 mem_table.clear();
00099 mem_table.resize(table_size);
00100 mem_table_count.clear();
00101 mem_table_count.resize(table_size,0);
00102 }
00103
00104 boolean MemHash::CreateMatches(){
00105 MatchFinder::FindMatchSeeds();
00106 return true;
00107 }
00108
00109 void MemHash::FindMatches( MatchList& ml ) {
00110 vector<gnSeqI> start_points;
00111 for( uint32 seqI = 0; seqI < ml.seq_table.size(); ++seqI ){
00112 start_points.push_back( 0 );
00113 }
00114 FindMatchesFromPosition( ml, start_points );
00115 }
00116
00117 void MemHash::FindMatchesFromPosition( MatchList& ml, const vector<gnSeqI>& start_points ){
00118 for( uint32 seqI = 0; seqI < ml.seq_table.size(); ++seqI ){
00119 if( !AddSequence( ml.sml_table[ seqI ], ml.seq_table[ seqI ] ) ){
00120 ErrorMsg( "Error adding " + ml.seq_filename[seqI] + "\n");
00121 return;
00122 }
00123 }
00124 MatchFinder::FindMatchSeeds( start_points );
00125
00126 GetMatchList( ml );
00127 }
00128
00129 MatchList MemHash::GetMatchList() const{
00130 MatchList ml;
00131 GetMatchList( ml );
00132 ml.seq_table = seq_table;
00133 ml.sml_table = sar_table;
00134
00135 return ml;
00136 }
00137
00138
00139 boolean MemHash::EnumerateMatches( IdmerList& match_list )
00140 {
00141 vector< uint > enum_tally(seq_count, 0);
00142 IdmerList::iterator iter = match_list.begin();
00143 IdmerList hash_list;
00144 for(; iter != match_list.end(); ++iter)
00145 {
00146 if( enum_tally[iter->id] < m_enumeration_tolerance )
00147 {
00148 hash_list.push_back(*iter);
00149 }
00150 if(enum_tally[iter->id] > m_repeat_tolerance)
00151 return true;
00152 ++enum_tally[iter->id];
00153 }
00154
00155 if(hash_list.size() > 1){
00156 if(m_enumeration_tolerance == 1)
00157 return HashMatch(hash_list);
00158 else
00159 return MatchFinder::EnumerateMatches( hash_list );
00160 }
00161 return true;
00162 }
00163
00164
00165
00166
00167 boolean MemHash::HashMatch(IdmerList& match_list){
00168
00169
00170
00171 MatchHashEntry mhe = MatchHashEntry(seq_count, GetSar(0)->SeedLength());
00172 mhe.SetLength( GetSar(0)->SeedLength() );
00173
00174
00175 IdmerList::iterator iter = match_list.begin();
00176 for(; iter != match_list.end(); ++iter)
00177 mhe.SetStart(iter->id, iter->position + 1);
00178 SetDirection(mhe);
00179 mhe.CalculateOffset();
00180 if(mhe.Multiplicity() < 2){
00181 cout << "red flag " << mhe << "\n";
00182 cout << "match_list.size(): " << match_list.size() << endl;
00183 }else
00184 AddHashEntry(mhe);
00185
00186 return true;
00187 }
00188
00189 void MemHash::SetDirection(MatchHashEntry& mhe){
00190
00191 boolean ref_forward = false;
00192 uint32 seqI=0;
00193 for(; seqI < mhe.SeqCount(); ++seqI)
00194 if(mhe[seqI] != NO_MATCH){
00195 ref_forward = !(GetSar(seqI)->GetMer(mhe[seqI] - 1) & 0x1);
00196 break;
00197 }
00198
00199 for(++seqI; seqI < mhe.SeqCount(); ++seqI)
00200 if(mhe[seqI] != NO_MATCH)
00201 if(ref_forward == (GetSar(seqI)->GetMer(mhe[seqI] - 1) & 0x1))
00202 mhe.SetStart(seqI, -mhe[seqI]);
00203 }
00204
00205
00206
00207
00208
00209 MatchHashEntry* MemHash::AddHashEntry(MatchHashEntry& mhe){
00210
00211 int64 offset = mhe.Offset();
00212
00213 uint32 bucketI = ((offset % table_size) + table_size) % table_size;
00214 vector<MatchHashEntry*>::iterator insert_he;
00215 insert_he = std::lower_bound(mem_table[bucketI].begin(), mem_table[bucketI].end(), &mhe, mhecomp);
00216
00217 if( insert_he != mem_table[bucketI].end() && (!mhecomp(*insert_he, &mhe) && !mhecomp(&mhe, *insert_he)) ){
00218 ++m_collision_count;
00219 return *insert_he;
00220 }
00221
00222
00223
00224 vector<MatchHashEntry> subset_matches;
00225 if( !mhe.Extended() )
00226 ExtendMatch(mhe, subset_matches);
00227
00228 MatchHashEntry* new_mhe = allocator.Allocate();
00229 new_mhe = new(new_mhe) MatchHashEntry(mhe);
00230
00231 allocated.push_back(new_mhe);
00232
00233
00234 insert_he = std::lower_bound(mem_table[bucketI].begin(), mem_table[bucketI].end(), new_mhe, mhecomp);
00235 mem_table[bucketI].insert(insert_he, new_mhe);
00236
00237
00238 if( match_log != NULL ){
00239 (*match_log) << *new_mhe << endl;
00240 match_log->flush();
00241 }
00242
00243
00244 for(uint32 subsetI = 0; subsetI < subset_matches.size(); ++subsetI){
00245 MatchHashEntry* submem = AddHashEntry( subset_matches[ subsetI ] );
00246 }
00247
00248 ++mem_table_count[bucketI];
00249 ++m_mem_count;
00250 return new_mhe;
00251 }
00252
00253 void MemHash::PrintDistribution(ostream& os) const{
00254 vector<MatchHashEntry*>::const_iterator mem_iter;
00255 gnSeqI base_count;
00256 for(uint32 i=0; i < mem_table_count.size(); ++i){
00257 mem_iter = mem_table[i].begin();
00258 base_count = 0;
00259 for(; mem_iter != mem_table[i].end(); ++mem_iter){
00260 base_count += (*mem_iter)->Length();
00261 }
00262 os << i << '\t' << mem_table_count[i] << '\t' << base_count << '\n';
00263 }
00264 }
00265
00266 void MemHash::LoadFile(istream& mem_file){
00267 string tag;
00268 gnSeqI len;
00269 int64 start;
00270 MatchHashEntry mhe;
00271 getline( mem_file, tag );
00272 stringstream first_mum( tag );
00273 seq_count = 0;
00274 first_mum >> len;
00275 while( first_mum >> start ){
00276 seq_count++;
00277 }
00278 mhe = MatchHashEntry(seq_count, mer_size, MatchHashEntry::seed);
00279 first_mum.str( tag );
00280 first_mum.clear();
00281 for(uint32 seqI = 0; seqI < seq_count; seqI++){
00282 first_mum >> start;
00283 mhe.SetStart(seqI, start);
00284 }
00285 mhe.SetLength( len );
00286 mhe.CalculateOffset();
00287 AddHashEntry(mhe);
00288
00289 while(mem_file.good()){
00290 mem_file >> len;
00291 if(!mem_file.good())
00292 break;
00293 mhe.SetLength(len);
00294 for(uint32 seqI = 0; seqI < seq_count; seqI++){
00295 mem_file >> start;
00296 mhe.SetStart(seqI, start);
00297 }
00298
00299 if(!mem_file.good())
00300 break;
00301 mhe.CalculateOffset();
00302 AddHashEntry(mhe);
00303 }
00304 }
00305
00306 void MemHash::WriteFile(ostream& mem_file) const{
00307 mem_file << "FormatVersion" << '\t' << 1 << "\n";
00308 mem_file << "SequenceCount" << '\t' << sar_table.size() << "\n";
00309 for(unsigned int seqI = 0; seqI < seq_count; seqI++){
00310 mem_file << "Sequence" << seqI << "File";
00311 gnGenomeSpec* specker = seq_table[seqI]->GetSpec();
00312 string sourcename = specker->GetName();
00313 if( sourcename == "" )
00314 sourcename = "null";
00315 mem_file << '\t' << sourcename << "\n";
00316 mem_file << "Sequence" << seqI << "Length";
00317 mem_file << '\t' << seq_table[seqI]->length() << "\n";
00318 }
00319 mem_file << "MatchCount" << '\t' << m_mem_count << endl;
00320
00321 vector<MatchHashEntry*>::const_iterator mem_table_iter;
00322 for(uint32 i=0; i < table_size; i++){
00323 mem_table_iter = mem_table[i].begin();
00324 for(; mem_table_iter != mem_table[i].end(); mem_table_iter++)
00325 mem_file << **mem_table_iter << "\n";
00326 }
00327 }
00328
00329
00330 }