00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifndef _Aligner_h_
00010 #define _Aligner_h_
00011
00012 #ifdef HAVE_CONFIG_H
00013 #include "config.h"
00014 #endif
00015
00016 #include "libGenome/gnSequence.h"
00017 #include "libMems/DNAMemorySML.h"
00018 #include "libMems/GappedAligner.h"
00019 #include "libMems/MatchList.h"
00020 #include "libMems/Interval.h"
00021 #include "libMems/IntervalList.h"
00022 #include "libMems/MemHash.h"
00023 #include "libMems/MaskedMemHash.h"
00024 #include <map>
00025 #include "libMems/NumericMatrix.h"
00026 #include "libMems/GreedyBreakpointElimination.h"
00027 #include <list>
00028 #include "libMems/LCB.h"
00029 #include "libMUSCLE/threadstorage.h"
00030
00031 namespace mems {
00032
00037 class LabeledMem{
00038 public:
00039 Match* mem;
00040 uint label;
00041 };
00042
00047 class LabeledMemComparator {
00048 public:
00049 LabeledMemComparator( uint seq ){
00050 m_seq = seq;
00051 }
00052 LabeledMemComparator( LabeledMemComparator& lmc ){
00053 m_seq = lmc.m_seq;
00054 }
00055 boolean operator()(const LabeledMem& a, const LabeledMem& b) const{
00056
00057 int64 a_start = a.mem->Start( m_seq ), b_start = b.mem->Start( m_seq );
00058 if( a_start == NO_MATCH || b_start == NO_MATCH ){
00059 if( b_start != NO_MATCH )
00060 return true;
00061 return false;
00062 }
00063 if(a_start < 0)
00064 a_start = -a_start;
00065
00066 if(b_start < 0)
00067 b_start = -b_start;
00068
00069 int64 diff = a_start - b_start;
00070 return diff < 0;
00071 }
00072 ~LabeledMemComparator() {
00073 }
00074 protected:
00075 uint m_seq;
00076 private:
00077 LabeledMemComparator();
00078 };
00079
00084 class PlacementMatch{
00085 public:
00086 Match* mem;
00087 std::list< LabeledMem >::iterator iter;
00088 };
00089
00094 class PlacementMatchComparator {
00095 public:
00096 PlacementMatchComparator( uint seq ){
00097 m_seq = seq;
00098 }
00099 PlacementMatchComparator( PlacementMatchComparator& lmc ){
00100 m_seq = lmc.m_seq;
00101 }
00102 boolean operator()(const PlacementMatch& a, const PlacementMatch& b) const{
00103
00104 int64 a_start = a.mem->Start( m_seq ), b_start = b.mem->Start( m_seq );
00105 if( a_start == NO_MATCH || b_start == NO_MATCH ){
00106 if( b_start != NO_MATCH )
00107 return true;
00108 return false;
00109 }
00110 if(a_start < 0)
00111 a_start = -a_start;
00112
00113 if(b_start < 0)
00114 b_start = -b_start;
00115
00116
00117 int64 diff = a_start - b_start;
00118 return diff < 0;
00119 }
00120 ~PlacementMatchComparator() {
00121 }
00122 protected:
00123 uint m_seq;
00124 private:
00125 PlacementMatchComparator();
00126 };
00127
00128
00130 typedef std::pair< mems::Match*, mems::Match* > search_cache_t;
00131
00132
00142 class Aligner {
00143 public:
00148 Aligner( uint seq_count );
00149 Aligner( const Aligner& al );
00150 Aligner& operator=( const Aligner& al );
00151
00172 void align( MatchList& mlist, IntervalList& interval_list, double LCB_minimum_density, double LCB_minimum_range, boolean recursive, boolean extend_lcbs, boolean gapped_alignment, std::string tree_filename = "" );
00173
00174 void Recursion( MatchList& r_list, Match* r_begin, Match* r_end, boolean nway_only = false );
00175 void GetBestLCB( MatchList& r_list, MatchList& best_lcb );
00176 void DoSomethingCool( MatchList& mlist, Interval& iv );
00177
00184 void SetMinRecursionGapLength( gnSeqI min_r_gap );
00185
00186 void SetMaxExtensionIterations( uint ext_iters ){ this->max_extension_iters = ext_iters; }
00187
00188 void SearchWithinLCB( MatchList& mlist, std::vector< search_cache_t >& new_cache, bool leftmost = false, bool rightmost = false );
00189 void RecursiveAnchorSearch( MatchList& mlist, gnSeqI minimum_weight, std::vector< MatchList >& LCB_list, boolean entire_genome, std::ostream* status_out = NULL );
00190
00191 void AlignLCB( MatchList& mlist, Interval& iv );
00192 void SetGappedAligner( GappedAligner& gal );
00194 void SetMaxGappedAlignmentLength( gnSeqI len );
00195
00197 void SetPermutationOutput( std::string& permutation_filename, int64 permutation_weight );
00198 void WritePermutation( std::vector< LCB >& adjacencies, std::string out_filename );
00199
00200 void SetRecursive( bool value ){ this->recursive = value; }
00201 protected:
00202 TLS<MemHash> gap_mh;
00203 MaskedMemHash nway_mh;
00204 uint32 seq_count;
00205 boolean debug;
00207 double LCB_minimum_density;
00208 double LCB_minimum_range;
00209
00210 uint max_extension_iters;
00212 int64 cur_min_coverage;
00214 gnSeqI min_recursive_gap_length;
00216 void consistencyCheck( uint lcb_count, std::vector< LCB >& adjacencies, std::vector< MatchList >& lcb_list, std::vector< int64 >& weights );
00217
00218 boolean recursive;
00219 boolean extend_lcbs;
00220 boolean gapped_alignment;
00221 boolean currently_recursing;
00222 boolean collinear_genomes;
00224 GappedAligner* gal;
00225
00226 std::string permutation_filename;
00227 int64 permutation_weight;
00228
00229 std::vector< search_cache_t > search_cache;
00230 };
00231
00235 CREATE_EXCEPTION( AlignerError );
00236
00237 void transposeMatches( MatchList& mlist, uint seqI, const std::vector< int64 >& seq_regions );
00238
00243 void EliminateOverlaps( MatchList& ml );
00244
00254 void AaronsLCB( MatchList& mlist, std::set<uint>& breakpoints );
00255
00256
00257 void ComputeLCBs( MatchList& meml, std::set<uint>& breakpoints, std::vector<MatchList>& lcb_list, std::vector<int64>& weights );
00258 void computeLCBAdjacencies_v2( std::vector<MatchList>& lcb_list, std::vector< int64 >& weights, std::vector< LCB >& adjacencies );
00259 void computeLCBAdjacencies_v2( IntervalList& iv_list, std::vector< int64 >& weights, std::vector< LCB >& adjacencies );
00260 void scanLeft( int& left_recurseI, std::vector< LCB >& adjacencies, int min_weight, int seqI );
00261 void scanRight( int& right_recurseI, std::vector< LCB >& adjacencies, int min_weight, int seqI );
00262 void GetLCBCoverage( MatchList& lcb, uint64& coverage );
00263
00264 int64 greedyBreakpointElimination( gnSeqI minimum_weight, std::vector< LCB >& adjacencies, std::vector< int64 >& weights, std::ostream* status_out = NULL );
00265 void filterMatches( std::vector< LCB >& adjacencies, std::vector< MatchList >& lcb_list, std::vector< int64 >& weights );
00266
00267 void CreateGapSearchList( std::vector< LCB >& adjacencies, const std::vector< genome::gnSequence* >& seq_table, std::vector< std::vector< int64 > >& iv_regions, boolean entire_genome );
00268 void SearchLCBGaps( MatchList& new_matches, const std::vector< std::vector< int64 > >& iv_regions, MaskedMemHash& nway_mh );
00269
00270 static const uint MIN_ANCHOR_LENGTH = 9;
00271
00272
00274 class SearchCacheComparator
00275 {
00276 public:
00277 SearchCacheComparator() : msc(0){};
00278 bool operator()( const search_cache_t& a, const search_cache_t& b ) const
00279 {
00280 bool lt = true;
00281 if( a.first == NULL )
00282 {
00283 if( b.first == NULL )
00284 lt = false;
00285 }else if( b.first == NULL )
00286 {
00287 lt = false;
00288 }else if( !msc( a.first, b.first ) )
00289 lt = false;
00290 else if( a.second == NULL )
00291 {
00292 if( b.second == NULL )
00293 lt = false;
00294 }else if( b.second == NULL )
00295 {
00296 lt = false;
00297 }else if( !msc( a.second, b.second ) )
00298 lt = false;
00299
00300 return lt;
00301 }
00302 protected:
00303 mems::MatchStartComparator<mems::Match> msc;
00304 };
00305
00306 static SearchCacheComparator cache_comparator;
00307
00308
00309 }
00310
00311 #endif // _Aligner_h_