00001
00002
00003
00004
00005
00006
00007
00008 #ifndef __DistanceMatrix_h__
00009 #define __DistanceMatrix_h__
00010
00011 #ifdef HAVE_CONFIG_H
00012 #include "config.h"
00013 #endif
00014
00015 #include "libGenome/gnSequence.h"
00016 #include "libMems/SubstitutionMatrix.h"
00017 #include "libMems/IntervalList.h"
00018 #include "libMems/MatchList.h"
00019 #include "libMems/GappedAlignment.h"
00020 #include "libMems/CompactGappedAlignment.h"
00021
00022
00023 namespace mems {
00024
00025
00026 void TransformDistanceIdentity( NumericMatrix<double>& identity );
00027
00028 void DistanceMatrix( const MatchList& mlist, NumericMatrix<double>& identity );
00029
00030
00031 template< class AbstractMatchVectorType >
00032 void IdentityMatrix( const AbstractMatchVectorType& matches, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity );
00033 template<class AbstractMatchType>
00034 void MatchIdentityMatrix( const AbstractMatchType& amt, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity);
00035
00036 void DistanceMatrix( uint seq_count, const std::vector< std::pair< uint64, uint64 > >& detail_list, NumericMatrix<double>& distance );
00037
00038 void IdentityMatrix( const IntervalList& iv_list, NumericMatrix<double>& identity );
00039 inline
00040 void IdentityMatrix( const IntervalList& iv_list, NumericMatrix<double>& identity )
00041 {
00042 std::vector< const AbstractMatch* > am_list;
00043 for( size_t ivI = 0; ivI < iv_list.size(); ivI++ )
00044 am_list.push_back( &iv_list[ivI] );
00045 IdentityMatrix( am_list, iv_list.seq_table, identity );
00046 }
00047
00048 template< class AbstractMatchVectorType >
00049 void IdentityMatrix( const AbstractMatchVectorType& matches, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity ){
00050 if( matches.size() == 0 )
00051 return;
00052
00053 uint seq_count = seq_table.size();
00054 identity = NumericMatrix<double>( seq_count, seq_count );
00055 identity.init( 0 );
00056 NumericMatrix<double> possible( seq_count, seq_count );
00057 possible.init( 0 );
00058
00059 for( uint ivI = 0; ivI < matches.size(); ivI++ ){
00060 AddToMatchIdentityMatrix( *matches[ ivI ], seq_table, identity );
00061 }
00062 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00063 for( uint seqJ = 0; seqJ < seq_count; seqJ++ ){
00064 gnSeqI shorter_len = seq_table[seqI]->length() < seq_table[seqJ]->length() ? seq_table[seqI]->length() : seq_table[seqJ]->length();
00065 possible( seqI, seqJ ) += shorter_len;
00066 }
00067 }
00068 identity /= possible;
00069 }
00070
00071
00072 template< class AbstractMatchVectorType >
00073 void BackboneIdentityMatrix( const AbstractMatchVectorType& matches, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity ){
00074 if( matches.size() == 0 )
00075 return;
00076
00077 size_t seq_count = seq_table.size();
00078 identity = NumericMatrix<double>( seq_count, seq_count );
00079 identity.init( 0 );
00080
00081 for( uint ivI = 0; ivI < matches.size(); ivI++ ){
00082 AddToMatchIdentityMatrix( *matches[ ivI ], seq_table, identity );
00083 }
00084
00085 NumericMatrix<double> possible( seq_count, seq_count );
00086 possible.init( 0 );
00087
00088 for( size_t mI = 0; mI < matches.size(); ++mI ){
00089 std::vector< std::string > alignment;
00090 GetAlignment( *(matches[mI]), seq_table, alignment );
00091 for( gnSeqI charI = 0; charI < matches[mI]->AlignmentLength(); charI++ ){
00092 for( size_t seqI = 0; seqI < seq_count; seqI++ ){
00093 for( size_t seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00094 if( alignment[ seqI ][ charI ] != '-' &&
00095 alignment[ seqJ ][ charI ] != '-' ){
00096 possible( seqI, seqJ ) += 1;
00097 }
00098 }
00099 }
00100 }
00101 }
00102
00103 identity /= possible;
00104 }
00105
00106
00107 template<class AbstractMatchType>
00108 void MatchIdentityMatrix( const AbstractMatchType& amt, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity)
00109 {
00110 if( amt.SeqCount() == 0 )
00111 return;
00112 uint seq_count = amt.SeqCount();
00113 identity = NumericMatrix<double>( seq_count, seq_count );
00114 identity.init( 0 );
00115 uint seqI;
00116 uint seqJ;
00117
00118 std::vector< std::string > alignment;
00119 GetAlignment( amt, seq_table, alignment );
00120 for( gnSeqI charI = 0; charI < amt.AlignmentLength(); charI++ ){
00121 for( seqI = 0; seqI < seq_count; seqI++ ){
00122 for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00123 if( ( toupper( alignment[ seqI ][ charI ] ) ==
00124 toupper( alignment[ seqJ ][ charI ] ) ) &&
00125 alignment[ seqI ][ charI ] != '-' ){
00126
00127 identity( seqI, seqJ ) += 1;
00128 }
00129 }
00130 }
00131 }
00132
00133 for( seqI = 0; seqI < seq_count; seqI++ ){
00134 for( seqJ = seq_count; seqJ > 0; seqJ-- ){
00135 if( seqI == seqJ - 1 )
00136
00137 identity( seqI, seqJ - 1 ) = 1;
00138 else if( seqI < seqJ - 1 ){
00139
00140 gnSeqI shorter_len = amt.Length( seqI ) < amt.Length( seqJ - 1 ) ? amt.Length( seqI ) : amt.Length( seqJ - 1 );
00141
00142 identity( seqI, seqJ - 1 ) /= (double)shorter_len;
00143
00144 if( identity( seqI, seqJ - 1 ) > 1 )
00145 identity( seqI, seqJ - 1 ) = 1;
00146 }else
00147 identity( seqI, seqJ - 1 ) = identity( seqJ - 1, seqI );
00148 }
00149 }
00150 }
00151
00152
00153
00154 template<class AbstractMatchType>
00155 void AddToMatchIdentityMatrix( const AbstractMatchType& amt, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity)
00156 {
00157 if( amt.SeqCount() == 0 )
00158 return;
00159 uint seq_count = amt.SeqCount();
00160 uint seqI;
00161 uint seqJ;
00162
00163 std::vector< std::string > alignment;
00164 GetAlignment( amt, seq_table, alignment );
00165 for( gnSeqI charI = 0; charI < amt.AlignmentLength(); charI++ ){
00166 for( seqI = 0; seqI < seq_count; seqI++ ){
00167 for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00168 if( ( toupper( alignment[ seqI ][ charI ] ) ==
00169 toupper( alignment[ seqJ ][ charI ] ) ) &&
00170 alignment[ seqI ][ charI ] != '-' ){
00171
00172 identity( seqI, seqJ ) += 1;
00173 }
00174 }
00175 }
00176 }
00177 }
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194 template< typename MatchVector >
00195 void SingleCopyDistanceMatrix( MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& distance )
00196 {
00197 uint seq_count = seq_table.size();
00198 distance = NumericMatrix<double>( seq_count, seq_count );
00199 distance.init( 0 );
00200 uint seqI;
00201 uint seqJ;
00202 std::vector< std::pair< bitset_t, bitset_t > > tmp_comp( seq_count );
00203 std::vector< std::vector< std::pair< bitset_t, bitset_t > > > pair_comp( seq_count, tmp_comp );
00204 for( uint seqI = 0; seqI < seq_count; ++seqI )
00205 {
00206 for( uint seqJ = seqI+1; seqJ < seq_count; ++seqJ )
00207 {
00208 pair_comp[seqI][seqJ].first.resize( seq_table[seqI]->length(), false );
00209 pair_comp[seqI][seqJ].second.resize( seq_table[seqJ]->length(), false );
00210 }
00211 }
00212 #pragma omp parallel for
00213 for( int ivI = 0; ivI < iv_list.size(); ++ivI )
00214 {
00215 std::vector< bitset_t > aln_table;
00216 #pragma omp critical
00217 {
00218 iv_list[ivI]->GetAlignment(aln_table);
00219 }
00220 for( uint seqI = 0; seqI < seq_count; ++seqI )
00221 {
00222 for( uint seqJ = seqI+1; seqJ < seq_count; ++seqJ )
00223 {
00224 gnSeqI seqI_pos = iv_list[ivI]->LeftEnd(seqI);
00225 gnSeqI seqJ_pos = iv_list[ivI]->LeftEnd(seqJ);
00226 AbstractMatch::orientation o_i = iv_list[ivI]->Orientation(seqI);
00227 AbstractMatch::orientation o_j = iv_list[ivI]->Orientation(seqJ);
00228 if( o_i == AbstractMatch::reverse )
00229 seqI_pos = iv_list[ivI]->RightEnd(seqI);
00230 if( o_j == AbstractMatch::reverse )
00231 seqJ_pos = iv_list[ivI]->RightEnd(seqJ);
00232 if( seqI_pos == NO_MATCH || seqJ_pos == NO_MATCH )
00233 continue;
00234 for( size_t colI = 0; colI < aln_table[seqI].size(); ++colI )
00235 {
00236 if( aln_table[seqI].test(colI) && aln_table[seqJ].test(colI) )
00237 {
00238 pair_comp[seqI][seqJ].first.set(seqI_pos-1,true);
00239 pair_comp[seqI][seqJ].second.set(seqJ_pos-1,true);
00240 }
00241 if( aln_table[seqI].test(colI) )
00242 if( o_i == AbstractMatch::forward )
00243 seqI_pos++;
00244 else
00245 seqI_pos--;
00246 if( aln_table[seqJ].test(colI) )
00247 if( o_j == AbstractMatch::forward )
00248 seqJ_pos++;
00249 else
00250 seqJ_pos--;
00251 }
00252 }
00253 }
00254 }
00255 for( uint seqI = 0; seqI < seq_count; ++seqI )
00256 {
00257 distance(seqI,seqI) = 1;
00258 for( uint seqJ = seqI+1; seqJ < seq_count; ++seqJ )
00259 {
00260 double pI = ((double)pair_comp[seqI][seqJ].first.count())/((double)pair_comp[seqI][seqJ].first.size());
00261 double pJ = ((double)pair_comp[seqI][seqJ].second.count())/((double)pair_comp[seqI][seqJ].second.size());
00262 distance(seqI,seqJ) = (pI + pJ) / 2.0;
00263 distance(seqJ,seqI) = (pI + pJ) / 2.0;
00264 }
00265 }
00266 TransformDistanceIdentity(distance);
00267 }
00268
00269 inline
00270 void DistanceMatrix( const MatchList& mlist, NumericMatrix<double>& distance ){
00271 IdentityMatrix(mlist, mlist.seq_table, distance );
00272 TransformDistanceIdentity( distance );
00273 }
00274
00275 inline
00276 void TransformDistanceIdentity( NumericMatrix<double>& identity ){
00277 for( int i = 0; i < identity.cols(); i++ ){
00278 for( int j = 0; j < identity.rows(); j++ ){
00279 identity( i, j ) = 1 - identity( i, j );
00280 }
00281 }
00282 }
00283
00284 inline
00285 void DistanceMatrix( uint seq_count, const std::vector< std::pair< uint64, uint64 > >& detail_list, NumericMatrix<double>& distance ){
00286 distance = NumericMatrix<double>( seq_count, seq_count );
00287 distance.init( 0 );
00288 uint seqI;
00289 uint seqJ;
00290 for( seqI = 0; seqI < seq_count; seqI++ ){
00291 uint64 seqI_mask = 1;
00292 seqI_mask <<= seq_count - seqI - 1;
00293 for( seqJ = 0; seqJ < seq_count; seqJ++ ){
00294 uint64 seqJ_mask = 1;
00295 seqJ_mask <<= seq_count - seqJ - 1;
00296 for( uint pairI = 0; pairI < detail_list.size(); pairI++ ){
00297 if( (detail_list[ pairI ].first & seqI_mask) != 0 &&
00298 (detail_list[ pairI ].first & seqJ_mask) != 0 ){
00299 distance( seqI, seqJ ) += detail_list[ pairI ].second;
00300 }
00301 }
00302 }
00303 }
00304
00305 for( seqI = 0; seqI < seq_count; seqI++ ){
00306 for( seqJ = 0; seqJ < seq_count; seqJ++ ){
00307 if( seqI == seqJ )
00308 continue;
00309 double avg_length = ( distance( seqI, seqI ) + distance( seqJ, seqJ ) ) / 2;
00310 distance( seqI, seqJ ) = 1.0 - ( distance( seqI, seqJ ) / avg_length );
00311 if( !(distance( seqI, seqJ ) == distance( seqI, seqJ )) ){
00312 distance( seqI, seqJ ) = 1.0;
00313 }
00314 }
00315 }
00316
00317
00318 for( seqI = 0; seqI < seq_count; seqI++ )
00319 distance( seqI, seqI ) = 0;
00320 }
00321
00322
00323 }
00324
00325
00326 #endif // __DistanceMatrix_h__
00327