libMems/DistanceMatrix.h

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
00003  * This file is licensed under the GPL.
00004  * Please see the file called COPYING for licensing details.
00005  * **************
00006  ******************************************************************************/
00007 
00008 #ifndef __DistanceMatrix_h__
00009 #define __DistanceMatrix_h__
00010 
00011 #ifdef HAVE_CONFIG_H
00012 #include "config.h"
00013 #endif
00014 
00015 #include "libGenome/gnSequence.h"
00016 #include "libMems/SubstitutionMatrix.h"
00017 #include "libMems/IntervalList.h"
00018 #include "libMems/MatchList.h"
00019 #include "libMems/GappedAlignment.h"
00020 #include "libMems/CompactGappedAlignment.h"
00021 
00022 
00023 namespace mems {
00024 
00025 
00026 void TransformDistanceIdentity( NumericMatrix<double>& identity );
00027 
00028 void DistanceMatrix( const MatchList& mlist, NumericMatrix<double>& identity );
00029 
00030 
00031 template< class AbstractMatchVectorType >
00032 void IdentityMatrix( const AbstractMatchVectorType& matches, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity );
00033 template<class AbstractMatchType>
00034 void MatchIdentityMatrix( const AbstractMatchType& amt, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity);
00035 
00036 void DistanceMatrix( uint seq_count, const std::vector< std::pair< uint64, uint64 > >& detail_list, NumericMatrix<double>& distance );
00037 
00038 void IdentityMatrix( const IntervalList& iv_list, NumericMatrix<double>& identity );
00039 inline
00040 void IdentityMatrix( const IntervalList& iv_list, NumericMatrix<double>& identity )
00041 {
00042         std::vector< const AbstractMatch* > am_list;
00043         for( size_t ivI = 0; ivI < iv_list.size(); ivI++ )
00044                 am_list.push_back( &iv_list[ivI] );
00045         IdentityMatrix( am_list, iv_list.seq_table, identity );
00046 }
00047 
00048 template< class AbstractMatchVectorType >
00049 void IdentityMatrix( const AbstractMatchVectorType& matches, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity ){
00050         if( matches.size() == 0 )
00051                 return;
00052 
00053         uint seq_count = seq_table.size();
00054         identity = NumericMatrix<double>( seq_count, seq_count );
00055         identity.init( 0 );
00056         NumericMatrix<double> possible( seq_count, seq_count );
00057         possible.init( 0 );
00058         
00059         for( uint ivI = 0; ivI < matches.size(); ivI++ ){
00060                 AddToMatchIdentityMatrix( *matches[ ivI ], seq_table, identity );
00061         }
00062         for( uint seqI = 0; seqI < seq_count; seqI++ ){
00063                 for( uint seqJ = 0; seqJ < seq_count; seqJ++ ){
00064                         gnSeqI shorter_len = seq_table[seqI]->length() < seq_table[seqJ]->length() ? seq_table[seqI]->length() : seq_table[seqJ]->length();
00065                         possible( seqI, seqJ ) += shorter_len;
00066                 }
00067         }
00068         identity /= possible;
00069 }
00070 
00071 
00072 template< class AbstractMatchVectorType >
00073 void BackboneIdentityMatrix( const AbstractMatchVectorType& matches, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity ){
00074         if( matches.size() == 0 )
00075                 return;
00076 
00077         size_t seq_count = seq_table.size();
00078         identity = NumericMatrix<double>( seq_count, seq_count );
00079         identity.init( 0 );
00080         
00081         for( uint ivI = 0; ivI < matches.size(); ivI++ ){
00082                 AddToMatchIdentityMatrix( *matches[ ivI ], seq_table, identity );
00083         }
00084 
00085         NumericMatrix<double> possible( seq_count, seq_count );
00086         possible.init( 0 );
00087 
00088         for( size_t mI = 0; mI < matches.size(); ++mI ){
00089                 std::vector< std::string > alignment;
00090                 GetAlignment( *(matches[mI]), seq_table, alignment );
00091                 for( gnSeqI charI = 0; charI < matches[mI]->AlignmentLength(); charI++ ){
00092                         for( size_t seqI = 0; seqI < seq_count; seqI++ ){
00093                                 for( size_t seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00094                                         if( alignment[ seqI ][ charI ] != '-' &&
00095                                                 alignment[ seqJ ][ charI ] != '-' ){
00096                                                         possible( seqI, seqJ ) += 1;
00097                                         }
00098                                 }
00099                         }
00100                 }
00101         }
00102 
00103         identity /= possible;
00104 }
00105 
00106 
00107 template<class AbstractMatchType>
00108 void MatchIdentityMatrix( const AbstractMatchType& amt, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity)
00109 {
00110         if( amt.SeqCount() == 0 )
00111                 return;
00112         uint seq_count = amt.SeqCount();
00113         identity = NumericMatrix<double>( seq_count, seq_count );
00114         identity.init( 0 );
00115         uint seqI;
00116         uint seqJ;
00117 
00118         std::vector< std::string > alignment;
00119         GetAlignment( amt, seq_table, alignment );
00120         for( gnSeqI charI = 0; charI < amt.AlignmentLength(); charI++ ){
00121                 for( seqI = 0; seqI < seq_count; seqI++ ){
00122                         for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00123                                 if( ( toupper( alignment[ seqI ][ charI ] ) == 
00124                                         toupper( alignment[ seqJ ][ charI ] ) ) &&
00125                                         alignment[ seqI ][ charI ] != '-' ){
00126                                         
00127                                                 identity( seqI, seqJ ) += 1;
00128                                 }
00129                         }
00130                 }
00131         }
00132 
00133         for( seqI = 0; seqI < seq_count; seqI++ ){
00134                 for( seqJ = seq_count; seqJ > 0; seqJ-- ){
00135                         if( seqI == seqJ - 1 )
00136                                 // set the diagonal to identical
00137                                 identity( seqI, seqJ - 1 ) = 1;
00138                         else if( seqI < seqJ - 1 ){
00139                                 // determine the length of the shorter sequence
00140                                 gnSeqI shorter_len = amt.Length( seqI ) < amt.Length( seqJ - 1 ) ? amt.Length( seqI ) : amt.Length( seqJ - 1 );
00141                                 // divide through
00142                                 identity( seqI, seqJ - 1 ) /= (double)shorter_len;
00143                                 // maxes out at 1
00144                                 if( identity( seqI, seqJ - 1 ) > 1 )
00145                                         identity( seqI, seqJ - 1 ) = 1;
00146                         }else   // copy the other one
00147                                 identity( seqI, seqJ - 1 ) = identity( seqJ - 1, seqI );
00148                 }
00149         }
00150 }
00151 
00152 
00153 
00154 template<class AbstractMatchType>
00155 void AddToMatchIdentityMatrix( const AbstractMatchType& amt, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity)
00156 {
00157         if( amt.SeqCount() == 0 )
00158                 return;
00159         uint seq_count = amt.SeqCount();
00160         uint seqI;
00161         uint seqJ;
00162 
00163         std::vector< std::string > alignment;
00164         GetAlignment( amt, seq_table, alignment );
00165         for( gnSeqI charI = 0; charI < amt.AlignmentLength(); charI++ ){
00166                 for( seqI = 0; seqI < seq_count; seqI++ ){
00167                         for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00168                                 if( ( toupper( alignment[ seqI ][ charI ] ) == 
00169                                         toupper( alignment[ seqJ ][ charI ] ) ) &&
00170                                         alignment[ seqI ][ charI ] != '-' ){
00171                                         
00172                                                 identity( seqI, seqJ ) += 1;
00173                                 }
00174                         }
00175                 }
00176         }
00177 }
00178 
00179 /*
00180 // template specialization for (exact) matches
00181 inline
00182 void AddToMatchIdentityMatrix( const Match& m, const std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& identity)
00183 {
00184         if( m.SeqCount() == 0 )
00185                 return;
00186         for( uint seqI = 0; seqI < m.SeqCount(); seqI++ )
00187                 if( m.LeftEnd(seqI) != NO_MATCH )
00188                         for( uint seqJ = seqI + 1; seqJ < m.SeqCount(); seqJ++ )
00189                                 if( m.LeftEnd(seqJ) != NO_MATCH )
00190                                         identity(seqI,seqJ) += m.Length();
00191 }
00192 */
00193 
00194 template< typename MatchVector >
00195 void SingleCopyDistanceMatrix( MatchVector& iv_list, std::vector< genome::gnSequence* >& seq_table, NumericMatrix<double>& distance )
00196 {
00197         uint seq_count = seq_table.size();
00198         distance = NumericMatrix<double>( seq_count, seq_count );
00199         distance.init( 0 );
00200         uint seqI;
00201         uint seqJ;
00202         std::vector< std::pair< bitset_t, bitset_t > > tmp_comp( seq_count );
00203         std::vector< std::vector< std::pair< bitset_t, bitset_t > > > pair_comp( seq_count, tmp_comp );
00204         for( uint seqI = 0; seqI < seq_count; ++seqI )
00205         {
00206                 for( uint seqJ = seqI+1; seqJ < seq_count; ++seqJ )
00207                 {
00208                         pair_comp[seqI][seqJ].first.resize( seq_table[seqI]->length(), false );
00209                         pair_comp[seqI][seqJ].second.resize( seq_table[seqJ]->length(), false );
00210                 }
00211         }
00212 #pragma omp parallel for
00213         for( int ivI = 0; ivI < iv_list.size(); ++ivI )
00214         {
00215                 std::vector< bitset_t > aln_table;
00216 #pragma omp critical
00217 {
00218                 iv_list[ivI]->GetAlignment(aln_table);
00219 }
00220                 for( uint seqI = 0; seqI < seq_count; ++seqI )
00221                 {
00222                         for( uint seqJ = seqI+1; seqJ < seq_count; ++seqJ )
00223                         {
00224                                 gnSeqI seqI_pos = iv_list[ivI]->LeftEnd(seqI);
00225                                 gnSeqI seqJ_pos = iv_list[ivI]->LeftEnd(seqJ);
00226                                 AbstractMatch::orientation o_i = iv_list[ivI]->Orientation(seqI);
00227                                 AbstractMatch::orientation o_j = iv_list[ivI]->Orientation(seqJ);
00228                                 if( o_i == AbstractMatch::reverse )
00229                                         seqI_pos = iv_list[ivI]->RightEnd(seqI);
00230                                 if( o_j == AbstractMatch::reverse )
00231                                         seqJ_pos = iv_list[ivI]->RightEnd(seqJ);
00232                                 if( seqI_pos == NO_MATCH || seqJ_pos == NO_MATCH )
00233                                         continue;
00234                                 for( size_t colI = 0; colI < aln_table[seqI].size(); ++colI )
00235                                 {
00236                                         if( aln_table[seqI].test(colI) && aln_table[seqJ].test(colI) )
00237                                         {
00238                                                 pair_comp[seqI][seqJ].first.set(seqI_pos-1,true);
00239                                                 pair_comp[seqI][seqJ].second.set(seqJ_pos-1,true);
00240                                         }
00241                                         if( aln_table[seqI].test(colI) )
00242                                                 if( o_i == AbstractMatch::forward )
00243                                                         seqI_pos++;
00244                                                 else
00245                                                         seqI_pos--;
00246                                         if( aln_table[seqJ].test(colI) )
00247                                                 if( o_j == AbstractMatch::forward )
00248                                                         seqJ_pos++;
00249                                                 else
00250                                                         seqJ_pos--;
00251                                 }
00252                         }
00253                 }
00254         }
00255         for( uint seqI = 0; seqI < seq_count; ++seqI )
00256         {
00257                 distance(seqI,seqI) = 1;
00258                 for( uint seqJ = seqI+1; seqJ < seq_count; ++seqJ )
00259                 {
00260                         double pI = ((double)pair_comp[seqI][seqJ].first.count())/((double)pair_comp[seqI][seqJ].first.size());
00261                         double pJ = ((double)pair_comp[seqI][seqJ].second.count())/((double)pair_comp[seqI][seqJ].second.size());
00262                         distance(seqI,seqJ) = (pI + pJ) / 2.0;
00263                         distance(seqJ,seqI) = (pI + pJ) / 2.0;
00264                 }
00265         }
00266         TransformDistanceIdentity(distance);
00267 }
00268 
00269 inline
00270 void DistanceMatrix( const MatchList& mlist, NumericMatrix<double>& distance ){
00271         IdentityMatrix(mlist, mlist.seq_table, distance );
00272         TransformDistanceIdentity( distance );
00273 }
00274 
00275 inline
00276 void TransformDistanceIdentity( NumericMatrix<double>& identity ){
00277         for( int i = 0; i < identity.cols(); i++ ){
00278                 for( int j = 0; j < identity.rows(); j++ ){
00279                         identity( i, j ) = 1 - identity( i, j );
00280                 }
00281         }
00282 }
00283 
00284 inline
00285 void DistanceMatrix( uint seq_count, const std::vector< std::pair< uint64, uint64 > >& detail_list, NumericMatrix<double>& distance ){
00286         distance = NumericMatrix<double>( seq_count, seq_count );
00287         distance.init( 0 );
00288         uint seqI;
00289         uint seqJ;
00290         for( seqI = 0; seqI < seq_count; seqI++ ){
00291                 uint64 seqI_mask = 1;
00292                 seqI_mask <<= seq_count - seqI - 1;
00293                 for( seqJ = 0; seqJ < seq_count; seqJ++ ){
00294                         uint64 seqJ_mask = 1;
00295                         seqJ_mask <<= seq_count - seqJ - 1;
00296                         for( uint pairI = 0; pairI < detail_list.size(); pairI++ ){
00297                                 if( (detail_list[ pairI ].first & seqI_mask) != 0 &&
00298                                         (detail_list[ pairI ].first & seqJ_mask) != 0 ){
00299                                         distance( seqI, seqJ ) += detail_list[ pairI ].second;
00300                                 }
00301                         }
00302                 }
00303         }
00304         
00305         for( seqI = 0; seqI < seq_count; seqI++ ){
00306                 for( seqJ = 0; seqJ < seq_count; seqJ++ ){
00307                         if( seqI == seqJ )
00308                                 continue;
00309                         double avg_length = ( distance( seqI, seqI ) + distance( seqJ, seqJ ) ) / 2;
00310                         distance( seqI, seqJ ) = 1.0 - ( distance( seqI, seqJ ) / avg_length );
00311                         if( !(distance( seqI, seqJ ) == distance( seqI, seqJ )) ){
00312                                 distance( seqI, seqJ ) = 1.0;
00313                         }
00314                 }
00315         }
00316 
00317         // set the diagonal identical to itself
00318         for( seqI = 0; seqI < seq_count; seqI++ )
00319                 distance( seqI, seqI ) = 0;
00320 }
00321 
00322 
00323 }       // namespace mems
00324 
00325 
00326 #endif  // __DistanceMatrix_h__
00327 

Generated on Fri Mar 14 06:01:02 2008 for libMems by doxygen 1.3.6