00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifdef HAVE_CONFIG_H
00010 #include "config.h"
00011 #endif
00012
00013 #include "libMems/ClustalInterface.h"
00014 #include <sstream>
00015 #include "libGenome/gnFilter.h"
00016
00017 #include <fstream>
00018
00019 extern "C" {
00020 #include "libClustalW/clustalw.h"
00021
00022 extern sint max_names;
00023 extern Boolean usemenu, dnaflag, explicit_dnaflag;
00024 extern Boolean interactive;
00025 extern char *seqname;
00026 extern sint nseqs;
00027 extern sint *seqlen_array;
00028 extern char **names,**titles;
00029 extern char **seq_array;
00030
00031 extern sint max_aln_length;
00032
00033
00034 extern float gap_open, gap_extend;
00035 extern float dna_gap_open, dna_gap_extend;
00036
00037
00038
00039
00040 extern float pw_go_penalty, pw_ge_penalty;
00041 extern float dna_pw_go_penalty, dna_pw_ge_penalty;
00042
00043
00044
00045 extern Boolean output_clustal, output_nbrf, output_phylip, output_gcg, output_gde, output_nexus;
00046 extern FILE *clustal_outfile, *gcg_outfile, *nbrf_outfile, *phylip_outfile, *nexus_outfile;
00047
00048 extern char* amino_acid_codes;
00049 extern sint max_aa;
00050
00051
00052
00053 extern sint gap_pos1;
00054 extern double** tmat;
00055
00056 extern Boolean use_endgaps;
00057 extern Boolean endgappenalties;
00058
00059 extern sint output_order;
00060 extern Boolean no_weights;
00061
00062 }
00063
00064 using namespace std;
00065 using namespace genome;
00066 namespace mems {
00067
00074 #define MISALIGNMENT_WORKAROUND
00075
00076 lint get_aln_score(void);
00077
00078
00079 ClustalInterface& ClustalInterface::getClustalInterface()
00080 {
00081 static ClustalInterface m_ci;
00082
00083 return m_ci;
00084 }
00085
00086 ClustalInterface::ClustalInterface(){
00087
00088 max_alignment_length = 10000;
00089 min_flank_size = 3;
00090 clustal_score_cutoff = 0;
00091
00092
00093 use_endgaps = FALSE;
00094
00095 endgappenalties = TRUE;
00096
00097 output_order = INPUT;
00098 no_weights = FALSE;
00099
00100
00101 init_amenu();
00102 init_interface();
00103 init_matrix();
00104
00105 fill_chartab();
00106 allocated_aln = false;
00107
00108
00109 use_endgaps = FALSE;
00110
00111 endgappenalties = TRUE;
00112 }
00113
00114 ClustalInterface& ClustalInterface::operator=( const ClustalInterface& ci )
00115 {
00116 GappedAligner::operator=( ci );
00117 min_flank_size = ci.min_flank_size;
00118 clustal_score_cutoff = ci.clustal_score_cutoff;
00119 distance_matrix = ci.distance_matrix;
00120 allocated_aln = ci.allocated_aln;
00121 return *this;
00122 }
00123
00124 void ClustalInterface::SetDistanceMatrix( NumericMatrix< double >& distance_matrix, string& tree_filename ){
00125 SetDistanceMatrix( distance_matrix, tree_filename, false );
00126 }
00127
00128 void ClustalInterface::SetDistanceMatrix( NumericMatrix< double >& distance_matrix, string& tree_filename, boolean reread_tree ){
00129 char* phylip_name;
00130 uint seqI, seqJ;
00131 #ifdef MISALIGNMENT_WORKAROUND
00132 if( reread_tree == false ){
00133 NumericMatrix< double > dist_plus_matrix( distance_matrix.cols() + 1, distance_matrix.cols() + 1 );
00134 for( seqI = 0; seqI < dist_plus_matrix.cols(); seqI++ ){
00135 for( seqJ = 0; seqJ < dist_plus_matrix.cols(); seqJ++ ){
00136 double new_val = 0;
00137 if( seqI == 0 ){
00138 if( seqJ == 0 )
00139 new_val = 0;
00140 else
00141 new_val = distance_matrix( seqI, seqJ - 1 );
00142 }else{
00143 if( seqJ == 0 )
00144 new_val = distance_matrix( seqI - 1, seqJ );
00145 else
00146 new_val = distance_matrix( seqI - 1, seqJ - 1 );
00147 }
00148 dist_plus_matrix( seqI, seqJ ) = new_val;
00149 }
00150 }
00151 SetDistanceMatrix( dist_plus_matrix, tree_filename, true );
00152 }
00153 #else
00154 reread_tree = true;
00155 #endif
00156 if( reread_tree )
00157 this->distance_matrix = distance_matrix;
00158 free_aln( nseqs );
00159 nseqs = distance_matrix.cols();
00160 alloc_aln( nseqs );
00161 allocated_aln = true;
00162
00163 for( seqI = 1; seqI <= distance_matrix.cols(); seqI++ ){
00164 ostringstream ss;
00165 ss << "seq" << seqI;
00166 int namelen = MAXNAMES < ss.str().size() ? MAXNAMES : ss.str().size();
00167 strncpy( names[ seqI ], ss.str().c_str(), namelen);
00168 strncpy( titles[ seqI ], ss.str().c_str(), namelen);
00169
00170 alloc_seq( seqI, 1 );
00171
00172 if( strlen( names[ seqI ] ) > max_names )
00173 max_names = strlen( names[ seqI ] );
00174 }
00175
00176 phylip_name = (char * ) ckalloc( tree_filename.length() + 1);
00177 strcpy( phylip_name, tree_filename.c_str() );
00178
00179
00180 for( seqI = 0; seqI < nseqs; seqI++ )
00181 for( uint seqJ = 0; seqJ < nseqs; seqJ++ )
00182 tmat[ seqI + 1][ seqJ + 1 ] = distance_matrix( seqI, seqJ );
00183
00184 FILE* tree;
00185 if((tree = open_explicit_file( phylip_name ))==NULL) return;
00186 if (nseqs >= 2) {
00187 guide_tree(tree,1,nseqs);
00188 }
00189
00190
00191
00192 if( reread_tree )
00193 int status = read_tree(phylip_name, (sint)0, nseqs);
00194 phylip_name = (char*)ckfree( phylip_name );
00195 allocated_aln = false;
00196
00197 }
00198
00199
00200
00201 void ClustalInterface::setGuideTree( string& tree_filename, NumericMatrix< double >& dist_mat, uint seq_count ){
00202 #ifdef MISALIGNMENT_WORKAROUND
00203 seq_count++;
00204 #endif
00205 distance_matrix = dist_mat;
00206
00207 ifstream guide_file( tree_filename.c_str() );
00208 if( guide_file.is_open() )
00209 guide_file.close();
00210 else
00211 throw( "Unable to open guide tree file" );
00212
00213 char* phylip_name;
00214 uint seqI;
00215
00216 free_aln( nseqs );
00217 nseqs = seq_count;
00218 alloc_aln( nseqs );
00219 allocated_aln = true;
00220
00221 for( seqI = 1; seqI <= seq_count; seqI++ ){
00222 ostringstream ss;
00223 ss << "seq" << seqI;
00224 int namelen = MAXNAMES < ss.str().size() ? MAXNAMES : ss.str().size();
00225 strncpy( names[ seqI ], ss.str().c_str(), namelen);
00226 strncpy( titles[ seqI ], ss.str().c_str(), namelen);
00227
00228 alloc_seq( seqI, 1 );
00229
00230 if( strlen( names[ seqI ] ) > max_names )
00231 max_names = strlen( names[ seqI ] );
00232 }
00233
00234
00235 for( seqI = 0; seqI < nseqs; seqI++ )
00236 for( uint seqJ = 0; seqJ < nseqs; seqJ++ )
00237 tmat[ seqI + 1][ seqJ + 1 ] = 1 - distance_matrix( seqI, seqJ );
00238
00239
00240 phylip_name = (char * ) ckalloc( tree_filename.length() + 1);
00241 strcpy( phylip_name, tree_filename.c_str() );
00242 int success = read_tree(phylip_name, (sint)0, nseqs);
00243 phylip_name = (char*)ckfree( phylip_name );
00244 allocated_aln = false;
00245 if( !success )
00246 throw "Error loading guide tree\n";
00247 }
00248
00249 boolean ClustalInterface::Align( GappedAlignment& cr, Match* r_begin, Match* r_end, vector< gnSequence* >& seq_table ){
00250 boolean flank = false;
00251 gnSeqI gap_size = 0;
00252 boolean create_ok = true;
00253 uint seq_count = seq_table.size();
00254 uint seqI;
00255 uint align_seqs = 0;
00256
00257
00258
00259
00260 try{
00261 for( seqI = 0; seqI < seq_count; seqI++ ){
00262 int64 gap_start = 0;
00263 int64 gap_end = 0;
00264 create_ok = getInterveningCoordinates( seq_table, r_begin, r_end, seqI, gap_start, gap_end );
00265
00266 if( gap_start == NO_MATCH || gap_end == NO_MATCH )
00267 continue;
00268 if( !create_ok )
00269 break;
00270
00271 int64 diff = gap_end - gap_start;
00272 if( diff <= 0 ){
00273 continue;
00274 }
00275 if( diff > max_alignment_length ){
00276 cout << "gap from " << gap_start << " to " << gap_end << " is too big for ClustalW\n";
00277 continue;
00278 }
00279 gap_size = diff < gap_size ? gap_size : diff;
00280 align_seqs++;
00281 }
00282
00283 if( align_seqs <= 1 )
00284 create_ok = false;
00285
00286
00287
00288
00289 vector< string > seq_data;
00290 vector< int64 > starts;
00291 gnSeqI left_flank, right_flank;
00292 const gnFilter* rc_filter = gnFilter::DNAComplementFilter();
00293
00294 if( create_ok ){
00295
00296
00297 left_flank = 0;
00298 right_flank = 0;
00299
00300 for( seqI = 0; seqI < seq_count; seqI++ ){
00301
00302 #ifdef MISALIGNMENT_WORKAROUND
00303 if( seqI == 1 )
00304 seq_data.push_back( seq_data[ 0 ] );
00305 #endif
00306
00307 if( (r_end != NULL && r_end->Start( seqI ) == NO_MATCH ) ||
00308 (r_begin != NULL && r_begin->Start( seqI ) == NO_MATCH) ){
00309 starts.push_back( NO_MATCH );
00310 seq_data.push_back( "" );
00311 continue;
00312 }
00313
00314
00315 int64 gap_start = 0;
00316 int64 gap_end = 0;
00317 getInterveningCoordinates( seq_table, r_begin, r_end, seqI, gap_start, gap_end );
00318
00319 int64 diff = gap_end - gap_start;
00320 if( diff <= 0 || diff > max_alignment_length ){
00321 starts.push_back( NO_MATCH );
00322 seq_data.push_back( "" );
00323 continue;
00324 }
00325
00326 if( r_end == NULL || r_end->Start( seqI ) > 0 ){
00327 starts.push_back( gap_start );
00328 seq_data.push_back( seq_table[ seqI ]->ToString( left_flank + diff + right_flank, gap_start - left_flank ) );
00329 }else{
00330
00331 starts.push_back( -gap_start );
00332 string cur_seq_data = seq_table[ seqI ]->ToString( left_flank + diff + right_flank, gap_start - right_flank );
00333 rc_filter->ReverseFilter( cur_seq_data );
00334 seq_data.push_back( cur_seq_data );
00335 }
00336 }
00337 }
00338
00339 if( create_ok ){
00340 if( !CallClustal( seq_data ) ){
00341 cout << "Clustal was unable to align:\n";
00342 cout << "Left match: " << *r_begin << endl;
00343 cout << "Right match: " << *r_end << endl;
00344 return false;
00345 }
00346
00347
00348 boolean good_alignment = true;
00349 gnSeqI flankI = 0;
00350 gnSeqI align_length=0;
00351 for( seqI = 1; seqI <= seq_count; seqI++ )
00352 if( align_length < ( seqlen_array[seqI] < 0 ? 0 : (gnSeqI)seqlen_array[seqI] ))
00353 align_length = seqlen_array[seqI];
00354
00355 if( !good_alignment ){
00356
00357 return false;
00358 }else{
00359
00360 cr = GappedAlignment( seq_count, align_length );
00361 vector< string > align_array;
00362 int64 last_residue = -1;
00363 int64 first_residue = align_length + 2;
00364 #ifdef MISALIGNMENT_WORKAROUND
00365 for( seqI = 2; seqI <= seq_count + 1; seqI++ ){
00366 #else
00367 for( seqI = 1; seqI <= seq_count; seqI++ ){
00368 #endif
00369 string new_seq = string( seqlen_array[ seqI ] - left_flank - right_flank, '-' );
00370 uint new_seq_charI = 0;
00371 uint cur_seq_len = 0;
00372 for( uint charJ = left_flank + 1; charJ <= seqlen_array[ seqI ] - right_flank; charJ++ ){
00373 char val = seq_array[ seqI ][ charJ ];
00374 if( val >= 0 && val <= max_aa ){
00375 if( charJ > last_residue )
00376 last_residue = charJ;
00377 if( charJ < first_residue )
00378 first_residue = charJ;
00379 new_seq[ new_seq_charI ]= amino_acid_codes[ val ];
00380 cur_seq_len++;
00381 }
00382 new_seq_charI++;
00383 }
00384 align_array.push_back( new_seq );
00385
00386 #ifdef MISALIGNMENT_WORKAROUND
00387 cr.SetStart( seqI - 2, starts[ seqI - 2 ] );
00388 cr.SetLength( cur_seq_len, seqI - 2 );
00389 #else
00390 cr.SetStart( seqI - 1, starts[ seqI - 1 ] );
00391 cr.SetLength( cur_seq_len, seqI - 1 );
00392 #endif
00393 }
00394 int64 end_gap_count = align_array[ 0 ].size() - (last_residue - left_flank);
00395 if( last_residue != -1 && end_gap_count > 0 ){
00396 for( seqI = 0; seqI < align_array.size(); seqI++ ){
00397 align_array[ seqI ] = align_array[ seqI ].substr( 0, align_array[ seqI ].size() - end_gap_count );
00398 }
00399 }
00400 int64 start_gap_count = left_flank + 1 - first_residue;
00401 if( first_residue != align_length && start_gap_count > 0 ){
00402 for( seqI = 0; seqI < align_array.size(); seqI++ ){
00403 align_array[ seqI ] = align_array[ seqI ].substr( start_gap_count, align_array[ seqI ].size() - start_gap_count );
00404 }
00405 }
00406 cr.SetAlignment( align_array );
00407 }
00408 return true;
00409 }
00410 }catch(exception& e){
00411 cerr << "At: " << __FILE__ << ":" << __LINE__ << endl;
00412 cerr << e.what();
00413 }
00414 return false;
00415 }
00416
00417
00418 boolean ClustalInterface::CallClustal( vector< string >& seq_table ){
00419 char* phylip_name;
00420
00421
00422 free_aln( nseqs );
00423 alloc_aln( seq_table.size() );
00424 allocated_aln = true;
00425
00426 if( distance_matrix.cols() == seq_table.size() ){
00427
00428 for( uint seqI = 0; seqI < nseqs; seqI++ )
00429 for( uint seqJ = 0; seqJ < nseqs; seqJ++ )
00430 tmat[ seqI + 1][ seqJ + 1 ] = 1 - distance_matrix( seqI, seqJ );
00431 }else{
00432
00433 phylip_name = (char * ) ckalloc( strlen( "tmp_tree.txt" ) + 1);
00434 strcpy( phylip_name, "tmp_tree.txt" );
00435 }
00436
00437 uint seqI;
00438 max_aln_length = 0;
00439 max_names = 0;
00440 for( seqI = 1; seqI <= seq_table.size(); seqI++ ){
00441 seqlen_array[ seqI ] = seq_table[ seqI - 1 ].length();
00442 ostringstream ss;
00443 ss << "seq" << seqI;
00444 int namelen = ss.str().size();
00445 names[ seqI ] = (char * ) ckalloc( namelen + 1 );
00446 titles[ seqI ] = (char * ) ckalloc( namelen + 1 );
00447 strcpy( names[ seqI ], ss.str().c_str());
00448 strcpy( titles[ seqI ], ss.str().c_str());
00449
00450
00451 if( (int)strlen( names[ seqI ] ) > max_names )
00452 max_names = strlen( names[ seqI ] );
00453 if( seqlen_array[ seqI ] > max_aln_length )
00454 max_aln_length = seqlen_array[ seqI ];
00455 }
00456
00457 for( seqI = 1; seqI <= seq_table.size(); seqI++ ){
00458
00459 alloc_seq( seqI, max_aln_length );
00460 char* seq_char_array = new char[ seq_table[ seqI - 1 ].length() + 2];
00461 uint copyI = 0;
00462 string& dna_seq = seq_table[ seqI - 1 ];
00463 for( ; copyI < dna_seq.length(); copyI++ )
00464 seq_char_array[ copyI + 1 ] = toupper( dna_seq[ copyI ] );
00465 seq_char_array[ 0 ] = '-';
00466 seq_char_array[ copyI + 1 ] = 0;
00467 n_encode( seq_char_array, seq_array[ seqI ], dna_seq.length() );
00468 delete[] seq_char_array;
00469 }
00470 max_aln_length *= 2;
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480 nseqs = seq_table.size();
00481 gap_open = dna_gap_open;
00482 gap_extend = dna_gap_extend;
00483 pw_go_penalty = dna_pw_go_penalty;
00484 pw_ge_penalty = dna_pw_ge_penalty;
00485
00486
00487
00488
00489 dnaflag = TRUE;
00490 output_clustal = FALSE;
00491
00492 int retval = 0;
00493 if( distance_matrix.cols() == seq_table.size() ){
00494
00495
00496
00497
00498 retval = malign_nofiles( 0, false );
00499
00500
00501
00502 }else{
00503 pairalign((sint)0,nseqs,(sint)0,nseqs);
00504
00505 FILE* tree;
00506 if((tree = open_explicit_file( phylip_name ))==NULL) return false;
00507 if (nseqs >= 2) {
00508 guide_tree(tree,1,nseqs);
00509 }
00510
00511
00512
00513
00514 retval = malign( 0, phylip_name );
00515
00516 phylip_name = (char*)ckfree( phylip_name );
00517
00518 }
00519
00520 if( retval <= 0 )
00521 return false;
00522 return true;
00523
00524 }
00525
00526
00527
00528
00529
00530
00531
00532
00533
00534
00535
00536
00537
00538
00539
00540
00541
00542
00543
00544
00545
00546
00547
00548
00549
00550
00551
00552
00553
00554
00555
00556
00557
00558
00559
00560
00561
00562
00563
00564
00565
00566
00567
00568
00569
00570
00571
00572
00573
00574
00575
00576 }