00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifdef HAVE_CONFIG_H
00010 #include "config.h"
00011 #endif
00012
00013 #include "libMems/MuscleInterface.h"
00014
00015 #include "libGenome/gnFilter.h"
00016 #include "libGenome/gnFASSource.h"
00017 #include "libGenome/gnStringTools.h"
00018 #include "libMUSCLE/muscle.h"
00019 #include "libMUSCLE/params.h"
00020 #include "libMUSCLE/msa.h"
00021 #include "libMUSCLE/seq.h"
00022 #include "libMUSCLE/seqvect.h"
00023 #include "libMUSCLE/tree.h"
00024 #include "libMUSCLE/clust.h"
00025 #include "libMUSCLE/profile.h"
00026 #include "libMUSCLE/distfunc.h"
00027 #include "libMUSCLE/clustsetdf.h"
00028 #include "libMUSCLE/textfile.h"
00029 #include "libMUSCLE/types.h"
00030 #include "boost/algorithm/string/erase.hpp"
00031 #include "boost/algorithm/string/case_conv.hpp"
00032
00033 #include <sstream>
00034 #include <fstream>
00035
00036
00037 extern void MUSCLE(SeqVect &v, MSA &msaOut);
00038 extern void RefineW(const MSA &msaIn, MSA &msaOut);
00039
00040 using namespace std;
00041 using namespace genome;
00042 namespace mems {
00043
00044 bool debug_muscle = false;
00045
00046 bool pipeExec( char** cmd_argv, const string& command, const string& input, string& output, string& error );
00047
00048 char** parseCommand( const string& cmd );
00049 char** parseCommand( const string& cmd ){
00050
00051
00052
00053 stringstream qs( cmd );
00054 string cur_str;
00055 boolean in_quote = true;
00056 int token_count = 0;
00057 vector< string > cmd_tokens;
00058 while( getline( qs, cur_str, '"' ) ){
00059
00060 in_quote = !in_quote;
00061 if( cur_str.length() == 0 )
00062 continue;
00063 if( in_quote ){
00064 cmd_tokens.push_back( cur_str );
00065 }else{
00066 stringstream ss( cur_str );
00067 string asdf;
00068 while( ss >> asdf )
00069 cmd_tokens.push_back( asdf );
00070 }
00071 }
00072 char ** cmd_array = new char*[ cmd_tokens.size() + 1 ];
00073 for( int tokI = 0; tokI < cmd_tokens.size(); tokI++ ){
00074 cmd_array[ tokI ] = new char[ cmd_tokens[ tokI ].length() + 1 ];
00075 strcpy( cmd_array[ tokI ], cmd_tokens[ tokI ].c_str() );
00076 }
00077 cmd_array[ cmd_tokens.size() ] = NULL;
00078 return cmd_array;
00079 }
00080
00081 #if !defined(WIN32)
00082 #include <unistd.h>
00083 #include <fcntl.h>
00084 #include <errno.h>
00085 #include <sys/types.h>
00086 #include <sys/wait.h>
00087
00088
00089 bool pipeExec( char** cmd_argv, const string& command, const string& input, string& output, string& error ){
00090 int stdin_pipe[2], stdout_pipe[2], stderr_pipe[2];
00091 boolean success = false;
00092 pid_t sid;
00093 pid_t pid1;
00094 const char* fail;
00095 char buf[1024];
00096 ssize_t bread = 0;
00097 int rval = 0;
00098
00099 if((sid = setsid()) < 0)sid = getpgrp();
00100
00101 if((sid < 0 && (fail = "sid"))
00102 || (pipe(stdin_pipe) < 0 && (fail = "stdin"))
00103 || (pipe(stdout_pipe) < 0 && (fail = "stdout"))
00104
00105 )
00106 {
00107 fprintf(stderr, "Ouch, the world just collapsed (%s).\n", fail);
00108 perror("muscle:");
00109 goto cleanup;
00110 }
00111
00112 fcntl(stdin_pipe[0], F_SETFL, fcntl(stdin_pipe[0], F_GETFL) & ~O_NONBLOCK);
00113 fcntl(stdin_pipe[1], F_SETFL, fcntl(stdin_pipe[1], F_GETFL) & ~O_NONBLOCK);
00114 fcntl(stdout_pipe[0], F_SETFL, fcntl(stdout_pipe[0], F_GETFL) & ~O_NONBLOCK);
00115 fcntl(stdout_pipe[1], F_SETFL, fcntl(stdout_pipe[1], F_GETFL) & ~O_NONBLOCK);
00116
00117
00118
00119 if((pid1 = fork()) < 0)goto cleanup;
00120 if(pid1)
00121 setpgid(pid1, sid);
00122 else
00123 {
00124 dup2(stdin_pipe[0], 0);
00125 dup2(stdout_pipe[1], 1);
00126
00127 close( stdin_pipe[0] );
00128 close( stdin_pipe[1] );
00129 close( stdout_pipe[0] );
00130 close( stdout_pipe[1] );
00131
00132
00133 execvp(cmd_argv[0], cmd_argv);
00134 _exit(errno);
00135 }
00136 rval = write( stdin_pipe[1], input.c_str(), input.size() );
00137 if( rval == -1 )
00138 perror( "write: " );
00139 if( close( stdin_pipe[1] ) )
00140 perror( "close stdin_w: " );
00141 if( close( stdin_pipe[0] ) )
00142 perror( "close stdin_r: " );
00143
00144 close( stdout_pipe[1] );
00145
00146 while(true){
00147 bzero( buf, sizeof(buf) );
00148 bread = read( stdout_pipe[0], buf, 1023 );
00149 if( bread == 0 )
00150 break;
00151 if( bread == -1 ){
00152 perror("muscle read: " );
00153 }
00154 output += buf;
00155 }
00156 wait( NULL );
00157 success = true;
00158
00159 cleanup:
00160 close(stdin_pipe[0]);
00161 close(stdin_pipe[1]);
00162 close(stdout_pipe[0]);
00163 close(stdout_pipe[1]);
00164
00165
00166 return success;
00167 };
00168
00169
00170 #else
00171
00172
00173 #include <windows.h>
00174 #define bzero(a) memset(a,0,sizeof(a)) //easier -- shortcut
00175
00176 bool IsWinNT()
00177 {
00178 OSVERSIONINFO osv;
00179 osv.dwOSVersionInfoSize = sizeof(osv);
00180 GetVersionEx(&osv);
00181 return (osv.dwPlatformId == VER_PLATFORM_WIN32_NT);
00182 }
00183
00184 void ErrorMessage(char *str)
00185 {
00186 LPVOID msg;
00187 FormatMessage(
00188 FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM,
00189 NULL,
00190 GetLastError(),
00191 MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
00192 (LPTSTR) &msg,
00193 0,
00194 NULL
00195 );
00196 printf("%s: %s\n",str,msg);
00197 LocalFree(msg);
00198 }
00199
00200 bool pipeExec( char** cmd_argv, const string& command, const string& input, string& output, string& error ){
00201
00202 char buf[1024];
00203
00204 STARTUPINFO si;
00205 SECURITY_ATTRIBUTES sa;
00206 SECURITY_DESCRIPTOR sd;
00207 PROCESS_INFORMATION pi;
00208 HANDLE newstdin_w,newstdout_w,newstderr_w,newstdin_r,newstdout_r,newstderr_r;
00209 HANDLE read_stdout,read_stderr,write_stdin;
00210 boolean success = false;
00211
00212 if (IsWinNT())
00213 {
00214 InitializeSecurityDescriptor(&sd,SECURITY_DESCRIPTOR_REVISION);
00215 SetSecurityDescriptorDacl(&sd, true, NULL, false);
00216 sa.lpSecurityDescriptor = &sd;
00217 }
00218 else sa.lpSecurityDescriptor = NULL;
00219
00220 sa.nLength = sizeof(SECURITY_ATTRIBUTES);
00221 sa.bInheritHandle = true;
00222
00223 if (!CreatePipe(&newstdin_r,&newstdin_w,&sa,0))
00224 {
00225 ErrorMessage("CreatePipe");
00226 goto finito;
00227 }
00228 if (!CreatePipe(&newstdout_r,&newstdout_w,&sa,0))
00229 {
00230 ErrorMessage("CreatePipe");
00231 goto finito;
00232 }
00233 if (!CreatePipe(&newstderr_r,&newstderr_w,&sa,0))
00234 {
00235 ErrorMessage("CreatePipe");
00236 goto finito;
00237 }
00238
00239 boolean fSuccess = DuplicateHandle(GetCurrentProcess(), newstdin_w,
00240 GetCurrentProcess(), &write_stdin, 0,
00241 FALSE,
00242 DUPLICATE_SAME_ACCESS);
00243 if (! fSuccess){
00244 ErrorMessage("DuplicateHandle failed");
00245 goto finito;
00246 }
00247 CloseHandle(newstdin_w);
00248 newstdin_w = INVALID_HANDLE_VALUE;
00249
00250
00251 fSuccess = DuplicateHandle(GetCurrentProcess(), newstdout_r,
00252 GetCurrentProcess(), &read_stdout, 0,
00253 FALSE,
00254 DUPLICATE_SAME_ACCESS);
00255 if (! fSuccess){
00256 ErrorMessage("DuplicateHandle failed");
00257 goto finito;
00258 }
00259 CloseHandle(newstdout_r);
00260 newstdout_r = INVALID_HANDLE_VALUE;
00261
00262
00263 fSuccess = DuplicateHandle(GetCurrentProcess(), newstderr_r,
00264 GetCurrentProcess(), &read_stderr, 0,
00265 FALSE,
00266 DUPLICATE_SAME_ACCESS);
00267 if (! fSuccess){
00268 ErrorMessage("DuplicateHandle failed");
00269 goto finito;
00270 }
00271 CloseHandle(newstderr_r);
00272 newstderr_r = INVALID_HANDLE_VALUE;
00273
00274 GetStartupInfo(&si);
00275
00276
00277
00278
00279
00280 si.dwFlags = STARTF_USESTDHANDLES|STARTF_USESHOWWINDOW;
00281 si.wShowWindow = SW_HIDE;
00282 si.hStdOutput = newstdout_w;
00283 si.hStdError = newstderr_w;
00284 si.hStdInput = newstdin_r;
00285
00286
00287 char* cmd = new char[ command.length() + 1 ];
00288 strcpy( cmd, command.c_str() );
00289 if (!CreateProcess(NULL,cmd,NULL,NULL,TRUE,0,
00290 NULL,NULL,&si,&pi))
00291 {
00292 delete cmd;
00293 ErrorMessage("CreateProcess");
00294 goto finito;
00295 }
00296 delete cmd;
00297
00298 unsigned long exit=0;
00299 unsigned long bread;
00300 unsigned long avail;
00301
00302 WriteFile(write_stdin, input.c_str(), input.size(), &bread, NULL);
00303 CloseHandle(write_stdin);
00304 write_stdin = INVALID_HANDLE_VALUE;
00305
00306
00307 while( true ){
00308 GetExitCodeProcess( pi.hProcess, &exit );
00309 if( exit != STILL_ACTIVE )
00310 WaitForSingleObject( pi.hProcess, INFINITE );
00311
00312
00313 PeekNamedPipe(read_stdout,buf,1023,&bread,&avail,NULL);
00314 if( avail == 0 )
00315 Sleep(5);
00316 while( avail > 0 ){
00317 bzero(buf);
00318 int read_size = 1023 < avail ? 1023 : avail;
00319 ReadFile(read_stdout,buf,read_size,&bread,NULL);
00320 avail -= bread;
00321 output += buf;
00322 }
00323
00324
00325 PeekNamedPipe(read_stderr,buf,1023,&bread,&avail,NULL);
00326 while( avail > 0 ){
00327 bzero(buf);
00328 int read_size = 1023 < avail ? 1023 : avail;
00329 ReadFile(read_stderr,buf,read_size,&bread,NULL);
00330 avail -= bread;
00331 error += buf;
00332 }
00333
00334 if( exit != STILL_ACTIVE )
00335 break;
00336 }
00337
00338 WaitForSingleObject( pi.hProcess, INFINITE );
00339 success = true;
00340
00341
00342 finito:
00343 if( pi.hThread != INVALID_HANDLE_VALUE )
00344 CloseHandle(pi.hThread);
00345 if( pi.hProcess != INVALID_HANDLE_VALUE )
00346 CloseHandle(pi.hProcess);
00347 if( newstdin_r != INVALID_HANDLE_VALUE )
00348 CloseHandle(newstdin_r);
00349 if( newstdout_w != INVALID_HANDLE_VALUE )
00350 CloseHandle(newstdout_w);
00351 if( newstderr_w != INVALID_HANDLE_VALUE )
00352 CloseHandle(newstderr_w);
00353 if( read_stdout != INVALID_HANDLE_VALUE )
00354 CloseHandle(read_stdout);
00355 if( read_stderr != INVALID_HANDLE_VALUE )
00356 CloseHandle(read_stderr);
00357 if( write_stdin != INVALID_HANDLE_VALUE )
00358 CloseHandle(write_stdin);
00359 return success;
00360 }
00361
00362 #endif
00363
00364
00365
00366 MuscleInterface& MuscleInterface::getMuscleInterface()
00367 {
00368 static MuscleInterface m_ci;
00369
00370 return m_ci;
00371 }
00372
00373 MuscleInterface::MuscleInterface() : GappedAligner() {
00374 muscle_path = "muscle_aed";
00375 muscle_arguments = "-stable -quiet -seqtype DNA";
00376 muscle_cmdline = parseCommand( muscle_path + " " + muscle_arguments );
00377 max_alignment_length = 12500;
00378 }
00379
00380 void MuscleInterface::ParseMusclePath( const char* argv0 ){
00381
00382 string path_str = argv0;
00383
00384 if( path_str[0] == '"' )
00385 path_str = path_str.substr( 1, path_str.size() - 2 );
00386 standardizePathString( path_str );
00387 string::size_type i = path_str.rfind('/');
00388 if( i != string::npos )
00389 path_str.erase(i+1, path_str.length() - (i+1));
00390 else
00391 path_str.clear();
00392 SetMusclePath( '"' + path_str + "muscle_aed\"");
00393 }
00394
00395 void MuscleInterface::SetMusclePath( const string& path ){
00396 muscle_path = path;
00397 ClearCommandLine();
00398 muscle_cmdline = parseCommand( muscle_path + " " + muscle_arguments );
00399 }
00400
00401 void MuscleInterface::SetExtraMuscleArguments( const string& args )
00402 {
00403 extra_muscle_arguments = args;
00404 }
00405
00406 void MuscleInterface::SetMuscleArguments( const string& args )
00407 {
00408 ClearCommandLine();
00409 muscle_arguments = args + " " + extra_muscle_arguments;
00410 muscle_cmdline = parseCommand( muscle_path + " " + args + " " + extra_muscle_arguments );
00411 }
00412
00413 MuscleInterface& MuscleInterface::operator=( const MuscleInterface& ci ){
00414 GappedAligner::operator =( ci );
00415 return *this;
00416 }
00417
00418
00419
00420
00421
00422
00423 boolean MuscleInterface::Align( GappedAlignment& cr, Match* r_begin, Match* r_end, vector< gnSequence* >& seq_table ){
00424 gnSeqI gap_size = 0;
00425 boolean create_ok = true;
00426 uint seq_count = seq_table.size();
00427
00428 uint seqI;
00429 uint align_seqs = 0;
00430 vector< string > tmp_mat = vector< string >( seq_count );
00431 try{
00432
00433
00434
00435
00436 vector< string > seq_data;
00437 vector< int64 > starts;
00438 vector< uint > seqs;
00439 const gnFilter* rc_filter = gnFilter::DNAComplementFilter();
00440
00441 for( seqI = 0; seqI < seq_count; seqI++ ){
00442
00443
00444 if( (r_end != NULL && r_end->Start( seqI ) == NO_MATCH ) ||
00445 (r_begin != NULL && r_begin->Start( seqI ) == NO_MATCH) ){
00446 starts.push_back( NO_MATCH );
00447 continue;
00448 }
00449
00450
00451 int64 gap_start = 0;
00452 int64 gap_end = 0;
00453 getInterveningCoordinates( seq_table, r_begin, r_end, seqI, gap_start, gap_end );
00454
00455 int64 diff = gap_end - gap_start;
00456 if( diff <= 0 || diff > max_alignment_length ){
00457 starts.push_back( NO_MATCH );
00458 continue;
00459 }
00460 seqs.push_back( seqI );
00461
00462
00463 if( r_end == NULL || r_end->Start( seqI ) > 0 ){
00464 starts.push_back( gap_start );
00465 seq_data.push_back( seq_table[ seqI ]->ToString( diff , gap_start ) );
00466 }else{
00467
00468 starts.push_back( -gap_start );
00469 string cur_seq_data = seq_table[ seqI ]->ToString( diff , gap_start );
00470 rc_filter->ReverseFilter( cur_seq_data );
00471 seq_data.push_back( cur_seq_data );
00472 }
00473 }
00474
00475 if( seqs.size() <= 1 )
00476 create_ok = false;
00477
00478 if( create_ok ){
00479
00480 vector< string > aln_matrix;
00481 if( !CallMuscleFast( aln_matrix, seq_data ) ){
00482 cout << "Muscle was unable to align:\n";
00483 if( r_begin )
00484 cout << "Left match: " << *r_begin << endl;
00485 if( r_end )
00486 cout << "Right match: " << *r_end << endl;
00487 return false;
00488 }
00489
00490 gnSeqI aln_length = aln_matrix.size() == 0 ? 0 : aln_matrix[0].length();
00491 cr = GappedAlignment( seq_count, aln_length );
00492 vector< string > aln_mat = vector< string >( seq_count );
00493
00494
00495 for( uint seqI = 0; seqI < seqs.size(); seqI++ ){
00496 cr.SetLength( seq_data[ seqI ].size(), seqs[ seqI ] );
00497 aln_mat[ seqs[ seqI ] ] = aln_matrix[ seqI ];
00498 }
00499 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00500 cr.SetStart( seqI, starts[ seqI ] );
00501 if( aln_mat[ seqI ].length() != aln_length )
00502 aln_mat[ seqI ] = string( aln_length, '-' );
00503 }
00504
00505 cr.SetAlignment( aln_mat );
00506
00507 return true;
00508 }
00509 }catch(exception& e){
00510 cerr << "At: " << __FILE__ << ":" << __LINE__ << endl;
00511 cerr << e.what();
00512 }
00513 return false;
00514 }
00515
00516 static int failure_count = 0;
00517
00518 boolean MuscleInterface::Align( GappedAlignment& cr, AbstractMatch* r_begin, AbstractMatch* r_end, vector< gnSequence* >& seq_table ){
00519 gnSeqI gap_size = 0;
00520 boolean create_ok = true;
00521
00522
00523
00524
00525
00526
00527 uint seq_count = r_begin->Multiplicity();
00528 uint seqI;
00529 uint align_seqs = 0;
00530 vector< string > tmp_mat = vector< string >( seq_count );
00531 try{
00532
00533
00534
00535
00536 vector< string > seq_data;
00537 vector< int64 > starts;
00538 vector< uint > seqs;
00539 const gnFilter* rc_filter = gnFilter::DNAComplementFilter();
00540
00541
00542 for( seqI = 0; seqI < seq_count; seqI++ ){
00543
00544
00545 if( (r_end != NULL && r_end->Start( seqI ) == NO_MATCH ) ||
00546 (r_begin != NULL && r_begin->Start( seqI ) == NO_MATCH) ){
00547 starts.push_back( NO_MATCH );
00548 continue;
00549 }
00550
00551
00552 int64 gap_start = 0;
00553 int64 gap_end = 0;
00554
00555 if( (r_end != NULL && r_end->Start( seqI ) == NO_MATCH) ||
00556 (r_begin != NULL && r_begin->Start( seqI ) == NO_MATCH) ){
00557 gap_start = 0;
00558 gap_end = 0;
00559 return true;
00560 }
00561
00562
00563 gap_end = r_end != NULL ? r_end->Start( seqI ) : seq_table[ seqI ]->length() + 1;
00564 gap_start = r_begin != NULL ? r_begin->End( seqI ) + 1 : 1;
00565 if( gap_end < 0 || gap_start < 0 ){
00566 gap_end = r_begin != NULL ? -r_begin->Start( seqI ) : seq_table[ 0 ]->length() + 1;
00567 gap_start = r_end != NULL ? -r_end->Start( seqI ) + r_end->Length( seqI ) : 1;
00568 }
00569 if( gap_end <= 0 || gap_start <= 0 ){
00570
00571 genome::ErrorMsg( "Error constructing intervening coordinates" );
00572 }
00573
00574 int64 diff = gap_end - gap_start;
00575
00576
00577 if( diff <= 0 || diff > max_alignment_length ){
00578 starts.push_back( NO_MATCH );
00579 continue;
00580 }
00581 seqs.push_back( seqI );
00582
00583
00584 if (0 )
00585 {
00586 starts.push_back( gap_start );
00587 seq_data.push_back( "A" );
00588 std::cout << "A" << std::endl;
00589 diff = 1;
00590 }
00591
00592 if( r_end == NULL || r_end->Start( seqI ) > 0 ){
00593 starts.push_back( gap_start );
00594
00595
00596
00597 seq_data.push_back( seq_table[ 0 ]->ToString( diff , gap_start ) );
00598 }else{
00599
00600 starts.push_back( -gap_start );
00601
00602
00603 string cur_seq_data = seq_table[ 0 ]->ToString( diff , gap_start );
00604 rc_filter->ReverseFilter( cur_seq_data );
00605 seq_data.push_back( cur_seq_data );
00606
00607 }
00608 }
00609
00610 if( seqs.size() <= 1 )
00611 create_ok = false;
00612
00613 if( create_ok ){
00614
00615 vector< string > aln_matrix;
00616 if( !CallMuscleFast( aln_matrix, seq_data ) ){
00617 cout << "Muscle was unable to align:\n";
00618
00619
00620
00621
00622 return false;
00623 }
00624
00625 gnSeqI aln_length = aln_matrix.size() == 0 ? 0 : aln_matrix[0].length();
00626 cr = GappedAlignment( seq_count, aln_length );
00627 vector< string > aln_mat = vector< string >( seq_count );
00628
00629
00630 for( uint seqI = 0; seqI < seqs.size(); seqI++ ){
00631 cr.SetLength( seq_data[ seqI ].size(), seqs[ seqI ] );
00632 aln_mat[ seqs[ seqI ] ] = aln_matrix[ seqI ];
00633 }
00634 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00635 cr.SetStart( seqI, starts[ seqI ] );
00636 if( aln_mat[ seqI ].length() != aln_length )
00637 aln_mat[ seqI ] = string( aln_length, '-' );
00638 }
00639
00640 cr.SetAlignment( aln_mat );
00641
00642 return true;
00643 }
00644 }catch(exception& e){
00645 cerr << "At: " << __FILE__ << ":" << __LINE__ << endl;
00646 cerr << e.what();
00647 }
00648 return false;
00649 }
00650
00651 boolean MuscleInterface::CallMuscle( vector< string >& aln_matrix, const vector< string >& seq_table )
00652 {
00653 gnSequence seq;
00654
00655 try{
00656 ostringstream input_seq_stream;
00657
00658 for( uint seqI = 0; seqI < seq_table.size(); seqI++ ){
00659 seq += seq_table[ seqI ];
00660 seq.setContigName( seqI, "seq" );
00661 }
00662 gnFASSource::Write( seq, input_seq_stream, false, true );
00663
00664 string muscle_cmd = muscle_path + " " + muscle_arguments;
00665 string output;
00666 string error;
00667 boolean success = pipeExec( muscle_cmdline, muscle_cmd, input_seq_stream.str(), output, error );
00668 if( !success || output.size() == 0 )
00669 {
00670 throw "b0rk3d";
00671 }
00672
00673 istringstream output_aln_stream( output );
00674 string cur_line;
00675
00676
00677 while( getline( output_aln_stream, cur_line ) ){
00678 if( cur_line[0] == '>' ){
00679 aln_matrix.push_back( "" );
00680 continue;
00681 }
00682 gnSeqI len = cur_line.size();
00683 len = cur_line[ len - 1 ] == '\r' ? len - 1 : len;
00684 uint seqI = aln_matrix.size() - 1;
00685 aln_matrix[ seqI ] += cur_line.substr( 0, len );
00686 }
00687
00688 return true;
00689 }catch( gnException& gne ){
00690 }catch( exception& e ){
00691 }catch(...){
00692 }
00693 cerr << "muscle failed! saving failed input data to muscle_failure_" << failure_count << ".txt\n";
00694 cerr << "Please contact the Mauve developers about this problem\n";
00695 stringstream debug_fname;
00696 debug_fname << "muscle_failure_" << failure_count++ << ".txt";
00697 ofstream debug_file( debug_fname.str().c_str() );
00698 gnFASSource::Write(seq, debug_file, false);
00699 debug_file.close();
00700 return false;
00701 }
00702
00703
00704 boolean MuscleInterface::CallMuscleFast( vector< string >& aln_matrix, const vector< string >& seq_table )
00705 {
00706 g_SeqType.get() = SEQTYPE_DNA;
00707 g_uMaxIters.get() = 1;
00708 g_bStable.get() = true;
00709 g_bQuiet.get() = true;
00710 g_SeqWeight1.get() = SEQWEIGHT_ClustalW;
00711
00712 SetMaxIters(g_uMaxIters.get());
00713 SetSeqWeightMethod(g_SeqWeight1.get());
00714
00715
00716 SeqVect sv;
00717 const char* seqname = "seq00000";
00718 for( size_t seqI = 0; seqI < seq_table.size(); seqI++ )
00719 {
00720 Seq curseq;
00721 curseq.SetId(seqI);
00722 curseq.SetName(seqname);
00723 curseq.resize(seq_table[seqI].size());
00724 std::copy(seq_table[seqI].begin(), seq_table[seqI].end(), curseq.begin());
00725 sv.AppendSeq(curseq);
00726 }
00727
00728 MSA msaTmp;
00729 MUSCLE(sv,msaTmp);
00730
00731
00732 aln_matrix.clear();
00733 aln_matrix.resize(msaTmp.GetSeqCount());
00734 for( size_t seqI = 0; seqI < msaTmp.GetSeqCount(); seqI++ )
00735 {
00736 unsigned indie = msaTmp.GetSeqIndex(seqI);
00737 const char* buf = msaTmp.GetSeqBuffer(indie);
00738 string curseq(buf, msaTmp.GetColCount());
00739 swap(aln_matrix[seqI],curseq);
00740 }
00741 return true;
00742 }
00743
00744 bool MuscleInterface::Refine( GappedAlignment& ga, size_t windowsize )
00745 {
00746 const vector< string >& seq_table = GetAlignment( ga, vector< gnSequence* >() );
00747 vector< string > aln_table;
00748 for( uint seqI = 0; seqI < ga.SeqCount(); seqI++ )
00749 {
00750 if( ga.LeftEnd(seqI) != NO_MATCH )
00751 {
00752 aln_table.push_back( seq_table[seqI] );
00753 }
00754 }
00755 vector< string > aln_matrix;
00756 if( windowsize == 0 )
00757 SetMuscleArguments( " -quiet -refine -seqtype DNA " );
00758 else
00759 {
00760 stringstream sstr;
00761 sstr << " -quiet -seqtype DNA -refinew -refinewindow " << windowsize << " ";
00762 SetMuscleArguments( sstr.str() );
00763 }
00764 bool success = CallMuscle( aln_matrix, aln_table );
00765 if( success )
00766 {
00767 aln_table.clear();
00768 uint alnI = 0;
00769 for( uint seqI = 0; seqI < ga.SeqCount(); seqI++ )
00770 {
00771 if( ga.LeftEnd(seqI) != NO_MATCH )
00772 aln_table.push_back( aln_matrix[alnI++] );
00773 else
00774 aln_table.push_back( string( aln_matrix[0].size(), '-' ) );
00775 }
00776 ga.SetAlignment( aln_table );
00777 }
00778 return success;
00779 }
00780
00781 void msaFromSeqTable(MSA& msa, const vector< string >& seq_table, unsigned id_base = 0)
00782 {
00783 msa.SetSize(seq_table.size(), seq_table[0].size());
00784 for( uint seqI = 0; seqI < seq_table.size(); seqI++ )
00785 {
00786 stringstream ss;
00787 ss << "seq" << seqI;
00788 msa.SetSeqName(seqI, ss.str().c_str());
00789 msa.SetSeqId(seqI,seqI+id_base);
00790 for(size_t i = 0; i < seq_table[seqI].size(); i++)
00791 msa.SetChar(seqI, i, seq_table[seqI][i]);
00792 }
00793 }
00794
00795
00796 bool MuscleInterface::RefineFast( GappedAlignment& ga, size_t windowsize )
00797 {
00798 const vector< string >& seq_table = GetAlignment( ga, vector< gnSequence* >() );
00799 vector< string > aln_table;
00800 for( uint seqI = 0; seqI < ga.SeqCount(); seqI++ )
00801 {
00802 if( ga.LeftEnd(seqI) != NO_MATCH )
00803 {
00804 aln_table.push_back( seq_table[seqI] );
00805 }
00806 }
00807
00808 g_SeqType.get() = SEQTYPE_DNA;
00809 g_uMaxIters.get() = 1;
00810 g_bStable.get() = true;
00811 g_bQuiet.get() = true;
00812 g_SeqWeight1.get() = SEQWEIGHT_ClustalW;
00813
00814 g_uRefineWindow.get() = windowsize;
00815 g_uWindowTo.get() = 0;
00816
00817 SetMaxIters(g_uMaxIters.get());
00818 SetSeqWeightMethod(g_SeqWeight1.get());
00819
00820 MSA::SetIdCount(seq_table.size());
00821
00822
00823 MSA msa;
00824 msaFromSeqTable(msa, seq_table);
00825
00826 SetAlpha(ALPHA_DNA);
00827 msa.FixAlpha();
00828 SetPPScore(PPSCORE_SPN);
00829 SetMuscleInputMSA(msa);
00830
00831 Tree GuideTree;
00832 TreeFromMSA(msa, GuideTree, g_Cluster2.get(), g_Distance2.get(), g_Root2.get());
00833 SetMuscleTree(GuideTree);
00834
00835 MSA msaOut;
00836 MSA* finalMsa;
00837
00838 if(windowsize == 0)
00839 {
00840 if (g_bAnchors.get())
00841 RefineVert(msa, GuideTree, g_uMaxIters.get());
00842 else
00843 RefineHoriz(msa, GuideTree, g_uMaxIters.get(), false, false);
00844 finalMsa = &msa;
00845 }else{
00846 RefineW(msa, msaOut);
00847 finalMsa = &msaOut;
00848 }
00849
00850
00851 ValidateMuscleIds(*finalMsa);
00852 ValidateMuscleIds(GuideTree);
00853
00854
00855 vector< string > aln_matrix;
00856 aln_matrix.resize(finalMsa->GetSeqCount());
00857 for( size_t seqI = 0; seqI < finalMsa->GetSeqCount(); seqI++ )
00858 {
00859 unsigned indie = finalMsa->GetSeqIndex(seqI);
00860 const char* buf = finalMsa->GetSeqBuffer(indie);
00861 string curseq(buf, finalMsa->GetColCount());
00862 swap(aln_matrix[seqI],curseq);
00863 }
00864
00865 ga.SetAlignment( aln_matrix );
00866 return true;
00867 }
00868
00869
00870 void stripGapColumns( std::vector< std::string >& aln )
00871 {
00872 size_t cur_col = 0;
00873 size_t gap_seq = 0;
00874 for( size_t colI = 0; colI < aln[0].size(); colI++ )
00875 {
00876 gap_seq = 0;
00877 for( ; gap_seq < aln.size(); gap_seq++ )
00878 if( aln[gap_seq][colI] != '-' )
00879 break;
00880 if( gap_seq != aln.size() )
00881 {
00882 for( gap_seq = 0; gap_seq < aln.size(); gap_seq++ )
00883 aln[gap_seq][cur_col] = aln[gap_seq][colI];
00884 cur_col++;
00885 }
00886 }
00887 for( gap_seq = 0; gap_seq < aln.size(); gap_seq++ )
00888 aln[gap_seq].resize(cur_col);
00889 }
00890
00891 void stripGaps( std::string& str )
00892 {
00893 std::string::iterator striter = std::remove(str.begin(), str.end(), '-');
00894 str.resize(striter - str.begin());
00895 }
00896
00897 bool MuscleInterface::ProfileAlign( const GappedAlignment& ga1, const GappedAlignment& ga2, GappedAlignment& aln, bool anchored )
00898 {
00899 try{
00900 const vector< string >& aln1 = GetAlignment( ga1, vector< gnSequence* >() );
00901 const vector< string >& aln2 = GetAlignment( ga2, vector< gnSequence* >() );
00902 vector< uint > order;
00903 ostringstream input_seq_stream;
00904 gnSequence seq;
00905 vector< string > aln11( ga1.Multiplicity() );
00906 vector< string > aln22( ga2.Multiplicity() );
00907 size_t curI = 0;
00908 for( uint seqI = 0; seqI < aln1.size(); seqI++ )
00909 {
00910 if( ga1.LeftEnd(seqI) != NO_MATCH )
00911 {
00912 aln11[curI++] = aln1[seqI];
00913 order.push_back(seqI);
00914 }
00915 }
00916 curI = 0;
00917 for( uint seqI = 0; seqI < aln2.size(); seqI++ )
00918 {
00919 if( ga2.LeftEnd(seqI) != NO_MATCH )
00920 {
00921 aln22[curI++] = aln2[seqI];
00922 order.push_back(seqI);
00923 }
00924 }
00925
00926 if( !anchored )
00927 {
00928 stripGapColumns(aln11);
00929 stripGapColumns(aln22);
00930 }
00931 for( uint seqI = 0; seqI < aln11.size(); seqI++ )
00932 {
00933 seq += aln11[ seqI ];
00934 seq.setContigName( seq.contigListLength()-1, "seq" );
00935 }
00936
00937 gnFASSource::Write( seq, input_seq_stream, false, true );
00938 input_seq_stream << "=\n";
00939
00940 gnSequence seq2;
00941 for( uint seqI = 0; seqI < aln22.size(); seqI++ )
00942 {
00943 seq2 += aln22[ seqI ];
00944 seq2.setContigName( seq2.contigListLength()-1, "seq" );
00945 }
00946
00947 gnFASSource::Write( seq2, input_seq_stream, false, true );
00948 input_seq_stream << "=\n";
00949
00950 if( debug_muscle )
00951 {
00952
00953 stringstream debug_fname;
00954 debug_fname << "muscle_debug_" << failure_count++ << ".txt";
00955 ofstream debug_file( debug_fname.str().c_str() );
00956 debug_file << input_seq_stream.str();
00957 debug_file.close();
00958 }
00959
00960
00961 string musc_args = "-quiet -seqtype DNA -profile -ProfileOnStdIn ";
00962 if( anchored )
00963 musc_args += "-AnchoredPP ";
00964 SetMuscleArguments( musc_args );
00965 string output;
00966 string error;
00967 string muscle_cmd = muscle_path + " " + muscle_arguments;
00968 if( debug_muscle )
00969 {
00970 cerr << "Running " << muscle_cmd << endl;
00971 }
00972 boolean success = pipeExec( muscle_cmdline, muscle_cmd, input_seq_stream.str(), output, error );
00973 if( !success || output.size() == 0 )
00974 {
00975 if( output.size() == 0 )
00976 cerr << "\nmuscle nothing\n";
00977 else
00978 cerr << "\nunsuccessful muscle\n";
00979 return false;
00980 }
00981
00982 istringstream output_aln_stream( output );
00983 string cur_line;
00984
00985
00986 vector< string > aln_matrix( ga1.SeqCount() );
00987 int ordI = -1;
00988 while( getline( output_aln_stream, cur_line ) ){
00989 if( cur_line[0] == '>' ){
00990 ordI++;
00991 continue;
00992 }
00993 gnSeqI len = cur_line.size();
00994 len = cur_line[ len - 1 ] == '\r' ? len - 1 : len;
00995 uint seqI = aln_matrix.size() - 1;
00996 aln_matrix[ order[ordI] ] += cur_line.substr( 0, len );
00997 }
00998 for( size_t i = 0; i < aln_matrix.size(); i++ )
00999 {
01000 if( aln_matrix[i].size() == 0 )
01001 aln_matrix[i].resize( aln_matrix[order[0]].size(), '-' );
01002 }
01003
01004 aln.SetAlignment( aln_matrix );
01005 for( uint seqI = 0; seqI < ga1.SeqCount(); seqI++ )
01006 if( ga1.LeftEnd(seqI) != NO_MATCH )
01007 {
01008 aln.SetLeftEnd(seqI, ga1.LeftEnd(seqI));
01009 aln.SetLength(ga1.Length(seqI), seqI);
01010 }
01011 for( uint seqI = 0; seqI < ga2.SeqCount(); seqI++ )
01012 if( ga2.LeftEnd(seqI) != NO_MATCH )
01013 {
01014 aln.SetLeftEnd(seqI, ga2.LeftEnd(seqI));
01015 aln.SetLength(ga2.Length(seqI), seqI);
01016 }
01017 return true;
01018 }catch( gnException& gne ){
01019 }catch( exception& e ){
01020 }catch(...){
01021 }
01022 return false;
01023 }
01024
01025
01026 bool MuscleInterface::ProfileAlignFast( const GappedAlignment& ga1, const GappedAlignment& ga2, GappedAlignment& aln, bool anchored )
01027 {
01028 try{
01029 const vector< string >& aln1 = GetAlignment( ga1, vector< gnSequence* >() );
01030 const vector< string >& aln2 = GetAlignment( ga2, vector< gnSequence* >() );
01031 vector< uint > order;
01032 vector< string > aln11( ga1.Multiplicity() );
01033 vector< string > aln22( ga2.Multiplicity() );
01034 size_t curI = 0;
01035 for( uint seqI = 0; seqI < aln1.size(); seqI++ )
01036 {
01037 if( ga1.LeftEnd(seqI) != NO_MATCH )
01038 {
01039 aln11[curI++] = aln1[seqI];
01040 order.push_back(seqI);
01041 }
01042 }
01043 curI = 0;
01044 for( uint seqI = 0; seqI < aln2.size(); seqI++ )
01045 {
01046 if( ga2.LeftEnd(seqI) != NO_MATCH )
01047 {
01048 aln22[curI++] = aln2[seqI];
01049 order.push_back(seqI);
01050 }
01051 }
01052
01053 if( !anchored )
01054 {
01055 stripGapColumns(aln11);
01056 stripGapColumns(aln22);
01057 }
01058
01059 g_SeqType.get() = SEQTYPE_DNA;
01060 g_uMaxIters.get() = 1;
01061 g_bStable.get() = true;
01062 g_bQuiet.get() = true;
01063 g_SeqWeight1.get() = SEQWEIGHT_ClustalW;
01064
01065 SetMaxIters(g_uMaxIters.get());
01066 SetSeqWeightMethod(g_SeqWeight1.get());
01067
01068 MSA::SetIdCount(order.size());
01069
01070 MSA msa1;
01071 MSA msa2;
01072 MSA msaOut;
01073 msaFromSeqTable(msa1, aln11);
01074 msaFromSeqTable(msa2, aln22, msa1.GetSeqCount());
01075
01076 SetAlpha(ALPHA_DNA);
01077 msa1.FixAlpha();
01078 msa2.FixAlpha();
01079 SetPPScore(PPSCORE_SPN);
01080
01081 if(anchored)
01082 {
01083 AnchoredProfileProfile(msa1, msa2, msaOut);
01084 }else{
01085 ProfileProfile(msa1, msa2, msaOut);
01086 }
01087
01088
01089 vector< string > aln_matrix( aln1.size() );
01090 for( size_t seqI = 0; seqI < msaOut.GetSeqCount(); seqI++ )
01091 {
01092 unsigned indie = msaOut.GetSeqIndex(seqI);
01093 const char* buf = msaOut.GetSeqBuffer(indie);
01094 string curseq(buf, msaOut.GetColCount());
01095 swap(aln_matrix[order[indie]],curseq);
01096
01097
01098
01099
01100
01101
01102
01103
01104
01105
01106
01107
01108
01109 }
01110
01111 for( size_t seqI = 0; seqI < aln_matrix.size(); seqI++ )
01112 if(aln_matrix[seqI].size() == 0)
01113 aln_matrix[seqI].resize(msaOut.GetColCount(), '-');
01114
01115 aln.SetAlignment( aln_matrix );
01116 for( uint seqI = 0; seqI < ga1.SeqCount(); seqI++ )
01117 if( ga1.LeftEnd(seqI) != NO_MATCH )
01118 {
01119 aln.SetLeftEnd(seqI, ga1.LeftEnd(seqI));
01120 aln.SetLength(ga1.Length(seqI), seqI);
01121 }
01122 for( uint seqI = 0; seqI < ga2.SeqCount(); seqI++ )
01123 if( ga2.LeftEnd(seqI) != NO_MATCH )
01124 {
01125 aln.SetLeftEnd(seqI, ga2.LeftEnd(seqI));
01126 aln.SetLength(ga2.Length(seqI), seqI);
01127 }
01128 return true;
01129
01130 }catch( gnException& gne ){
01131 }catch( exception& e ){
01132 }catch(...){
01133 }
01134 return false;
01135 }
01136
01137
01138 void MuscleInterface::CreateTree( const NumericMatrix<double>& distances, const std::string& tree_filename )
01139 {
01140 DistFunc df;
01141 df.SetCount( distances.rows() );
01142 for( size_t i = 0; i < distances.rows(); i++ )
01143 for( size_t j = 0; j < distances.rows(); j++ )
01144 df.SetDist( i, j, distances(i,j) );
01145
01146 for( size_t i = 0; i < distances.rows(); i++ )
01147 {
01148 stringstream ss;
01149 ss << "seq";
01150 ss << i + 1;
01151 df.SetName( i, ss.str().c_str() );
01152 df.SetId( i, i );
01153 }
01154 ClustSetDF csdf( df );
01155 Clust crusty;
01156 crusty.Create( csdf, CLUSTER_NeighborJoining );
01157 Tree tt;
01158 tt.FromClust( crusty );
01159 TextFile tf( tree_filename.c_str(), true );
01160 tt.ToFile( tf );
01161 }
01162
01163
01164 }