00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifdef HAVE_CONFIG_H
00010 #include "config.h"
00011 #endif
00012
00013 #include "libMems/gnAlignedSequences.h"
00014 #include <sstream>
00015
00016 using namespace std;
00017 using namespace genome;
00018 namespace mems {
00019
00020 gnAlignedSequences::gnAlignedSequences()
00021 {
00022 alignedSequenceFileName = "";
00023
00024
00025 }
00026
00027
00028 gnAlignedSequences::gnAlignedSequences(const gnAlignedSequences &toCopy)
00029 {
00030 alignedSequenceFileName = toCopy.alignedSequenceFileName;
00031 consensus = toCopy.consensus;
00032
00033 names = toCopy.names;
00034 sequences = toCopy.sequences;
00035 positions = toCopy.positions;
00036 }
00037
00038
00039 void gnAlignedSequences::constructFromClustalW(string alignedFileName)
00040 {
00041 alignedSequenceFileName = alignedFileName;
00042
00043 readClustalWAlignment();
00044 buildConsensus();
00045
00046 indexPositions.resize(consensus.size());
00047 for (int i=0; i<consensus.size(); i++)
00048 indexPositions[i] = i+1;
00049 }
00050
00051
00052 void gnAlignedSequences::constructFromPhylip(string alignedFileName)
00053 {
00054 alignedSequenceFileName = alignedFileName;
00055
00056 readPhylipAlignment();
00057 buildConsensus();
00058
00059 indexPositions.resize(consensus.size());
00060 for (int i=0; i<consensus.size(); i++)
00061 indexPositions[i] = i+1;
00062 }
00063
00064
00065 void gnAlignedSequences::constructFromMSF(string alignedFileName)
00066 {
00067 alignedSequenceFileName = alignedFileName;
00068
00069 readMSFAlignment();
00070 buildConsensus();
00071
00072 indexPositions.resize(consensus.size());
00073 for (int i=0; i<consensus.size(); i++)
00074 indexPositions[i] = i+1;
00075 }
00076
00077
00078 void gnAlignedSequences::constructFromRelaxedNexus( istream& align_stream ){
00079 readRelaxedNexusAlignment( align_stream );
00080
00081
00082
00083
00084
00085 }
00086
00087 void gnAlignedSequences::constructFromNexus(string alignedFileName)
00088 {
00089 alignedSequenceFileName = alignedFileName;
00090
00091 readNexusAlignment();
00092 buildConsensus();
00093
00094 indexPositions.resize(consensus.size());
00095 for (int i=0; i<consensus.size(); i++)
00096 indexPositions[i] = i+1;
00097 }
00098
00099
00100 void gnAlignedSequences::constructFromMega(string alignedFileName)
00101 {
00102 alignedSequenceFileName = alignedFileName;
00103
00104 readMegaAlignment();
00105 buildConsensus();
00106
00107 indexPositions.resize(consensus.size());
00108 for (int i=0; i<consensus.size(); i++)
00109 indexPositions[i] = i+1;
00110 }
00111
00112 const vector< string >& gnAlignedSequences::getSupportedFormats()
00113 {
00114 static vector< string > formats;
00115 if( formats.size() == 0 ){
00116 formats.push_back( "phylip" );
00117 formats.push_back( "clustal" );
00118 formats.push_back( "msf" );
00119 formats.push_back( "nexus" );
00120 formats.push_back( "mega" );
00121 formats.push_back( "codon" );
00122 }
00123 return formats;
00124 }
00125
00126 boolean gnAlignedSequences::isSupportedFormat( const string& format_name )
00127 {
00128 const vector< string >& formats = getSupportedFormats();
00129 for( int formatI = 0; formatI < formats.size(); formatI++ ){
00130 if( formats[ formatI ] == format_name )
00131 return true;
00132 }
00133 return false;
00134 }
00135 void gnAlignedSequences::output( const string& format_name, ostream& os ) const
00136 {
00137 bool rval = false;
00138
00139 if( format_name == "phylip" )
00140 rval = outputPhylip( os );
00141
00142 if( format_name == "clustal" )
00143 rval = outputClustalW( os );
00144
00145 if( format_name == "msf" )
00146 rval = outputMSF( os );
00147
00148 if( format_name == "nexus" )
00149 rval = outputNexus( os );
00150
00151 if( format_name == "mega" )
00152 rval = outputMega( os );
00153
00154 if( format_name == "codon" )
00155 rval = outputCodon( os );
00156
00157 if( !rval )
00158 throw "Error writing alignment\n";
00159
00160 }
00161
00162 bool gnAlignedSequences::outputPhylip(ostream& os) const
00163 {
00164
00165 os << "Sequences in Alignment: " << sequences.size()
00166 << " Bases in Each Aligned Sequence: " << sequences[0].length() << endl;
00167
00168 int offset = 10;
00169 uint seqI;
00170 for( seqI = 0; seqI < sequences.size(); seqI++ )
00171 {
00172 int position = 0;
00173 const string& seq = sequences[ seqI ];
00174 string seqName = names[ seqI ].substr( 0, offset );
00175 seqName.append( offset - seqName.length() + 1, ' ' );
00176
00177 os << seqName;
00178
00179 for ( position=0; position + offset < seq.size(); position += offset){
00180 if ( position % 50 == 0)
00181 os << endl;
00182 os.write( seq.data() + position, offset );
00183 os << ' ';
00184 }
00185
00186 if ( position % 50 == 0)
00187 os << endl;
00188
00189 os.write( seq.data() + position, seq.size() - position );
00190 os << endl;
00191 }
00192
00193 return true;
00194 }
00195
00196 uint64 countGaps( string& seq );
00197 uint64 countGaps( string& seq ){
00198 uint gap_count = 0;
00199 for( uint charI = 0; charI < seq.length(); charI++ )
00200 if( seq[ charI ] == '-' )
00201 gap_count++;
00202 return gap_count;
00203 }
00204
00205 bool gnAlignedSequences::outputClustalW(ostream& os) const
00206 {
00207 boolean output_positions = true;
00208
00209 os << "Clustal W multiple sequence alignment" << endl;
00210
00211 vector< int64 > seq_pos( sequences.size(), 0 );
00212 if( positions.size() == sequences.size() )
00213 seq_pos = positions;
00214 vector< string > seq_names;
00215 int pos;
00216 uint seqI = 0;
00217 int longestNameSize = 0;
00218 for( ; seqI < sequences.size(); seqI++ )
00219 {
00220 seq_names.push_back( names[ seqI ].substr( 0, 30 ) );
00221 if ( seq_names[ seq_names.size() - 1 ].length() > longestNameSize)
00222 longestNameSize=seq_names[ seq_names.size() - 1 ].length();
00223 }
00224
00225 for( seqI = 0; seqI < seq_names.size(); seqI++ )
00226 seq_names[ seqI ] += string( (longestNameSize - seq_names[ seqI ].length()) + 6, ' ' );
00227 for (pos=0; pos+60 < alignedSeqsSize(); pos+=60)
00228 {
00229 os << endl
00230 << endl;
00231 for( seqI = 0; seqI < sequences.size(); seqI++ )
00232 {
00233 os << seq_names[ seqI ];
00234 const string& seq = sequences[ seqI ];
00235 string cur_seq = seq.substr( pos, 60 );
00236 os << cur_seq;
00237 if( output_positions ){
00238 seq_pos[ seqI ] += 60 - countGaps( cur_seq );
00239 os << " " << seq_pos[ seqI ];
00240 }
00241 os << endl;
00242 }
00243 }
00244
00245 if (pos<alignedSeqsSize())
00246 {
00247 os << endl
00248 << endl;
00249
00250 for( seqI = 0; seqI < sequences.size(); seqI++ )
00251 {
00252 os << seq_names[ seqI ];
00253 const string& seq = sequences[ seqI ];
00254 string cur_seq = seq.substr( pos, 60 );
00255 os << cur_seq;
00256 if( output_positions ){
00257 seq_pos[ seqI ] += 60 - countGaps( cur_seq );
00258 os << " " << seq_pos[ seqI ];
00259 }
00260 os << endl;
00261 }
00262 }
00263 return true;
00264 }
00265
00266
00267 bool gnAlignedSequences::outputMSF(ostream& os) const
00268 {
00269 os << "//" << endl;
00270
00271 list <pair <string*, string*> >::const_iterator sequenceItr = alignedSequences.begin();
00272 int longestSeqNameLength = 0;
00273 for ( ; sequenceItr!=alignedSequences.end(); sequenceItr++)
00274 {
00275 if ((*(*sequenceItr).first).length() > longestSeqNameLength)
00276 longestSeqNameLength = (*(*sequenceItr).first).length();
00277 }
00278
00279 int pos = 0;
00280 for ( ; pos+60<(*(*alignedSequences.begin()).second).size(); pos+=60)
00281 {
00282
00283 for (int i=0; i<longestSeqNameLength+2; i++)
00284 os << " ";
00285
00286 os << pos+1;
00287 for (int i=0; i<54; i++)
00288 os << " ";
00289 os << pos+60 << endl;
00290
00291 for (sequenceItr=alignedSequences.begin(); sequenceItr!=alignedSequences.end(); sequenceItr++)
00292 {
00293 int spaces = longestSeqNameLength-(*(*sequenceItr).first).length();
00294 for (int i=0; i<spaces; i++)
00295 os << " ";
00296
00297 os << (*(*sequenceItr).first) << " ";
00298
00299 string seq = (*(*sequenceItr).second).substr(pos, 60);
00300 for (int i=0; i<60; i++)
00301 {
00302 if (seq[i]=='-')
00303 os << ".";
00304 else
00305 os << seq[i];
00306 }
00307 os << endl;
00308 }
00309
00310 os << endl;
00311 }
00312
00313 if (pos<(*(*alignedSequences.begin()).second).size())
00314 {
00315
00316 for (int i=0; i<longestSeqNameLength+2; i++)
00317 os << " ";
00318
00319 os << pos+1;
00320 for (int i=0; i<(*(*alignedSequences.begin()).second).size()-pos; i++)
00321 os << " ";
00322 os << (*(*alignedSequences.begin()).second).size() << endl;
00323
00324 for (sequenceItr=alignedSequences.begin(); sequenceItr!=alignedSequences.end(); sequenceItr++)
00325 {
00326 int spaces = longestSeqNameLength-(*(*sequenceItr).first).length();
00327 for (int i=0; i<spaces; i++)
00328 os << " ";
00329
00330 os << (*(*sequenceItr).first) << " ";
00331
00332 string seq = (*(*sequenceItr).second).substr(pos, (*(*alignedSequences.begin()).second).size()-pos );
00333 for (int i=0; i<seq.length(); i++)
00334 {
00335 if (seq[i]=='-')
00336 os << ".";
00337 else
00338 os << seq[i];
00339 }
00340 os << endl;
00341 }
00342
00343 os << endl;
00344 }
00345
00346 return false;
00347 }
00348
00349
00350
00351 bool gnAlignedSequences::outputNexus(ostream& os) const
00352 {
00353 os << "begin data;" << endl
00354 << " dimensions ntax=" << sequences.size();
00355 if( sequences.size() == 0 )
00356 return true;
00357 os << " nchar="
00358 << sequences[0].length() << ";" << endl
00359 << " ;" << endl
00360 << " matrix" << endl;
00361
00362 list <pair <string*, string*> >::const_iterator sequenceItr = alignedSequences.begin();
00363 int i;
00364 int seqI;
00365 int longestSeqNameLength = 0;
00366 for( seqI = 0; seqI < sequences.size(); seqI++ ){
00367 if( names[ seqI ].length() > longestSeqNameLength )
00368 longestSeqNameLength = names[ seqI ].length();
00369 }
00370
00371 int pos = 1;
00372 for ( ; pos+59 < sequences[0].size(); pos+=60)
00373 {
00374 os << "[";
00375
00376 for (i = 0; i < longestSeqNameLength+2; i++)
00377 os << " ";
00378
00379 os << pos;
00380 for (i = 0; i < 54; i++)
00381 os << " ";
00382 os << pos+59 << "]" << endl;
00383
00384 for( seqI = 0; seqI < sequences.size(); seqI++ )
00385 {
00386 os << names[ seqI ];
00387
00388 int spaces = longestSeqNameLength - names[ seqI ].length();
00389 for (i = 0; i < spaces + 2; i++)
00390 os << " ";
00391
00392 string seq = sequences[ seqI ].substr( pos, 60 );
00393 os << seq << endl;
00394 }
00395
00396 os << endl;
00397 }
00398
00399
00400 if (pos - 1 < sequences[0].size())
00401 {
00402
00403 os << "[";
00404
00405 for (i = 0; i < longestSeqNameLength + 2; i++)
00406 os << " ";
00407
00408 os << pos;
00409 for (i=0; i < sequences[0].size() - pos + 1; i++)
00410 os << " ";
00411 os << pos+59 << "]" << endl;
00412
00413 for (sequenceItr = alignedSequences.begin(); sequenceItr != alignedSequences.end(); sequenceItr++)
00414 for( seqI = 0; seqI < sequences.size(); seqI++ )
00415 {
00416 os << names[ seqI ];
00417
00418 int spaces = longestSeqNameLength - names[ seqI ].length();
00419 for (i=0; i<spaces+2; i++)
00420 os << " ";
00421
00422 string seq = sequences[seqI].substr( pos, sequences[seqI].size()-pos+1 );
00423 os << seq << endl;
00424 }
00425
00426 os << endl;
00427 }
00428
00429 return true;
00430 }
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503
00504
00505
00506
00507
00508
00509
00510
00511
00512 bool gnAlignedSequences::outputMega(ostream& os) const
00513 {
00514 os << "#MEGA" << endl
00515 << "TITLE:" << endl;
00516
00517 list <pair <string*, string*> >::const_iterator sequenceItr = alignedSequences.begin();
00518 int longestSeqNameLength = 0;
00519
00520 for ( ; sequenceItr!=alignedSequences.end(); sequenceItr++){
00521 if (sequenceItr->first->length() > longestSeqNameLength)
00522 longestSeqNameLength = sequenceItr->first->length();
00523 }
00524
00525 gnSeqI pos = 1;
00526 gnSeqI remaining_len = alignedSequences.begin()->second->size();
00527
00528 while(remaining_len > 0){
00529 os << endl;
00530 gnSeqI write_chars = MEGA_ALIGN_COLUMNS < remaining_len ? MEGA_ALIGN_COLUMNS : remaining_len;
00531
00532
00533 for (sequenceItr = alignedSequences.begin(); sequenceItr != alignedSequences.end(); sequenceItr++){
00534 os << "#" << *(sequenceItr->first);
00535
00536 int spaces = longestSeqNameLength - sequenceItr->first->length();
00537 for (int i = 0; i < spaces + 5; i++)
00538 os << " ";
00539
00540 string seq = sequenceItr->second->substr( pos, write_chars );
00541 for (int i = 0; i < write_chars; i++)
00542 os << seq[i];
00543 os << endl;
00544 }
00545 os << endl;
00546
00547 pos += write_chars;
00548 remaining_len -= write_chars;
00549 }
00550 return true;
00551 }
00552
00553
00554 bool gnAlignedSequences::outputCodon(ostream& os) const
00555 {
00556 list <pair <string*, string*> >::const_iterator sequenceItr = alignedSequences.begin();
00557
00558 os << '\t' << alignedSequences.size() << '\t' << (*(*sequenceItr).second).size() << endl;
00559
00560 int offset = 10;
00561 for ( ; sequenceItr != alignedSequences.end(); sequenceItr++)
00562 {
00563 int position = 0;
00564 string seq = (*(*sequenceItr).second);
00565 string seqName = (*(*sequenceItr).first);
00566 if (seqName.size() <= offset)
00567 {
00568 for (int i=seqName.size(); i<offset; i++)
00569 seqName += " ";
00570 }
00571
00572 else
00573 {
00574 string temp = seqName;
00575 seqName = "";
00576 for (int i=0; i<offset; i++)
00577 seqName += temp[i];
00578 }
00579
00580 os << seqName;
00581 int count = 0;
00582 for ( ; position+3<seq.size(); position+=3)
00583 {
00584 if (count == 20)
00585 {
00586 count = 0;
00587 os << endl;
00588 }
00589 for (int i=position; i<position+3; i++)
00590 os << seq[i];
00591
00592 os << ' ';
00593 count++;
00594 }
00595
00596 for ( ; position < seq.size(); position++)
00597 os << seq[position];
00598
00599 os << endl;
00600 }
00601
00602 return false;
00603 }
00604
00605
00606 bool gnAlignedSequences::outputWithConsensus(ostream& os)
00607 {
00608 list <pair <string*, string*> >::iterator sequenceItr = alignedSequences.begin();
00609
00610 os << '\t' << alignedSequences.size() << '\t' << (*(*sequenceItr).second).size() << endl;
00611
00612 int offset = 10;
00613 for ( ; sequenceItr != alignedSequences.end(); sequenceItr++)
00614 {
00615 int position = 0;
00616 string seq = (*(*sequenceItr).second);
00617 string seqName = (*(*sequenceItr).first);
00618 if (seqName.size() <= offset)
00619 {
00620 for (int i=seqName.size(); i<offset; i++)
00621 seqName += " ";
00622 }
00623
00624 else
00625 {
00626 string temp = seqName;
00627 seqName = "";
00628 for (int i=0; i<offset; i++)
00629 seqName += temp[i];
00630 }
00631
00632 os << seqName;
00633 int count = 0;
00634 for ( ; position+10<seq.size(); position+=10)
00635 {
00636 if (count == 5)
00637 {
00638 count = 0;
00639 os << endl;
00640 }
00641 for (int i=position; i<position+10; i++)
00642 os << seq[i];
00643
00644 os << ' ';
00645 count++;
00646 }
00647
00648 for ( ; position < seq.size(); position++)
00649 os << seq[position];
00650
00651 os << endl;
00652 }
00653
00654 int position = 0;
00655 int count = 0;
00656 os << "Consensus:";
00657 for ( ; position+10<consensus.size(); position+=10)
00658 {
00659 if (count == 5)
00660 {
00661 count = 0;
00662 os << endl;
00663 }
00664 for (int i=position; i<position+10; i++)
00665 os << consensus[i];
00666
00667 os << ' ';
00668 count++;
00669 }
00670
00671 for ( ; position < consensus.size(); position++)
00672 os << consensus[position];
00673
00674 os << endl;
00675
00676 return false;
00677 }
00678
00679
00680 gnAlignedSequences gnAlignedSequences::getAlignedSegment(unsigned start, unsigned stop)
00681 {
00682 gnAlignedSequences newAlignment;
00683
00684 addAllSegments(newAlignment, start, stop);
00685 newAlignment.buildConsensus();
00686
00687 return newAlignment;
00688 }
00689
00690
00691 gnAlignedSequences gnAlignedSequences::getCodons(int readingFrame, int startCodon, int codonMultiple)
00692 {
00693 gnAlignedSequences toReturn;
00694 int startBase = ((startCodon*3)-2)+(readingFrame-1);
00695
00696 for (int index=startBase; (index+2)<(*(*alignedSequences.begin()).second).size(); index+=(codonMultiple*3))
00697 addAllSegmentsReplaceGaps(toReturn, index, index+2);
00698
00699 toReturn.buildConsensus();
00700
00701 return toReturn;
00702 }
00703
00704
00705 gnSeqI gnAlignedSequences::alignedSeqsSize() const
00706 {
00707 if( sequences.size() > 0 )
00708 return sequences[ 0 ].size();
00709 return 0;
00710 }
00711
00712
00713 bool gnAlignedSequences::removeAlignedSeq(string seqName)
00714 {
00715 list <pair <string*, string*> >::iterator sequenceItr = alignedSequences.begin();
00716
00717 for ( ; sequenceItr != alignedSequences.end(); sequenceItr++)
00718 {
00719 if ((*(*sequenceItr).first) == seqName)
00720 {
00721 alignedSequences.erase(sequenceItr);
00722 return true;
00723 }
00724 }
00725
00726 return false;
00727 }
00728
00729
00730 bool gnAlignedSequences::removeAlignedSeq(unsigned index)
00731 {
00732 list <pair <string*, string*> >::iterator sequenceItr = alignedSequences.begin();
00733 int i = 0;
00734
00735 for ( ; sequenceItr != alignedSequences.end(); sequenceItr++)
00736 {
00737 if (i == index)
00738 {
00739 alignedSequences.erase(sequenceItr);
00740 return true;
00741 }
00742
00743 i++;
00744 }
00745
00746 return false;
00747 }
00748
00749
00750 void gnAlignedSequences::concatenateAlignedSequences(gnAlignedSequences toConcat)
00751 {
00752 list <pair <string*, string*> >::iterator toConcatItr = toConcat.alignedSequences.begin();
00753 list <pair <string*, string*> >::iterator originalItr;
00754
00755 unsigned largestSeqSize = 0;
00756
00757 for ( ; toConcatItr != toConcat.alignedSequences.end(); toConcatItr++)
00758 {
00759 for (originalItr = alignedSequences.begin(); originalItr != alignedSequences.end(); originalItr++)
00760 {
00761 if ((*(*originalItr).second).size() > largestSeqSize)
00762 largestSeqSize = (*(*originalItr).second).size();
00763
00764 if ((*(*toConcatItr).first) == (*(*originalItr).first)) {
00765 string seq = (*(*originalItr).second);
00766 seq += (*(*toConcatItr).second);
00767 (*(*originalItr).second) = seq;
00768 break;
00769 }
00770 }
00771 }
00772
00773 for (originalItr = alignedSequences.begin(); originalItr != alignedSequences.end(); originalItr++)
00774 {
00775 while ((*(*originalItr).second).size() < largestSeqSize)
00776 (*(*originalItr).second).append("-");
00777 }
00778
00779 buildConsensus();
00780 }
00781
00782
00783 void gnAlignedSequences::extractVariableSites(gnAlignedSequences &variableSites, bool countGapsAsMismatches)
00784 {
00785 list <pair <string*, string*> >::iterator originalItr = alignedSequences.begin();
00786
00787 int alignedSeqSize = (*((*originalItr).second)).size();
00788
00789 char positionBase;
00790 int matchStart = alignedSeqSize,
00791 matchStop = alignedSeqSize;
00792
00793 bool mismatch = false;
00794
00795 indexPositions.resize(0);
00796
00797 for (int position=alignedSeqSize; position > 0; position--)
00798 {
00799 originalItr = alignedSequences.begin();
00800 positionBase = (*((*originalItr).second))[position-1];
00801 while (!countGapsAsMismatches && (*((*originalItr).second))[position-1] == '-')
00802 {
00803 originalItr++;
00804 positionBase = (*((*originalItr).second))[position-1];
00805 if (originalItr == alignedSequences.end()) break;
00806 }
00807
00808 if (originalItr == alignedSequences.end()) break;
00809
00810 for ( ; originalItr != alignedSequences.end(); originalItr++)
00811 {
00812
00813
00814 if (positionBase != (*((*originalItr).second))[position-1])
00815 {
00816 if (!(!countGapsAsMismatches && (*((*originalItr).second))[position-1] == '-'))
00817 {
00818 mismatch = true;
00819 break;
00820 }
00821 }
00822 }
00823
00824 if (!mismatch)
00825 matchStart--;
00826
00827 else
00828 {
00829 matchStart--;
00830 matchStop = matchStart;
00831
00832
00833 variableSites.indexPositions.push_back(position);
00834 }
00835
00836 mismatch = false;
00837 }
00838
00839 for (int i=variableSites.indexPositions.size()-1; i>=0; i--)
00840 addAllSegments(variableSites, variableSites.indexPositions[i], variableSites.indexPositions[i]);
00841
00842 variableSites.buildConsensus();
00843 }
00844
00845
00846 bool gnAlignedSequences::collapseIdenticalSequences()
00847 {
00848 list <pair <string*, string*> >::iterator itr1 = alignedSequences.begin();
00849 list <pair <string*, string*> >::iterator itr2;
00850 bool toReturn = false;
00851
00852 for ( ; itr1!=alignedSequences.end(); itr1++)
00853 {
00854 itr2=alignedSequences.begin();
00855 for (itr2++; itr2!=alignedSequences.end(); itr2++)
00856 {
00857 if (((*(*itr1).second)==(*(*itr2).second)) && itr1!=itr2)
00858 {
00859 list <pair <string*, string*> >::iterator itrTemp = itr2;
00860 itr2--;
00861 alignedSequences.erase(itrTemp);
00862 toReturn = true;
00863 }
00864 }
00865 }
00866
00867 return toReturn;
00868 }
00869
00870
00871 vector <char> gnAlignedSequences::operator[]( const int offset )
00872 {
00873 vector <char> toReturn;
00874 list <pair <string*, string*> >::iterator itr;
00875
00876 for (itr=alignedSequences.begin(); itr!=alignedSequences.end(); itr++)
00877 toReturn.push_back((*(*itr).second)[offset]);
00878
00879 return toReturn;
00880 }
00881
00882
00883 bool gnAlignedSequences::readClustalWAlignment()
00884 {
00885 ifstream alignmentFile;
00886
00887 alignmentFile.open(alignedSequenceFileName.c_str(), ios::in | ios::binary);
00888
00889 if (!(alignmentFile.is_open()))
00890 {
00891 cout << "Unable to open " << alignedSequenceFileName << ".\n"
00892 << "Exiting.\n";
00893
00894 exit(-1);
00895 }
00896
00897 string line;
00898
00899
00900 getline(alignmentFile, line);
00901 getline(alignmentFile, line);
00902 getline(alignmentFile, line);
00903
00904 bool constructSuccess = constructClustalWAlignedSequenceList(alignmentFile);
00905
00906 alignmentFile.close();
00907
00908 if (constructSuccess) return true;
00909
00910 return false;
00911 }
00912
00913
00914 bool gnAlignedSequences::readPhylipAlignment()
00915 {
00916 ifstream alignmentFile;
00917
00918 alignmentFile.open(alignedSequenceFileName.c_str(), ios::in | ios::binary);
00919
00920 if (!(alignmentFile.is_open()))
00921 {
00922 cout << "Unable to open " << alignedSequenceFileName << ".\n"
00923 << "Exiting.\n";
00924
00925 exit(-1);
00926 }
00927
00928 string line;
00929
00930
00931 getline(alignmentFile, line);
00932
00933 bool constructSuccess = constructPhylipAlignedSequenceList(alignmentFile);
00934
00935 alignmentFile.close();
00936
00937 if (constructSuccess) return true;
00938
00939 return false;
00940 }
00941
00942
00943 bool gnAlignedSequences::readMSFAlignment()
00944 {
00945 ifstream alignmentFile;
00946
00947 alignmentFile.open(alignedSequenceFileName.c_str(), ios::in | ios::binary);
00948
00949 if (!(alignmentFile.is_open()))
00950 {
00951 cout << "Unable to open " << alignedSequenceFileName << ".\n"
00952 << "Exiting.\n";
00953
00954 exit(-1);
00955 }
00956
00957 string line;
00958 getline(alignmentFile, line);
00959
00960
00961 while (line.find("//")<0 || line.find("//")>line.size())
00962 getline(alignmentFile, line);
00963
00964 bool constructSuccess = constructMSFAlignedSequenceList(alignmentFile);
00965
00966 alignmentFile.close();
00967
00968 if (constructSuccess) return true;
00969
00970 return false;
00971 }
00972
00973
00978 bool gnAlignedSequences::readRelaxedNexusAlignment( istream& align_stream ){
00979
00980 string line;
00981 string comments;
00982 getline( align_stream, line );
00983 if( line == "#NEXUS" ){
00984 getline( align_stream, line );
00985 }
00986 if( line[0] == '[' ){
00987 getline( align_stream, line );
00988 while( line[0] != ']' ){
00989 comments += line + "\n";
00990 getline( align_stream, line );
00991 }
00992 getline( align_stream, line );
00993 if( line.size() == 0 )
00994 getline( align_stream, line );
00995 }
00996 while( line.length() == 0 )
00997 getline( align_stream, line );
00998
00999 stringstream align_info( line );
01000 uint seq_count;
01001 gnSeqI align_len;
01002 align_info >> seq_count;
01003 align_info >> align_len;
01004 sequences = vector< string >( seq_count );
01005
01006 for( uint seqI = 0; seqI < seq_count; seqI++ ){
01007 align_stream >> line;
01008 names.push_back( line );
01009
01010 align_stream >> sequences[ seqI ];
01011
01012
01013
01014 }
01015
01016
01017 getline( align_stream, line );
01018 return true;
01019 }
01020
01021
01022 bool gnAlignedSequences::readNexusAlignment()
01023 {
01024 ifstream alignmentFile;
01025
01026 alignmentFile.open(alignedSequenceFileName.c_str(), ios::in | ios::binary);
01027
01028 if (!(alignmentFile.is_open()))
01029 {
01030 cout << "Unable to open " << alignedSequenceFileName << ".\n"
01031 << "Exiting.\n";
01032
01033 exit(-1);
01034 }
01035
01036 string line;
01037 getline(alignmentFile, line);
01038
01039
01040 while (line.find("begin data;")<0 || line.find("begin data;")>line.length())
01041 getline(alignmentFile, line);
01042
01043 bool constructSuccess = constructNexusAlignedSequenceList(alignmentFile);
01044
01045 alignmentFile.close();
01046
01047 if (constructSuccess) return true;
01048
01049 return false;
01050 }
01051
01052
01053 bool gnAlignedSequences::readMegaAlignment()
01054 {
01055 ifstream alignmentFile;
01056
01057 alignmentFile.open(alignedSequenceFileName.c_str(), ios::in | ios::binary);
01058
01059 if (!(alignmentFile.is_open()))
01060 {
01061 cout << "Unable to open " << alignedSequenceFileName << ".\n"
01062 << "Exiting.\n";
01063
01064 exit(-1);
01065 }
01066
01067 string line;
01068
01069 getline(alignmentFile, line);
01070 getline(alignmentFile, line);
01071 getline(alignmentFile, line);
01072
01073 bool constructSuccess = constructMegaAlignedSequenceList(alignmentFile);
01074
01075 alignmentFile.close();
01076
01077 if (constructSuccess) return true;
01078
01079 return false;
01080 }
01081
01082
01083 bool gnAlignedSequences::constructClustalWAlignedSequenceList(ifstream& alignmentFile)
01084 {
01085 string line;
01086
01087
01088 getline(alignmentFile, line);
01089
01090 while (alignmentFile.good())
01091 {
01092 while (line[0] != ' ' && line[0] != '\0')
01093 {
01094 string sequenceName;
01095 int i;
01096 for (i=0; line[i] != ' '; i++)
01097 sequenceName += line[i];
01098
01099 const gnFilter* newFilter = gnFilter::fullDNASeqFilter();
01100 string sequenceBases;
01101 for(int i=sequenceName.size(); i < line.length(); i++){
01102 if ((*newFilter).IsValid(line[i]))
01103 sequenceBases += line[i];
01104 }
01105
01106 list <pair <string*, string*> >::iterator sequenceItr;
01107 if (!(sequenceNameInList(sequenceName, sequenceItr)))
01108 {
01109 pair <string*, string*> sequence;
01110 sequence.first = new string(sequenceName);
01111
01112 sequence.second = new string( sequenceBases );
01113
01114 alignedSequences.push_back(sequence);
01115 }
01116
01117 else
01118 (*(*sequenceItr).second).append(sequenceBases);
01119
01120 getline(alignmentFile, line);
01121 }
01122
01123 getline(alignmentFile, line);
01124 getline(alignmentFile, line);
01125 }
01126
01127
01128 return false;
01129 }
01130
01131
01132 bool gnAlignedSequences::constructPhylipAlignedSequenceList(ifstream& alignmentFile)
01133 {
01134 string line;
01135
01136
01137 getline(alignmentFile, line);
01138
01139 while (alignmentFile.good())
01140 {
01141 if (line[10]!=' ')
01142 {
01143 string sequenceName = line.substr(0,10);
01144 cout << sequenceName << endl;
01145 const gnFilter* newFilter = gnFilter::fullDNASeqFilter();
01146 string sequenceBases;
01147 for(int i=10; i < line.length(); i++)
01148 {
01149 if ((*newFilter).IsValid(line[i]))
01150 sequenceBases += line[i];
01151 }
01152
01153 pair <string*, string*> sequence;
01154 sequence.first = new string(sequenceName);
01155
01156 sequence.second = new string( sequenceBases );
01157
01158 alignedSequences.push_back(sequence);
01159
01160 getline(alignmentFile, line);
01161 }
01162
01163
01164 else
01165 {
01166 string sequenceBases;
01167 while (line[10]==' ' && line[0]!='\0' && line.length()>0)
01168 {
01169 const gnFilter* newFilter = gnFilter::fullDNASeqFilter();
01170 for(int i=0; i < line.length(); i++)
01171 {
01172 if ((*newFilter).IsValid(line[i]))
01173 sequenceBases += line[i];
01174 }
01175
01176 getline(alignmentFile, line);
01177 }
01178
01179 list <pair <string*, string*> >::iterator sequenceItr = alignedSequences.end();
01180 sequenceItr--;
01181 (*(*sequenceItr).second) += sequenceBases;
01182 }
01183 }
01184
01185 return false;
01186 }
01187
01188
01189 bool gnAlignedSequences::constructMSFAlignedSequenceList(ifstream& alignmentFile)
01190 {
01191 string line;
01192
01193
01194 getline(alignmentFile, line);
01195
01196 while (alignmentFile.good())
01197 {
01198 getline(alignmentFile, line);
01199 while (!coordinates(line))
01200 {
01201 string sequenceName;
01202 int i;
01203
01204 for (i=0; line[i] == ' '; i++) {}
01205
01206 for (; line[i] != ' '; i++)
01207 sequenceName += line[i];
01208
01209 const gnFilter* newFilter = gnFilter::fullDNASeqFilter();
01210 string sequenceBases;
01211 for( ; i < line.length(); i++){
01212 if ((*newFilter).IsValid(line[i]))
01213 sequenceBases += line[i];
01214 else if (line[i] == '.' || line[i]=='~')
01215 sequenceBases += '-';
01216 }
01217
01218 list <pair <string*, string*> >::iterator sequenceItr;
01219 if (!(sequenceNameInList(sequenceName, sequenceItr)))
01220 {
01221 pair <string*, string*> sequence;
01222 sequence.first = new string(sequenceName);
01223
01224 sequence.second = new string( sequenceBases );
01225
01226 alignedSequences.push_back(sequence);
01227 }
01228
01229 else
01230 (*(*sequenceItr).second).append(sequenceBases);
01231
01232 getline(alignmentFile, line);
01233 }
01234 }
01235
01236 return false;
01237 }
01238
01239
01240 bool gnAlignedSequences::constructNexusAlignedSequenceList(ifstream& alignmentFile)
01241 {
01242 string line;
01243
01244
01245 getline(alignmentFile, line);
01246
01247
01248 while (alignmentFile.good() && (line.find("endblock;")<0 || line.find("endblock;")>line.length()))
01249 {
01250 while (line[0]!='[' && line[0]!=' ' && line[0]!='\n' && line[0]!='\r' && alignmentFile.good())
01251 {
01252 string sequenceName;
01253 int i;
01254 for (i=0; line[i] != ' '; i++)
01255 sequenceName += line[i];
01256
01257 const gnFilter* newFilter = gnFilter::fullDNASeqFilter();
01258 string sequenceBases;
01259 for(int i=sequenceName.size(); i < line.length(); i++){
01260 if ( line[i] != '\r' && line[i] != '\n' && line[i] != ' ' )
01261 sequenceBases += line[i];
01262 }
01263
01264 list <pair <string*, string*> >::iterator sequenceItr;
01265 if (!(sequenceNameInList(sequenceName, sequenceItr)))
01266 {
01267 pair <string*, string*> sequence;
01268 sequence.first = new string(sequenceName);
01269
01270 sequence.second = new string( sequenceBases );
01271
01272 alignedSequences.push_back(sequence);
01273 }
01274
01275 else
01276 (*(*sequenceItr).second).append(sequenceBases);
01277
01278 getline(alignmentFile, line);
01279 }
01280
01281 getline(alignmentFile, line);
01282 }
01283
01284
01285 return false;
01286 }
01287
01288
01289 bool gnAlignedSequences::constructMegaAlignedSequenceList(ifstream& alignmentFile)
01290 {
01291 string line;
01292 string consensusSequenceBases;
01293 list <pair <string*, string*> >::iterator alignedSequencesItr;
01294
01295
01296 getline(alignmentFile, line);
01297
01298 int previousLineLength = 0;
01299
01300
01301 while (alignmentFile.good())
01302 {
01303 while (line.length()>0 && line[0]=='#')
01304 {
01305 string sequenceName;
01306 for (int i=1; line[i] != ' '; i++)
01307 sequenceName += line[i];
01308
01309 const gnFilter* newFilter = gnFilter::fullDNASeqFilter();
01310 string sequenceBases;
01311 bool isInSeqName = true;
01312 if (alignedSequences.size()>0)
01313 consensusSequenceBases = (*(*alignedSequencesItr).second);
01314
01315 for(int i=sequenceName.size(); i < line.length(); i++)
01316 {
01317
01318
01319 if ((*newFilter).IsValid(line[i]) && !isInSeqName)
01320 sequenceBases += line[i];
01321
01322 else if (line[i] == ' ') isInSeqName=false;
01323
01324 else if (line[i]=='.' && alignedSequences.size()>0 && !isInSeqName)
01325 sequenceBases += consensusSequenceBases[sequenceBases.size()+previousLineLength];
01326 }
01327
01328 list <pair <string*, string*> >::iterator sequenceItr;
01329 if (!(sequenceNameInList(sequenceName, sequenceItr)))
01330 {
01331 pair <string*, string*> sequence;
01332 sequence.first = new string(sequenceName);
01333
01334 sequence.second = new string( sequenceBases );
01335
01336 alignedSequences.push_back(sequence);
01337 alignedSequencesItr = alignedSequences.begin();
01338 }
01339
01340 else
01341 (*(*sequenceItr).second).append(sequenceBases);
01342
01343 getline(alignmentFile, line);
01344 }
01345
01346 if (alignedSequences.size() > 0)
01347 previousLineLength = (*(*alignedSequences.begin()).second).size();
01348
01349 getline(alignmentFile, line);
01350 }
01351
01352
01353 return false;
01354 }
01355
01356
01357 int gnAlignedSequences::sequenceNameInList( string& sequenceName ){
01358 for( uint nameI = 0; nameI < names.size(); nameI++ ){
01359 if( sequenceName == names[ nameI ] )
01360 return nameI;
01361 }
01362 return -1;
01363 }
01364
01365 bool gnAlignedSequences::sequenceNameInList(string sequenceName, list <pair <string*, string*> >::iterator &sequenceItr)
01366 {
01367 for (sequenceItr = alignedSequences.begin(); sequenceItr != alignedSequences.end(); sequenceItr++)
01368 {
01369 if (sequenceName == (*(*sequenceItr).first))
01370 return true;
01371 }
01372
01373 return false;
01374 }
01375
01376
01377 bool gnAlignedSequences::buildConsensus()
01378 {
01379 char consensusBase = '-';
01380
01381 consensus = "";
01382
01383 vector <char> crossAlignmentBases;
01384 for (int index=0; index<(*(*alignedSequences.begin()).second).size(); index++)
01385 {
01386 vector <int> baseCounts(26, 0);
01387 crossAlignmentBases = (*this)[index];
01388
01389
01390 for (int i=0; i<crossAlignmentBases.size(); i++)
01391 {
01392
01393
01394 if (i == 0)
01395 consensusBase=crossAlignmentBases[i];
01396
01397
01398 if (i>0 && crossAlignmentBases[i]=='.')
01399 break;
01400
01401 else if (crossAlignmentBases[i] != '-')
01402 {
01403 int baseIndex = determineBaseIndex(crossAlignmentBases[i]);
01404 baseCounts[baseIndex]++;
01405 }
01406 }
01407
01408 int toAppendToConsensus = 0;
01409 for (int i=1; i<baseCounts.size(); i++)
01410 {
01411
01412 if (baseCounts[i] > baseCounts[toAppendToConsensus])
01413 toAppendToConsensus = i;
01414
01415
01416
01417
01418
01419
01420
01421
01422
01423
01424
01425 }
01426
01427 consensus += (toAppendToConsensus+65);
01428 }
01429
01430 return false;
01431 }
01432
01433
01434 void gnAlignedSequences::addSequence(string& seqToAdd, string& seqName)
01435 {
01436 sequences.push_back( seqToAdd );
01437 names.push_back( seqName );
01438 }
01439
01440
01441 void gnAlignedSequences::addSequence(gnSequence& seqToAdd, string& seqName)
01442 {
01443
01444 ErrorMsg( "Fix gnAlignedSequences::addSequence()" );
01445 sequences.push_back( seqToAdd.ToString() );
01446 names.push_back( seqName );
01447
01448
01449
01450
01451
01452
01453
01454
01455
01456
01457
01458
01459
01460
01461
01462
01463 }
01464
01465
01466 void gnAlignedSequences::addSequence(gnSequence seqToAdd, string seqName, int consensusStart, string originalConsensus)
01467 {
01468 list <pair <string*, string*> >::iterator itr;
01469 if (!sequenceNameInList(seqName, itr))
01470 {
01471 pair <string*, string*> toAdd;
01472 toAdd.first = new string(seqName);
01473 string seq = seqToAdd.ToString();
01474 toAdd.second = new string( seq );
01475 (*toAdd.second).erase();
01476
01477 for (int i=0; i<(*toAdd.second).size(); i++)
01478 {
01479 if (seq[i] == '-')
01480 seq[i] = originalConsensus[consensusStart+i-1];
01481 }
01482
01483 (*toAdd.second) = seq;
01484
01485 alignedSequences.push_back(toAdd);
01486 }
01487
01488 else
01489 {
01490 string seq = (*((*itr).second));
01491 seq += seqToAdd.ToString();
01492 for (int i=0; i<seq.size(); i++)
01493 {
01494 if (seq[i+(*((*itr).second)).size()]=='-' && originalConsensus.size()>0)
01495 seq[i+(*((*itr).second)).size()] = originalConsensus[consensusStart+i-1];
01496 }
01497
01498 (*((*itr).second)) = seq;
01499 }
01500 }
01501
01502
01503 void gnAlignedSequences::addAllSegments(gnAlignedSequences &alignment, unsigned start, unsigned stop)
01504 {
01505 for ( uint seqI = 0; seqI < alignment.sequences.size(); seqI++ ){
01506 if (stop == 0 || stop == alignment.sequences[ seqI ].size()-1)
01507 stop = alignment.sequences[ seqI ].size();
01508 string seq = alignment.sequences[ seqI ].substr(start, stop-start+1);
01509 alignment.addSequence( seq, alignment.names[ seqI ] );
01510
01511 }
01512 }
01513
01514
01515 void gnAlignedSequences::addAllSegmentsReplaceGaps(gnAlignedSequences &alignment, unsigned start, unsigned stop)
01516 {
01517 list <pair <string*, string*> >::iterator alignedItr = alignedSequences.begin();
01518 for ( ; alignedItr != alignedSequences.end(); alignedItr++)
01519 {
01520 if (stop == 0 || stop == (*(*alignedItr).second).size()-1)
01521 stop = (*(*alignedItr).second).size();
01522
01523 alignment.addSequence(((*(*alignedItr).second).substr(start, stop-start+1)),
01524 ((*(*alignedItr).first)), start, consensus);
01525 }
01526 }
01527
01528
01529 void gnAlignedSequences::removeAllSegments(unsigned start, unsigned stop)
01530 {
01531 list <pair <string*, string*> >::iterator alignedItr = alignedSequences.begin();
01532 for ( ; alignedItr != alignedSequences.end(); alignedItr++)
01533 {
01534 if (stop == 0)
01535 stop = (*(*alignedItr).second).size();
01536
01537 (alignedItr->second)->erase(start, stop-start+1);
01538 }
01539
01540 cout << start << " " << stop << ": " << stop-start+1 << endl;
01541 }
01542
01543
01544 int gnAlignedSequences::determineBaseIndex(char base)
01545 {
01546 if (base < 91)
01547 return (base-65);
01548
01549
01550 return (base-97);
01551 }
01552
01553
01554 bool gnAlignedSequences::coordinates(string line)
01555 {
01556 bool toReturn = true;
01557
01558 for (int i=0; i<line.length(); i++)
01559 {
01560 if (line[i]!=' ' && line[i]!='\r' && line[i]!='\n' && (line[i]<48 || line[i]>57))
01561 {
01562 toReturn = false;
01563 break;
01564 }
01565 }
01566
01567 return toReturn;
01568 }
01569
01570 }