00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifdef HAVE_CONFIG_H
00010 #include "config.h"
00011 #endif
00012
00013 #include "libMems/Islands.h"
00014 #include "libMems/Aligner.h"
00015 #include "libMems/GappedAlignment.h"
00016
00017 using namespace std;
00018 using namespace genome;
00019 namespace mems {
00020
00025 void simpleFindIslands( IntervalList& iv_list, uint island_size, ostream& island_out ){
00026 vector< Island > island_list;
00027 simpleFindIslands( iv_list, island_size, island_list );
00028 for( size_t isleI = 0; isleI < island_list.size(); isleI++ )
00029 {
00030 Island& i = island_list[isleI];
00031 island_out << i.seqI << '\t' << i.leftI << '\t' << i.rightI << '\t'
00032 << i.seqJ << '\t' << i.leftJ << '\t' << i.rightJ << endl;
00033 }
00034 }
00035
00036
00037 void simpleFindIslands( IntervalList& iv_list, uint island_size, vector< Island >& island_list ){
00038 if( iv_list.size() == 0 )
00039 return;
00040 for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
00041 Interval& iv = iv_list[ iv_listI ];
00042 gnAlignedSequences gnas;
00043 iv.GetAlignedSequences( gnas, iv_list.seq_table );
00044 uint seq_count = iv_list.seq_table.size();
00045
00046 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00047 uint seqJ;
00048 for( seqJ = seqI + 1; seqJ < seq_count; seqJ++ ){
00049 uint columnI = 0;
00050 gnSeqI curI = 0;
00051 gnSeqI curJ = 0;
00052 gnSeqI lastI = 0;
00053 gnSeqI lastJ = 0;
00054 for( columnI = 0; columnI < gnas.alignedSeqsSize(); columnI++ ){
00055 if( gnas.sequences[ seqI ][ columnI ] != '-' )
00056 curI++;
00057 if( gnas.sequences[ seqJ ][ columnI ] != '-' )
00058 curJ++;
00059 if( toupper( gnas.sequences[ seqI ][ columnI ] ) ==
00060 toupper( gnas.sequences[ seqJ ][ columnI ] ) &&
00061 gnas.sequences[ seqJ ][ columnI ] != '-' ){
00062
00063 if( curI - lastI > island_size ||
00064 curJ - lastJ > island_size ){
00065 int64 leftI = iv.Start( seqI );
00066 int64 rightI = leftI < 0 ? leftI - curI : leftI + curI;
00067 leftI = leftI < 0 ? leftI - lastI : leftI + lastI;
00068 int64 leftJ = iv.Start( seqJ );
00069 int64 rightJ = leftJ < 0 ? leftJ - curJ : leftJ + curJ;
00070 leftJ = leftJ < 0 ? leftJ - lastJ : leftJ + lastJ;
00071 Island isle;
00072 isle.seqI = seqI;
00073 isle.seqJ = seqJ;
00074 isle.leftI = leftI;
00075 isle.leftJ = leftJ;
00076 isle.rightI = rightI;
00077 isle.rightJ = rightJ;
00078 island_list.push_back(isle);
00079 }
00080
00081 lastI = curI;
00082 lastJ = curJ;
00083 }
00084 }
00085 }
00086 }
00087 }
00088 }
00089
00090
00096 void simpleFindBackbone( IntervalList& iv_list, uint backbone_size, uint max_gap_size, vector< GappedAlignment >& backbone_regions ){
00097 if( iv_list.size() == 0 )
00098 return;
00099 for( uint iv_listI = 0; iv_listI < iv_list.size(); iv_listI++ ){
00100 Interval& iv = iv_list[ iv_listI ];
00101 gnAlignedSequences gnas;
00102 uint seqI;
00103 uint seq_count = iv_list.seq_table.size();
00104 vector< int64 > positions( seq_count );
00105 vector< int64 > starts( seq_count );
00106 vector< int64 > ends( seq_count );
00107 vector< uint > gap_size( seq_count, 0 );
00108 uint seqJ;
00109 gnSeqI bb_start_col = 0;
00110 gnSeqI bb_end_col = 0;
00111 GappedAlignment cur_backbone( seq_count, 0 );
00112
00113
00114 for( seqI = 0; seqI < seq_count; seqI++ ){
00115 positions[ seqI ] = iv_list[ iv_listI ].Start( seqI );
00116 if( positions[ seqI ] < 0 )
00117 positions[ seqI ] -= iv_list[ iv_listI ].Length( seqI ) + 1;
00118 }
00119 starts = positions;
00120 ends = positions;
00121
00122 iv.GetAlignedSequences( gnas, iv_list.seq_table );
00123 bool backbone = true;
00124 uint columnI = 0;
00125 vector< int64 > prev_positions;
00126 for( ; columnI < gnas.alignedSeqsSize(); columnI++ ){
00127 bool no_gaps = true;
00128 prev_positions = positions;
00129 for( seqI = 0; seqI < seq_count; seqI++ ){
00130 char cur_char = gnas.sequences[ seqI ][ columnI ];
00131 if( cur_char != '-' && toupper(cur_char) != 'N' ){
00132 if( gap_size[ seqI ] > max_gap_size && backbone ){
00133
00134
00135
00136 for( seqJ = 0; seqJ < seq_count; seqJ++ ){
00137 if( ends[ seqJ ] - starts[ seqJ ] < backbone_size ){
00138 break;
00139 }
00140 }
00141 if( seqJ == seq_count ) {
00142
00143 backbone_regions.push_back( cur_backbone );
00144 uint bbI = backbone_regions.size() - 1;
00145 vector< string > aln_mat( seq_count );
00146 for( seqJ = 0; seqJ < seq_count; seqJ++ ){
00147 if( starts[ seqJ ] < 0 )
00148 backbone_regions[ bbI ].SetStart( seqJ, ends[ seqJ ] + 1);
00149 else
00150 backbone_regions[ bbI ].SetStart( seqJ, starts[ seqJ ] );
00151 backbone_regions[ bbI ].SetLength( ends[ seqJ ] - starts[ seqJ ], seqJ );
00152 aln_mat[ seqJ ] = gnas.sequences[ seqJ ].substr( bb_start_col, bb_end_col - bb_start_col + 1);
00153 }
00154 backbone_regions[ bbI ].SetAlignment(aln_mat);
00155
00156 }
00157
00158
00159
00160 backbone = false;
00161 }
00162 positions[ seqI ]++;
00163 gap_size[ seqI ] = 0;
00164 }else{
00165 gap_size[ seqI ]++;
00166 no_gaps = false;
00167 }
00168 }
00169 if( no_gaps ){
00170 bb_end_col = columnI;
00171 ends = positions;
00172 if( !backbone ){
00173 starts = prev_positions;
00174 bb_start_col = columnI;
00175 backbone = true;
00176 }
00177 }
00178 }
00179
00180
00181 for( seqJ = 0; seqJ < seq_count; seqJ++ ){
00182 if( ends[ seqJ ] - starts[ seqJ ] < backbone_size ){
00183 break;
00184 }
00185 }
00186 if( seqJ == seq_count ) {
00187
00188 backbone_regions.push_back( cur_backbone );
00189 uint bbI = backbone_regions.size() - 1;
00190 vector< string > aln_mat( seq_count );
00191 for( seqJ = 0; seqJ < seq_count; seqJ++ ){
00192 if( starts[ seqJ ] < 0 )
00193 backbone_regions[ bbI ].SetStart( seqJ, ends[ seqJ ] + 1);
00194 else
00195 backbone_regions[ bbI ].SetStart( seqJ, starts[ seqJ ] );
00196 backbone_regions[ bbI ].SetLength( ends[ seqJ ] - starts[ seqJ ], seqJ );
00197 aln_mat[ seqJ ] = gnas.sequences[ seqJ ].substr( bb_start_col, bb_end_col - bb_start_col + 1);
00198 }
00199 backbone_regions[ bbI ].SetAlignment( aln_mat );
00200 }
00201 }
00202 }
00203
00204
00205 void outputBackbone( const vector< GappedAlignment >& backbone_regions, ostream& backbone_out ){
00206 for( uint bbI = 0; bbI < backbone_regions.size(); bbI++ ){
00207 for( uint seqJ = 0; seqJ < backbone_regions[ bbI ].SeqCount(); seqJ++ ){
00208 if( seqJ > 0 )
00209 backbone_out << '\t';
00210 int64 bb_rend = backbone_regions[ bbI ].Start( seqJ );
00211 if( backbone_regions[ bbI ].Start( seqJ ) < 0 )
00212 bb_rend -= (int64)backbone_regions[ bbI ].Length( seqJ );
00213 else
00214 bb_rend += (int64)backbone_regions[ bbI ].Length( seqJ );
00215 backbone_out << backbone_regions[ bbI ].Start( seqJ ) << '\t' << bb_rend;
00216 }
00217 backbone_out << endl;
00218 }
00219 }
00220
00221
00222
00223
00224 void getGapBounds( vector<gnSeqI>& seq_lengths, vector< LCB >& adjacencies, uint seqJ, int leftI, int rightI, int64& left_start, int64& right_start ){
00225 if( rightI != -1 )
00226 right_start = absolut( adjacencies[ rightI ].left_end[ seqJ ] );
00227 else
00228 right_start = seq_lengths[seqJ] + 1;
00229
00230 if( leftI != -1 )
00231 left_start = absolut( adjacencies[ leftI ].right_end[ seqJ ] );
00232 else
00233 left_start = 1;
00234 }
00235
00236
00237 void addUnalignedIntervals( IntervalList& iv_list, set< uint > seq_set, vector<gnSeqI> seq_lengths ){
00238 vector< LCB > adjacencies;
00239 vector< int64 > weights;
00240 uint lcbI;
00241 uint seqI;
00242 if( seq_lengths.size() == 0 )
00243 for( seqI = 0; seqI < iv_list.seq_table.size(); seqI++ )
00244 seq_lengths.push_back(iv_list.seq_table[seqI]->length());
00245
00246 uint seq_count = seq_lengths.size();
00247
00248
00249 if( seq_set.size() == 0 )
00250 {
00251
00252
00253 for( seqI = 0; seqI < seq_count; seqI++ )
00254 seq_set.insert( seqI );
00255 }
00256
00257 weights = vector< int64 >( iv_list.size(), 0 );
00258 computeLCBAdjacencies_v2( iv_list, weights, adjacencies );
00259
00260 vector< int > rightmost;
00261 for( seqI = 0; seqI < seq_count; seqI++ ){
00262 rightmost.push_back( -1 );
00263 }
00264 for( lcbI = 0; lcbI <= adjacencies.size(); lcbI++ ){
00265 set< uint >::iterator seq_set_iterator = seq_set.begin();
00266 for( ; seq_set_iterator != seq_set.end(); seq_set_iterator++ ){
00267 seqI = *seq_set_iterator;
00268
00269 int leftI;
00270 if( lcbI < adjacencies.size() ){
00271
00272 leftI = adjacencies[ lcbI ].left_adjacency[ seqI ];
00273 }else
00274 leftI = rightmost[ seqI ];
00275
00276 int rightI = lcbI < adjacencies.size() ? lcbI : -1;
00277
00278 if( lcbI < adjacencies.size() )
00279 if( adjacencies[ lcbI ].right_adjacency[ seqI ] == -1 )
00280 rightmost[ seqI ] = lcbI;
00281
00282 int64 left_start, right_start;
00283 getGapBounds( seq_lengths, adjacencies, seqI, leftI, rightI, left_start, right_start );
00284 int64 gap_len = absolut( right_start ) - absolut( left_start );
00285 if( gap_len > 0 ){
00286 Match mm( seq_count );
00287 Match* m = mm.Copy();
00288 for( uint seqJ = 0; seqJ < seq_count; seqJ++ ){
00289 m->SetStart( seqJ, 0 );
00290 }
00291 m->SetStart( seqI, left_start );
00292 m->SetLength( gap_len );
00293 vector<AbstractMatch*> tmp(1, m);
00294 iv_list.push_back( Interval(tmp.begin(), tmp.end()) );
00295 m->Free();
00296 }
00297 }
00298 }
00299 }
00300
00301
00302 void findIslandsBetweenLCBs( IntervalList& iv_list, uint island_size, ostream& island_out ){
00303 IntervalList iv_list_tmp = iv_list;
00304 addUnalignedIntervals( iv_list_tmp );
00305 uint seq_count = iv_list.seq_table.size();
00306
00307 for( int ivI = iv_list.size(); ivI < iv_list_tmp.size(); ivI++ ){
00308 for( uint seqI = 0; seqI < seq_count; seqI++ ){
00309 if( iv_list_tmp[ ivI ].Length( seqI ) < island_size )
00310 continue;
00311
00312
00313 gnSeqI left_end = absolut( iv_list_tmp[ ivI ].Start( seqI ) );
00314 gnSeqI right_end = left_end + iv_list_tmp[ ivI ].Length( seqI ) - 1;
00315 island_out << "LCB island:\t" << seqI << '\t' << left_end << '\t' << right_end << endl;
00316 }
00317 }
00318 }
00319
00320 }