[Bio] / FigKernelPackages / PinnedRegions.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/PinnedRegions.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : dsouza 1.1 # -*- perl -*-
2 :     ########################################################################
3 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     ########################################################################
18 :    
19 :    
20 :     package PinnedRegions;
21 :    
22 :     use strict;
23 :     use warnings;
24 :    
25 :     use FIG;
26 :     use FIG_Config;
27 :    
28 :     use Time::HiRes qw( usleep ualarm gettimeofday tv_interval );
29 :    
30 :     sub pinned_regions {
31 :     my($fig, $pin_desc, $fast_color, $sims_from, $map_sz) = @_;
32 :    
33 :     # Get list of pegs required by the description in $pin_desc
34 :     my $pinned_pegs = &expand_peg_list($fig, $pin_desc);
35 :    
36 :     # Get regions around pinned pegs -- boundaries, features, etc.
37 :     my $regions = &define_regions($fig, $map_sz, $pinned_pegs);
38 :    
39 :     # Filter out overlapping regions caused by gene fusions, frame shifts etc. where multiple PEGs
40 :     # are pinned (by similarity or PCH) to the input PEG
41 :     $regions = &filter_regions_1($pin_desc, $regions);
42 :    
43 :     # Get information for each feature -- location, function and the set (of PEGs) it belongs to.
44 :     # Assigning set numbers requires BLASTing and use similarity scores to do transitive closure
45 :     my $feature_data = &feature_data($fig, $regions);
46 :    
47 : dsouza 1.3 &add_functional_coupling($fig, $pin_desc, $regions, $feature_data);
48 :     &add_subsystem_data($fig, $pin_desc, $feature_data);
49 :    
50 : dsouza 1.1 # Assign a set number to some PEGs based on similarity from blast scores
51 :     &color_pegs($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $fast_color, $sims_from);
52 :    
53 :     # Filter out regions which have only a single PEG (the pinned one) colored.
54 : dsouza 1.3 # $regions = &filter_regions_2($pin_desc, $regions, $feature_data);
55 : dsouza 1.1
56 :     # Add feature data to the regions to make the final maps
57 :     my $maps = &make_maps($fig, $regions, $feature_data);
58 :    
59 :     return $maps;
60 :     }
61 :    
62 :     sub expand_peg_list {
63 :     my($fig, $pin_desc) = @_;
64 :     my @pegs = ();
65 :    
66 :     # Filter for legitimate genome IDS -- should handle deleted organisms and (in NMPDR) non-NMPDR orgs
67 :     my %ok_genome_id = map {$_ => 1} $fig->genomes;
68 :    
69 :     if ( @{ $pin_desc->{'pegs'} } > 1 )
70 :     {
71 :     # Input already contains list of pegs, no need for expansion
72 :     @pegs = grep {$ok_genome_id{$fig->genome_of($_)}} @{ $pin_desc->{'pegs'} };
73 :     }
74 :     else
75 :     {
76 :     # Get PCH pinned pegs
77 :     my $pch_pinned_pegs = &pch_pinned_pegs($fig, $pin_desc, \%ok_genome_id);
78 :    
79 : dsouza 1.3 $pch_pinned_pegs = &select_pegs($fig, $pin_desc, $pin_desc->{'n_pch_pins'}, $pch_pinned_pegs);
80 : dsouza 1.1
81 :     # Get similarity pinned pegs
82 :     my $sim_pegs = &sim_pegs($fig, $pin_desc, \%ok_genome_id);
83 :    
84 : dsouza 1.3 # Remove PCH PEGs (if any) from similarity pinned PEG list
85 : dsouza 1.1 my %seen = map {$_ => 1} @$pch_pinned_pegs;
86 :     $sim_pegs = [grep {! $seen{$_}} @$sim_pegs];
87 : dsouza 1.3
88 : dsouza 1.1 $sim_pegs = &select_pegs($fig, $pin_desc, $pin_desc->{'n_sims'}, $sim_pegs);
89 :    
90 :     @pegs = ();
91 :    
92 :     # Get input peg
93 :     my $peg = $pin_desc->{'pegs'}[0];
94 :     if ( $pin_desc->{'sort_by'} eq 'phylogeny' )
95 :     {
96 :     # First add input peg, then sort by phylogeny
97 :     @pegs = $fig->sort_fids_by_taxonomy($peg, @$sim_pegs, @$pch_pinned_pegs);
98 :     }
99 :     else
100 :     {
101 :     # Sort by phylogenetic distance or organism name
102 :     my $g1 = $fig->genome_of($peg);
103 :     @pegs = map {$_->[0] }
104 :     sort {$a->[1] <=> $b->[1] or $a->[2] cmp $b->[2]}
105 :     map {[$_, $fig->crude_estimate_of_distance($g1,$fig->genome_of($_)), $fig->org_of($_)]} @$sim_pegs, @$pch_pinned_pegs;
106 :     # Add input peg at front of list
107 :     unshift @pegs, $peg;
108 :     }
109 :     }
110 :    
111 :     return \@pegs
112 :     }
113 :    
114 :     sub pch_pinned_pegs {
115 :     my($fig, $pin_desc, $ok_genome_id) = @_;
116 :    
117 :     my @pegs = ();
118 :    
119 :     if ( $pin_desc->{'n_pch_pins'} > 0 )
120 :     {
121 :     # Get input peg
122 :     my $peg = $pin_desc->{'pegs'}[0];
123 :    
124 :     # Get pegs in pch pin with input peg, and filter out deleted or non-NMPDR organisms
125 :     @pegs = grep {$ok_genome_id->{$fig->genome_of($_)}}
126 :     grep {$_ ne $peg}
127 :     map {$_->[0]} $fig->in_pch_pin_with_and_evidence($peg);
128 :     }
129 :    
130 :     return \@pegs;
131 :     }
132 :    
133 :     sub sim_pegs {
134 :     my($fig, $pin_desc, $ok_genome_id) = @_;
135 :    
136 :     # Return a list of PEGS similar to the input PEG, with evalue less than or equal to
137 :     # the user defined similarity cutoff.
138 :     # If the number of sims requested is 0, return the empty list
139 :    
140 :     my $n_sims = $pin_desc->{'n_sims'};
141 :     my $sim_cutoff = $pin_desc->{'sim_cutoff'};
142 :     my @pegs = ();
143 :    
144 :     if ( $n_sims > 0 )
145 :     {
146 :     my $peg = $pin_desc->{'pegs'}[0];
147 :     my %seen;
148 :     @pegs = grep {$ok_genome_id->{$fig->genome_of($_)}}
149 :     grep {! $seen{$_}++}
150 :     map {$_->[1]} $fig->sims($peg, 10000, $sim_cutoff, "fig");
151 :    
152 :     my $n_pegs = @pegs;
153 :     }
154 :    
155 :     return \@pegs;
156 :     }
157 :    
158 :     sub select_pegs {
159 :     my($fig, $pin_desc, $n_pegs, $pegs) = @_;
160 :    
161 :     if ( $n_pegs == 0 )
162 :     {
163 :     return [];
164 :     }
165 :    
166 : dsouza 1.3 if ( $pin_desc->{'collapse_close_genomes'} == 1 )
167 : dsouza 1.1 {
168 : dsouza 1.3 # The ordering of the PEGs done here is in order to =select= the correct set, the ordering for the
169 :     # display is done later.
170 : dsouza 1.2
171 : dsouza 1.3 # To collapse the PEG list, we want to:
172 :     # a. Return at most $n_pegs PEGs,
173 :     # b. include a representative PEG from every genus, subject to a, and
174 :     # c. include a representative PEG from each organism (genus-species), subject to a and b.
175 : dsouza 1.1
176 : dsouza 1.3 my(%seen, @unique_genus, @unique_org);
177 : dsouza 1.1
178 : dsouza 1.3 foreach my $peg ( @$pegs )
179 :     {
180 :     my $org = $fig->org_of($peg);
181 :     # 'org_of' returns undef for deleted PEGs, these need to be omitted from the peg list returned
182 :    
183 :     if ( $org )
184 :     {
185 :     my($genus) = split(/\s+/, $org);
186 :    
187 :     if ( not $seen{$genus}++ ) {
188 :     push @unique_genus, $peg;
189 :     } elsif ( not $seen{$org}++ ) {
190 :     push @unique_org, $peg;
191 :     }
192 :     }
193 :     }
194 : dsouza 1.1
195 : dsouza 1.3 $pegs = [@unique_genus, @unique_org];
196 : dsouza 1.1 }
197 :    
198 :     # Truncate list if necessary
199 : dsouza 1.3 if ( $n_pegs < @$pegs )
200 : dsouza 1.1 {
201 : dsouza 1.3 $#{ $pegs } = $n_pegs - 1;
202 : dsouza 1.1 }
203 :    
204 :     return $pegs;
205 :     }
206 :    
207 :     sub filter_regions_1 {
208 :     my($pin_desc, $regions) = @_;
209 :     my $new_regions;
210 :    
211 :     # Filter out some overlapping regions caused by gene fusions, frame shifts etc. where multiple PEGs
212 :     # are pinned (by similarity or PCH) to the input PEG
213 :     # Note that all overlaps are NOT eliminated
214 :    
215 :     if ( @{ $pin_desc->{'pegs'} } == 1 )
216 :     {
217 :     # Input pin description is for a single input peg
218 :     my %seen;
219 :    
220 :     foreach my $region ( @$regions )
221 :     {
222 :     my $pinned_peg = $region->{'pinned_peg'};
223 :    
224 :     # If the pinned peg from a region has already been 'seen', don't
225 :     # add the region into the @$new_regions array
226 :    
227 :     if ( not $seen{$pinned_peg} )
228 :     {
229 :     push @$new_regions, $region;
230 :    
231 :     my $fids = $region->{'features'};
232 :     foreach my $fid ( @$fids )
233 :     {
234 :     $seen{$fid} = 1;
235 :     }
236 :     }
237 :     }
238 :     }
239 :     else
240 :     {
241 :     # input PEGs is a list -- keep all of them
242 :     $new_regions = $regions;
243 :     }
244 :    
245 :     return $new_regions;
246 :     }
247 :    
248 :     sub filter_regions_2 {
249 :     my($pin_desc, $regions, $feature_data) = @_;
250 :     my $new_regions;
251 :    
252 :     if ( @{ $pin_desc->{'pegs'} } == 1 )
253 :     {
254 :     # Input is single peg
255 :     my $input_peg = $pin_desc->{'pegs'}[0];
256 :    
257 :     foreach my $region ( @$regions )
258 :     {
259 :     my $fids = $region->{'features'};
260 :     my $n_in_sets = grep {$feature_data->{$_}{'set_number'}} @$fids;
261 :    
262 :     if ( $n_in_sets > 1 or $region->{'pinned_peg'} eq $input_peg )
263 :     {
264 :     # Regions should be displayed only if:
265 :     # a. more than one PEG is in a 'colored set' OR
266 :     # b. the pinned PEG is the input PEG
267 :     push @$new_regions, $region;
268 :     }
269 :     }
270 :     }
271 :     else
272 :     {
273 :     # Input is a list of pegs -- keep all of the regions
274 :     $new_regions = $regions
275 :     }
276 :    
277 :     return $new_regions;
278 :     }
279 :    
280 :     sub make_maps {
281 :     my($fig, $regions, $feature_data) = @_;
282 :    
283 :     foreach my $region ( @$regions )
284 :     {
285 :     my $features = [];
286 :     my $fids = $region->{'features'};
287 :    
288 :     foreach my $fid ( @$fids )
289 :     {
290 :     push @$features, $feature_data->{$fid};
291 :     }
292 :    
293 :     $region->{'features'} = $features;
294 :     }
295 :    
296 :     return $regions;
297 :     }
298 :    
299 :     sub define_regions {
300 :     my($fig, $map_sz, $pinned_pegs) = @_;
301 :    
302 :     # Define the 'region' around the pinned peg, get features, and store some region information
303 :     my $regions = [];
304 :     my $half_map_sz = int($map_sz/2);
305 :    
306 :     foreach my $peg ( @$pinned_pegs )
307 :     {
308 :     my $genome = $fig->genome_of($peg);
309 :     my $loc = $fig->feature_location($peg);
310 :     my($contig, $beg, $end) = $fig->boundaries_of($loc);
311 :    
312 :     my $region_mid = int(($beg + $end)/2);
313 :     my $region_beg = $region_mid - $half_map_sz;
314 :     my $region_end = $region_mid + $half_map_sz;
315 :    
316 :     my($fids) = $fig->genes_in_region($genome, $contig, $region_beg, $region_end);
317 :    
318 :     my $region = {};
319 :     $region->{'genome_id'} = $fig->genome_of($peg);
320 :     $region->{'org_name'} = $fig->genus_species($region->{'genome_id'});
321 :     $region->{'contig'} = $contig;
322 :     $region->{'beg'} = $region_beg;
323 :     $region->{'mid'} = $region_mid;
324 :     $region->{'end'} = $region_end;
325 :     $region->{'features'} = $fids;
326 :     $region->{'pinned_peg_strand'} = ($beg <= $end)? '+' : '-';
327 :     $region->{'pinned_peg'} = $peg;
328 :    
329 :     push @$regions, $region;
330 :     }
331 :    
332 :     return $regions;
333 :     }
334 :    
335 :     sub feature_data {
336 :     my($fig, $regions) = @_;
337 :     my %feature_data;
338 :    
339 :     foreach my $region ( @$regions )
340 :     {
341 :     my $region_mid = $region->{'mid'};
342 :     my $fids = $region->{'features'};
343 :    
344 :     foreach my $fid ( @$fids )
345 :     {
346 :     # Get feature data if this feature has not been seen before, this avoids repeating
347 :     # this step when pegs occur in multiple regions
348 :     if ( not exists $feature_data{$fid} )
349 :     {
350 :     $feature_data{$fid} = &feature_entry($fig, $fid, $region_mid);
351 :     }
352 :     }
353 :     }
354 :    
355 :     return \%feature_data;
356 :     }
357 :    
358 :     sub feature_entry {
359 :     my($fig, $fid, $region_mid) = @_;
360 :    
361 :     my $type = $fig->ftype($fid);
362 :     my $loc = $fig->feature_location($fid);
363 :     my($contig, $beg, $end) = $fig->boundaries_of($loc);
364 :     my($left, $right) = sort {$a <=> $b} ($beg, $end);
365 : dsouza 1.3 my $size = $right - $left + 1;
366 : dsouza 1.1 my $strand = ($beg <= $end)? '+' : '-';
367 :     my $offset = int(($left + $right)/2) - $region_mid;
368 :     my $offset_beg = $left - $region_mid;
369 :     my $offset_end = $right - $region_mid;
370 :     my $func = scalar $fig->function_of($fid) || '';
371 :    
372 :     return {
373 :     'fid' => $fid,
374 :     'type' => $type,
375 :     'contig' => $contig,
376 :     'beg' => $beg,
377 :     'end' => $end,
378 : dsouza 1.3 'size' => $size,
379 : dsouza 1.1 'strand' => $strand,
380 :     'offset' => $offset,
381 :     'offset_beg' => $offset_beg,
382 :     'offset_end' => $offset_end,
383 :     'function' => $func,
384 :     };
385 :     }
386 :    
387 : dsouza 1.3 sub add_functional_coupling {
388 :     my($fig, $pin_desc, $regions, $feature_data) = @_;
389 :    
390 :     my $fc_cutoff = defined($pin_desc->{'fc_cutoff'})? $pin_desc->{'fc_cutoff'} : 4;
391 :    
392 :     foreach my $region ( @$regions )
393 :     {
394 :     my $pin = $region->{'pinned_peg'};
395 :     my @pegs = grep {$feature_data->{$_}{'type'} eq 'peg'} @{ $region->{'features'} };
396 :    
397 :     foreach my $couple ( $fig->coupled_to_batch(@pegs) )
398 :     {
399 :     my($peg1, $peg2, $sc) = @$couple;
400 :    
401 :     if ( $peg1 eq $pin and $sc >= $fc_cutoff )
402 :     {
403 :     $feature_data->{$peg2}{'fc_score'} = $sc;
404 :     }
405 :     }
406 :     }
407 :     }
408 :    
409 :     sub add_subsystem_data {
410 :     my($fig, $pin_desc, $feature_data) = @_;
411 :    
412 :     # Get subsystem_information for =all= pegs
413 :     my %peg_to_ss = $fig->subsystems_for_pegs([grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data]);
414 :    
415 :     # Count number of occurences of each subsystem
416 :     my %ss_count;
417 :     foreach my $ss ( map {$_->[0]} map {@$_} values %peg_to_ss )
418 :     {
419 :     $ss_count{$ss}++;
420 :     }
421 :    
422 :     # Sort subsystems based on count and create hash with unique index for each subsystem where
423 :     # lower indices correspond to higher number of occurences of the subsystem.
424 :     my %ss_index;
425 :     my $index = 0;
426 :     foreach my $ss ( sort {$ss_count{$b} <=> $ss_count{$a} or $a cmp $b} keys %ss_count )
427 :     {
428 :     $ss_index{$ss} = ++$index;
429 :     }
430 :    
431 :     # Add subsystem information for pegs in subsystems
432 :     foreach my $fid ( keys %peg_to_ss )
433 :     {
434 :     my @subsystems = ();
435 :     foreach my $rec ( @{ $peg_to_ss{$fid} } )
436 :     {
437 :     if ( @$rec )
438 :     {
439 :     push @subsystems, $rec->[0];
440 :     }
441 :     }
442 :    
443 :     if ( @subsystems )
444 :     {
445 :     $feature_data->{$fid}{'subsystems'} = [sort {$a->[1] <=> $b->[1]} map {$_->[0] =~ s/_/ /g; $_} map {[$_, $ss_index{$_}]} @subsystems];
446 :     }
447 :     }
448 :     }
449 :    
450 : dsouza 1.1 sub color_pegs {
451 :     my($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $fast_color, $sims_from) = @_;
452 :    
453 :     # Run blastp and get connected pegs
454 :     my $blast_hit = {};
455 :     my $color_sim_cutoff = $pin_desc->{'color_sim_cutoff'};
456 :    
457 :     if ( $sims_from eq 'blast' )
458 :     {
459 :     $blast_hit = &blast_hits($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff);
460 :     }
461 :     else
462 :     {
463 :     $blast_hit = &blast_hits_from_sims($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff);
464 :     }
465 :    
466 :     # Assign set numbers to pegs based on blast scores
467 :     my $color_sets = &partition_pegs($blast_hit);
468 :    
469 :     # Sort peg sets to that a) larger sets have smaller set numbers, and
470 :     # b) sets closer to the pinned peg have smaller set numbers
471 :     $color_sets = &sort_color_sets($pin_desc, $pinned_pegs, $color_sets, $feature_data);
472 :    
473 :     # Add color set number into $feature_data
474 :     for (my $i = 0; $i < @$color_sets; $i++)
475 :     {
476 :     # Use an index that starts at 1 (matches with coloring index)
477 :     my $n = $i + 1;
478 :     foreach my $peg ( @{ $color_sets->[$i] } )
479 :     {
480 :     $feature_data->{$peg}{'set_number'} = $n;
481 :    
482 :     # If this is the pinned set, set the 'color' to 'red'
483 :     if ( $n == 1 )
484 :     {
485 :     $feature_data->{$peg}{'color'} = 'red';
486 :     }
487 :     }
488 :     }
489 :     }
490 :    
491 :     sub sort_color_sets {
492 :     my($pin_desc, $pinned_pegs, $color_sets, $feature_data) = @_;
493 :    
494 : dsouza 1.3 # Find the color set containing the set of pinned pegs, i.e. the set containing the input peg.
495 :     # If the input peg is found (or if input is a list of pegs, returned value is the array index for the
496 :     # set.
497 :     # If the input peg is not found, the returned value is an reference to an array containing the input peg.
498 :     my $set = &pinned_set($pin_desc, $pinned_pegs, $color_sets);
499 : dsouza 1.1
500 : dsouza 1.3 my $pinned_color_set;
501 :     if ( $set =~ /^\d+$/ )
502 :     {
503 :     # Splice out the color set containing the input peg
504 :     ($pinned_color_set) = splice(@$color_sets,$set,1);
505 :     }
506 :     else
507 :     {
508 :     $pinned_color_set = $set;
509 :     }
510 : dsouza 1.1
511 : dsouza 1.3 # Add offset (summed distance from
512 :     my @color_sets = map {[$_, &offset($_, $feature_data)]} @$color_sets;
513 : dsouza 1.1 # Sort the remaining color sets based on:
514 :     # a. size (number of pegs) and
515 :     # b. offset from mid-point of region
516 :     @$color_sets = map {$_->[0]}
517 :     sort {@{$b->[0]} <=> @{$a->[0]} or $a->[1] <=> $b->[1]}
518 : dsouza 1.3 # sort {$a->[1] <=> $b->[1] or @{$b->[0]} <=> @{$a->[0]}}
519 : dsouza 1.1 map {[$_, &offset($_, $feature_data)]} @$color_sets;
520 :    
521 :     # Add the set of pinned pegs at the front of the returned list so that it gets colored red
522 :     return [$pinned_color_set, @$color_sets];
523 :     }
524 :    
525 : dsouza 1.3 sub pinned_set {
526 : dsouza 1.1 my($pin_desc, $pinned_pegs, $color_sets) = @_;
527 :    
528 : dsouza 1.3 # Returns an integer if the input is a peg-list or if the input peg is in a set.
529 :     # Returns the input peg (as an array reference) if the input peg is not in a set.
530 :    
531 : dsouza 1.1 if ( @{ $pin_desc->{'pegs'} } == 1 )
532 :     {
533 :     # Get input peg if it exists -- the set containing this peg should be colored red
534 :     my $peg = $pin_desc->{'pegs'}[0];
535 :    
536 :     # Iterate through the color sets until you find the set containing the input peg
537 :     for (my $i = 0; $i < @$color_sets; $i++)
538 :     {
539 :     foreach my $peg2 ( @{ $color_sets->[$i] } )
540 :     {
541 :     if ( $peg2 eq $peg )
542 :     {
543 :     # Return the set index
544 :     return $i;
545 :     }
546 :     }
547 :     }
548 :    
549 : dsouza 1.3 # The input peg may not be in a set if there is only one region or the coloring cutoff is very stringent.
550 :     return [$peg];
551 : dsouza 1.1 }
552 :     else
553 :     {
554 : dsouza 1.3 # Input is a peg-list, which may be split into multiple sets -- the largest will be called the
555 :     # "pinned" set.
556 : dsouza 1.1 my($i, $max_count);
557 :     my %pinned = map {$_ => 1} @$pinned_pegs;
558 :    
559 :     for (my $j = 0; $j < @$color_sets; $j++)
560 :     {
561 :     # count how many pinned pegs are found in each set
562 :     my $count = scalar grep {$pinned{$_}} @{ $color_sets->[$j] };
563 :    
564 :     if ( $max_count < $count )
565 :     {
566 :     # Keep track of the set having the largest number of pinned pegs
567 :     $max_count = $count;
568 :     $i = $j;
569 :     }
570 :     }
571 :    
572 :     return $i;
573 :     }
574 :     }
575 :    
576 :     sub offset {
577 :     my($set, $feature_data) = @_;
578 :    
579 :     my $offset;
580 :     foreach my $peg ( @$set )
581 :     {
582 :     $offset += abs($feature_data->{$peg}{'offset'});
583 :     }
584 : dsouza 1.3
585 :     return sprintf("%.2f", $offset/@$set);
586 : dsouza 1.1 }
587 :    
588 :     sub partition_pegs {
589 :     my($blast_hit) = @_;
590 :    
591 :     my %seen;
592 :     my $sets = [];
593 :    
594 :     # Iterate through the pegs with blast hits in arbitrary order
595 :     foreach my $peg1 ( keys %$blast_hit )
596 :     {
597 :     # If it has not been 'seen' (and placed in a set) use it as the seed for a set
598 :     if ( not $seen{$peg1} )
599 :     {
600 :     my $set = [$peg1];
601 :    
602 :     # Mark $peg1 as seen
603 :     $seen{$peg1} = 1;
604 :    
605 :     # Grow set through transitive closure -- note that @$set keeps growing
606 :     for (my $i = 0; $i < @$set; $i++)
607 :     {
608 :     $peg1 = $set->[$i];
609 :    
610 :     # Iterate through blast hits
611 :     foreach my $peg2 ( keys %{ $blast_hit->{$peg1} } )
612 :     {
613 :     # If $peg2 has not been seen, put it in the set, and mark it as seen
614 :     if ( not exists $seen{$peg2} )
615 :     {
616 :     push @$set, $peg2;
617 :     $seen{$peg2} = 1;
618 :     }
619 :     }
620 :     }
621 :    
622 :     # Add the new set to the set of sets
623 :     push @$sets, $set;
624 :     }
625 :     }
626 :    
627 :     return $sets;
628 :     }
629 :    
630 :     sub blast_hits {
631 :     my($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff) = @_;
632 :     my($t0, $dt);
633 :    
634 :     # Set blast environment variables
635 :     $ENV{"BLASTMAT"} ||= "$FIG_Config::blastmat";
636 :     if ($ENV{"PATH"} !~ /fig\/bin/) { $ENV{"PATH"} = "$FIG_Config::bin:" . $ENV{"PATH"}; }
637 :    
638 :     # Get amino acid sequences
639 :     my $sequences = &get_peg_sequences($fig, $feature_data);
640 :    
641 :     # Write the entire set of sequences to a fasta file
642 :     my $all_seqs_file = "$FIG_Config::temp/all_seqs.$$.tmp.fasta";
643 :     &write_seqs_to_file($sequences, $all_seqs_file);
644 :    
645 :     # Run formatdb
646 :     &formatdb_file($fig, $all_seqs_file);
647 :    
648 :     # If $fast_color == 0, the complete blast of all vs. all sequences is performed
649 :     # Otherwise, instead of all vs. all, the sequences from a single pinned peg region
650 :     # is blasted against all sequences. If any of the hits are better than a given cutoff
651 :     # ($cutoff_2), the pegs hit are deemed to be 'done', i.e. it is assumed that blasting this
652 :     # peg will not yield additional information for forming the peg sets. These 'done' pegs
653 :     # are then omitted from the blast when the sequences from the region they belong to
654 :     # is blasted.
655 :     my %blast_hit;
656 :     if ( $fast_color )
657 :     {
658 :     my $cutoff_2 = $color_sim_cutoff * 1e-20;
659 :     my %done_with;
660 :    
661 :     foreach my $region ( @$regions )
662 :     {
663 :     # Iterate through each region
664 :     my $fids = $region->{'features'};
665 :    
666 :     # Build up a hash of peg sequences which are not 'done_with'
667 :     my %region_seqs;
668 :     foreach my $peg ( grep {$feature_data->{$_}{'type'} eq 'peg'} @$fids )
669 :     {
670 :     if ( $sequences->{$peg} and not $done_with{$peg} )
671 :     {
672 :     $region_seqs{$peg} = $sequences->{$peg};
673 :     }
674 :     }
675 :    
676 :     if ( scalar keys %region_seqs )
677 :     {
678 :     # Write the region sequences to a file
679 :     my $region_seqs_file = "$FIG_Config::temp/region_seqs.$$.tmp.fasta";
680 :     &write_seqs_to_file(\%region_seqs, $region_seqs_file);
681 :    
682 :     # BLAST the region sequences against all other sequences
683 :     my $hits = &blast_files($fig, $region_seqs_file, $all_seqs_file, $color_sim_cutoff);
684 :    
685 :     # Build up hash of blast hits
686 :     foreach my $hit ( @$hits )
687 :     {
688 :     my($peg1, $peg2, $sc) = (split(/\s+/, $hit))[0,1,10];
689 :    
690 :     if ( $peg1 ne $peg2 )
691 :     {
692 :     $blast_hit{$peg1}{$peg2} = 1;
693 :     $blast_hit{$peg2}{$peg1} = 1;
694 :    
695 :     # If the blast score is less than the chosen cutoff, mark $peg2 as 'done_with' so
696 :     # that it will not be blasted with it's region sequences
697 :     if ( $sc <= $cutoff_2 )
698 :     {
699 :     $done_with{$peg2} = 1;
700 :     }
701 :     }
702 :     }
703 :     }
704 :     }
705 :     }
706 :     else
707 :     {
708 :     # BLAST sequence file against itself
709 :     my $hits = &blast_files($fig, $all_seqs_file, $all_seqs_file, $color_sim_cutoff);
710 :    
711 :     # Build up hash of blast hits
712 :     foreach my $hit ( @$hits )
713 :     {
714 :     my($peg1, $peg2, $sc) = (split(/\s+/, $hit))[0,1,10];
715 :    
716 :     if ( $peg1 ne $peg2 )
717 :     {
718 :     $blast_hit{$peg1}{$peg2} = 1;
719 :     $blast_hit{$peg2}{$peg1} = 1;
720 :     }
721 :     }
722 :     }
723 :    
724 :     return \%blast_hit;
725 :     }
726 :    
727 :     sub blast_hits_from_sims {
728 :     my($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff) = @_;
729 :    
730 :     my %blast_hit;
731 :    
732 :     my $maxN = 2000;
733 :     my $maxP = $color_sim_cutoff;
734 :     my $select = 'fig';
735 :     my $max_expand = '';
736 :     my $filters = '';
737 :    
738 :     # Create a hash of all pegs
739 :     my %region_peg = map {$_ => 1} grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data;
740 :    
741 :     if ( $fast_color == 1 )
742 :     {
743 :     my $cutoff_2 = $color_sim_cutoff * 1e-20;
744 :     my %done_with;
745 :    
746 :     # Iterate through each peg
747 :     foreach my $peg1 ( keys %region_peg )
748 :     {
749 :     # Skip the 'done_with' pegs
750 :     next if ($done_with{$peg1});
751 :    
752 :     my @sims = $fig->sims($peg1, $maxN, $maxP, $select, $max_expand, $filters);
753 :    
754 :     foreach my $sim ( grep {$region_peg{$_->[1]} and $_->[1] ne $peg1} @sims )
755 :     {
756 :     my($peg2, $sc) = @$sim[1,10];
757 :    
758 :     $blast_hit{$peg1}{$peg2} = 1;
759 :     $blast_hit{$peg2}{$peg1} = 1;
760 :    
761 :     # If the blast score is less than the chosen cutoff, mark $peg2 as 'done_with' so
762 :     # that it will not be blasted with it's region sequences
763 :     if ( $sc <= $cutoff_2 )
764 :     {
765 :     $done_with{$peg2} = 1;
766 :     }
767 :     }
768 :     }
769 :     }
770 :     else
771 :     {
772 :     # Iterate through each peg
773 :     foreach my $peg1 ( keys %region_peg )
774 :     {
775 :     my @sims = $fig->sims($peg1, $maxN, $maxP, $select, $max_expand, $filters);
776 :    
777 :     foreach my $peg2 ( map {$_->[1]} grep {$region_peg{$_->[1]} and $_->[1] ne $peg1} @sims )
778 :     {
779 :     $blast_hit{$peg1}{$peg2} = 1;
780 :     $blast_hit{$peg2}{$peg1} = 1;
781 :     }
782 :     }
783 :     }
784 :    
785 :     return \%blast_hit;
786 :     }
787 :    
788 :     sub blast_files {
789 :     my($fig, $input, $database, $cutoff) = @_;
790 :    
791 :     my $cmd = "$FIG_Config::ext_bin/blastall";
792 :     my @args = ('-p', 'blastp', '-i', $input, '-d', $database, '-m', 8, '-e', $cutoff);
793 :     my @blast_out = $fig->run_gathering_output($cmd, @args);
794 :    
795 :     return \@blast_out;
796 :     }
797 :    
798 :     sub formatdb_file {
799 :     my($fig, $file) = @_;
800 :    
801 :     my $cmd = "$FIG_Config::ext_bin/formatdb -i $file -p T";
802 :     $fig->run($cmd);
803 :     }
804 :    
805 :     sub write_seqs_to_file {
806 :     my($seq, $fasta_file) = @_;
807 :    
808 :     open(FASTA, ">$fasta_file") or die "could not create FASTA file '$fasta_file': $!";
809 :     foreach my $peg ( keys %$seq )
810 :     {
811 :     print FASTA ">$peg\n$seq->{$peg}\n";
812 :     }
813 :     close(FASTA) or die "could not close file FASTA file '$fasta_file': $!";
814 :     }
815 :    
816 :     sub get_peg_sequences {
817 :     my($fig, $feature_data) = @_;
818 :     my %sequences;
819 :    
820 :     # Iterate over each peg
821 :     foreach my $peg ( grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data )
822 :     {
823 :     my $seq = $fig->get_translation($peg);
824 :    
825 :     if ( $seq )
826 :     {
827 :     $sequences{$peg} = $seq;
828 :     }
829 :     else
830 :     {
831 :     print STDERR "could not get sqeuence for $peg\n";
832 :     }
833 :     }
834 :    
835 :     return \%sequences;
836 :     }
837 :    
838 :    
839 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3