[Bio] / FigKernelPackages / PinnedRegions.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/PinnedRegions.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.15 - (view) (download) (as text)

1 : dsouza 1.1 # -*- perl -*-
2 :     ########################################################################
3 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     ########################################################################
18 :    
19 :    
20 :     package PinnedRegions;
21 :    
22 :     use strict;
23 :     use warnings;
24 :    
25 :     use FIG;
26 : dsouza 1.5 use FigFams;
27 : dsouza 1.1 use FIG_Config;
28 : parrello 1.9 use Tracer;
29 : dsouza 1.1
30 : parrello 1.9 use Time::HiRes qw( usleep gettimeofday tv_interval );
31 : dsouza 1.1
32 :     sub pinned_regions {
33 : bartels 1.8 my($fig, $pin_desc, $fast_color, $sims_from, $map_sz, $add_features) = @_;
34 : parrello 1.11 Trace("Pinned regions method called.") if T(3);
35 : dsouza 1.1 # Get list of pegs required by the description in $pin_desc
36 : dsouza 1.4 my $pinned_pegs = &expand_peg_list($fig, $pin_desc);
37 : parrello 1.9 Trace("Pinned pegs are " . join(", ", @$pinned_pegs) . ".") if T(3);
38 : parrello 1.11 # Filter out the pegs that don't exist.
39 : dsouza 1.1 # Get regions around pinned pegs -- boundaries, features, etc.
40 :     my $regions = &define_regions($fig, $map_sz, $pinned_pegs);
41 :     # Filter out overlapping regions caused by gene fusions, frame shifts etc. where multiple PEGs
42 :     # are pinned (by similarity or PCH) to the input PEG
43 :     $regions = &filter_regions_1($pin_desc, $regions);
44 :    
45 : dsouza 1.4 # Get information for each feature -- location, strand, function etc.
46 : dsouza 1.1 my $feature_data = &feature_data($fig, $regions);
47 :    
48 : dsouza 1.3 &add_functional_coupling($fig, $pin_desc, $regions, $feature_data);
49 : bartels 1.8 # &add_figfams($fig, $feature_data);
50 : dsouza 1.3 &add_subsystem_data($fig, $pin_desc, $feature_data);
51 : parrello 1.9 Trace("Coloring pegs.") if T(3);
52 : dsouza 1.4 # Assign a set number to some PEGs through transitive closure based on similarity, from blast scores
53 : dsouza 1.1 &color_pegs($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $fast_color, $sims_from);
54 :    
55 :     # Filter out regions which have only a single PEG (the pinned one) colored.
56 : dsouza 1.14 # $regions = &filter_regions_2($pin_desc, $regions, $feature_data);
57 : dsouza 1.1
58 : bartels 1.8 if ( defined( $add_features ) ) {
59 : parrello 1.11 &add_features_to_fdata( $fig, $feature_data, $add_features, $regions );
60 : bartels 1.8 }
61 :    
62 : dsouza 1.1 # Add feature data to the regions to make the final maps
63 :     my $maps = &make_maps($fig, $regions, $feature_data);
64 :     return $maps;
65 :     }
66 :    
67 :     sub expand_peg_list {
68 :     my($fig, $pin_desc) = @_;
69 :     my @pegs = ();
70 :    
71 :     # Filter for legitimate genome IDS -- should handle deleted organisms and (in NMPDR) non-NMPDR orgs
72 :     my %ok_genome_id = map {$_ => 1} $fig->genomes;
73 :    
74 : dsouza 1.5 # If the user has selected the genomes to be displayed, filter out all other genomes
75 :     if ( @{ $pin_desc->{'show_genomes'} } )
76 :     {
77 : parrello 1.11 # create new %ok_genome_id hash from intersection of selected genomes and legitimate genome IDs
78 :     %ok_genome_id = map {$_ => 1} grep {$ok_genome_id{$_}} @{ $pin_desc->{'show_genomes'} };
79 : dsouza 1.5 }
80 :    
81 : dsouza 1.1 if ( @{ $pin_desc->{'pegs'} } > 1 )
82 :     {
83 : parrello 1.11 # Input already contains list of pegs, no need for expansion
84 :     @pegs = grep {$ok_genome_id{$fig->genome_of($_)}} @{ $pin_desc->{'pegs'} };
85 : dsouza 1.1 }
86 :     else
87 :     {
88 : parrello 1.11 # Get PCH pinned pegs
89 :     my $pch_pinned_pegs = &pch_pinned_pegs($fig, $pin_desc, \%ok_genome_id);
90 : dsouza 1.1
91 : parrello 1.11 $pch_pinned_pegs = &select_pegs($fig, $pin_desc, $pin_desc->{'n_pch_pins'}, $pch_pinned_pegs);
92 : dsouza 1.1
93 : dsouza 1.14 # Get similarity pinned pegs
94 :     my $sim_pegs = &sim_pegs($fig, $pin_desc, \%ok_genome_id);
95 : dsouza 1.1
96 : parrello 1.11 # Remove PCH PEGs (if any) from similarity pinned PEG list
97 :     my %seen = map {$_ => 1} @$pch_pinned_pegs;
98 :     $sim_pegs = [grep {! $seen{$_}} @$sim_pegs];
99 :    
100 :     $sim_pegs = &select_pegs($fig, $pin_desc, $pin_desc->{'n_sims'}, $sim_pegs);
101 :    
102 :     @pegs = ();
103 :    
104 :     # Get input peg
105 :     my $peg = $pin_desc->{'pegs'}[0];
106 :     if ( $pin_desc->{'sort_by'} eq 'phylogeny' )
107 :     {
108 :     # First add input peg, then sort by phylogeny
109 :     @pegs = $fig->sort_fids_by_taxonomy($peg, @$sim_pegs, @$pch_pinned_pegs);
110 :     }
111 :     else
112 :     {
113 :     # Sort by phylogenetic distance or organism name
114 :     my $g1 = $fig->genome_of($peg);
115 :     @pegs = map {$_->[0] }
116 :     sort {$a->[1] <=> $b->[1] or $a->[2] cmp $b->[2]}
117 :     map {[$_, $fig->crude_estimate_of_distance($g1,$fig->genome_of($_)), $fig->org_of($_)]} @$sim_pegs, @$pch_pinned_pegs;
118 :     # Add input peg at front of list
119 :     unshift @pegs, $peg;
120 :     }
121 : paczian 1.15
122 : dsouza 1.1 }
123 :     return \@pegs
124 :     }
125 :    
126 :     sub pch_pinned_pegs {
127 :     my($fig, $pin_desc, $ok_genome_id) = @_;
128 :    
129 :     my @pegs = ();
130 :    
131 :     if ( $pin_desc->{'n_pch_pins'} > 0 )
132 :     {
133 : parrello 1.11 # Get input peg
134 :     my $peg = $pin_desc->{'pegs'}[0];
135 : dsouza 1.1
136 : parrello 1.11 # Get pegs in pch pin with input peg, and filter out deleted or non-NMPDR organisms
137 :     @pegs = grep {$ok_genome_id->{$fig->genome_of($_)}}
138 :     grep {$_ ne $peg}
139 : dsouza 1.1 map {$_->[0]} $fig->in_pch_pin_with_and_evidence($peg);
140 :     }
141 :    
142 :     return \@pegs;
143 :     }
144 :    
145 :     sub sim_pegs {
146 :     my($fig, $pin_desc, $ok_genome_id) = @_;
147 :    
148 :     # Return a list of PEGS similar to the input PEG, with evalue less than or equal to
149 :     # the user defined similarity cutoff.
150 :     # If the number of sims requested is 0, return the empty list
151 :    
152 :     my $n_sims = $pin_desc->{'n_sims'};
153 :     my $sim_cutoff = $pin_desc->{'sim_cutoff'};
154 :     my @pegs = ();
155 :    
156 :     if ( $n_sims > 0 )
157 :     {
158 : parrello 1.11 my $peg = $pin_desc->{'pegs'}[0];
159 :     my %seen;
160 : paczian 1.15
161 : parrello 1.11 @pegs = grep {$ok_genome_id->{$fig->genome_of($_)}}
162 :     grep {! $seen{$_}++}
163 :     map {$_->[1]} $fig->sims($peg, 10000, $sim_cutoff, "fig");
164 : dsouza 1.1
165 : parrello 1.11 my $n_pegs = @pegs;
166 : dsouza 1.1 }
167 :    
168 :     return \@pegs;
169 :     }
170 :    
171 :     sub select_pegs {
172 :     my($fig, $pin_desc, $n_pegs, $pegs) = @_;
173 :    
174 :     if ( $n_pegs == 0 )
175 :     {
176 : parrello 1.11 return [];
177 : dsouza 1.1 }
178 :    
179 : dsouza 1.3 if ( $pin_desc->{'collapse_close_genomes'} == 1 )
180 : dsouza 1.1 {
181 : parrello 1.11 # The ordering of the PEGs done here is in order to =select= the correct set, the ordering for the
182 :     # display is done later.
183 : dsouza 1.2
184 : parrello 1.11 # To collapse the PEG list, we want to:
185 :     # a. Return at most $n_pegs PEGs,
186 :     # b. include a representative PEG from every genus, subject to a, and
187 :     # c. include a representative PEG from each genus-species, subject to a and b.
188 :     # Individual strains will get represented by a single PEG, chosen arbitrarily.
189 :    
190 :     # The PEG selection is defined by the order of the PEGs. This could be done beter.
191 :    
192 :     my(%seen, @unique_genus, @unique_genus_species);
193 :    
194 :     foreach my $peg ( @$pegs )
195 :     {
196 :     my $org = $fig->org_of($peg);
197 :     # 'org_of' returns undef for deleted PEGs, these need to be omitted from the peg list returned
198 :    
199 :     if ( $org )
200 :     {
201 :     # Use only genus+species to drop strain information
202 :     my($genus, $species) = split(/\s+/, $org);
203 :     my $gs = "$genus $species";
204 :    
205 :     if ( not $seen{$genus}++ )
206 :     {
207 :     # First PEG from this genus, add it to @unique_genus.
208 :     # Mark the genus+species as seen.
209 :     # A subsequent PEG with the same genus and species will be dropped.
210 :     # A subsequent PEG with the same genus but different species will be added to @unique_genus_species.
211 :     $seen{$gs}++;
212 :     push @unique_genus, $peg;
213 :     }
214 :     elsif ( not $seen{$gs}++ )
215 :     {
216 :     # First PEG from this genus+species, add it to @unique_genus_species.
217 :     push @unique_genus_species, $peg;
218 :     }
219 :     }
220 :     }
221 : dsouza 1.1
222 : parrello 1.11 # Keep the unique_genus PEGS at the top, followed by the unique_genus_species
223 :     $pegs = [@unique_genus, @unique_genus_species];
224 : dsouza 1.1 }
225 :    
226 :     # Truncate list if necessary
227 : dsouza 1.3 if ( $n_pegs < @$pegs )
228 : dsouza 1.1 {
229 : parrello 1.11 $#{ $pegs } = $n_pegs - 1;
230 : dsouza 1.1 }
231 :    
232 :     return $pegs;
233 :     }
234 :    
235 :     sub filter_regions_1 {
236 :     my($pin_desc, $regions) = @_;
237 :     my $new_regions;
238 :    
239 :     # Filter out some overlapping regions caused by gene fusions, frame shifts etc. where multiple PEGs
240 :     # are pinned (by similarity or PCH) to the input PEG
241 :     # Note that all overlaps are NOT eliminated
242 :    
243 :     if ( @{ $pin_desc->{'pegs'} } == 1 )
244 :     {
245 : parrello 1.11 # Input pin description is for a single input peg
246 :     my %seen;
247 :    
248 :     foreach my $region ( @$regions )
249 :     {
250 :     my $pinned_peg = $region->{'pinned_peg'};
251 :    
252 :     # If the pinned peg from a region has already been 'seen', don't
253 :     # add the region into the @$new_regions array
254 :    
255 :     if ( not $seen{$pinned_peg} )
256 :     {
257 :     push @$new_regions, $region;
258 :    
259 :     my $fids = $region->{'features'};
260 :     foreach my $fid ( @$fids )
261 :     {
262 :     $seen{$fid} = 1;
263 :     }
264 :     }
265 :     }
266 : dsouza 1.1 }
267 :     else
268 :     {
269 : parrello 1.11 # input PEGs is a list -- keep all of them
270 :     $new_regions = $regions;
271 : dsouza 1.1 }
272 :    
273 :     return $new_regions;
274 :     }
275 :    
276 :     sub filter_regions_2 {
277 :     my($pin_desc, $regions, $feature_data) = @_;
278 :     my $new_regions;
279 :    
280 :     if ( @{ $pin_desc->{'pegs'} } == 1 )
281 :     {
282 : parrello 1.11 # Input is single peg
283 :     my $input_peg = $pin_desc->{'pegs'}[0];
284 : dsouza 1.1
285 : parrello 1.11 foreach my $region ( @$regions )
286 :     {
287 :     my $fids = $region->{'features'};
288 :     my $n_in_sets = grep {$feature_data->{$_}{'set_number'}} @$fids;
289 :    
290 :     if ( $n_in_sets > 1 or $region->{'pinned_peg'} eq $input_peg )
291 :     {
292 :     # Regions should be displayed only if:
293 :     # a. more than one PEG is in a 'colored set' OR
294 :     # b. the pinned PEG is the input PEG
295 :     push @$new_regions, $region;
296 :     }
297 :     }
298 : dsouza 1.1 }
299 :     else
300 :     {
301 : parrello 1.11 # Input is a list of pegs -- keep all of the regions
302 :     $new_regions = $regions
303 : dsouza 1.1 }
304 :    
305 :     return $new_regions;
306 :     }
307 :    
308 :     sub make_maps {
309 :     my($fig, $regions, $feature_data) = @_;
310 :    
311 :     foreach my $region ( @$regions )
312 :     {
313 : parrello 1.11 my $features = [];
314 :     my $fids = $region->{'features'};
315 : dsouza 1.1
316 : parrello 1.11 foreach my $fid ( @$fids )
317 :     {
318 :     push @$features, $feature_data->{$fid};
319 :     # if ( !( $fid =~ /fig/ ) ) {
320 :     # print STDERR $fid." FIDMAKEMAPS\n";
321 :     # }
322 :     }
323 : dsouza 1.1
324 : parrello 1.11 $region->{'features'} = $features;
325 : dsouza 1.1 }
326 :    
327 :     return $regions;
328 :     }
329 :    
330 :     sub define_regions {
331 :     my($fig, $map_sz, $pinned_pegs) = @_;
332 :    
333 :     # Define the 'region' around the pinned peg, get features, and store some region information
334 :     my $regions = [];
335 :     my $half_map_sz = int($map_sz/2);
336 :    
337 :     foreach my $peg ( @$pinned_pegs )
338 :     {
339 : parrello 1.11 my $genome = $fig->genome_of($peg);
340 :     my $loc = $fig->feature_location($peg);
341 :     # We only proceed if a location was found. Failure to find a location indicates
342 :     # that the feature is deleted.
343 :     if ($loc) {
344 :     my($contig, $beg, $end) = $fig->boundaries_of($loc);
345 :    
346 :     my $region_mid = int(($beg + $end)/2);
347 :     my $region_beg = $region_mid - $half_map_sz;
348 :     my $region_end = $region_mid + $half_map_sz;
349 :    
350 :     my($fids) = $fig->genes_in_region($genome, $contig, $region_beg, $region_end);
351 :    
352 :     my $region = {};
353 :     $region->{'genome_id'} = $fig->genome_of($peg);
354 :     $region->{'org_name'} = $fig->genus_species($region->{'genome_id'});
355 :     $region->{'contig'} = $contig;
356 :     $region->{'beg'} = $region_beg;
357 :     $region->{'mid'} = $region_mid;
358 :     $region->{'end'} = $region_end;
359 :     $region->{'features'} = $fids;
360 :     $region->{'pinned_peg_strand'} = ($beg <= $end)? '+' : '-';
361 :     $region->{'pinned_peg'} = $peg;
362 :    
363 :     push @$regions, $region;
364 :     }
365 : dsouza 1.1 }
366 :    
367 :     return $regions;
368 :     }
369 :    
370 : bartels 1.8 sub add_features_to_fdata {
371 :     my ( $fig, $feature_data, $add_features, $regions ) = @_;
372 :    
373 :     foreach my $region ( @$regions ) {
374 : parrello 1.11 my $new_feats = $add_features->{ $region->{ 'genome_id' } };
375 :     foreach my $nf ( @$new_feats ) {
376 :     if ( $nf->{ 'contig' } eq $region->{ 'contig' } &&
377 :     $nf->{ 'start' } < $region->{ 'end' } &&
378 :     $nf->{ 'start' } > $region->{ 'beg' } ) {
379 :     $feature_data->{ $nf->{ 'name' } } = &new_feature_entry( $fig, $nf->{ 'name' }, $region->{ 'mid' }, $region->{ 'contig' }, $nf->{ 'start' }, $nf->{ 'stop' }, $nf->{ 'type' }, $nf->{ 'function' } );
380 :     push @{ $region->{ 'features' } }, $nf->{ 'name' };
381 :     }
382 :     }
383 : bartels 1.8 }
384 :     }
385 :    
386 :     sub new_feature_entry {
387 :     my ( $fig, $fid, $region_mid, $contig, $beg, $end, $type, $func ) = @_;
388 :    
389 :     if ( !defined( $type ) ) {
390 : parrello 1.11 $type = 'unknown';
391 : bartels 1.8 }
392 :    
393 :     if ( !defined( $func ) ) {
394 : parrello 1.11 $func = '';
395 : bartels 1.8 }
396 :    
397 :     my($left, $right) = sort {$a <=> $b} ($beg, $end);
398 :     my $size = $right - $left + 1;
399 :     my $strand = ($beg <= $end)? '+' : '-';
400 :     my $offset = int(($left + $right)/2) - $region_mid;
401 :     my $offset_beg = $left - $region_mid;
402 :     my $offset_end = $right - $region_mid;
403 :    
404 :     return {
405 : parrello 1.11 'fid' => $fid,
406 :     'type' => $type,
407 :     'contig' => $contig,
408 :     'beg' => $beg,
409 :     'end' => $end,
410 :     'size' => $size,
411 :     'strand' => $strand,
412 :     'offset' => $offset,
413 :     'offset_beg' => $offset_beg,
414 :     'offset_end' => $offset_end,
415 :     'function' => $func,
416 : bartels 1.8 };
417 :     }
418 :    
419 : dsouza 1.1 sub feature_data {
420 :     my($fig, $regions) = @_;
421 :     my %feature_data;
422 :    
423 :     foreach my $region ( @$regions )
424 :     {
425 : parrello 1.11 my $region_mid = $region->{'mid'};
426 :     my $fids = $region->{'features'};
427 : dsouza 1.1
428 : parrello 1.11 foreach my $fid ( @$fids )
429 :     {
430 :     # Get feature data if this feature has not been seen before, this avoids repeating
431 :     # this step when pegs occur in multiple regions
432 :     if ( not exists $feature_data{$fid} )
433 :     {
434 :     $feature_data{$fid} = &feature_entry($fig, $fid, $region_mid);
435 :     }
436 :     }
437 : dsouza 1.1 }
438 :    
439 :     return \%feature_data;
440 :     }
441 :    
442 :     sub feature_entry {
443 :     my($fig, $fid, $region_mid) = @_;
444 :    
445 :     my $type = $fig->ftype($fid);
446 :     my $loc = $fig->feature_location($fid);
447 :     my($contig, $beg, $end) = $fig->boundaries_of($loc);
448 :     my($left, $right) = sort {$a <=> $b} ($beg, $end);
449 : dsouza 1.3 my $size = $right - $left + 1;
450 : dsouza 1.1 my $strand = ($beg <= $end)? '+' : '-';
451 :     my $offset = int(($left + $right)/2) - $region_mid;
452 :     my $offset_beg = $left - $region_mid;
453 :     my $offset_end = $right - $region_mid;
454 :     my $func = scalar $fig->function_of($fid) || '';
455 :    
456 :     return {
457 : parrello 1.11 'fid' => $fid,
458 :     'type' => $type,
459 :     'contig' => $contig,
460 :     'beg' => $beg,
461 :     'end' => $end,
462 :     'size' => $size,
463 :     'strand' => $strand,
464 :     'offset' => $offset,
465 :     'offset_beg' => $offset_beg,
466 :     'offset_end' => $offset_end,
467 :     'function' => $func,
468 :     };
469 : dsouza 1.1 }
470 :    
471 : dsouza 1.3 sub add_functional_coupling {
472 :     my($fig, $pin_desc, $regions, $feature_data) = @_;
473 :    
474 :     my $fc_cutoff = defined($pin_desc->{'fc_cutoff'})? $pin_desc->{'fc_cutoff'} : 4;
475 :    
476 :     foreach my $region ( @$regions )
477 :     {
478 : parrello 1.11 my $pin = $region->{'pinned_peg'};
479 :     my @pegs = grep {$feature_data->{$_}{'type'} eq 'peg'} @{ $region->{'features'} };
480 : dsouza 1.3
481 : parrello 1.11 foreach my $couple ( $fig->coupled_to_batch(@pegs) )
482 :     {
483 :     my($peg1, $peg2, $sc) = @$couple;
484 :    
485 :     if ( $peg1 eq $pin and $sc >= $fc_cutoff )
486 :     {
487 :     $feature_data->{$peg2}{'fc_score'} = $sc;
488 : paczian 1.15 $feature_data->{$peg2}{'fc_pin'} = $peg1;
489 : parrello 1.11 }
490 :     }
491 : dsouza 1.3 }
492 :     }
493 :    
494 : dsouza 1.5 sub add_figfams {
495 :     my($fig, $feature_data) = @_;
496 :    
497 : dsouza 1.6 # Get FigFams directory from config file
498 : olson 1.10 my $figfam_dir = $fig->get_figfams_data();
499 : dsouza 1.5
500 : dsouza 1.6 # Check if FigFams directory is defined and exists on current machine
501 :     if ( defined($figfam_dir) and (-d $figfam_dir) )
502 : dsouza 1.5 {
503 : parrello 1.11 # Get all PEG IDs
504 :     my @pegs = grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data;
505 :     # Get $figfams object
506 :     my $figfams = new FigFams($fig, $figfam_dir);
507 :     # Get FigFam family ID for @pegs
508 :     my $figfam = $figfams->families_containing_peg_bulk(\@pegs);
509 :     # Get hash of FigFam ID to family function
510 :     my $family_function = $figfams->family_functions();
511 :    
512 :     # Some FigFams (the ones that are not subsystem related) have the FigFam ID in the family function.
513 :     # This results in the popup displaying the ID twice, so one should be removed.
514 :    
515 :     my %figfam_text;
516 :     foreach my $fid ( keys %$feature_data )
517 :     {
518 :     if ( $figfam->{$fid} )
519 :     {
520 :     my $figfam_id = $figfam->{$fid};
521 :     if ( ! exists $figfam_text{$figfam_id} ) {
522 :     if ( $family_function->{$figfam_id} =~ /^FIG\d+/ ) {
523 :     $figfam_text{$figfam_id} = $family_function->{$figfam_id};
524 :     } else {
525 :     $figfam_text{$figfam_id} = $figfam_id . ': ' . $family_function->{$figfam_id};
526 :     }
527 :     }
528 : dsouza 1.7
529 : parrello 1.11 # Add FigFam information to hash -- to go into the popup text
530 :     $feature_data->{$fid}{'figfam'} = $figfam_text{$figfam_id};
531 :     }
532 :     }
533 : dsouza 1.5 }
534 :     }
535 :    
536 : dsouza 1.3 sub add_subsystem_data {
537 :     my($fig, $pin_desc, $feature_data) = @_;
538 : dsouza 1.14
539 :     # Get subsystem_information for =all= pegs
540 :    
541 :     my %peg_to_ss;
542 :     foreach my $fid ( keys %$feature_data )
543 :     {
544 :     if ( $feature_data->{$fid}{'type'} eq 'peg' )
545 :     {
546 :     my @subsystems = grep { $fig->usable_subsystem($_->[0],1) } $fig->subsystems_for_peg_complete($fid, 1);
547 :    
548 :     foreach my $rec ( @subsystems )
549 :     {
550 :     my($ss_name, $role, $variant, $is_auxiliary) = @$rec;
551 :     $ss_name =~ s/_/ /g;
552 :    
553 :     if ( $variant eq '0' )
554 :     {
555 :     # no subsystem
556 :     }
557 :     elsif ( $variant eq '-1' or $variant eq '*-1' )
558 :     {
559 :     # subsystem not functional in this organism
560 :     my $ss_text = "$ss_name (classified 'not active' in this organism)";
561 :     $peg_to_ss{$fid}{$ss_text} = 1;
562 :     }
563 :     else
564 :     {
565 :     $peg_to_ss{$fid}{$ss_name} = 1;
566 :     }
567 :     }
568 :     }
569 :     }
570 :    
571 : dsouza 1.3 # Count number of occurences of each subsystem
572 :     my %ss_count;
573 : dsouza 1.14 foreach my $fid ( keys %peg_to_ss )
574 : dsouza 1.3 {
575 : dsouza 1.14 foreach my $ss ( keys %{ $peg_to_ss{$fid} } )
576 :     {
577 :     $ss_count{$ss}++;
578 :     }
579 : dsouza 1.3 }
580 :    
581 :     # Sort subsystems based on count and create hash with unique index for each subsystem where
582 :     # lower indices correspond to higher number of occurences of the subsystem.
583 :     my %ss_index;
584 :     my $index = 0;
585 :     foreach my $ss ( sort {$ss_count{$b} <=> $ss_count{$a} or $a cmp $b} keys %ss_count )
586 :     {
587 : parrello 1.11 $ss_index{$ss} = ++$index;
588 : dsouza 1.3 }
589 :    
590 :     # Add subsystem information for pegs in subsystems
591 :     foreach my $fid ( keys %peg_to_ss )
592 :     {
593 : dsouza 1.14 my @subsystems = keys %{ $peg_to_ss{$fid} };
594 : parrello 1.11
595 :     if ( @subsystems )
596 :     {
597 : dsouza 1.14 $feature_data->{$fid}{'subsystems'} = [sort {$a->[1] <=> $b->[1]} map {[$_, $ss_index{$_}]} @subsystems];
598 : parrello 1.11 }
599 : dsouza 1.3 }
600 :     }
601 :    
602 : dsouza 1.1 sub color_pegs {
603 :     my($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $fast_color, $sims_from) = @_;
604 :    
605 :     # Run blastp and get connected pegs
606 :     my $blast_hit = {};
607 :     my $color_sim_cutoff = $pin_desc->{'color_sim_cutoff'};
608 :    
609 :     if ( $sims_from eq 'blast' )
610 :     {
611 : parrello 1.11 $blast_hit = &blast_hits($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff);
612 : dsouza 1.1 }
613 :     else
614 :     {
615 : parrello 1.11 $blast_hit = &blast_hits_from_sims($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff);
616 : dsouza 1.1 }
617 :    
618 :     # Assign set numbers to pegs based on blast scores
619 :     my $color_sets = &partition_pegs($blast_hit);
620 :    
621 :     # Sort peg sets to that a) larger sets have smaller set numbers, and
622 :     # b) sets closer to the pinned peg have smaller set numbers
623 : dsouza 1.4 $color_sets = &sort_color_sets($pin_desc, $pinned_pegs, $color_sets, $feature_data);
624 : dsouza 1.1
625 :     # Add color set number into $feature_data
626 :     for (my $i = 0; $i < @$color_sets; $i++)
627 :     {
628 : parrello 1.11 # Use an index that starts at 1 (matches with coloring index)
629 :     my $n = $i + 1;
630 :     foreach my $peg ( @{ $color_sets->[$i] } )
631 :     {
632 :     $feature_data->{$peg}{'set_number'} = $n;
633 :    
634 :     # If this is the pinned set, set the 'color' to 'red'
635 :     if ( $n == 1 )
636 :     {
637 :     $feature_data->{$peg}{'color'} = 'red';
638 :     }
639 :     }
640 : dsouza 1.1 }
641 :     }
642 :    
643 :     sub sort_color_sets {
644 :     my($pin_desc, $pinned_pegs, $color_sets, $feature_data) = @_;
645 :    
646 : dsouza 1.3 # Find the color set containing the set of pinned pegs, i.e. the set containing the input peg.
647 :     # If the input peg is found (or if input is a list of pegs, returned value is the array index for the
648 :     # set.
649 :     # If the input peg is not found, the returned value is an reference to an array containing the input peg.
650 :     my $set = &pinned_set($pin_desc, $pinned_pegs, $color_sets);
651 : dsouza 1.1
652 : dsouza 1.3 my $pinned_color_set;
653 :     if ( $set =~ /^\d+$/ )
654 :     {
655 : parrello 1.11 # Splice out the color set containing the input peg
656 :     ($pinned_color_set) = splice(@$color_sets,$set,1);
657 : dsouza 1.3 }
658 :     else
659 :     {
660 : parrello 1.11 $pinned_color_set = $set;
661 : dsouza 1.3 }
662 : dsouza 1.1
663 : dsouza 1.3 # Add offset (summed distance from
664 :     my @color_sets = map {[$_, &offset($_, $feature_data)]} @$color_sets;
665 : dsouza 1.1 # Sort the remaining color sets based on:
666 :     # a. size (number of pegs) and
667 :     # b. offset from mid-point of region
668 :     @$color_sets = map {$_->[0]}
669 :     sort {@{$b->[0]} <=> @{$a->[0]} or $a->[1] <=> $b->[1]}
670 : dsouza 1.3 # sort {$a->[1] <=> $b->[1] or @{$b->[0]} <=> @{$a->[0]}}
671 : dsouza 1.1 map {[$_, &offset($_, $feature_data)]} @$color_sets;
672 :    
673 :     # Add the set of pinned pegs at the front of the returned list so that it gets colored red
674 :     return [$pinned_color_set, @$color_sets];
675 :     }
676 :    
677 : dsouza 1.3 sub pinned_set {
678 : dsouza 1.1 my($pin_desc, $pinned_pegs, $color_sets) = @_;
679 :    
680 : dsouza 1.3 # Returns an integer if the input is a peg-list or if the input peg is in a set.
681 :     # Returns the input peg (as an array reference) if the input peg is not in a set.
682 :    
683 : dsouza 1.1 if ( @{ $pin_desc->{'pegs'} } == 1 )
684 :     {
685 : parrello 1.11 # Get input peg if it exists -- the set containing this peg should be colored red
686 :     my $peg = $pin_desc->{'pegs'}[0];
687 : dsouza 1.1
688 : parrello 1.11 # Iterate through the color sets until you find the set containing the input peg
689 :     for (my $i = 0; $i < @$color_sets; $i++)
690 :     {
691 :     foreach my $peg2 ( @{ $color_sets->[$i] } )
692 :     {
693 :     if ( $peg2 eq $peg )
694 :     {
695 :     # Return the set index
696 :     return $i;
697 :     }
698 :     }
699 :     }
700 : dsouza 1.1
701 : parrello 1.11 # The input peg may not be in a set if there is only one region or the coloring cutoff is very stringent.
702 :     return [$peg];
703 : dsouza 1.1 }
704 :     else
705 :     {
706 : parrello 1.11 # Input is a peg-list, which may be split into multiple sets -- the largest will be called the
707 :     # "pinned" set.
708 : parrello 1.12 my $i;
709 :     my $max_count = 0;
710 : parrello 1.11 my %pinned = map {$_ => 1} @$pinned_pegs;
711 :    
712 :     for (my $j = 0; $j < @$color_sets; $j++)
713 :     {
714 :     # count how many pinned pegs are found in each set
715 : parrello 1.12 my $count = scalar(grep {$pinned{$_}} @{ $color_sets->[$j] }) || 0;
716 : parrello 1.11
717 :     if ( $max_count < $count )
718 :     {
719 :     # Keep track of the set having the largest number of pinned pegs
720 :     $max_count = $count;
721 :     $i = $j;
722 :     }
723 :     }
724 : dsouza 1.1
725 : parrello 1.11 return $i;
726 : dsouza 1.1 }
727 :     }
728 :    
729 :     sub offset {
730 :     my($set, $feature_data) = @_;
731 :    
732 :     my $offset;
733 :     foreach my $peg ( @$set )
734 :     {
735 : parrello 1.11 $offset += abs($feature_data->{$peg}{'offset'});
736 : dsouza 1.1 }
737 : dsouza 1.3
738 :     return sprintf("%.2f", $offset/@$set);
739 : dsouza 1.1 }
740 :    
741 :     sub partition_pegs {
742 :     my($blast_hit) = @_;
743 :    
744 :     my %seen;
745 :     my $sets = [];
746 :    
747 :     # Iterate through the pegs with blast hits in arbitrary order
748 :     foreach my $peg1 ( keys %$blast_hit )
749 :     {
750 : parrello 1.11 # If it has not been 'seen' (and placed in a set) use it as the seed for a set
751 :     if ( not $seen{$peg1} )
752 :     {
753 :     my $set = [$peg1];
754 :    
755 :     # Mark $peg1 as seen
756 :     $seen{$peg1} = 1;
757 :    
758 :     # Grow set through transitive closure -- note that @$set keeps growing
759 :     for (my $i = 0; $i < @$set; $i++)
760 :     {
761 :     $peg1 = $set->[$i];
762 :    
763 :     # Iterate through blast hits
764 :     foreach my $peg2 ( keys %{ $blast_hit->{$peg1} } )
765 :     {
766 :     # If $peg2 has not been seen, put it in the set, and mark it as seen
767 :     if ( not exists $seen{$peg2} )
768 :     {
769 :     push @$set, $peg2;
770 :     $seen{$peg2} = 1;
771 :     }
772 :     }
773 :     }
774 :    
775 :     # Add the new set to the set of sets
776 :     push @$sets, $set;
777 :     }
778 : dsouza 1.1 }
779 :    
780 :     return $sets;
781 :     }
782 :    
783 :     sub blast_hits {
784 :     my($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff) = @_;
785 :     my($t0, $dt);
786 :    
787 :     # Set blast environment variables
788 :     $ENV{"BLASTMAT"} ||= "$FIG_Config::blastmat";
789 :     if ($ENV{"PATH"} !~ /fig\/bin/) { $ENV{"PATH"} = "$FIG_Config::bin:" . $ENV{"PATH"}; }
790 :    
791 :     # Get amino acid sequences
792 : dsouza 1.4 my $sequences = &get_peg_sequences($fig, $feature_data);
793 : dsouza 1.1
794 :     # Write the entire set of sequences to a fasta file
795 :     my $all_seqs_file = "$FIG_Config::temp/all_seqs.$$.tmp.fasta";
796 :     &write_seqs_to_file($sequences, $all_seqs_file);
797 :    
798 :     # Run formatdb
799 :     &formatdb_file($fig, $all_seqs_file);
800 :    
801 :     # If $fast_color == 0, the complete blast of all vs. all sequences is performed
802 :     # Otherwise, instead of all vs. all, the sequences from a single pinned peg region
803 :     # is blasted against all sequences. If any of the hits are better than a given cutoff
804 :     # ($cutoff_2), the pegs hit are deemed to be 'done', i.e. it is assumed that blasting this
805 :     # peg will not yield additional information for forming the peg sets. These 'done' pegs
806 :     # are then omitted from the blast when the sequences from the region they belong to
807 :     # is blasted.
808 :     my %blast_hit;
809 :     if ( $fast_color )
810 :     {
811 : parrello 1.11 my $cutoff_2 = $color_sim_cutoff * 1e-20;
812 :     my %done_with;
813 : dsouza 1.1
814 : parrello 1.11 foreach my $region ( @$regions )
815 :     {
816 :     # Iterate through each region
817 :     my $fids = $region->{'features'};
818 :    
819 :     # Build up a hash of peg sequences which are not 'done_with'
820 :     my %region_seqs;
821 :     foreach my $peg ( grep {$feature_data->{$_}{'type'} eq 'peg'} @$fids )
822 :     {
823 :     if ( $sequences->{$peg} and not $done_with{$peg} )
824 :     {
825 :     $region_seqs{$peg} = $sequences->{$peg};
826 :     }
827 :     }
828 :    
829 :     if ( scalar keys %region_seqs )
830 :     {
831 :     # Write the region sequences to a file
832 :     my $region_seqs_file = "$FIG_Config::temp/region_seqs.$$.tmp.fasta";
833 :     &write_seqs_to_file(\%region_seqs, $region_seqs_file);
834 :    
835 :     # BLAST the region sequences against all other sequences
836 :     my $hits = &blast_files($fig, $region_seqs_file, $all_seqs_file, $color_sim_cutoff);
837 :    
838 :     # Build up hash of blast hits
839 :     foreach my $hit ( @$hits )
840 :     {
841 :     my($peg1, $peg2, $sc) = (split(/\s+/, $hit))[0,1,10];
842 :    
843 :     if ( $peg1 ne $peg2 )
844 :     {
845 :     $blast_hit{$peg1}{$peg2} = 1;
846 :     $blast_hit{$peg2}{$peg1} = 1;
847 :    
848 :     # If the blast score is less than the chosen cutoff, mark $peg2 as 'done_with' so
849 :     # that it will not be blasted with it's region sequences
850 :     if ( $sc <= $cutoff_2 )
851 :     {
852 :     $done_with{$peg2} = 1;
853 :     }
854 :     }
855 :     }
856 :     }
857 :     }
858 : dsouza 1.1 }
859 :     else
860 :     {
861 : parrello 1.11 # BLAST sequence file against itself
862 :     my $hits = &blast_files($fig, $all_seqs_file, $all_seqs_file, $color_sim_cutoff);
863 : dsouza 1.1
864 : parrello 1.11 # Build up hash of blast hits
865 :     foreach my $hit ( @$hits )
866 :     {
867 :     my($peg1, $peg2, $sc) = (split(/\s+/, $hit))[0,1,10];
868 :    
869 :     if ( $peg1 ne $peg2 )
870 :     {
871 :     $blast_hit{$peg1}{$peg2} = 1;
872 :     $blast_hit{$peg2}{$peg1} = 1;
873 :     }
874 :     }
875 : dsouza 1.1 }
876 :    
877 :     return \%blast_hit;
878 :     }
879 :    
880 :     sub blast_hits_from_sims {
881 :     my($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff) = @_;
882 :    
883 :     my %blast_hit;
884 :    
885 :     my $maxN = 2000;
886 :     my $maxP = $color_sim_cutoff;
887 :     my $select = 'fig';
888 :     my $max_expand = '';
889 :     my $filters = '';
890 :    
891 :     # Create a hash of all pegs
892 :     my %region_peg = map {$_ => 1} grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data;
893 :    
894 :     if ( $fast_color == 1 )
895 :     {
896 : parrello 1.11 my $cutoff_2 = $color_sim_cutoff * 1e-20;
897 :     my %done_with;
898 : dsouza 1.1
899 : parrello 1.11 # Iterate through each peg
900 :     foreach my $peg1 ( keys %region_peg )
901 :     {
902 :     # Skip the 'done_with' pegs
903 :     next if ($done_with{$peg1});
904 :    
905 :     my @sims = $fig->sims($peg1, $maxN, $maxP, $select, $max_expand, $filters);
906 :    
907 :     foreach my $sim ( grep {$region_peg{$_->[1]} and $_->[1] ne $peg1} @sims )
908 :     {
909 :     my($peg2, $sc) = @$sim[1,10];
910 :    
911 :     $blast_hit{$peg1}{$peg2} = 1;
912 :     $blast_hit{$peg2}{$peg1} = 1;
913 :    
914 :     # If the blast score is less than the chosen cutoff, mark $peg2 as 'done_with' so
915 :     # that it will not be blasted with it's region sequences
916 :     if ( $sc <= $cutoff_2 )
917 :     {
918 :     $done_with{$peg2} = 1;
919 :     }
920 :     }
921 :     }
922 : dsouza 1.1 }
923 :     else
924 :     {
925 : parrello 1.11 # Iterate through each peg
926 :     foreach my $peg1 ( keys %region_peg )
927 :     {
928 :     my @sims = $fig->sims($peg1, $maxN, $maxP, $select, $max_expand, $filters);
929 :    
930 :     foreach my $peg2 ( map {$_->[1]} grep {$region_peg{$_->[1]} and $_->[1] ne $peg1} @sims )
931 :     {
932 :     $blast_hit{$peg1}{$peg2} = 1;
933 :     $blast_hit{$peg2}{$peg1} = 1;
934 :     }
935 :     }
936 : dsouza 1.1 }
937 :    
938 :     return \%blast_hit;
939 :     }
940 :    
941 :     sub blast_files {
942 :     my($fig, $input, $database, $cutoff) = @_;
943 :    
944 :     my $cmd = "$FIG_Config::ext_bin/blastall";
945 :     my @args = ('-p', 'blastp', '-i', $input, '-d', $database, '-m', 8, '-e', $cutoff);
946 :     my @blast_out = $fig->run_gathering_output($cmd, @args);
947 :    
948 :     return \@blast_out;
949 :     }
950 :    
951 :     sub formatdb_file {
952 :     my($fig, $file) = @_;
953 :    
954 :     my $cmd = "$FIG_Config::ext_bin/formatdb -i $file -p T";
955 :     $fig->run($cmd);
956 :     }
957 :    
958 :     sub write_seqs_to_file {
959 :     my($seq, $fasta_file) = @_;
960 :    
961 :     open(FASTA, ">$fasta_file") or die "could not create FASTA file '$fasta_file': $!";
962 :     foreach my $peg ( keys %$seq )
963 :     {
964 : parrello 1.11 print FASTA ">$peg\n$seq->{$peg}\n";
965 : dsouza 1.1 }
966 :     close(FASTA) or die "could not close file FASTA file '$fasta_file': $!";
967 :     }
968 :    
969 :     sub get_peg_sequences {
970 :     my($fig, $feature_data) = @_;
971 :     my %sequences;
972 :    
973 :     # Iterate over each peg
974 :     foreach my $peg ( grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data )
975 :     {
976 : parrello 1.11 my $seq = $fig->get_translation($peg);
977 : dsouza 1.1
978 : parrello 1.11 if ( $seq )
979 :     {
980 :     $sequences{$peg} = $seq;
981 :     }
982 :     else
983 :     {
984 :     print STDERR "could not get sqeuence for $peg\n";
985 :     }
986 : dsouza 1.1 }
987 :    
988 :     return \%sequences;
989 :     }
990 :    
991 :    
992 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3