[Bio] / FigKernelPackages / PinnedRegions.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/PinnedRegions.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (view) (download) (as text)

1 : dsouza 1.1 # -*- perl -*-
2 :     ########################################################################
3 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     ########################################################################
18 :    
19 :    
20 :     package PinnedRegions;
21 :    
22 :     use strict;
23 :     use warnings;
24 :    
25 :     use FIG;
26 : dsouza 1.5 use FigFams;
27 : dsouza 1.1 use FIG_Config;
28 : parrello 1.9 use Tracer;
29 : dsouza 1.1
30 : parrello 1.9 use Time::HiRes qw( usleep gettimeofday tv_interval );
31 : dsouza 1.1
32 :     sub pinned_regions {
33 : bartels 1.8 my($fig, $pin_desc, $fast_color, $sims_from, $map_sz, $add_features) = @_;
34 : parrello 1.11 Trace("Pinned regions method called.") if T(3);
35 : dsouza 1.1 # Get list of pegs required by the description in $pin_desc
36 : dsouza 1.4 my $pinned_pegs = &expand_peg_list($fig, $pin_desc);
37 : parrello 1.9 Trace("Pinned pegs are " . join(", ", @$pinned_pegs) . ".") if T(3);
38 : parrello 1.11 # Filter out the pegs that don't exist.
39 : dsouza 1.1 # Get regions around pinned pegs -- boundaries, features, etc.
40 :     my $regions = &define_regions($fig, $map_sz, $pinned_pegs);
41 :     # Filter out overlapping regions caused by gene fusions, frame shifts etc. where multiple PEGs
42 :     # are pinned (by similarity or PCH) to the input PEG
43 :     $regions = &filter_regions_1($pin_desc, $regions);
44 :    
45 : dsouza 1.4 # Get information for each feature -- location, strand, function etc.
46 : dsouza 1.1 my $feature_data = &feature_data($fig, $regions);
47 :    
48 : dsouza 1.3 &add_functional_coupling($fig, $pin_desc, $regions, $feature_data);
49 : bartels 1.8 # &add_figfams($fig, $feature_data);
50 : dsouza 1.3 &add_subsystem_data($fig, $pin_desc, $feature_data);
51 : parrello 1.9 Trace("Coloring pegs.") if T(3);
52 : dsouza 1.4 # Assign a set number to some PEGs through transitive closure based on similarity, from blast scores
53 : dsouza 1.1 &color_pegs($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $fast_color, $sims_from);
54 :    
55 :     # Filter out regions which have only a single PEG (the pinned one) colored.
56 : dsouza 1.3 # $regions = &filter_regions_2($pin_desc, $regions, $feature_data);
57 : dsouza 1.1
58 : bartels 1.8 if ( defined( $add_features ) ) {
59 : parrello 1.11 &add_features_to_fdata( $fig, $feature_data, $add_features, $regions );
60 : bartels 1.8 }
61 :    
62 : dsouza 1.1 # Add feature data to the regions to make the final maps
63 :     my $maps = &make_maps($fig, $regions, $feature_data);
64 :     return $maps;
65 :     }
66 :    
67 :     sub expand_peg_list {
68 :     my($fig, $pin_desc) = @_;
69 :     my @pegs = ();
70 :    
71 :     # Filter for legitimate genome IDS -- should handle deleted organisms and (in NMPDR) non-NMPDR orgs
72 :     my %ok_genome_id = map {$_ => 1} $fig->genomes;
73 :    
74 : dsouza 1.5 # If the user has selected the genomes to be displayed, filter out all other genomes
75 :     if ( @{ $pin_desc->{'show_genomes'} } )
76 :     {
77 : parrello 1.11 # create new %ok_genome_id hash from intersection of selected genomes and legitimate genome IDs
78 :     %ok_genome_id = map {$_ => 1} grep {$ok_genome_id{$_}} @{ $pin_desc->{'show_genomes'} };
79 : dsouza 1.5 }
80 :    
81 : dsouza 1.1 if ( @{ $pin_desc->{'pegs'} } > 1 )
82 :     {
83 : parrello 1.11 # Input already contains list of pegs, no need for expansion
84 :     @pegs = grep {$ok_genome_id{$fig->genome_of($_)}} @{ $pin_desc->{'pegs'} };
85 : dsouza 1.1 }
86 :     else
87 :     {
88 : parrello 1.11 # Get PCH pinned pegs
89 :     my $pch_pinned_pegs = &pch_pinned_pegs($fig, $pin_desc, \%ok_genome_id);
90 : dsouza 1.1
91 : parrello 1.11 $pch_pinned_pegs = &select_pegs($fig, $pin_desc, $pin_desc->{'n_pch_pins'}, $pch_pinned_pegs);
92 : dsouza 1.1
93 : parrello 1.11 # Get similarity pinned pegs
94 :     my $sim_pegs = &sim_pegs($fig, $pin_desc, \%ok_genome_id);
95 : dsouza 1.1
96 : parrello 1.11 # Remove PCH PEGs (if any) from similarity pinned PEG list
97 :     my %seen = map {$_ => 1} @$pch_pinned_pegs;
98 :     $sim_pegs = [grep {! $seen{$_}} @$sim_pegs];
99 :    
100 :     $sim_pegs = &select_pegs($fig, $pin_desc, $pin_desc->{'n_sims'}, $sim_pegs);
101 :    
102 :     @pegs = ();
103 :    
104 :     # Get input peg
105 :     my $peg = $pin_desc->{'pegs'}[0];
106 :     if ( $pin_desc->{'sort_by'} eq 'phylogeny' )
107 :     {
108 :     # First add input peg, then sort by phylogeny
109 :     @pegs = $fig->sort_fids_by_taxonomy($peg, @$sim_pegs, @$pch_pinned_pegs);
110 :     }
111 :     else
112 :     {
113 :     # Sort by phylogenetic distance or organism name
114 :     my $g1 = $fig->genome_of($peg);
115 :     @pegs = map {$_->[0] }
116 :     sort {$a->[1] <=> $b->[1] or $a->[2] cmp $b->[2]}
117 :     map {[$_, $fig->crude_estimate_of_distance($g1,$fig->genome_of($_)), $fig->org_of($_)]} @$sim_pegs, @$pch_pinned_pegs;
118 :     # Add input peg at front of list
119 :     unshift @pegs, $peg;
120 :     }
121 : dsouza 1.1 }
122 :     return \@pegs
123 :     }
124 :    
125 :     sub pch_pinned_pegs {
126 :     my($fig, $pin_desc, $ok_genome_id) = @_;
127 :    
128 :     my @pegs = ();
129 :    
130 :     if ( $pin_desc->{'n_pch_pins'} > 0 )
131 :     {
132 : parrello 1.11 # Get input peg
133 :     my $peg = $pin_desc->{'pegs'}[0];
134 : dsouza 1.1
135 : parrello 1.11 # Get pegs in pch pin with input peg, and filter out deleted or non-NMPDR organisms
136 :     @pegs = grep {$ok_genome_id->{$fig->genome_of($_)}}
137 :     grep {$_ ne $peg}
138 : dsouza 1.1 map {$_->[0]} $fig->in_pch_pin_with_and_evidence($peg);
139 :     }
140 :    
141 :     return \@pegs;
142 :     }
143 :    
144 :     sub sim_pegs {
145 :     my($fig, $pin_desc, $ok_genome_id) = @_;
146 :    
147 :     # Return a list of PEGS similar to the input PEG, with evalue less than or equal to
148 :     # the user defined similarity cutoff.
149 :     # If the number of sims requested is 0, return the empty list
150 :    
151 :     my $n_sims = $pin_desc->{'n_sims'};
152 :     my $sim_cutoff = $pin_desc->{'sim_cutoff'};
153 :     my @pegs = ();
154 :    
155 :     if ( $n_sims > 0 )
156 :     {
157 : parrello 1.11 my $peg = $pin_desc->{'pegs'}[0];
158 :     my %seen;
159 :     @pegs = grep {$ok_genome_id->{$fig->genome_of($_)}}
160 :     grep {! $seen{$_}++}
161 :     map {$_->[1]} $fig->sims($peg, 10000, $sim_cutoff, "fig");
162 : dsouza 1.1
163 : parrello 1.11 my $n_pegs = @pegs;
164 : dsouza 1.1 }
165 :    
166 :     return \@pegs;
167 :     }
168 :    
169 :     sub select_pegs {
170 :     my($fig, $pin_desc, $n_pegs, $pegs) = @_;
171 :    
172 :     if ( $n_pegs == 0 )
173 :     {
174 : parrello 1.11 return [];
175 : dsouza 1.1 }
176 :    
177 : dsouza 1.3 if ( $pin_desc->{'collapse_close_genomes'} == 1 )
178 : dsouza 1.1 {
179 : parrello 1.11 # The ordering of the PEGs done here is in order to =select= the correct set, the ordering for the
180 :     # display is done later.
181 : dsouza 1.2
182 : parrello 1.11 # To collapse the PEG list, we want to:
183 :     # a. Return at most $n_pegs PEGs,
184 :     # b. include a representative PEG from every genus, subject to a, and
185 :     # c. include a representative PEG from each genus-species, subject to a and b.
186 :     # Individual strains will get represented by a single PEG, chosen arbitrarily.
187 :    
188 :     # The PEG selection is defined by the order of the PEGs. This could be done beter.
189 :    
190 :     my(%seen, @unique_genus, @unique_genus_species);
191 :    
192 :     foreach my $peg ( @$pegs )
193 :     {
194 :     my $org = $fig->org_of($peg);
195 :     # 'org_of' returns undef for deleted PEGs, these need to be omitted from the peg list returned
196 :    
197 :     if ( $org )
198 :     {
199 :     # Use only genus+species to drop strain information
200 :     my($genus, $species) = split(/\s+/, $org);
201 :     my $gs = "$genus $species";
202 :    
203 :     if ( not $seen{$genus}++ )
204 :     {
205 :     # First PEG from this genus, add it to @unique_genus.
206 :     # Mark the genus+species as seen.
207 :     # A subsequent PEG with the same genus and species will be dropped.
208 :     # A subsequent PEG with the same genus but different species will be added to @unique_genus_species.
209 :     $seen{$gs}++;
210 :     push @unique_genus, $peg;
211 :     }
212 :     elsif ( not $seen{$gs}++ )
213 :     {
214 :     # First PEG from this genus+species, add it to @unique_genus_species.
215 :     push @unique_genus_species, $peg;
216 :     }
217 :     }
218 :     }
219 : dsouza 1.1
220 : parrello 1.11 # Keep the unique_genus PEGS at the top, followed by the unique_genus_species
221 :     $pegs = [@unique_genus, @unique_genus_species];
222 : dsouza 1.1 }
223 :    
224 :     # Truncate list if necessary
225 : dsouza 1.3 if ( $n_pegs < @$pegs )
226 : dsouza 1.1 {
227 : parrello 1.11 $#{ $pegs } = $n_pegs - 1;
228 : dsouza 1.1 }
229 :    
230 :     return $pegs;
231 :     }
232 :    
233 :     sub filter_regions_1 {
234 :     my($pin_desc, $regions) = @_;
235 :     my $new_regions;
236 :    
237 :     # Filter out some overlapping regions caused by gene fusions, frame shifts etc. where multiple PEGs
238 :     # are pinned (by similarity or PCH) to the input PEG
239 :     # Note that all overlaps are NOT eliminated
240 :    
241 :     if ( @{ $pin_desc->{'pegs'} } == 1 )
242 :     {
243 : parrello 1.11 # Input pin description is for a single input peg
244 :     my %seen;
245 :    
246 :     foreach my $region ( @$regions )
247 :     {
248 :     my $pinned_peg = $region->{'pinned_peg'};
249 :    
250 :     # If the pinned peg from a region has already been 'seen', don't
251 :     # add the region into the @$new_regions array
252 :    
253 :     if ( not $seen{$pinned_peg} )
254 :     {
255 :     push @$new_regions, $region;
256 :    
257 :     my $fids = $region->{'features'};
258 :     foreach my $fid ( @$fids )
259 :     {
260 :     $seen{$fid} = 1;
261 :     }
262 :     }
263 :     }
264 : dsouza 1.1 }
265 :     else
266 :     {
267 : parrello 1.11 # input PEGs is a list -- keep all of them
268 :     $new_regions = $regions;
269 : dsouza 1.1 }
270 :    
271 :     return $new_regions;
272 :     }
273 :    
274 :     sub filter_regions_2 {
275 :     my($pin_desc, $regions, $feature_data) = @_;
276 :     my $new_regions;
277 :    
278 :     if ( @{ $pin_desc->{'pegs'} } == 1 )
279 :     {
280 : parrello 1.11 # Input is single peg
281 :     my $input_peg = $pin_desc->{'pegs'}[0];
282 : dsouza 1.1
283 : parrello 1.11 foreach my $region ( @$regions )
284 :     {
285 :     my $fids = $region->{'features'};
286 :     my $n_in_sets = grep {$feature_data->{$_}{'set_number'}} @$fids;
287 :    
288 :     if ( $n_in_sets > 1 or $region->{'pinned_peg'} eq $input_peg )
289 :     {
290 :     # Regions should be displayed only if:
291 :     # a. more than one PEG is in a 'colored set' OR
292 :     # b. the pinned PEG is the input PEG
293 :     push @$new_regions, $region;
294 :     }
295 :     }
296 : dsouza 1.1 }
297 :     else
298 :     {
299 : parrello 1.11 # Input is a list of pegs -- keep all of the regions
300 :     $new_regions = $regions
301 : dsouza 1.1 }
302 :    
303 :     return $new_regions;
304 :     }
305 :    
306 :     sub make_maps {
307 :     my($fig, $regions, $feature_data) = @_;
308 :    
309 :     foreach my $region ( @$regions )
310 :     {
311 : parrello 1.11 my $features = [];
312 :     my $fids = $region->{'features'};
313 : dsouza 1.1
314 : parrello 1.11 foreach my $fid ( @$fids )
315 :     {
316 :     push @$features, $feature_data->{$fid};
317 :     # if ( !( $fid =~ /fig/ ) ) {
318 :     # print STDERR $fid." FIDMAKEMAPS\n";
319 :     # }
320 :     }
321 : dsouza 1.1
322 : parrello 1.11 $region->{'features'} = $features;
323 : dsouza 1.1 }
324 :    
325 :     return $regions;
326 :     }
327 :    
328 :     sub define_regions {
329 :     my($fig, $map_sz, $pinned_pegs) = @_;
330 :    
331 :     # Define the 'region' around the pinned peg, get features, and store some region information
332 :     my $regions = [];
333 :     my $half_map_sz = int($map_sz/2);
334 :    
335 :     foreach my $peg ( @$pinned_pegs )
336 :     {
337 : parrello 1.11 my $genome = $fig->genome_of($peg);
338 :     my $loc = $fig->feature_location($peg);
339 :     # We only proceed if a location was found. Failure to find a location indicates
340 :     # that the feature is deleted.
341 :     if ($loc) {
342 :     my($contig, $beg, $end) = $fig->boundaries_of($loc);
343 :    
344 :     my $region_mid = int(($beg + $end)/2);
345 :     my $region_beg = $region_mid - $half_map_sz;
346 :     my $region_end = $region_mid + $half_map_sz;
347 :    
348 :     my($fids) = $fig->genes_in_region($genome, $contig, $region_beg, $region_end);
349 :    
350 :     my $region = {};
351 :     $region->{'genome_id'} = $fig->genome_of($peg);
352 :     $region->{'org_name'} = $fig->genus_species($region->{'genome_id'});
353 :     $region->{'contig'} = $contig;
354 :     $region->{'beg'} = $region_beg;
355 :     $region->{'mid'} = $region_mid;
356 :     $region->{'end'} = $region_end;
357 :     $region->{'features'} = $fids;
358 :     $region->{'pinned_peg_strand'} = ($beg <= $end)? '+' : '-';
359 :     $region->{'pinned_peg'} = $peg;
360 :    
361 :     push @$regions, $region;
362 :     }
363 : dsouza 1.1 }
364 :    
365 :     return $regions;
366 :     }
367 :    
368 : bartels 1.8 sub add_features_to_fdata {
369 :     my ( $fig, $feature_data, $add_features, $regions ) = @_;
370 :    
371 :     foreach my $region ( @$regions ) {
372 : parrello 1.11 my $new_feats = $add_features->{ $region->{ 'genome_id' } };
373 :     foreach my $nf ( @$new_feats ) {
374 :     if ( $nf->{ 'contig' } eq $region->{ 'contig' } &&
375 :     $nf->{ 'start' } < $region->{ 'end' } &&
376 :     $nf->{ 'start' } > $region->{ 'beg' } ) {
377 :     $feature_data->{ $nf->{ 'name' } } = &new_feature_entry( $fig, $nf->{ 'name' }, $region->{ 'mid' }, $region->{ 'contig' }, $nf->{ 'start' }, $nf->{ 'stop' }, $nf->{ 'type' }, $nf->{ 'function' } );
378 :     push @{ $region->{ 'features' } }, $nf->{ 'name' };
379 :     }
380 :     }
381 : bartels 1.8 }
382 :     }
383 :    
384 :     sub new_feature_entry {
385 :     my ( $fig, $fid, $region_mid, $contig, $beg, $end, $type, $func ) = @_;
386 :    
387 :     if ( !defined( $type ) ) {
388 : parrello 1.11 $type = 'unknown';
389 : bartels 1.8 }
390 :    
391 :     if ( !defined( $func ) ) {
392 : parrello 1.11 $func = '';
393 : bartels 1.8 }
394 :    
395 :     my($left, $right) = sort {$a <=> $b} ($beg, $end);
396 :     my $size = $right - $left + 1;
397 :     my $strand = ($beg <= $end)? '+' : '-';
398 :     my $offset = int(($left + $right)/2) - $region_mid;
399 :     my $offset_beg = $left - $region_mid;
400 :     my $offset_end = $right - $region_mid;
401 :    
402 :     return {
403 : parrello 1.11 'fid' => $fid,
404 :     'type' => $type,
405 :     'contig' => $contig,
406 :     'beg' => $beg,
407 :     'end' => $end,
408 :     'size' => $size,
409 :     'strand' => $strand,
410 :     'offset' => $offset,
411 :     'offset_beg' => $offset_beg,
412 :     'offset_end' => $offset_end,
413 :     'function' => $func,
414 : bartels 1.8 };
415 :     }
416 :    
417 : dsouza 1.1 sub feature_data {
418 :     my($fig, $regions) = @_;
419 :     my %feature_data;
420 :    
421 :     foreach my $region ( @$regions )
422 :     {
423 : parrello 1.11 my $region_mid = $region->{'mid'};
424 :     my $fids = $region->{'features'};
425 : dsouza 1.1
426 : parrello 1.11 foreach my $fid ( @$fids )
427 :     {
428 :     # Get feature data if this feature has not been seen before, this avoids repeating
429 :     # this step when pegs occur in multiple regions
430 :     if ( not exists $feature_data{$fid} )
431 :     {
432 :     $feature_data{$fid} = &feature_entry($fig, $fid, $region_mid);
433 :     }
434 :     }
435 : dsouza 1.1 }
436 :    
437 :     return \%feature_data;
438 :     }
439 :    
440 :     sub feature_entry {
441 :     my($fig, $fid, $region_mid) = @_;
442 :    
443 :     my $type = $fig->ftype($fid);
444 :     my $loc = $fig->feature_location($fid);
445 :     my($contig, $beg, $end) = $fig->boundaries_of($loc);
446 :     my($left, $right) = sort {$a <=> $b} ($beg, $end);
447 : dsouza 1.3 my $size = $right - $left + 1;
448 : dsouza 1.1 my $strand = ($beg <= $end)? '+' : '-';
449 :     my $offset = int(($left + $right)/2) - $region_mid;
450 :     my $offset_beg = $left - $region_mid;
451 :     my $offset_end = $right - $region_mid;
452 :     my $func = scalar $fig->function_of($fid) || '';
453 :    
454 :     return {
455 : parrello 1.11 'fid' => $fid,
456 :     'type' => $type,
457 :     'contig' => $contig,
458 :     'beg' => $beg,
459 :     'end' => $end,
460 :     'size' => $size,
461 :     'strand' => $strand,
462 :     'offset' => $offset,
463 :     'offset_beg' => $offset_beg,
464 :     'offset_end' => $offset_end,
465 :     'function' => $func,
466 :     };
467 : dsouza 1.1 }
468 :    
469 : dsouza 1.3 sub add_functional_coupling {
470 :     my($fig, $pin_desc, $regions, $feature_data) = @_;
471 :    
472 :     my $fc_cutoff = defined($pin_desc->{'fc_cutoff'})? $pin_desc->{'fc_cutoff'} : 4;
473 :    
474 :     foreach my $region ( @$regions )
475 :     {
476 : parrello 1.11 my $pin = $region->{'pinned_peg'};
477 :     my @pegs = grep {$feature_data->{$_}{'type'} eq 'peg'} @{ $region->{'features'} };
478 : dsouza 1.3
479 : parrello 1.11 foreach my $couple ( $fig->coupled_to_batch(@pegs) )
480 :     {
481 :     my($peg1, $peg2, $sc) = @$couple;
482 :    
483 :     if ( $peg1 eq $pin and $sc >= $fc_cutoff )
484 :     {
485 :     $feature_data->{$peg2}{'fc_score'} = $sc;
486 :     }
487 :     }
488 : dsouza 1.3 }
489 :     }
490 :    
491 : dsouza 1.5 sub add_figfams {
492 :     my($fig, $feature_data) = @_;
493 :    
494 : dsouza 1.6 # Get FigFams directory from config file
495 : olson 1.10 my $figfam_dir = $fig->get_figfams_data();
496 : dsouza 1.5
497 : dsouza 1.6 # Check if FigFams directory is defined and exists on current machine
498 :     if ( defined($figfam_dir) and (-d $figfam_dir) )
499 : dsouza 1.5 {
500 : parrello 1.11 # Get all PEG IDs
501 :     my @pegs = grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data;
502 :     # Get $figfams object
503 :     my $figfams = new FigFams($fig, $figfam_dir);
504 :     # Get FigFam family ID for @pegs
505 :     my $figfam = $figfams->families_containing_peg_bulk(\@pegs);
506 :     # Get hash of FigFam ID to family function
507 :     my $family_function = $figfams->family_functions();
508 :    
509 :     # Some FigFams (the ones that are not subsystem related) have the FigFam ID in the family function.
510 :     # This results in the popup displaying the ID twice, so one should be removed.
511 :    
512 :     my %figfam_text;
513 :     foreach my $fid ( keys %$feature_data )
514 :     {
515 :     if ( $figfam->{$fid} )
516 :     {
517 :     my $figfam_id = $figfam->{$fid};
518 :     if ( ! exists $figfam_text{$figfam_id} ) {
519 :     if ( $family_function->{$figfam_id} =~ /^FIG\d+/ ) {
520 :     $figfam_text{$figfam_id} = $family_function->{$figfam_id};
521 :     } else {
522 :     $figfam_text{$figfam_id} = $figfam_id . ': ' . $family_function->{$figfam_id};
523 :     }
524 :     }
525 : dsouza 1.7
526 : parrello 1.11 # Add FigFam information to hash -- to go into the popup text
527 :     $feature_data->{$fid}{'figfam'} = $figfam_text{$figfam_id};
528 : dsouza 1.7 #->{$fid} . ": " . $family_function->{$figfam->{$fid}};
529 : parrello 1.11 }
530 :     }
531 : dsouza 1.5 }
532 :     }
533 :    
534 : dsouza 1.3 sub add_subsystem_data {
535 :     my($fig, $pin_desc, $feature_data) = @_;
536 :    
537 :     # Get subsystem_information for =all= pegs
538 :     my %peg_to_ss = $fig->subsystems_for_pegs([grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data]);
539 :    
540 :     # Count number of occurences of each subsystem
541 :     my %ss_count;
542 :     foreach my $ss ( map {$_->[0]} map {@$_} values %peg_to_ss )
543 :     {
544 : parrello 1.11 $ss_count{$ss}++;
545 : dsouza 1.3 }
546 :    
547 :     # Sort subsystems based on count and create hash with unique index for each subsystem where
548 :     # lower indices correspond to higher number of occurences of the subsystem.
549 :     my %ss_index;
550 :     my $index = 0;
551 :     foreach my $ss ( sort {$ss_count{$b} <=> $ss_count{$a} or $a cmp $b} keys %ss_count )
552 :     {
553 : parrello 1.11 $ss_index{$ss} = ++$index;
554 : dsouza 1.3 }
555 :    
556 :     # Add subsystem information for pegs in subsystems
557 :     foreach my $fid ( keys %peg_to_ss )
558 :     {
559 : parrello 1.11 my @subsystems = ();
560 :     foreach my $rec ( @{ $peg_to_ss{$fid} } )
561 :     {
562 :     if ( @$rec )
563 :     {
564 :     push @subsystems, $rec->[0];
565 :     }
566 :     }
567 :    
568 :     if ( @subsystems )
569 :     {
570 :     $feature_data->{$fid}{'subsystems'} = [sort {$a->[1] <=> $b->[1]} map {$_->[0] =~ s/_/ /g; $_} map {[$_, $ss_index{$_}]} @subsystems];
571 :     }
572 : dsouza 1.3 }
573 :     }
574 :    
575 : dsouza 1.1 sub color_pegs {
576 :     my($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $fast_color, $sims_from) = @_;
577 :    
578 :     # Run blastp and get connected pegs
579 :     my $blast_hit = {};
580 :     my $color_sim_cutoff = $pin_desc->{'color_sim_cutoff'};
581 :    
582 :     if ( $sims_from eq 'blast' )
583 :     {
584 : parrello 1.11 $blast_hit = &blast_hits($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff);
585 : dsouza 1.1 }
586 :     else
587 :     {
588 : parrello 1.11 $blast_hit = &blast_hits_from_sims($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff);
589 : dsouza 1.1 }
590 :    
591 :     # Assign set numbers to pegs based on blast scores
592 :     my $color_sets = &partition_pegs($blast_hit);
593 :    
594 :     # Sort peg sets to that a) larger sets have smaller set numbers, and
595 :     # b) sets closer to the pinned peg have smaller set numbers
596 : dsouza 1.4 $color_sets = &sort_color_sets($pin_desc, $pinned_pegs, $color_sets, $feature_data);
597 : dsouza 1.1
598 :     # Add color set number into $feature_data
599 :     for (my $i = 0; $i < @$color_sets; $i++)
600 :     {
601 : parrello 1.11 # Use an index that starts at 1 (matches with coloring index)
602 :     my $n = $i + 1;
603 :     foreach my $peg ( @{ $color_sets->[$i] } )
604 :     {
605 :     $feature_data->{$peg}{'set_number'} = $n;
606 :    
607 :     # If this is the pinned set, set the 'color' to 'red'
608 :     if ( $n == 1 )
609 :     {
610 :     $feature_data->{$peg}{'color'} = 'red';
611 :     }
612 :     }
613 : dsouza 1.1 }
614 :     }
615 :    
616 :     sub sort_color_sets {
617 :     my($pin_desc, $pinned_pegs, $color_sets, $feature_data) = @_;
618 :    
619 : dsouza 1.3 # Find the color set containing the set of pinned pegs, i.e. the set containing the input peg.
620 :     # If the input peg is found (or if input is a list of pegs, returned value is the array index for the
621 :     # set.
622 :     # If the input peg is not found, the returned value is an reference to an array containing the input peg.
623 :     my $set = &pinned_set($pin_desc, $pinned_pegs, $color_sets);
624 : dsouza 1.1
625 : dsouza 1.3 my $pinned_color_set;
626 :     if ( $set =~ /^\d+$/ )
627 :     {
628 : parrello 1.11 # Splice out the color set containing the input peg
629 :     ($pinned_color_set) = splice(@$color_sets,$set,1);
630 : dsouza 1.3 }
631 :     else
632 :     {
633 : parrello 1.11 $pinned_color_set = $set;
634 : dsouza 1.3 }
635 : dsouza 1.1
636 : dsouza 1.3 # Add offset (summed distance from
637 :     my @color_sets = map {[$_, &offset($_, $feature_data)]} @$color_sets;
638 : dsouza 1.1 # Sort the remaining color sets based on:
639 :     # a. size (number of pegs) and
640 :     # b. offset from mid-point of region
641 :     @$color_sets = map {$_->[0]}
642 :     sort {@{$b->[0]} <=> @{$a->[0]} or $a->[1] <=> $b->[1]}
643 : dsouza 1.3 # sort {$a->[1] <=> $b->[1] or @{$b->[0]} <=> @{$a->[0]}}
644 : dsouza 1.1 map {[$_, &offset($_, $feature_data)]} @$color_sets;
645 :    
646 :     # Add the set of pinned pegs at the front of the returned list so that it gets colored red
647 :     return [$pinned_color_set, @$color_sets];
648 :     }
649 :    
650 : dsouza 1.3 sub pinned_set {
651 : dsouza 1.1 my($pin_desc, $pinned_pegs, $color_sets) = @_;
652 :    
653 : dsouza 1.3 # Returns an integer if the input is a peg-list or if the input peg is in a set.
654 :     # Returns the input peg (as an array reference) if the input peg is not in a set.
655 :    
656 : dsouza 1.1 if ( @{ $pin_desc->{'pegs'} } == 1 )
657 :     {
658 : parrello 1.11 # Get input peg if it exists -- the set containing this peg should be colored red
659 :     my $peg = $pin_desc->{'pegs'}[0];
660 : dsouza 1.1
661 : parrello 1.11 # Iterate through the color sets until you find the set containing the input peg
662 :     for (my $i = 0; $i < @$color_sets; $i++)
663 :     {
664 :     foreach my $peg2 ( @{ $color_sets->[$i] } )
665 :     {
666 :     if ( $peg2 eq $peg )
667 :     {
668 :     # Return the set index
669 :     return $i;
670 :     }
671 :     }
672 :     }
673 : dsouza 1.1
674 : parrello 1.11 # The input peg may not be in a set if there is only one region or the coloring cutoff is very stringent.
675 :     return [$peg];
676 : dsouza 1.1 }
677 :     else
678 :     {
679 : parrello 1.11 # Input is a peg-list, which may be split into multiple sets -- the largest will be called the
680 :     # "pinned" set.
681 :     my($i, $max_count);
682 :     my %pinned = map {$_ => 1} @$pinned_pegs;
683 :    
684 :     for (my $j = 0; $j < @$color_sets; $j++)
685 :     {
686 :     # count how many pinned pegs are found in each set
687 :     my $count = scalar grep {$pinned{$_}} @{ $color_sets->[$j] };
688 :    
689 :     if ( $max_count < $count )
690 :     {
691 :     # Keep track of the set having the largest number of pinned pegs
692 :     $max_count = $count;
693 :     $i = $j;
694 :     }
695 :     }
696 : dsouza 1.1
697 : parrello 1.11 return $i;
698 : dsouza 1.1 }
699 :     }
700 :    
701 :     sub offset {
702 :     my($set, $feature_data) = @_;
703 :    
704 :     my $offset;
705 :     foreach my $peg ( @$set )
706 :     {
707 : parrello 1.11 $offset += abs($feature_data->{$peg}{'offset'});
708 : dsouza 1.1 }
709 : dsouza 1.3
710 :     return sprintf("%.2f", $offset/@$set);
711 : dsouza 1.1 }
712 :    
713 :     sub partition_pegs {
714 :     my($blast_hit) = @_;
715 :    
716 :     my %seen;
717 :     my $sets = [];
718 :    
719 :     # Iterate through the pegs with blast hits in arbitrary order
720 :     foreach my $peg1 ( keys %$blast_hit )
721 :     {
722 : parrello 1.11 # If it has not been 'seen' (and placed in a set) use it as the seed for a set
723 :     if ( not $seen{$peg1} )
724 :     {
725 :     my $set = [$peg1];
726 :    
727 :     # Mark $peg1 as seen
728 :     $seen{$peg1} = 1;
729 :    
730 :     # Grow set through transitive closure -- note that @$set keeps growing
731 :     for (my $i = 0; $i < @$set; $i++)
732 :     {
733 :     $peg1 = $set->[$i];
734 :    
735 :     # Iterate through blast hits
736 :     foreach my $peg2 ( keys %{ $blast_hit->{$peg1} } )
737 :     {
738 :     # If $peg2 has not been seen, put it in the set, and mark it as seen
739 :     if ( not exists $seen{$peg2} )
740 :     {
741 :     push @$set, $peg2;
742 :     $seen{$peg2} = 1;
743 :     }
744 :     }
745 :     }
746 :    
747 :     # Add the new set to the set of sets
748 :     push @$sets, $set;
749 :     }
750 : dsouza 1.1 }
751 :    
752 :     return $sets;
753 :     }
754 :    
755 :     sub blast_hits {
756 :     my($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff) = @_;
757 :     my($t0, $dt);
758 :    
759 :     # Set blast environment variables
760 :     $ENV{"BLASTMAT"} ||= "$FIG_Config::blastmat";
761 :     if ($ENV{"PATH"} !~ /fig\/bin/) { $ENV{"PATH"} = "$FIG_Config::bin:" . $ENV{"PATH"}; }
762 :    
763 :     # Get amino acid sequences
764 : dsouza 1.4 my $sequences = &get_peg_sequences($fig, $feature_data);
765 : dsouza 1.1
766 :     # Write the entire set of sequences to a fasta file
767 :     my $all_seqs_file = "$FIG_Config::temp/all_seqs.$$.tmp.fasta";
768 :     &write_seqs_to_file($sequences, $all_seqs_file);
769 :    
770 :     # Run formatdb
771 :     &formatdb_file($fig, $all_seqs_file);
772 :    
773 :     # If $fast_color == 0, the complete blast of all vs. all sequences is performed
774 :     # Otherwise, instead of all vs. all, the sequences from a single pinned peg region
775 :     # is blasted against all sequences. If any of the hits are better than a given cutoff
776 :     # ($cutoff_2), the pegs hit are deemed to be 'done', i.e. it is assumed that blasting this
777 :     # peg will not yield additional information for forming the peg sets. These 'done' pegs
778 :     # are then omitted from the blast when the sequences from the region they belong to
779 :     # is blasted.
780 :     my %blast_hit;
781 :     if ( $fast_color )
782 :     {
783 : parrello 1.11 my $cutoff_2 = $color_sim_cutoff * 1e-20;
784 :     my %done_with;
785 : dsouza 1.1
786 : parrello 1.11 foreach my $region ( @$regions )
787 :     {
788 :     # Iterate through each region
789 :     my $fids = $region->{'features'};
790 :    
791 :     # Build up a hash of peg sequences which are not 'done_with'
792 :     my %region_seqs;
793 :     foreach my $peg ( grep {$feature_data->{$_}{'type'} eq 'peg'} @$fids )
794 :     {
795 :     if ( $sequences->{$peg} and not $done_with{$peg} )
796 :     {
797 :     $region_seqs{$peg} = $sequences->{$peg};
798 :     }
799 :     }
800 :    
801 :     if ( scalar keys %region_seqs )
802 :     {
803 :     # Write the region sequences to a file
804 :     my $region_seqs_file = "$FIG_Config::temp/region_seqs.$$.tmp.fasta";
805 :     &write_seqs_to_file(\%region_seqs, $region_seqs_file);
806 :    
807 :     # BLAST the region sequences against all other sequences
808 :     my $hits = &blast_files($fig, $region_seqs_file, $all_seqs_file, $color_sim_cutoff);
809 :    
810 :     # Build up hash of blast hits
811 :     foreach my $hit ( @$hits )
812 :     {
813 :     my($peg1, $peg2, $sc) = (split(/\s+/, $hit))[0,1,10];
814 :    
815 :     if ( $peg1 ne $peg2 )
816 :     {
817 :     $blast_hit{$peg1}{$peg2} = 1;
818 :     $blast_hit{$peg2}{$peg1} = 1;
819 :    
820 :     # If the blast score is less than the chosen cutoff, mark $peg2 as 'done_with' so
821 :     # that it will not be blasted with it's region sequences
822 :     if ( $sc <= $cutoff_2 )
823 :     {
824 :     $done_with{$peg2} = 1;
825 :     }
826 :     }
827 :     }
828 :     }
829 :     }
830 : dsouza 1.1 }
831 :     else
832 :     {
833 : parrello 1.11 # BLAST sequence file against itself
834 :     my $hits = &blast_files($fig, $all_seqs_file, $all_seqs_file, $color_sim_cutoff);
835 : dsouza 1.1
836 : parrello 1.11 # Build up hash of blast hits
837 :     foreach my $hit ( @$hits )
838 :     {
839 :     my($peg1, $peg2, $sc) = (split(/\s+/, $hit))[0,1,10];
840 :    
841 :     if ( $peg1 ne $peg2 )
842 :     {
843 :     $blast_hit{$peg1}{$peg2} = 1;
844 :     $blast_hit{$peg2}{$peg1} = 1;
845 :     }
846 :     }
847 : dsouza 1.1 }
848 :    
849 :     return \%blast_hit;
850 :     }
851 :    
852 :     sub blast_hits_from_sims {
853 :     my($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff) = @_;
854 :    
855 :     my %blast_hit;
856 :    
857 :     my $maxN = 2000;
858 :     my $maxP = $color_sim_cutoff;
859 :     my $select = 'fig';
860 :     my $max_expand = '';
861 :     my $filters = '';
862 :    
863 :     # Create a hash of all pegs
864 :     my %region_peg = map {$_ => 1} grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data;
865 :    
866 :     if ( $fast_color == 1 )
867 :     {
868 : parrello 1.11 my $cutoff_2 = $color_sim_cutoff * 1e-20;
869 :     my %done_with;
870 : dsouza 1.1
871 : parrello 1.11 # Iterate through each peg
872 :     foreach my $peg1 ( keys %region_peg )
873 :     {
874 :     # Skip the 'done_with' pegs
875 :     next if ($done_with{$peg1});
876 :    
877 :     my @sims = $fig->sims($peg1, $maxN, $maxP, $select, $max_expand, $filters);
878 :    
879 :     foreach my $sim ( grep {$region_peg{$_->[1]} and $_->[1] ne $peg1} @sims )
880 :     {
881 :     my($peg2, $sc) = @$sim[1,10];
882 :    
883 :     $blast_hit{$peg1}{$peg2} = 1;
884 :     $blast_hit{$peg2}{$peg1} = 1;
885 :    
886 :     # If the blast score is less than the chosen cutoff, mark $peg2 as 'done_with' so
887 :     # that it will not be blasted with it's region sequences
888 :     if ( $sc <= $cutoff_2 )
889 :     {
890 :     $done_with{$peg2} = 1;
891 :     }
892 :     }
893 :     }
894 : dsouza 1.1 }
895 :     else
896 :     {
897 : parrello 1.11 # Iterate through each peg
898 :     foreach my $peg1 ( keys %region_peg )
899 :     {
900 :     my @sims = $fig->sims($peg1, $maxN, $maxP, $select, $max_expand, $filters);
901 :    
902 :     foreach my $peg2 ( map {$_->[1]} grep {$region_peg{$_->[1]} and $_->[1] ne $peg1} @sims )
903 :     {
904 :     $blast_hit{$peg1}{$peg2} = 1;
905 :     $blast_hit{$peg2}{$peg1} = 1;
906 :     }
907 :     }
908 : dsouza 1.1 }
909 :    
910 :     return \%blast_hit;
911 :     }
912 :    
913 :     sub blast_files {
914 :     my($fig, $input, $database, $cutoff) = @_;
915 :    
916 :     my $cmd = "$FIG_Config::ext_bin/blastall";
917 :     my @args = ('-p', 'blastp', '-i', $input, '-d', $database, '-m', 8, '-e', $cutoff);
918 :     my @blast_out = $fig->run_gathering_output($cmd, @args);
919 :    
920 :     return \@blast_out;
921 :     }
922 :    
923 :     sub formatdb_file {
924 :     my($fig, $file) = @_;
925 :    
926 :     my $cmd = "$FIG_Config::ext_bin/formatdb -i $file -p T";
927 :     $fig->run($cmd);
928 :     }
929 :    
930 :     sub write_seqs_to_file {
931 :     my($seq, $fasta_file) = @_;
932 :    
933 :     open(FASTA, ">$fasta_file") or die "could not create FASTA file '$fasta_file': $!";
934 :     foreach my $peg ( keys %$seq )
935 :     {
936 : parrello 1.11 print FASTA ">$peg\n$seq->{$peg}\n";
937 : dsouza 1.1 }
938 :     close(FASTA) or die "could not close file FASTA file '$fasta_file': $!";
939 :     }
940 :    
941 :     sub get_peg_sequences {
942 :     my($fig, $feature_data) = @_;
943 :     my %sequences;
944 :    
945 :     # Iterate over each peg
946 :     foreach my $peg ( grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data )
947 :     {
948 : parrello 1.11 my $seq = $fig->get_translation($peg);
949 : dsouza 1.1
950 : parrello 1.11 if ( $seq )
951 :     {
952 :     $sequences{$peg} = $seq;
953 :     }
954 :     else
955 :     {
956 :     print STDERR "could not get sqeuence for $peg\n";
957 :     }
958 : dsouza 1.1 }
959 :    
960 :     return \%sequences;
961 :     }
962 :    
963 :    
964 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3