[Bio] / FigKernelPackages / PinnedRegions.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/PinnedRegions.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.43 - (view) (download) (as text)

1 : dsouza 1.1 # -*- perl -*-
2 :     ########################################################################
3 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     ########################################################################
18 :    
19 :     package PinnedRegions;
20 :    
21 :     use strict;
22 :     use warnings;
23 :    
24 : olson 1.31 use Data::Dumper;
25 :    
26 : dsouza 1.1 use FIG;
27 : olson 1.32 use FFs;
28 : dsouza 1.1 use FIG_Config;
29 : parrello 1.9 use Tracer;
30 : paczian 1.27 use BasicLocation;
31 : olson 1.40 use KmerClient;
32 : olson 1.41 use ConservedDomainSearch;
33 : dsouza 1.1
34 : parrello 1.9 use Time::HiRes qw( usleep gettimeofday tv_interval );
35 : dsouza 1.1
36 :     sub pinned_regions {
37 : olson 1.41 my($fig, $pin_desc, $fast_color, $sims_from, $map_sz, $add_features, $extended, $color_by_function, $peg_functions, $fids_for_cds) = @_;
38 :    
39 :     my $cds = ConservedDomainSearch->new($fig);
40 :    
41 :     my $align = $pin_desc->{pin_alignment} || 'center';
42 :    
43 : parrello 1.11 Trace("Pinned regions method called.") if T(3);
44 : dsouza 1.1 # Get list of pegs required by the description in $pin_desc
45 : dsouza 1.25 my $pinned_pegs = &expand_peg_list($fig, $pin_desc);
46 : parrello 1.9 Trace("Pinned pegs are " . join(", ", @$pinned_pegs) . ".") if T(3);
47 : parrello 1.11 # Filter out the pegs that don't exist.
48 : dsouza 1.1 # Get regions around pinned pegs -- boundaries, features, etc.
49 : olson 1.41 my $regions = &define_regions($fig, $map_sz, $pinned_pegs, $align);
50 : dsouza 1.1 # Filter out overlapping regions caused by gene fusions, frame shifts etc. where multiple PEGs
51 :     # are pinned (by similarity or PCH) to the input PEG
52 :     $regions = &filter_regions_1($pin_desc, $regions);
53 :    
54 : olson 1.41 #
55 :     # Add contig info to regions
56 :     #
57 :     my @cinfo = map { [ @$_{qw(genome_id contig)} ] } @$regions;
58 :     my $contig_lens = $fig->contig_ln_bulk(\@cinfo);
59 :     $_->{contig_length} = $contig_lens->{$_->{genome_id}}->{$_->{contig}} foreach @$regions;
60 :     # print STDERR Dumper($regions);
61 :    
62 : dsouza 1.4 # Get information for each feature -- location, strand, function etc.
63 : olson 1.37 my $feature_data = &feature_data($fig, $regions, $extended, $peg_functions);
64 : dsouza 1.1
65 : dsouza 1.3 &add_functional_coupling($fig, $pin_desc, $regions, $feature_data);
66 : paczian 1.16
67 : bartels 1.8 # &add_figfams($fig, $feature_data);
68 : olson 1.42 &add_pattyfams($fig, $feature_data);
69 : paczian 1.17 &add_subsystem_data($fig, $pin_desc, $feature_data, $extended);
70 : paczian 1.16
71 : olson 1.41 my $cdd_trans = {};
72 :     if (ref($fids_for_cds) eq 'ARRAY' && @$fids_for_cds)
73 :     {
74 :     $cdd_trans = &add_cdd($regions, $fig, $cds, $fids_for_cds, $feature_data);
75 :     }
76 :    
77 : parrello 1.9 Trace("Coloring pegs.") if T(3);
78 : dsouza 1.4 # Assign a set number to some PEGs through transitive closure based on similarity, from blast scores
79 : olson 1.37 if ($color_by_function)
80 :     {
81 :     &color_pegs_by_function($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $fast_color, $sims_from);
82 :     }
83 :     else
84 :     {
85 : olson 1.41 &color_pegs($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $fast_color, $sims_from, $cdd_trans);
86 : olson 1.37 }
87 : dsouza 1.1
88 :     # Filter out regions which have only a single PEG (the pinned one) colored.
89 : dsouza 1.14 # $regions = &filter_regions_2($pin_desc, $regions, $feature_data);
90 : dsouza 1.1
91 : bartels 1.8 if ( defined( $add_features ) ) {
92 : parrello 1.11 &add_features_to_fdata( $fig, $feature_data, $add_features, $regions );
93 : bartels 1.8 }
94 :    
95 : dsouza 1.1 # Add feature data to the regions to make the final maps
96 :     my $maps = &make_maps($fig, $regions, $feature_data);
97 : paczian 1.16
98 : dsouza 1.1 return $maps;
99 :     }
100 :    
101 :     sub expand_peg_list {
102 : dsouza 1.25 my($fig, $pin_desc) = @_;
103 : dsouza 1.1 my @pegs = ();
104 :    
105 : olson 1.37 if ($pin_desc->{pegs}->[0] =~ /^fig/ && @{$pin_desc->{pegs}} > 1)
106 :     {
107 :     @pegs = @{$pin_desc->{pegs}};
108 :     return \@pegs;
109 :     }
110 : dsouza 1.1 # Filter for legitimate genome IDS -- should handle deleted organisms and (in NMPDR) non-NMPDR orgs
111 :     my %ok_genome_id = map {$_ => 1} $fig->genomes;
112 :    
113 : dsouza 1.5 # If the user has selected the genomes to be displayed, filter out all other genomes
114 :     if ( @{ $pin_desc->{'show_genomes'} } )
115 :     {
116 : parrello 1.11 # create new %ok_genome_id hash from intersection of selected genomes and legitimate genome IDs
117 :     %ok_genome_id = map {$_ => 1} grep {$ok_genome_id{$_}} @{ $pin_desc->{'show_genomes'} };
118 : dsouza 1.5 }
119 :    
120 : dsouza 1.1 if ( @{ $pin_desc->{'pegs'} } > 1 )
121 :     {
122 : parrello 1.11 # Input already contains list of pegs, no need for expansion
123 : paczian 1.30 if ($pin_desc->{'pegs'}->[0] =~ /^\d/) {
124 :     @pegs = @{ $pin_desc->{'pegs'} };
125 :     } else {
126 :     @pegs = grep {$ok_genome_id{$fig->genome_of($_)}} @{ $pin_desc->{'pegs'} };
127 :     }
128 : dsouza 1.1 }
129 :     else
130 :     {
131 : parrello 1.11 # Get PCH pinned pegs
132 :     my $pch_pinned_pegs = &pch_pinned_pegs($fig, $pin_desc, \%ok_genome_id);
133 : dsouza 1.1
134 : parrello 1.11 $pch_pinned_pegs = &select_pegs($fig, $pin_desc, $pin_desc->{'n_pch_pins'}, $pch_pinned_pegs);
135 : dsouza 1.1
136 : dsouza 1.14 # Get similarity pinned pegs
137 :     my $sim_pegs = &sim_pegs($fig, $pin_desc, \%ok_genome_id);
138 : dsouza 1.1
139 : parrello 1.11 # Remove PCH PEGs (if any) from similarity pinned PEG list
140 :     my %seen = map {$_ => 1} @$pch_pinned_pegs;
141 :     $sim_pegs = [grep {! $seen{$_}} @$sim_pegs];
142 :    
143 : olson 1.41 # print STDERR Dumper(simpegs1 => $sim_pegs);
144 :     my $n_pegs = $pin_desc->{'n_sims'} + $pin_desc->{n_kmers} + $pin_desc->{n_plfams} + $pin_desc->{n_pgfams};
145 :     $sim_pegs = &select_pegs($fig, $pin_desc, $n_pegs, $sim_pegs);
146 :     # print STDERR Dumper(simpegs2 => $sim_pegs);
147 : parrello 1.11
148 :     @pegs = ();
149 :    
150 :     # Get input peg
151 :     my $peg = $pin_desc->{'pegs'}[0];
152 :     if ( $pin_desc->{'sort_by'} eq 'phylogeny' )
153 :     {
154 :     # First add input peg, then sort by phylogeny
155 :     @pegs = $fig->sort_fids_by_taxonomy($peg, @$sim_pegs, @$pch_pinned_pegs);
156 :     }
157 : dsouza 1.21 elsif ( $pin_desc->{'sort_by'} eq 'phylogenetic_distance' )
158 : parrello 1.11 {
159 :     # Sort by phylogenetic distance or organism name
160 :     my $g1 = $fig->genome_of($peg);
161 :     @pegs = map {$_->[0] }
162 :     sort {$a->[1] <=> $b->[1] or $a->[2] cmp $b->[2]}
163 :     map {[$_, $fig->crude_estimate_of_distance($g1,$fig->genome_of($_)), $fig->org_of($_)]} @$sim_pegs, @$pch_pinned_pegs;
164 :     # Add input peg at front of list
165 :     unshift @pegs, $peg;
166 :     }
167 : dsouza 1.21 elsif ( $pin_desc->{'sort_by'} eq 'similarity' )
168 :     {
169 :     # Sort by similarity to input peg (first peg in list of arguments)
170 : dsouza 1.25 @pegs = &sort_pegs_by_similarity($fig, $peg, @$sim_pegs, @$pch_pinned_pegs);
171 : dsouza 1.21 }
172 : olson 1.41
173 :     if ( $n_pegs < @pegs )
174 :     {
175 :     $#pegs = $n_pegs - 1;
176 :     }
177 :     }
178 : dsouza 1.1 return \@pegs
179 :     }
180 :    
181 :     sub pch_pinned_pegs {
182 :     my($fig, $pin_desc, $ok_genome_id) = @_;
183 :    
184 :     my @pegs = ();
185 :    
186 :     if ( $pin_desc->{'n_pch_pins'} > 0 )
187 :     {
188 : parrello 1.11 # Get input peg
189 :     my $peg = $pin_desc->{'pegs'}[0];
190 : dsouza 1.1
191 : parrello 1.11 # Get pegs in pch pin with input peg, and filter out deleted or non-NMPDR organisms
192 :     @pegs = grep {$ok_genome_id->{$fig->genome_of($_)}}
193 :     grep {$_ ne $peg}
194 : dsouza 1.1 map {$_->[0]} $fig->in_pch_pin_with_and_evidence($peg);
195 :     }
196 :    
197 :     return \@pegs;
198 :     }
199 :    
200 :     sub sim_pegs {
201 :     my($fig, $pin_desc, $ok_genome_id) = @_;
202 :    
203 :     # Return a list of PEGS similar to the input PEG, with evalue less than or equal to
204 :     # the user defined similarity cutoff.
205 :     # If the number of sims requested is 0, return the empty list
206 :    
207 :     my $n_sims = $pin_desc->{'n_sims'};
208 : olson 1.40 my $n_kmers = $pin_desc->{'n_kmers'};
209 : olson 1.41 my $n_plfams = $pin_desc->{'n_plfams'};
210 :     my $n_pgfams = $pin_desc->{'n_pgfams'};
211 : dsouza 1.1 my $sim_cutoff = $pin_desc->{'sim_cutoff'};
212 :     my @pegs = ();
213 :    
214 : olson 1.41 my $peg = $pin_desc->{'pegs'}[0];
215 : dsouza 1.1 if ( $n_sims > 0 )
216 :     {
217 : parrello 1.11 my %seen;
218 : paczian 1.15
219 : parrello 1.11 @pegs = grep {$ok_genome_id->{$fig->genome_of($_)}}
220 :     grep {! $seen{$_}++}
221 :     map {$_->[1]} $fig->sims($peg, 10000, $sim_cutoff, "fig");
222 : olson 1.40 }
223 :     elsif ($n_kmers > 0)
224 :     {
225 :     my %seen;
226 : dsouza 1.1
227 : olson 1.41 my $kc;
228 :     eval {
229 :     $kc = KmerClient->new($fig,
230 :     $FIG_Config::kmer_server_host,
231 :     $FIG_Config::kmer_server_port,
232 :     $FIG_Config::kmer_genome_map,
233 :     $FIG_Config::kmer_memcache_host,
234 :     $FIG_Config::kmer_memcache_port);
235 :     };
236 :     if ($@)
237 :     {
238 :     warn "Cannot create kmer_client: $@";
239 :     }
240 :     else
241 :     {
242 :     my @kp = $kc->get_close_pegs($peg, 10000);
243 :    
244 :     @pegs = grep {$ok_genome_id->{$fig->genome_of($_)}} grep {! $seen{$_}++} map {$_->[1]} @kp;
245 : olson 1.40
246 : olson 1.41 open(O, ">","/homes/olson/tmp/kmer.out");
247 :     print O Dumper($peg, \@kp, \@pegs);
248 :     close(O);
249 :     }
250 : dsouza 1.1 }
251 : olson 1.41 elsif ($n_pgfams > 0 || $n_plfams > 0)
252 :     {
253 :     my $re = ($n_pgfams > 0) ? "LIKE 'PGF%'" : "LIKE 'PLF%'";
254 :     my $res = $fig->db_handle->SQL("SELECT family, score FROM family_membership WHERE fid = ? AND family $re", 0, $peg);
255 : dsouza 1.1
256 : olson 1.41 # print STDERR Dumper('plfams', $n_pgfams, $n_plfams, $peg, $re, $res);
257 :     if (@$res)
258 :     {
259 :     my $fam = $res->[0]->[0];
260 :     my $score = $res->[0]->[1];
261 :     my $members = $fig->db_handle->SQL('SELECT fid, abs(score - ?) FROM family_membership WHERE family = ? ORDER BY score', undef,
262 :     $score, $fam);
263 :     # my $members = $fig->db_handle->SQL('SELECT fid, abs(score - ?) FROM family_membership WHERE family = ? ORDER BY abs(score - ?) DESC', undef,
264 :     # $score, $fam, $score);
265 :     @pegs = map { $_->[0] } @$members;
266 :     # print STDERR Dumper('plfams2', $fam, $score, $members, \@pegs);
267 :     }
268 :     }
269 :    
270 : dsouza 1.1 return \@pegs;
271 :     }
272 :    
273 :     sub select_pegs {
274 :     my($fig, $pin_desc, $n_pegs, $pegs) = @_;
275 :    
276 :     if ( $n_pegs == 0 )
277 :     {
278 : parrello 1.11 return [];
279 : dsouza 1.1 }
280 :    
281 : dsouza 1.3 if ( $pin_desc->{'collapse_close_genomes'} == 1 )
282 : dsouza 1.1 {
283 : parrello 1.11 # To collapse the PEG list, we want to:
284 :     # a. Return at most $n_pegs PEGs,
285 :     # b. include a representative PEG from every genus, subject to a, and
286 :     # c. include a representative PEG from each genus-species, subject to a and b.
287 :     # Individual strains will get represented by a single PEG, chosen arbitrarily.
288 :    
289 :     # The PEG selection is defined by the order of the PEGs. This could be done beter.
290 :    
291 :     my(%seen, @unique_genus, @unique_genus_species);
292 :    
293 :     foreach my $peg ( @$pegs )
294 :     {
295 :     my $org = $fig->org_of($peg);
296 :     # 'org_of' returns undef for deleted PEGs, these need to be omitted from the peg list returned
297 :    
298 :     if ( $org )
299 :     {
300 :     # Use only genus+species to drop strain information
301 :     my($genus, $species) = split(/\s+/, $org);
302 :     my $gs = "$genus $species";
303 :    
304 :     if ( not $seen{$genus}++ )
305 :     {
306 :     # First PEG from this genus, add it to @unique_genus.
307 :     # Mark the genus+species as seen.
308 :     # A subsequent PEG with the same genus and species will be dropped.
309 :     # A subsequent PEG with the same genus but different species will be added to @unique_genus_species.
310 :     $seen{$gs}++;
311 :     push @unique_genus, $peg;
312 :     }
313 :     elsif ( not $seen{$gs}++ )
314 :     {
315 :     # First PEG from this genus+species, add it to @unique_genus_species.
316 :     push @unique_genus_species, $peg;
317 :     }
318 :     }
319 : olson 1.35 }
320 : dsouza 1.1
321 : parrello 1.11 # Keep the unique_genus PEGS at the top, followed by the unique_genus_species
322 :     $pegs = [@unique_genus, @unique_genus_species];
323 : dsouza 1.1 }
324 : olson 1.35 elsif ( $pin_desc->{'collapse_close_genomes'} == 2 )
325 :     {
326 :     #
327 :     # Reduce the list to contain only one instance of any given tax id.
328 :     #
329 :    
330 :     my %by_tax;
331 :    
332 :     my %keep;
333 :    
334 :     foreach my $peg ( @$pegs )
335 :     {
336 :     my ($org, $tax) = $peg =~ /^fig\|((\d+)\.\d+)/;
337 :     if ($tax =~ /^666666(6?)/)
338 :     {
339 :     $keep{$peg}++;
340 :     next;
341 :     }
342 :    
343 :     if ( $tax )
344 :     {
345 :     if (defined(my $last = $by_tax{$tax}))
346 :     {
347 :     my($last_org, $last_tax) = @$last;
348 :     if (&FIG::by_genome_id($org, $last_org) > 0)
349 :     {
350 :     $by_tax{$tax} = [$org, $tax, $peg];
351 :     }
352 :     }
353 :     else
354 :     {
355 :     $by_tax{$tax} = [$org, $tax, $peg];
356 :     }
357 :     }
358 :     }
359 :     $keep{$_} = 1 foreach map { $_->[2] } values %by_tax;
360 :     @$pegs = grep { $keep{$_} } @$pegs;
361 :     }
362 : olson 1.41 elsif ( $pin_desc->{'collapse_close_genomes'} == 3 )
363 :     {
364 :     #
365 :     # Use the OTU reps to minimize the list.
366 :     #
367 :    
368 :     my $gfile = "$FIG_Config::global/genome.sets";
369 :     if (open(G, "<", $gfile))
370 :     {
371 :     my %rep;
372 :     my %fam;
373 :     while (<G>)
374 :     {
375 :     chomp;
376 :     my($fam, $gid, $gname) = split(/\t/);
377 :     if (!$rep{$fam})
378 :     {
379 :     $rep{$fam} = $gid;
380 :     $fam{$gid} = $fam;
381 :     }
382 :     }
383 :     close(G);
384 :    
385 :     my @keep;
386 :     foreach my $peg ( @$pegs )
387 :     {
388 :     my ($org, $tax) = $peg =~ /^fig\|((\d+)\.\d+)/;
389 :     push(@keep, $peg) if defined($fam{$org});
390 :     }
391 :     @$pegs = @keep;
392 :     }
393 :     }
394 : dsouza 1.1
395 :     # Truncate list if necessary
396 : olson 1.41 # if ( $n_pegs < @$pegs )
397 :     # {
398 :     # $#{ $pegs } = $n_pegs - 1;
399 :     # }
400 : dsouza 1.1
401 :     return $pegs;
402 :     }
403 :    
404 : dsouza 1.21 sub sort_pegs_by_similarity {
405 : dsouza 1.25 my($fig, $input_peg, @other_pegs) = @_;
406 :    
407 :     # Set blast environment variables
408 :     $ENV{"BLASTMAT"} ||= "$FIG_Config::blastmat";
409 :     if ($ENV{"PATH"} !~ /fig\/bin/) { $ENV{"PATH"} = "$FIG_Config::bin:" . $ENV{"PATH"}; }
410 : dsouza 1.21
411 :     # Get amino acid sequences
412 : dsouza 1.25 my $sequences = &get_peg_sequences($fig, [$input_peg, @other_pegs]);
413 : dsouza 1.21
414 :     # Write the input peg sequence to a fasta file
415 :     my $input_seq = {$input_peg => $sequences->{$input_peg}};
416 :     my $input_seq_file = "$FIG_Config::temp/input_seq.$$.tmp.fasta";
417 :     &write_seqs_to_file($input_seq, $input_seq_file);
418 :    
419 :     # Write the other peg sequences to a fasta file
420 :     delete $sequences->{$input_peg};
421 :     my $other_seq_file = "$FIG_Config::temp/other_seq.$$.tmp.fasta";
422 :     &write_seqs_to_file($sequences, $other_seq_file);
423 :    
424 : dsouza 1.25 # Run formatdb
425 :     &formatdb_file($fig, $other_seq_file);
426 :    
427 :     # BLAST input peg sequence against other peg sequences
428 :     # set high evalue cutoff so all hits get reported
429 : dsouza 1.24 my $cutoff = 10;
430 : dsouza 1.25 my $hits = &blast_files($fig, $input_seq_file, $other_seq_file, $cutoff);
431 : dsouza 1.21
432 :     my @sorted = ($input_peg);
433 : olson 1.41 #
434 :     # We might have multiple hits; just use the first.
435 :     #
436 :     my %seen;
437 : dsouza 1.21 foreach my $hit ( @$hits )
438 :     {
439 : dsouza 1.25 # BLAST output is already sorted by similarity
440 : dsouza 1.21 my($peg2) = (split(/\s+/, $hit))[1];
441 : olson 1.41
442 :     push @sorted, $peg2 unless $seen{$peg2}++;
443 : dsouza 1.21 }
444 :    
445 : olson 1.33 for my $file ($input_seq_file, $other_seq_file)
446 :     {
447 :     for my $suffix ('', qw(.phr .psq .pin))
448 :     {
449 : olson 1.34 my $path = "$file$suffix";
450 : olson 1.33 unlink($path);
451 :     }
452 :     }
453 :    
454 : dsouza 1.21 return @sorted;
455 :     }
456 :    
457 : dsouza 1.1 sub filter_regions_1 {
458 :     my($pin_desc, $regions) = @_;
459 :     my $new_regions;
460 :    
461 :     # Filter out some overlapping regions caused by gene fusions, frame shifts etc. where multiple PEGs
462 :     # are pinned (by similarity or PCH) to the input PEG
463 :     # Note that all overlaps are NOT eliminated
464 :    
465 :     if ( @{ $pin_desc->{'pegs'} } == 1 )
466 :     {
467 : parrello 1.11 # Input pin description is for a single input peg
468 :     my %seen;
469 :    
470 :     foreach my $region ( @$regions )
471 :     {
472 :     my $pinned_peg = $region->{'pinned_peg'};
473 :    
474 :     # If the pinned peg from a region has already been 'seen', don't
475 :     # add the region into the @$new_regions array
476 :    
477 :     if ( not $seen{$pinned_peg} )
478 :     {
479 :     push @$new_regions, $region;
480 :    
481 :     my $fids = $region->{'features'};
482 :     foreach my $fid ( @$fids )
483 :     {
484 :     $seen{$fid} = 1;
485 :     }
486 :     }
487 :     }
488 : dsouza 1.1 }
489 :     else
490 :     {
491 : parrello 1.11 # input PEGs is a list -- keep all of them
492 :     $new_regions = $regions;
493 : dsouza 1.1 }
494 :    
495 :     return $new_regions;
496 :     }
497 :    
498 :     sub filter_regions_2 {
499 :     my($pin_desc, $regions, $feature_data) = @_;
500 :     my $new_regions;
501 :    
502 :     if ( @{ $pin_desc->{'pegs'} } == 1 )
503 :     {
504 : parrello 1.11 # Input is single peg
505 :     my $input_peg = $pin_desc->{'pegs'}[0];
506 : dsouza 1.1
507 : parrello 1.11 foreach my $region ( @$regions )
508 :     {
509 :     my $fids = $region->{'features'};
510 :     my $n_in_sets = grep {$feature_data->{$_}{'set_number'}} @$fids;
511 :    
512 :     if ( $n_in_sets > 1 or $region->{'pinned_peg'} eq $input_peg )
513 :     {
514 :     # Regions should be displayed only if:
515 :     # a. more than one PEG is in a 'colored set' OR
516 :     # b. the pinned PEG is the input PEG
517 :     push @$new_regions, $region;
518 :     }
519 :     }
520 : dsouza 1.1 }
521 :     else
522 :     {
523 : parrello 1.11 # Input is a list of pegs -- keep all of the regions
524 :     $new_regions = $regions
525 : dsouza 1.1 }
526 :    
527 :     return $new_regions;
528 :     }
529 :    
530 :     sub make_maps {
531 :     my($fig, $regions, $feature_data) = @_;
532 :    
533 : olson 1.41 for my $i (0..$#$regions)
534 : dsouza 1.1 {
535 : olson 1.41 my $region = $regions->[$i];
536 :    
537 : parrello 1.11 my $features = [];
538 :     my $fids = $region->{'features'};
539 : dsouza 1.1
540 : parrello 1.11 foreach my $fid ( @$fids )
541 :     {
542 :     push @$features, $feature_data->{$fid};
543 :     # if ( !( $fid =~ /fig/ ) ) {
544 :     # print STDERR $fid." FIDMAKEMAPS\n";
545 :     # }
546 :     }
547 : dsouza 1.1
548 : parrello 1.11 $region->{'features'} = $features;
549 : dsouza 1.1 }
550 :    
551 :     return $regions;
552 :     }
553 :    
554 :     sub define_regions {
555 : olson 1.41 my($fig, $map_sz, $pinned_pegs, $align) = @_;
556 : dsouza 1.1
557 :     # Define the 'region' around the pinned peg, get features, and store some region information
558 :     my $regions = [];
559 :     my $half_map_sz = int($map_sz/2);
560 :    
561 : olson 1.41 # print STDERR Dumper($pinned_pegs, $align);
562 :    
563 :     my $align_offset;
564 : dsouza 1.1 foreach my $peg ( @$pinned_pegs )
565 :     {
566 : paczian 1.30 my $genome;
567 : paczian 1.29 my $loc;
568 : paczian 1.30 if ($peg =~ /^(fig\|)*(\d+\.\d+)[\.\:]{1}(.+\_\d+\_\d+)$/) {
569 :     $loc = $3;
570 :     $genome = $2;
571 : paczian 1.29 } else {
572 :     $loc = $fig->feature_location($peg);
573 : paczian 1.30 $genome = $fig->genome_of($peg);
574 : paczian 1.29 }
575 : parrello 1.11 # We only proceed if a location was found. Failure to find a location indicates
576 :     # that the feature is deleted.
577 :     if ($loc) {
578 :     my($contig, $beg, $end) = $fig->boundaries_of($loc);
579 : olson 1.41
580 :     #
581 :     # Ensure that the focus peg is centered, even if we are aligning on start or stop.
582 :     #
583 :     my $align_point;
584 :     if ($align eq 'start') {
585 :     $align_point = $beg;
586 :     } elsif ($align eq 'center') {
587 :     $align_point = int(($beg + $end)/2);
588 :     } else {
589 :     $align_point = $end;
590 :     };
591 :    
592 :     my $center = int(($beg + $end) / 2);
593 :    
594 :     my $region_mid;
595 :     if (!defined($align_offset))
596 :     {
597 :     if ($beg < $end) {
598 :     $align_offset = $center - $align_point;
599 :     } else {
600 :     $align_offset = $align_point - $center;
601 :     }
602 :     $region_mid = $center;
603 :     }
604 :     else
605 :     {
606 :     if ($beg < $end) {
607 :     $region_mid = $align_point + $align_offset;
608 :     } else {
609 :     $region_mid = $align_point - $align_offset;
610 :     }
611 :    
612 :     }
613 :     print STDERR "$peg AO=$align_offset AP=$align_point C=$center M=$region_mid\n";
614 :    
615 : parrello 1.11 my $region_beg = $region_mid - $half_map_sz;
616 :     my $region_end = $region_mid + $half_map_sz;
617 :    
618 :     my($fids) = $fig->genes_in_region($genome, $contig, $region_beg, $region_end);
619 : olson 1.37 $fids = [ grep { $fig->is_real_feature($_) } @$fids];
620 :    
621 : parrello 1.11 my $region = {};
622 :     $region->{'genome_id'} = $fig->genome_of($peg);
623 :     $region->{'org_name'} = $fig->genus_species($region->{'genome_id'});
624 :     $region->{'contig'} = $contig;
625 :     $region->{'beg'} = $region_beg;
626 :     $region->{'mid'} = $region_mid;
627 :     $region->{'end'} = $region_end;
628 :     $region->{'features'} = $fids;
629 :     $region->{'pinned_peg_strand'} = ($beg <= $end)? '+' : '-';
630 :     $region->{'pinned_peg'} = $peg;
631 :    
632 :     push @$regions, $region;
633 :     }
634 : dsouza 1.1 }
635 :    
636 :     return $regions;
637 :     }
638 :    
639 : bartels 1.8 sub add_features_to_fdata {
640 :     my ( $fig, $feature_data, $add_features, $regions ) = @_;
641 :     foreach my $region ( @$regions ) {
642 : parrello 1.11 my $new_feats = $add_features->{ $region->{ 'genome_id' } };
643 :     foreach my $nf ( @$new_feats ) {
644 :     if ( $nf->{ 'contig' } eq $region->{ 'contig' } &&
645 :     $nf->{ 'start' } < $region->{ 'end' } &&
646 :     $nf->{ 'start' } > $region->{ 'beg' } ) {
647 :     $feature_data->{ $nf->{ 'name' } } = &new_feature_entry( $fig, $nf->{ 'name' }, $region->{ 'mid' }, $region->{ 'contig' }, $nf->{ 'start' }, $nf->{ 'stop' }, $nf->{ 'type' }, $nf->{ 'function' } );
648 :     push @{ $region->{ 'features' } }, $nf->{ 'name' };
649 :     }
650 :     }
651 : bartels 1.8 }
652 :     }
653 :    
654 :     sub new_feature_entry {
655 :     my ( $fig, $fid, $region_mid, $contig, $beg, $end, $type, $func ) = @_;
656 :    
657 :     if ( !defined( $type ) ) {
658 : parrello 1.11 $type = 'unknown';
659 : bartels 1.8 }
660 :    
661 :     if ( !defined( $func ) ) {
662 : parrello 1.11 $func = '';
663 : bartels 1.8 }
664 :    
665 :     my($left, $right) = sort {$a <=> $b} ($beg, $end);
666 :     my $size = $right - $left + 1;
667 :     my $strand = ($beg <= $end)? '+' : '-';
668 :     my $offset = int(($left + $right)/2) - $region_mid;
669 :     my $offset_beg = $left - $region_mid;
670 :     my $offset_end = $right - $region_mid;
671 :    
672 :     return {
673 : parrello 1.11 'fid' => $fid,
674 :     'type' => $type,
675 :     'contig' => $contig,
676 :     'beg' => $beg,
677 :     'end' => $end,
678 :     'size' => $size,
679 :     'strand' => $strand,
680 :     'offset' => $offset,
681 :     'offset_beg' => $offset_beg,
682 :     'offset_end' => $offset_end,
683 :     'function' => $func,
684 : paczian 1.28 'location' => $contig."_".$beg."_".$end
685 : bartels 1.8 };
686 :     }
687 :    
688 : dsouza 1.1 sub feature_data {
689 : olson 1.37 my($fig, $regions, $extended, $peg_functions) = @_;
690 : dsouza 1.1 my %feature_data;
691 :    
692 : olson 1.31 my %all_fids;
693 : olson 1.41 my %all_genomes;
694 : olson 1.31
695 :     foreach my $region ( @$regions )
696 :     {
697 :     my $fids = $region->{'features'};
698 : olson 1.41 $all_fids{$_}++, $all_genomes{&FIG::genome_of($_)}++ foreach @$fids;
699 : olson 1.31 }
700 :    
701 :     my @all_fids = keys(%all_fids);
702 :     my @locations = $fig->feature_location_bulk(\@all_fids);
703 : olson 1.37
704 : olson 1.31 my %locations = map { $_->[0] => $_->[1] } @locations;
705 :    
706 :     my $aliases = $fig->feature_aliases_bulk(\@all_fids);
707 : olson 1.37
708 : olson 1.38 my $functions;
709 :     if (ref($peg_functions) eq 'HASH')
710 :     {
711 :     $functions = $peg_functions;
712 :     }
713 :     elsif (ref($peg_functions) eq 'PG')
714 :     {
715 :     $functions = $peg_functions->function_of_bulk(\@all_fids);
716 :     }
717 :     else
718 :     {
719 :     $functions = $fig->function_of_bulk(\@all_fids);
720 :     }
721 : olson 1.31
722 :     # print STDERR Dumper(\%locations, $aliases, $functions);
723 : dsouza 1.1 foreach my $region ( @$regions )
724 :     {
725 : parrello 1.11 my $region_mid = $region->{'mid'};
726 :     my $fids = $region->{'features'};
727 : dsouza 1.1
728 : parrello 1.11 foreach my $fid ( @$fids )
729 :     {
730 :     # Get feature data if this feature has not been seen before, this avoids repeating
731 :     # this step when pegs occur in multiple regions
732 :     if ( not exists $feature_data{$fid} )
733 :     {
734 : olson 1.41 $feature_data{$fid} = &feature_entry($fig, $fid, $region_mid, $extended, $functions, \%locations, $aliases);
735 : parrello 1.11 }
736 :     }
737 : dsouza 1.1 }
738 :    
739 : paczian 1.16 if ($extended) {
740 :     # get the evidence codes
741 :     my @all_fids = keys(%feature_data);
742 :     my @codes = grep { $_->[1] =~ /^evidence_code/i } $fig->get_attributes(\@all_fids, 'evidence_code');
743 :     my $pretty_codes = {};
744 :     foreach my $code (@codes) {
745 :     my $pretty_code = $code->[2];
746 :     if ($pretty_code =~ /;/) {
747 :     my ($cd, $ss) = split(";", $code->[2]);
748 :     $pretty_code = $cd;
749 :     }
750 :     $pretty_code =~ s/lit\((\d+)\)/lit\(<a href='http:\/\/www\.ncbi\.nlm\.nih\.gov\/sites\/entrez\?cmd=Retrieve&db=PubMed&list_uids=$1&dopt=AbstractPlus' target=_blank>$1<\/a>\)/;
751 :     push(@{$pretty_codes->{$code->[0]}}, $pretty_code);
752 :     }
753 :     foreach my $entry (keys(%$pretty_codes)) {
754 :     $feature_data{$entry}{'evcodes'} = join(", ", @{$pretty_codes->{$entry}});
755 :     }
756 :     }
757 :    
758 : dsouza 1.1 return \%feature_data;
759 :     }
760 :    
761 :     sub feature_entry {
762 : olson 1.31 my($fig, $fid, $region_mid, $extended, $all_functions, $all_locations, $all_aliases) = @_;
763 : dsouza 1.1
764 : olson 1.41 my $genome = $fig->genome_of($fid);
765 : dsouza 1.1 my $type = $fig->ftype($fid);
766 : olson 1.31 my $loc = $all_locations->{$fid};
767 : dsouza 1.1 my($contig, $beg, $end) = $fig->boundaries_of($loc);
768 :     my($left, $right) = sort {$a <=> $b} ($beg, $end);
769 :     my $strand = ($beg <= $end)? '+' : '-';
770 :     my $offset = int(($left + $right)/2) - $region_mid;
771 :     my $offset_beg = $left - $region_mid;
772 :     my $offset_end = $right - $region_mid;
773 : olson 1.31 my $func = $all_functions->{$fid};
774 : paczian 1.27 my $location = FullLocation->new($fig, $genome, $loc);
775 :     my $size = 0;
776 :     map { $size += $_->Length } @{$location->Locs};
777 : dsouza 1.1
778 : paczian 1.16 my $retval = { 'fid' => $fid,
779 : paczian 1.26 'location' => $loc,
780 : paczian 1.16 'type' => $type,
781 :     'contig' => $contig,
782 :     'beg' => $beg,
783 :     'end' => $end,
784 :     'size' => $size,
785 :     'strand' => $strand,
786 :     'offset' => $offset,
787 :     'offset_beg' => $offset_beg,
788 :     'offset_end' => $offset_end,
789 : dsouza 1.25 'function' => $func };
790 : paczian 1.16 if ($extended) {
791 : olson 1.31 my $a = $all_aliases->{$fid};
792 :     my $aliases = ref($a) ? join("|", @$a) : '';
793 :    
794 : paczian 1.16 $retval->{aliases} = $aliases;
795 :     }
796 :    
797 :     return $retval;
798 : dsouza 1.1 }
799 :    
800 : dsouza 1.3 sub add_functional_coupling {
801 :     my($fig, $pin_desc, $regions, $feature_data) = @_;
802 :    
803 :     my $fc_cutoff = defined($pin_desc->{'fc_cutoff'})? $pin_desc->{'fc_cutoff'} : 4;
804 :    
805 :     foreach my $region ( @$regions )
806 :     {
807 : parrello 1.11 my $pin = $region->{'pinned_peg'};
808 :     my @pegs = grep {$feature_data->{$_}{'type'} eq 'peg'} @{ $region->{'features'} };
809 : dsouza 1.3
810 : parrello 1.11 foreach my $couple ( $fig->coupled_to_batch(@pegs) )
811 :     {
812 : olson 1.43 next unless $couple;
813 : parrello 1.11 my($peg1, $peg2, $sc) = @$couple;
814 :    
815 :     if ( $peg1 eq $pin and $sc >= $fc_cutoff )
816 :     {
817 :     $feature_data->{$peg2}{'fc_score'} = $sc;
818 : paczian 1.15 $feature_data->{$peg2}{'fc_pin'} = $peg1;
819 : parrello 1.11 }
820 :     }
821 : dsouza 1.3 }
822 :     }
823 :    
824 : dsouza 1.5 sub add_figfams {
825 : olson 1.32 return;
826 :     # my($fig, $feature_data) = @_;
827 : dsouza 1.5
828 : olson 1.32 # # Get FigFams directory from config file
829 :     # my $figfam_dir = $fig->get_figfams_data();
830 : dsouza 1.5
831 : olson 1.32 # # Check if FigFams directory is defined and exists on current machine
832 :     # if ( defined($figfam_dir) and (-d $figfam_dir) )
833 :     # {
834 :     # # Get all PEG IDs
835 :     # my @pegs = grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data;
836 :     # # Get $figfams object
837 :     # my $figfams = new FigFams($fig, $figfam_dir);
838 :     # # Get FigFam family ID for @pegs
839 :     # my $figfam = $figfams->families_containing_peg_bulk(\@pegs);
840 :     # # Get hash of FigFam ID to family function
841 :     # my $family_function = $figfams->family_functions();
842 : parrello 1.11
843 : olson 1.32 # # Some FigFams (the ones that are not subsystem related) have the FigFam ID in the family function.
844 :     # # This results in the popup displaying the ID twice, so one should be removed.
845 : parrello 1.11
846 : olson 1.32 # my %figfam_text;
847 :     # foreach my $fid ( keys %$feature_data )
848 :     # {
849 :     # if ( $figfam->{$fid} )
850 :     # {
851 :     # my $figfam_id = $figfam->{$fid};
852 :     # if ( ! exists $figfam_text{$figfam_id} ) {
853 :     # if ( $family_function->{$figfam_id} =~ /^FIG\d+/ ) {
854 :     # $figfam_text{$figfam_id} = $family_function->{$figfam_id};
855 :     # } else {
856 :     # $figfam_text{$figfam_id} = $figfam_id . ': ' . $family_function->{$figfam_id};
857 :     # }
858 :     # }
859 :    
860 :     # # Add FigFam information to hash -- to go into the popup text
861 :     # $feature_data->{$fid}{'figfam'} = $figfam_text{$figfam_id};
862 :     # }
863 :     # }
864 :     # }
865 : dsouza 1.5 }
866 :    
867 : olson 1.42 sub add_pattyfams {
868 :     my($fig, $feature_data) = @_;
869 :    
870 :     my @pegs = grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data;
871 :     my $fams = $fig->pattyfams_for_proteins(\@pegs);
872 :     foreach my $fid ( keys %$feature_data )
873 :     {
874 :     my $entlist = $fams->{$fid};
875 :     next unless ref($entlist);
876 :     my $fdata = $feature_data->{$fid};
877 :     for my $ent (@$entlist)
878 :     {
879 :     my($fam, $fun) = @$ent;
880 :     my $istart = my $iend = "";
881 :     if ($fun ne $fdata->{function})
882 :     {
883 :     $istart = "<i>";
884 :     $iend = "</i>";
885 :     }
886 :     if ($fam =~ /^PLF/)
887 :     {
888 :     $fdata->{'plfam'} = "$fam: $istart$fun$iend";
889 :     }
890 :     elsif ($fam =~ /^PGF/)
891 :     {
892 :     $fdata->{'pgfam'} = "$fam: $istart$fun$iend";
893 :     }
894 :     }
895 :     }
896 :     }
897 :    
898 : dsouza 1.3 sub add_subsystem_data {
899 : paczian 1.17 my($fig, $pin_desc, $feature_data, $extended) = @_;
900 : dsouza 1.14
901 :     # Get subsystem_information for =all= pegs
902 :    
903 : paczian 1.16 my @fids = keys(%$feature_data);
904 : olson 1.35 my %ssdata;
905 :    
906 :     if ($FIG_Config::use_subsystem_estimates)
907 :     {
908 :     %ssdata = $fig->subsystems_for_pegs_complete_estimate(\@fids);
909 :     }
910 :     else
911 :     {
912 :     %ssdata = $fig->subsystems_for_pegs_complete(\@fids);
913 :     }
914 :    
915 : paczian 1.16 my @subsystems = ();
916 :     foreach my $p (keys(%ssdata)) {
917 :     foreach my $entry (@{$ssdata{$p}}) {
918 :     push(@subsystems, [ $entry->[0], $entry->[1], $entry->[2], $p ]);
919 :     }
920 :     }
921 : paczian 1.17 unless ($extended) {
922 : paczian 1.19 @subsystems = grep { $fig->usable_subsystem($_->[0]) } @subsystems;
923 : paczian 1.18 @subsystems = grep { $fig->is_exchangable_subsystem($_->[0]) } @subsystems;
924 : paczian 1.17 }
925 : paczian 1.16
926 : dsouza 1.14 my %peg_to_ss;
927 : paczian 1.16 foreach my $rec ( @subsystems ) {
928 :     my($ss_name, $role, $variant, $fid) = @$rec;
929 :     $ss_name =~ s/_/ /g;
930 :    
931 :     if ( $variant eq '0' ) {
932 :     # no subsystem
933 : paczian 1.19 if ($extended) {
934 :     my $ss_text = "$ss_name (not yet classified for this organism)";
935 : paczian 1.20 $peg_to_ss{$fid}{$ss_name} = $ss_text;
936 : paczian 1.19 }
937 : paczian 1.16 } elsif ( $variant eq '-1' or $variant eq '*-1' ) {
938 :     # subsystem not functional in this organism
939 :     my $ss_text = "$ss_name (classified 'not active' in this organism)";
940 : paczian 1.20 $peg_to_ss{$fid}{$ss_name} = $ss_text;
941 : paczian 1.16 } else {
942 :     $peg_to_ss{$fid}{$ss_name} = 1;
943 : dsouza 1.14 }
944 :     }
945 :    
946 : dsouza 1.3 # Count number of occurences of each subsystem
947 :     my %ss_count;
948 : dsouza 1.14 foreach my $fid ( keys %peg_to_ss )
949 : dsouza 1.3 {
950 : dsouza 1.14 foreach my $ss ( keys %{ $peg_to_ss{$fid} } )
951 :     {
952 :     $ss_count{$ss}++;
953 :     }
954 : dsouza 1.3 }
955 :    
956 :     # Sort subsystems based on count and create hash with unique index for each subsystem where
957 :     # lower indices correspond to higher number of occurences of the subsystem.
958 :     my %ss_index;
959 :     my $index = 0;
960 :     foreach my $ss ( sort {$ss_count{$b} <=> $ss_count{$a} or $a cmp $b} keys %ss_count )
961 :     {
962 : parrello 1.11 $ss_index{$ss} = ++$index;
963 : dsouza 1.3 }
964 :    
965 :     # Add subsystem information for pegs in subsystems
966 :     foreach my $fid ( keys %peg_to_ss )
967 :     {
968 : paczian 1.20 my @subsystems;
969 :     foreach my $key (keys(%{$peg_to_ss{$fid}})) {
970 :     if ($peg_to_ss{$fid}{$key} ne "1") {
971 :     push(@subsystems, [ $peg_to_ss{$fid}{$key}, $ss_index{$key} ]);
972 :     } else {
973 :     push(@subsystems, [ $key, $ss_index{$key} ]);
974 :     }
975 :     }
976 : parrello 1.11
977 :     if ( @subsystems )
978 :     {
979 : paczian 1.20 $feature_data->{$fid}{'subsystems'} = [sort {$a->[1] <=> $b->[1]} @subsystems];
980 : parrello 1.11 }
981 : dsouza 1.3 }
982 :     }
983 :    
984 : dsouza 1.1 sub color_pegs {
985 : olson 1.41 my($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $fast_color, $sims_from, $cdd_trans) = @_;
986 : dsouza 1.1
987 :     # Run blastp and get connected pegs
988 :     my $blast_hit = {};
989 :     my $color_sim_cutoff = $pin_desc->{'color_sim_cutoff'};
990 :    
991 : olson 1.41 my $color_sets;
992 :     if ($fast_color == 2 || $fast_color == 3)
993 : dsouza 1.1 {
994 : olson 1.41 my $like = ($fast_color == 2) ? 'PLF%' : 'PGF%';
995 :    
996 :     my @pegs = grep {$feature_data->{$_}{'type'} && $feature_data->{$_}{'type'} =~ /peg|domain/} keys %$feature_data;
997 :    
998 :     my %sets;
999 :     if (@pegs)
1000 :     {
1001 :     my $in = join(",", map { "?" } @pegs);
1002 :     # print STDERR Dumper(query => \@pegs, $in, $feature_data);
1003 :     my $res = $fig->db_handle->SQL("SELECT fid, family FROM family_membership WHERE fid IN ($in) AND family LIKE '$like'",
1004 :     undef, @pegs);
1005 :     # print STDERR Dumper($res);
1006 :     for my $ent (@$res)
1007 :     {
1008 :     my($fid, $fam) = @$ent;
1009 :     push(@{$sets{$fam}}, $fid);
1010 :     }
1011 :     }
1012 :     $color_sets = [values(%sets)];
1013 :     # print STDERR Dumper(color_sets => \%sets, $color_sets);
1014 : dsouza 1.1 }
1015 :     else
1016 :     {
1017 : olson 1.41 if ( $sims_from eq 'blast' )
1018 :     {
1019 :     $blast_hit = &blast_hits($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff, $cdd_trans);
1020 :     }
1021 :     else
1022 :     {
1023 :     $blast_hit = &blast_hits_from_sims($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff, $cdd_trans);
1024 :     }
1025 :    
1026 :     # Assign set numbers to pegs based on blast scores
1027 :     $color_sets = &partition_pegs($blast_hit);
1028 : dsouza 1.1 }
1029 :    
1030 :     # Sort peg sets to that a) larger sets have smaller set numbers, and
1031 :     # b) sets closer to the pinned peg have smaller set numbers
1032 : dsouza 1.4 $color_sets = &sort_color_sets($pin_desc, $pinned_pegs, $color_sets, $feature_data);
1033 : dsouza 1.1
1034 :     # Add color set number into $feature_data
1035 :     for (my $i = 0; $i < @$color_sets; $i++)
1036 :     {
1037 : parrello 1.11 # Use an index that starts at 1 (matches with coloring index)
1038 :     my $n = $i + 1;
1039 :     foreach my $peg ( @{ $color_sets->[$i] } )
1040 :     {
1041 :     $feature_data->{$peg}{'set_number'} = $n;
1042 :    
1043 :     # If this is the pinned set, set the 'color' to 'red'
1044 :     if ( $n == 1 )
1045 :     {
1046 :     $feature_data->{$peg}{'color'} = 'red';
1047 :     }
1048 :     }
1049 : dsouza 1.1 }
1050 :     }
1051 :    
1052 : olson 1.37 sub color_pegs_by_function {
1053 :     my($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $fast_color, $sims_from) = @_;
1054 :    
1055 :     my %by_function;
1056 :     my $next;
1057 :     while (my($fid, $data) = each %$feature_data)
1058 :     {
1059 :     my $set = $by_function{$data->{function}};
1060 :     if (!defined($set))
1061 :     {
1062 :     $set = [];
1063 :     $by_function{$data->{function}} = $set;
1064 :     }
1065 :     push(@$set, $fid);
1066 :     }
1067 :    
1068 :     my $color_sets = [values %by_function];
1069 :    
1070 :     # Sort peg sets to that a) larger sets have smaller set numbers, and
1071 :     # b) sets closer to the pinned peg have smaller set numbers
1072 :     $color_sets = &sort_color_sets($pin_desc, $pinned_pegs, $color_sets, $feature_data);
1073 :    
1074 :     # Add color set number into $feature_data
1075 :     for (my $i = 0; $i < @$color_sets; $i++)
1076 :     {
1077 :     # Use an index that starts at 1 (matches with coloring index)
1078 :     my $n = $i + 1;
1079 :     foreach my $peg ( @{ $color_sets->[$i] } )
1080 :     {
1081 :     $feature_data->{$peg}{'set_number'} = $n;
1082 :    
1083 :     # If this is the pinned set, set the 'color' to 'red'
1084 :     if ( $n == 1 )
1085 :     {
1086 :     $feature_data->{$peg}{'color'} = 'red';
1087 :     }
1088 :     }
1089 :     }
1090 :     }
1091 :    
1092 : dsouza 1.1 sub sort_color_sets {
1093 :     my($pin_desc, $pinned_pegs, $color_sets, $feature_data) = @_;
1094 :    
1095 : dsouza 1.3 # Find the color set containing the set of pinned pegs, i.e. the set containing the input peg.
1096 :     # If the input peg is found (or if input is a list of pegs, returned value is the array index for the
1097 :     # set.
1098 :     # If the input peg is not found, the returned value is an reference to an array containing the input peg.
1099 :     my $set = &pinned_set($pin_desc, $pinned_pegs, $color_sets);
1100 : dsouza 1.1
1101 : dsouza 1.3 my $pinned_color_set;
1102 :     if ( $set =~ /^\d+$/ )
1103 :     {
1104 : parrello 1.11 # Splice out the color set containing the input peg
1105 :     ($pinned_color_set) = splice(@$color_sets,$set,1);
1106 : dsouza 1.3 }
1107 :     else
1108 :     {
1109 : parrello 1.11 $pinned_color_set = $set;
1110 : dsouza 1.3 }
1111 : dsouza 1.1
1112 : dsouza 1.3 # Add offset (summed distance from
1113 :     my @color_sets = map {[$_, &offset($_, $feature_data)]} @$color_sets;
1114 : dsouza 1.1 # Sort the remaining color sets based on:
1115 :     # a. size (number of pegs) and
1116 :     # b. offset from mid-point of region
1117 :     @$color_sets = map {$_->[0]}
1118 :     sort {@{$b->[0]} <=> @{$a->[0]} or $a->[1] <=> $b->[1]}
1119 : dsouza 1.3 # sort {$a->[1] <=> $b->[1] or @{$b->[0]} <=> @{$a->[0]}}
1120 : dsouza 1.1 map {[$_, &offset($_, $feature_data)]} @$color_sets;
1121 :    
1122 :     # Add the set of pinned pegs at the front of the returned list so that it gets colored red
1123 :     return [$pinned_color_set, @$color_sets];
1124 :     }
1125 :    
1126 : dsouza 1.3 sub pinned_set {
1127 : dsouza 1.1 my($pin_desc, $pinned_pegs, $color_sets) = @_;
1128 :    
1129 : dsouza 1.3 # Returns an integer if the input is a peg-list or if the input peg is in a set.
1130 :     # Returns the input peg (as an array reference) if the input peg is not in a set.
1131 :    
1132 : dsouza 1.1 if ( @{ $pin_desc->{'pegs'} } == 1 )
1133 :     {
1134 : parrello 1.11 # Get input peg if it exists -- the set containing this peg should be colored red
1135 :     my $peg = $pin_desc->{'pegs'}[0];
1136 : dsouza 1.1
1137 : parrello 1.11 # Iterate through the color sets until you find the set containing the input peg
1138 :     for (my $i = 0; $i < @$color_sets; $i++)
1139 :     {
1140 :     foreach my $peg2 ( @{ $color_sets->[$i] } )
1141 :     {
1142 :     if ( $peg2 eq $peg )
1143 :     {
1144 :     # Return the set index
1145 :     return $i;
1146 :     }
1147 :     }
1148 :     }
1149 : dsouza 1.1
1150 : parrello 1.11 # The input peg may not be in a set if there is only one region or the coloring cutoff is very stringent.
1151 :     return [$peg];
1152 : dsouza 1.1 }
1153 :     else
1154 :     {
1155 : parrello 1.11 # Input is a peg-list, which may be split into multiple sets -- the largest will be called the
1156 :     # "pinned" set.
1157 : paczian 1.30 my $i = 0;
1158 : parrello 1.12 my $max_count = 0;
1159 : parrello 1.11 my %pinned = map {$_ => 1} @$pinned_pegs;
1160 :    
1161 :     for (my $j = 0; $j < @$color_sets; $j++)
1162 :     {
1163 :     # count how many pinned pegs are found in each set
1164 : parrello 1.12 my $count = scalar(grep {$pinned{$_}} @{ $color_sets->[$j] }) || 0;
1165 : parrello 1.11
1166 :     if ( $max_count < $count )
1167 :     {
1168 :     # Keep track of the set having the largest number of pinned pegs
1169 :     $max_count = $count;
1170 :     $i = $j;
1171 :     }
1172 :     }
1173 : dsouza 1.1
1174 : parrello 1.11 return $i;
1175 : dsouza 1.1 }
1176 :     }
1177 :    
1178 :     sub offset {
1179 :     my($set, $feature_data) = @_;
1180 :    
1181 :     my $offset;
1182 :     foreach my $peg ( @$set )
1183 :     {
1184 : parrello 1.11 $offset += abs($feature_data->{$peg}{'offset'});
1185 : dsouza 1.1 }
1186 : dsouza 1.3
1187 :     return sprintf("%.2f", $offset/@$set);
1188 : dsouza 1.1 }
1189 :    
1190 :     sub partition_pegs {
1191 :     my($blast_hit) = @_;
1192 :    
1193 :     my %seen;
1194 :     my $sets = [];
1195 :    
1196 :     # Iterate through the pegs with blast hits in arbitrary order
1197 :     foreach my $peg1 ( keys %$blast_hit )
1198 :     {
1199 : parrello 1.11 # If it has not been 'seen' (and placed in a set) use it as the seed for a set
1200 :     if ( not $seen{$peg1} )
1201 :     {
1202 :     my $set = [$peg1];
1203 :    
1204 :     # Mark $peg1 as seen
1205 :     $seen{$peg1} = 1;
1206 :    
1207 :     # Grow set through transitive closure -- note that @$set keeps growing
1208 :     for (my $i = 0; $i < @$set; $i++)
1209 :     {
1210 :     $peg1 = $set->[$i];
1211 :    
1212 :     # Iterate through blast hits
1213 :     foreach my $peg2 ( keys %{ $blast_hit->{$peg1} } )
1214 :     {
1215 :     # If $peg2 has not been seen, put it in the set, and mark it as seen
1216 :     if ( not exists $seen{$peg2} )
1217 :     {
1218 :     push @$set, $peg2;
1219 :     $seen{$peg2} = 1;
1220 :     }
1221 :     }
1222 :     }
1223 :    
1224 :     # Add the new set to the set of sets
1225 :     push @$sets, $set;
1226 :     }
1227 : dsouza 1.1 }
1228 :    
1229 :     return $sets;
1230 :     }
1231 :    
1232 :     sub blast_hits {
1233 : olson 1.41 my($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff, $cdd_trans) = @_;
1234 : dsouza 1.25
1235 :     # Set blast environment variables
1236 :     $ENV{"BLASTMAT"} ||= "$FIG_Config::blastmat";
1237 :     if ($ENV{"PATH"} !~ /fig\/bin/) { $ENV{"PATH"} = "$FIG_Config::bin:" . $ENV{"PATH"}; }
1238 : dsouza 1.1
1239 :     # Get amino acid sequences
1240 : olson 1.41 my @pegs = grep {$feature_data->{$_}{'type'} && $feature_data->{$_}{'type'} =~ /peg|domain/} keys %$feature_data;
1241 :     my $sequences = &get_peg_sequences($fig, \@pegs, $cdd_trans);
1242 : dsouza 1.1
1243 :     # Write the entire set of sequences to a fasta file
1244 :     my $all_seqs_file = "$FIG_Config::temp/all_seqs.$$.tmp.fasta";
1245 :     &write_seqs_to_file($sequences, $all_seqs_file);
1246 :    
1247 : dsouza 1.25 # Run formatdb
1248 :     &formatdb_file($fig, $all_seqs_file);
1249 :    
1250 : olson 1.33 my @files_to_remove = ($all_seqs_file);
1251 :    
1252 : dsouza 1.25 # If $fast_color == 0, the complete blast of all vs. all sequences is performed
1253 :     # Otherwise, instead of all vs. all, the sequences from a single pinned peg region
1254 :     # is blasted against all sequences. If any of the hits are better than a given cutoff
1255 :     # ($cutoff_2), the pegs hit are deemed to be 'done', i.e. it is assumed that blasting this
1256 :     # peg will not yield additional information for forming the peg sets. These 'done' pegs
1257 :     # are then omitted from the blast when the sequences from the region they belong to
1258 :     # is blasted.
1259 :     my %blast_hit;
1260 : olson 1.41 if ( $fast_color == 1)
1261 : dsouza 1.1 {
1262 : dsouza 1.25 my $cutoff_2 = $color_sim_cutoff * 1e-20;
1263 :     my %done_with;
1264 :    
1265 :     foreach my $region ( @$regions )
1266 :     {
1267 :     # Iterate through each region
1268 :     my $fids = $region->{'features'};
1269 :    
1270 :     # Build up a hash of peg sequences which are not 'done_with'
1271 :     my %region_seqs;
1272 :     foreach my $peg ( grep {$feature_data->{$_}{'type'} eq 'peg'} @$fids )
1273 :     {
1274 :     if ( $sequences->{$peg} and not $done_with{$peg} )
1275 :     {
1276 :     $region_seqs{$peg} = $sequences->{$peg};
1277 :     }
1278 :     }
1279 :    
1280 :     if ( scalar keys %region_seqs )
1281 :     {
1282 :     # Write the region sequences to a file
1283 :     my $region_seqs_file = "$FIG_Config::temp/region_seqs.$$.tmp.fasta";
1284 :     &write_seqs_to_file(\%region_seqs, $region_seqs_file);
1285 : olson 1.33 push(@files_to_remove, $region_seqs_file);
1286 : dsouza 1.25
1287 :     # BLAST the region sequences against all other sequences
1288 :     my $hits = &blast_files($fig, $region_seqs_file, $all_seqs_file, $color_sim_cutoff);
1289 :    
1290 :     # Build up hash of blast hits
1291 :     foreach my $hit ( @$hits )
1292 :     {
1293 :     my($peg1, $peg2, $sc) = (split(/\s+/, $hit))[0,1,10];
1294 :    
1295 :     if ( $peg1 ne $peg2 )
1296 :     {
1297 :     $blast_hit{$peg1}{$peg2} = 1;
1298 :     $blast_hit{$peg2}{$peg1} = 1;
1299 :    
1300 :     # If the blast score is less than the chosen cutoff, mark $peg2 as 'done_with' so
1301 :     # that it will not be blasted with it's region sequences
1302 :     if ( $sc <= $cutoff_2 )
1303 :     {
1304 :     $done_with{$peg2} = 1;
1305 :     }
1306 :     }
1307 :     }
1308 :     }
1309 :     }
1310 : dsouza 1.1 }
1311 :     else
1312 :     {
1313 : parrello 1.11 # BLAST sequence file against itself
1314 : dsouza 1.25 my $hits = &blast_files($fig, $all_seqs_file, $all_seqs_file, $color_sim_cutoff);
1315 : dsouza 1.1
1316 : dsouza 1.25 # Build up hash of blast hits
1317 :     foreach my $hit ( @$hits )
1318 :     {
1319 :     my($peg1, $peg2, $sc) = (split(/\s+/, $hit))[0,1,10];
1320 : parrello 1.11
1321 : dsouza 1.25 if ( $peg1 ne $peg2 )
1322 :     {
1323 :     $blast_hit{$peg1}{$peg2} = 1;
1324 :     $blast_hit{$peg2}{$peg1} = 1;
1325 :     }
1326 :     }
1327 : dsouza 1.1 }
1328 :    
1329 : olson 1.33 for my $file (@files_to_remove)
1330 :     {
1331 :     for my $suffix ('', qw(.phr .psq .pin))
1332 :     {
1333 :     my $path = "$file$suffix";
1334 :     unlink($path);
1335 :     }
1336 :     }
1337 :    
1338 :    
1339 : dsouza 1.1 return \%blast_hit;
1340 :     }
1341 :    
1342 :     sub blast_hits_from_sims {
1343 : olson 1.41 my($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff, $cdd_trans) = @_;
1344 : dsouza 1.1
1345 :     my %blast_hit;
1346 :    
1347 :     my $maxN = 2000;
1348 :     my $maxP = $color_sim_cutoff;
1349 :     my $select = 'fig';
1350 :     my $max_expand = '';
1351 :     my $filters = '';
1352 :    
1353 :     # Create a hash of all pegs
1354 :     my %region_peg = map {$_ => 1} grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data;
1355 :    
1356 : dsouza 1.25 if ( $fast_color == 1 )
1357 : dsouza 1.1 {
1358 : dsouza 1.25 my $cutoff_2 = $color_sim_cutoff * 1e-20;
1359 :     my %done_with;
1360 :    
1361 :     # Iterate through each peg
1362 :     foreach my $peg1 ( keys %region_peg )
1363 :     {
1364 :     # Skip the 'done_with' pegs
1365 :     next if ($done_with{$peg1});
1366 : dsouza 1.1
1367 : dsouza 1.25 my @sims = $fig->sims($peg1, $maxN, $maxP, $select, $max_expand, $filters);
1368 : parrello 1.11
1369 : dsouza 1.25 foreach my $sim ( grep {$region_peg{$_->[1]} and $_->[1] ne $peg1} @sims )
1370 :     {
1371 :     my($peg2, $sc) = @$sim[1,10];
1372 : parrello 1.11
1373 : dsouza 1.25 $blast_hit{$peg1}{$peg2} = 1;
1374 :     $blast_hit{$peg2}{$peg1} = 1;
1375 :    
1376 :     # If the blast score is less than the chosen cutoff, mark $peg2 as 'done_with' so
1377 :     # that it will not be blasted with it's region sequences
1378 :     if ( $sc <= $cutoff_2 )
1379 :     {
1380 :     $done_with{$peg2} = 1;
1381 :     }
1382 :     }
1383 :     }
1384 :     }
1385 :     else
1386 : dsouza 1.1 {
1387 : dsouza 1.25 # Iterate through each peg
1388 :     foreach my $peg1 ( keys %region_peg )
1389 :     {
1390 :     my @sims = $fig->sims($peg1, $maxN, $maxP, $select, $max_expand, $filters);
1391 :    
1392 :     foreach my $peg2 ( map {$_->[1]} grep {$region_peg{$_->[1]} and $_->[1] ne $peg1} @sims )
1393 :     {
1394 :     $blast_hit{$peg1}{$peg2} = 1;
1395 :     $blast_hit{$peg2}{$peg1} = 1;
1396 :     }
1397 :     }
1398 : dsouza 1.1 }
1399 : dsouza 1.25
1400 :     return \%blast_hit;
1401 : dsouza 1.1 }
1402 :    
1403 :     sub blast_files {
1404 :     my($fig, $input, $database, $cutoff) = @_;
1405 :    
1406 : gdpusch 1.39 my $trouble = 0;
1407 :     if (!-s $input) {
1408 :     $trouble = 1;
1409 :     warn "ERROR: query file \'$input\' is empty";
1410 :     }
1411 :    
1412 :     if (!-s $database) {
1413 :     $trouble = 1;
1414 :     warn "ERROR: database file \'$database\' is empty";
1415 :     }
1416 :    
1417 :     if ($trouble) { return []; }
1418 :    
1419 : dsouza 1.24 my $cmd = "$FIG_Config::ext_bin/blastall";
1420 : olson 1.36 my @args = ('-p', 'blastp', '-i', $input, '-d', $database, '-m', 8, '-e', $cutoff, '-F', 'F');
1421 : dsouza 1.24 my @blast_out = $fig->run_gathering_output($cmd, @args);
1422 : dsouza 1.25
1423 : dsouza 1.24 return \@blast_out;
1424 :     }
1425 : overbeek 1.23
1426 : dsouza 1.1 sub formatdb_file {
1427 :     my($fig, $file) = @_;
1428 :    
1429 :     my $cmd = "$FIG_Config::ext_bin/formatdb -i $file -p T";
1430 :     $fig->run($cmd);
1431 :     }
1432 :    
1433 :     sub write_seqs_to_file {
1434 :     my($seq, $fasta_file) = @_;
1435 :    
1436 :     open(FASTA, ">$fasta_file") or die "could not create FASTA file '$fasta_file': $!";
1437 :     foreach my $peg ( keys %$seq )
1438 :     {
1439 : parrello 1.11 print FASTA ">$peg\n$seq->{$peg}\n";
1440 : dsouza 1.1 }
1441 :     close(FASTA) or die "could not close file FASTA file '$fasta_file': $!";
1442 :     }
1443 :    
1444 :     sub get_peg_sequences {
1445 : olson 1.41 my($fig, $pegs, $cdd_trans) = @_;
1446 : dsouza 1.1 my %sequences;
1447 :    
1448 :     # Iterate over each peg
1449 : olson 1.42 my @lookup;
1450 : dsouza 1.21 foreach my $peg ( @$pegs )
1451 : dsouza 1.1 {
1452 : olson 1.42 my $seq = $cdd_trans->{$peg};
1453 : dsouza 1.1
1454 : parrello 1.11 if ( $seq )
1455 :     {
1456 :     $sequences{$peg} = $seq;
1457 :     }
1458 :     else
1459 :     {
1460 : olson 1.42 push(@lookup, $peg);
1461 :     }
1462 : dsouza 1.1 }
1463 : olson 1.42 my $res = $fig->get_translation_bulk(\@lookup);
1464 :     $sequences{$_} = $res->{$_} foreach keys %$res;
1465 : dsouza 1.1
1466 :     return \%sequences;
1467 :     }
1468 :    
1469 : olson 1.41 sub add_cdd
1470 :     {
1471 :     my($regions, $fig, $cds, $fids_for_cds, $feature_data) = @_;
1472 :    
1473 :     return unless ref($fids_for_cds) eq 'ARRAY';
1474 :     my %genomes = map { $fig->genome_of($_) => 1 } @$fids_for_cds;
1475 :     my %fids = map { $_ => 1 } @$fids_for_cds;
1476 :    
1477 :     my @new;
1478 :     my $trans = {};
1479 :    
1480 :     for my $row (@$regions)
1481 :     {
1482 :     push(@new, $row);
1483 :     next unless $genomes{$row->{genome_id}};
1484 :    
1485 :     #
1486 :     # new row with cdd data.
1487 :     #
1488 :    
1489 :     my $feats = [];
1490 :     my $new = {
1491 :     beg => $row->{beg},
1492 :     mid => $row->{mid},
1493 :     end => $row->{end},
1494 :     contig => $row->{contig},
1495 :     contig_length => $row->{contig_length},
1496 :     pinned_peg_strand => $row->{pinned_peg_strand},
1497 :     org_name => "$row->{org_name} CDD",
1498 :     genome_id => $row->{genome_id},
1499 :     features => $feats,
1500 :     };
1501 :     for my $fid (grep { $fids{$_} } @{$row->{features}})
1502 :     {
1503 :     my @x = $cds->create_cdd_features($fid, { data_mode => 'rep', cached_only => 0 });
1504 :     for my $f (@x)
1505 :     {
1506 :     my($cfid, $type, $canno, $cloc, $ctrans) = @$f;
1507 :     push(@$feats, $cfid);
1508 :     $trans->{$cfid} = $ctrans;
1509 :    
1510 :     my $genome = $row->{genome_id};
1511 :     my $loc = $cloc;
1512 :     my($contig, $beg, $end) = $fig->boundaries_of($loc);
1513 :     my($left, $right) = sort {$a <=> $b} ($beg, $end);
1514 :     my $strand = ($beg <= $end)? '+' : '-';
1515 :     my $offset = int(($left + $right)/2) - $row->{mid};
1516 :     my $offset_beg = $left - $row->{mid};
1517 :     my $offset_end = $right - $row->{mid};
1518 :     my $func = $canno;
1519 :     my $location = FullLocation->new($fig, $genome, $loc);
1520 :     my $size = 0;
1521 :     map { $size += $_->Length } @{$location->Locs};
1522 :    
1523 :     $feature_data->{$cfid} = {
1524 :     'parent' => $fid,
1525 :     'fid' => $cfid,
1526 :     'location' => $loc,
1527 :     'type' => $type,
1528 :     'contig' => $contig,
1529 :     'beg' => $beg,
1530 :     'end' => $end,
1531 :     'size' => $size,
1532 :     'strand' => $strand,
1533 :     'offset' => $offset,
1534 :     'offset_beg' => $offset_beg,
1535 :     'offset_end' => $offset_end,
1536 :     'function' => $func
1537 :     };
1538 :     }
1539 :     }
1540 :     push(@new, $new);
1541 :     }
1542 :     @$regions = @new;
1543 :     return $trans;
1544 :     }
1545 : dsouza 1.1
1546 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3