[Bio] / FigKernelScripts / svr_corresponding_genes.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/svr_corresponding_genes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.24 - (view) (download) (as text)

1 : overbeek 1.10 ########################################################################
2 : overbeek 1.7 use SeedEnv;
3 : overbeek 1.10 use gjoseqlib;
4 :    
5 : overbeek 1.1 use strict;
6 :     use Data::Dumper;
7 :     use Carp;
8 :    
9 :     =head1 svr_corresponding_genes
10 :    
11 :     Attempt to Tabulate Corresponding Genes from Two Complete Genomes
12 :    
13 :     ------
14 :     Example: svr_corresponding_genes 107806.1 198804.1
15 :    
16 : overbeek 1.19 would produce a 18-column table that is an attempt to present the
17 : overbeek 1.1 correspondence between the genes in two genomes (in this case
18 :     107806.1 and 198804.1, which are two Buchnera genomes).
19 :     ------
20 :    
21 :     There is no input other than the command-line arguments. The two genomes
22 :     must be specified, and there are two optional arguments that relate to determining
23 :     how to determine the "context" of genes.
24 :    
25 :     One important aspect of the tool is that it tries to establish the correspondence,
26 :     and then for a corresponding pair of genes Ga and Gb, it attemptes to determine
27 :     how many genes in the "context" of Ga map to genes in the "context" of Gb. This is
28 :     important, since preservation of context increases the confidence of the mapping
29 :     between Ga and Gb considerably. The optional parameters effect the determination
30 :     of the genes in the "context". Using
31 :    
32 :     -n 5
33 :    
34 :     would indicate that the context of G should include 5 distinct genes
35 :     to the left of G and 5 distinct genes to the right of G. This notion of distinct
36 :     was added due to the existence of numerous splice variants in some eukaryotic
37 :     genomes. Genes are considered to be distinct if the size of the overlap
38 :     between the genes is less than a threshhold. The threshhold can be set using
39 :     the -o parameter. Thus, use of
40 :    
41 :     -o 1000
42 :    
43 :     would indicate that two genes are distinct iff the boundaries of the two genes
44 :     overlap by less than 1000 bp. The default is a very high value, so if you specify
45 :     nothing (which is appropriate for prokaryotic genomes), any two genes will be
46 :     considered distinct.
47 :    
48 :     =head2 Command-Line Options
49 :    
50 :     The program is invoked using
51 :    
52 : overbeek 1.19 svr_corresponding_genes [-u ServerUrl] [-n HalfSzOfContext] [-o MaxOverlap] [-d Genome1Dir] Genome1 Genome2
53 : overbeek 1.1
54 : parrello 1.21 =over 4
55 :    
56 : overbeek 1.1 =item -n HalfSizeOfRegion
57 :    
58 :     This is used to specify how many genes to the left and right you want to
59 :     be considered in the context. The default is 10.
60 :    
61 :     =item -o MaxOverlap
62 :    
63 :     This allows the user the specify a maximum overlap that would result in two genes
64 :     being considered "distinct" in the computation of genes to be added to the context.
65 :     It defaults to a very large value.
66 :    
67 : overbeek 1.10 =item -d Genome1Dir
68 :    
69 :     This allows the user to give a SEED "genome directory" that is used
70 :     to get data for Genome1, rather than taking it from the SEED itself.
71 :     This allows one, for example, to use a RAST directory.
72 :    
73 : overbeek 1.19 =item -u ServerUrl
74 :    
75 :     This allows the user to specify the URL for the Sapling server. If it is
76 :     "localhost", then the Sapling method will be run on the local SEED.
77 :    
78 : overbeek 1.1 =head2 Output Format
79 :    
80 : overbeek 1.9 The standard output is a 18-column tab-delimited file:
81 : overbeek 1.1
82 :     =item Column-1
83 :     The ID of a PEG in Genome1.
84 :    
85 :     =item Column-2
86 :    
87 :     The ID of a PEG in Genome2 that is our best estimate of a "corresponding gene".
88 :    
89 :     =item Column-3
90 : overbeek 1.3 Count of the number of pairs of matching genes were found in the context
91 :    
92 :     =item Column-4
93 : overbeek 1.1
94 : overbeek 1.2 Pairs of corresponding genes from the contexts
95 : overbeek 1.1
96 : overbeek 1.3 =item Column-5
97 : overbeek 1.1
98 :     The function of the gene in Genome1
99 :    
100 : overbeek 1.3 =item Column-6
101 : overbeek 1.1
102 :     The function of the gene in Genome2
103 :    
104 : overbeek 1.3 =item Column-7
105 : overbeek 1.1
106 :     Aliases of the gene in Genome1 (any protein with an identical sequence
107 :     is considered an alias, whether or not it is actually the name of the
108 :     same gene in the same genome)
109 :    
110 : overbeek 1.3 =item Column-8
111 : overbeek 1.1
112 :     Aliases of the gene in Genome2 (any protein with an identical sequence
113 :     is considered an alias, whether or not it is actually the name of the
114 :     same gene in the same genome)
115 :    
116 : overbeek 1.3 =item Column-9
117 : overbeek 1.1
118 :     Bi-directional best hits will contain "<=>" in this column.
119 : overbeek 1.19 Otherwise, an "->" or an "<-" will appear.
120 : overbeek 1.1
121 : overbeek 1.3 =item Column-10
122 : overbeek 1.1
123 :     Percent identity over the region of the detected match
124 :    
125 : overbeek 1.3 =item Column-11
126 : overbeek 1.1
127 :     The P-sc for the detected match
128 :    
129 : overbeek 1.3 =item Column-12
130 : overbeek 1.1
131 :     Beginning match coordinate in the protein encoded by the gene in Genome1.
132 :    
133 : overbeek 1.3 =item Column-13
134 : overbeek 1.1
135 :     Ending match coordinate in the protein encoded by the gene in Genome1.
136 :    
137 : overbeek 1.3 =item Column-14
138 : overbeek 1.1
139 :     Length of the protein encoded by the gene in Genome1.
140 :    
141 : overbeek 1.3 =item Column-15
142 : overbeek 1.1
143 :     Beginning match coordinate in the protein encoded by the gene in Genome2.
144 :    
145 : overbeek 1.3 =item Column-16
146 : overbeek 1.1
147 :     Ending match coordinate in the protein encoded by the gene in Genome2
148 :    
149 : overbeek 1.3 =item Column-17
150 : overbeek 1.1
151 :     Length of the protein encoded by the gene in Genome2.
152 :    
153 : overbeek 1.9 =item Column-18
154 :    
155 :     Bit score for the match. Divide by the length of the longer PEG to get
156 :     what we often refer to as a "normalized bit score".
157 :    
158 : parrello 1.21 =back
159 :    
160 : overbeek 1.1 =cut
161 :    
162 : overbeek 1.6 use SeedEnv;
163 : overbeek 1.1 use SeedUtils;
164 :     use SAPserver;
165 : overbeek 1.2 use ProtSims;
166 : olson 1.22 use SeedAware;
167 : overbeek 1.24 use Getopt::Long;
168 : overbeek 1.1
169 : overbeek 1.19 my $usage = "usage: svr_corresponding_genes [-u SERVERURL] [-o N1] [-n N2] [-d RASTdirectory] Genome1 Genome2";
170 : overbeek 1.1
171 :     my $ignore_ov = 1000000;
172 :     my $sz_context = 5;
173 : overbeek 1.10 my $g1dir;
174 : overbeek 1.19 my $url;
175 : overbeek 1.1
176 : overbeek 1.24 my $rc = GetOptions("o" => \$ignore_ov,
177 :     "n=i" => \$sz_context,
178 :     "d=s" => \$g1dir,
179 :     "u=s" => \$url
180 :     );
181 :     if (! $rc) { print STDERR $usage; exit }
182 : overbeek 1.1
183 : overbeek 1.20 my $sapObject = SAPserver->new(url => $url);
184 : overbeek 1.19 my $genomes = $sapObject->all_genomes(-complete => 1);
185 :    
186 : overbeek 1.1 my($genome1, $genome2);
187 :     (
188 :     ($genome1 = shift @ARGV) &&
189 : overbeek 1.7 ($genome2 = shift @ARGV)
190 : overbeek 1.1 )
191 :     || die $usage;
192 :    
193 : overbeek 1.10 if ((! $g1dir) &&
194 :     (! $genomes->{$genome1})) { print STDERR "$genome1 is not in the DB\n"; exit }
195 :     if (! $genomes->{$genome2}) { print STDERR "$genome2 is not in the DB\n"; exit }
196 : overbeek 1.7
197 : olson 1.22 my $tmp_dir = SeedAware::location_of_tmp();
198 :    
199 :     my $formatdb = SeedAware::executable_for("formatdb");
200 :    
201 :     my $tmp1 = "$tmp_dir/tmp1_$$.fasta";
202 :     my $tmp2 = "$tmp_dir/tmp2_$$.fasta";
203 : overbeek 1.1
204 : overbeek 1.10 my $lens1 = $g1dir ? &get_fastaD($genome1,$tmp1,$g1dir) :
205 :     &get_fasta($genome1,$tmp1,$sapObject);
206 : overbeek 1.1 my $lens2 = &get_fasta($genome2,$tmp2,$sapObject);
207 : overbeek 1.18 # print STDERR "GOT SIMS\n";
208 : olson 1.22 system($formatdb, '-i', $tmp2, '-p', 'T');
209 : overbeek 1.1
210 :     my($sims1,$sims2) = &get_sims($tmp1,$tmp2,$lens1,$lens2);
211 :     unlink($tmp1,$tmp2,"$tmp2.psq","$tmp2.pin","$tmp2.phr");
212 :    
213 : overbeek 1.10 my $functions = &get_functions($sapObject,[keys(%$lens1)],[keys(%$lens2)],$g1dir);
214 : overbeek 1.1 # print STDERR "GOT Functions\n";
215 : overbeek 1.10 my $aliases = $g1dir ? &get_aliases($sapObject,[keys(%$lens2)])
216 :     : &get_aliases($sapObject,[keys(%$lens1),keys(%$lens2)]);
217 : overbeek 1.1 # print STDERR "GOT Aliases\n";
218 :    
219 : olson 1.11 my $matching_context = &matching_neighbors($genome1,$g1dir,$sims1,$genome2,$sims2,$sz_context,$ignore_ov);
220 : overbeek 1.1 # print STDERR "GOT Context\n";
221 :    
222 :     foreach my $peg1 (keys(%$lens1))
223 :     {
224 :     my $peg2 = $sims1->{$peg1}->[0];
225 :     if ($peg2)
226 :     {
227 :     my $context = "";
228 : overbeek 1.3 my $context_count = 0;
229 : overbeek 1.1 my $function2 = "";
230 :     my $aliases2 = "";
231 :    
232 :     my $function1 = $functions->{$peg1} ? $functions->{$peg1} : "";
233 : overbeek 1.19 my $aliases1 = $aliases->{$peg1} ? $aliases->{$peg1} : "";
234 : overbeek 1.1 my $peg3 = $sims2->{$peg2}->[0];
235 :     my $bbh = ($peg3 && ($peg3 eq $peg1)) ? "<=>" : "->";
236 :    
237 : overbeek 1.19 if ($_ = $matching_context->{"$peg1,$peg2"})
238 : overbeek 1.3 {
239 : overbeek 1.19 $context = $_;
240 : overbeek 1.10 $context_count = ($context =~ tr/,//) + 1;
241 : overbeek 1.3 }
242 : overbeek 1.9 my($iden,$psc,$b1,$e1,$b2,$e2,$ln1,$ln2,$bitsc);
243 :     (undef,$iden,$psc,$bitsc,$b1,$e1,$b2,$e2,$ln1,$ln2) = @{$sims1->{$peg1}};
244 : overbeek 1.1 if ($functions->{$peg2}) { $function2 = $functions->{$peg2} }
245 :     if ($aliases->{$peg2}) { $aliases2 = $aliases->{$peg2} }
246 : overbeek 1.3 print join("\t",($peg1,$peg2,$context_count,$context,$function1,$function2,
247 : overbeek 1.1 $aliases1,$aliases2,$bbh,$iden,$psc,
248 : overbeek 1.9 $b1,$e1,$ln1,$b2,$e2,$ln2,$bitsc)),"\n";
249 : overbeek 1.1 }
250 :     }
251 :    
252 : overbeek 1.19 foreach my $peg2 (keys(%$lens2))
253 :     {
254 :     my $peg1 = $sims2->{$peg2}->[0];
255 :     if ($peg1)
256 :     {
257 :     my $context = "";
258 :     my $context_count = 0;
259 :     my $function1 = "";
260 :     my $aliases1 = "";
261 :    
262 :     my $function2 = $functions->{$peg2} ? $functions->{$peg2} : "";
263 :     my $aliases2 = $aliases->{$peg2} ? $aliases->{$peg2} : "";
264 :     my $peg3 = $sims1->{$peg1}->[0];
265 :     if ($peg3 ne $peg2)
266 :     {
267 :     if ($_ = $matching_context->{"$peg1,$peg2"})
268 :     {
269 :     $context = $_;
270 :     $context_count = ($context =~ tr/,//) + 1;
271 :     }
272 :     my($iden,$psc,$b1,$e1,$b2,$e2,$ln1,$ln2,$bitsc);
273 :     (undef,$iden,$psc,$bitsc,$b2,$e2,$b1,$e1,$ln2,$ln1) = @{$sims2->{$peg2}};
274 :     if ($functions->{$peg1}) { $function1 = $functions->{$peg1} }
275 :     if ($aliases->{$peg1}) { $aliases1 = $aliases->{$peg1} }
276 :     print join("\t",($peg1,$peg2,$context_count,$context,$function1,$function2,
277 :     $aliases1,$aliases2,"<-",$iden,$psc,
278 :     $b1,$e1,$ln1,$b2,$e2,$ln2,$bitsc)),"\n";
279 :     }
280 :     }
281 :     }
282 :    
283 : olson 1.22 unlink($tmp1);
284 :     unlink($tmp2);
285 :    
286 : overbeek 1.1 sub get_sims {
287 :     my($tmp1,$tmp2) = @_;
288 :    
289 : overbeek 1.19 my @sims = &ProtSims::blastP($tmp1,$tmp2,1,1); # this last argument forces the use of blast, bypassing blat
290 : overbeek 1.1 my $sims1 = {};
291 :     my $sims2 = {};
292 :     my %seen;
293 : overbeek 1.2 foreach my $sim (@sims)
294 : overbeek 1.1 {
295 : overbeek 1.2 my $id1 = $sim->id1;
296 :     my $id2 = $sim->id2;
297 :     my $iden = $sim->iden;
298 :     my $b1 = $sim->b1;
299 :     my $e1 = $sim->e1;
300 :     my $b2 = $sim->b2;
301 :     my $e2 = $sim->e2;
302 :     my $psc = $sim->psc;
303 :     my $bit_sc = $sim->bsc;
304 :    
305 : overbeek 1.19 my $x = $sims1->{$id1};
306 :     if ((! $x) || (($x->[0] ne $id2) && ($psc < $x->[2])))
307 : overbeek 1.1 {
308 :     $sims1->{$id1} = [$id2,$iden,$psc,$bit_sc,$b1,$e1,$b2,$e2,$lens1->{$id1},$lens2->{$id2}];
309 : overbeek 1.19 }
310 :     elsif ($x && ($x->[0] eq $id2))
311 :     {
312 :     ($b1,$e1,$b2,$e2) = &merge($b1,$e1,$b2,$e2,$x->[4],$x->[5],$x->[6],$x->[7]);
313 :     $x->[4] = $b1;
314 :     $x->[5] = $e1;
315 :     $x->[6] = $b2;
316 :     $x->[7] = $e2;
317 :     }
318 :    
319 :     $x = $sims2->{$id2};
320 :     if ((! $x) || (($x->[0] ne $id2) && ($psc < $x->[2])))
321 :     {
322 :     $sims2->{$id2} = [$id1,$iden,$psc,$bit_sc,$b2,$e2,$b1,$e1,$lens2->{$id2},$lens1->{$id1}];
323 :     }
324 :     elsif ($x && ($x->[0] eq $id2))
325 :     {
326 :     ($b2,$e2,$b1,$e1) = &merge($b2,$e2,$b1,$e1,$x->[4],$x->[5],$x->[6],$x->[7]);
327 :     $x->[4] = $b2;
328 :     $x->[5] = $e2;
329 :     $x->[6] = $b1;
330 :     $x->[7] = $e1;
331 : overbeek 1.15 }
332 : overbeek 1.1 }
333 : overbeek 1.19 # close(BLAST); # This file apparently no longer exists ??
334 : overbeek 1.1 return ($sims1,$sims2);
335 :     }
336 :    
337 : overbeek 1.19 sub merge {
338 :     my($b1a,$e1a,$b2a,$e2a,$b1b,$e1b,$b2b,$e2b) = @_;
339 :    
340 :     if (($b1a < $b1b) && (abs($b1b - $e1a) < 10) &&
341 :     ($b2a < $b2b) && (abs($b2b - $e2a) < 10))
342 :     {
343 :     return ($b1a,$e1b,$b2a,$b2b);
344 :     }
345 :     elsif (($b1b < $b1a) && (abs($b1a - $e1b) < 10) &&
346 :     ($b2b < $b2a) && (abs($b2a - $e2b) < 10))
347 :     {
348 :     return ($b1b,$e1a,$b2b,$b2a);
349 :     }
350 :     else
351 :     {
352 :     return ($b1b,$e1b,$b2b,$e2b);
353 :     }
354 :     }
355 :    
356 : overbeek 1.10 sub get_fastaD {
357 :     my($genome,$file,$g1dir) = @_;
358 :    
359 :     (-s "$g1dir/Features/peg/fasta") || die "$g1dir/Features/peg/fasta does not exist";
360 :     my @seqs = &gjoseqlib::read_fasta("$g1dir/Features/peg/fasta");
361 :     &gjoseqlib::print_alignment_as_fasta($file,\@seqs);
362 :     my $lens = {};
363 :     foreach $_ (@seqs) { $lens->{$_->[0]} = length($_->[2]) }
364 :     return $lens;
365 :     }
366 :    
367 : overbeek 1.1 sub get_fasta {
368 :     my($genome,$file,$sapObject) = @_;
369 :    
370 :     my $lens = {};
371 :     my $pegHash = $sapObject->all_features(-ids => $genome, -type => 'peg');
372 :     my $pegs = $pegHash->{$genome};
373 : overbeek 1.10 my $fastaHash = $sapObject->ids_to_sequences(-ids => $pegs,
374 :     -protein => 1);
375 : overbeek 1.1
376 :     open(FASTA,">$file") || die "could not open $file";
377 :     foreach my $peg (@$pegs)
378 :     {
379 :     my $seq = $fastaHash->{$peg};
380 : overbeek 1.6 if ($seq)
381 :     {
382 :     print FASTA ">$peg\n$seq\n";
383 :     $lens->{$peg} = length($seq);
384 :     }
385 : overbeek 1.1 }
386 :     close(FASTA);
387 :     return $lens;
388 :     }
389 :    
390 :     sub matching_neighbors {
391 : olson 1.11 my($genome1,$g1dir,$sims1,$genome2,$sims2,$sz_context,$ignore_ov) = @_;
392 : overbeek 1.1
393 :     my %by_genome;
394 : olson 1.11 my @peg_loc_tuples_in_genome;
395 :    
396 :     if ($g1dir)
397 :     {
398 :     @peg_loc_tuples_in_genome = &get_peg_loc_tuplesD($genome1, $g1dir);
399 :     }
400 :     else
401 : overbeek 1.1 {
402 : olson 1.11 @peg_loc_tuples_in_genome = &get_peg_loc_tuples($genome1);
403 : overbeek 1.1 }
404 : olson 1.11 $by_genome{$genome1} = &set_neighbors(\@peg_loc_tuples_in_genome, $sz_context, $ignore_ov);
405 :     @peg_loc_tuples_in_genome = &get_peg_loc_tuples($genome2);
406 :     $by_genome{$genome2} = &set_neighbors(\@peg_loc_tuples_in_genome, $sz_context, $ignore_ov);
407 :    
408 : overbeek 1.2 my %matched_pairs;
409 : overbeek 1.1 foreach my $peg1 (keys(%$sims1))
410 :     {
411 :     my $peg2 = $sims1->{$peg1}->[0];
412 :     my $neigh1 = $by_genome{$genome1}->{$peg1};
413 :     my $neigh2 = $by_genome{$genome2}->{$peg2};
414 :     my %neigh2H = map { $_ => 1 } @$neigh2;
415 : overbeek 1.2 my @pairs = ();
416 : overbeek 1.1 foreach my $n1 (@$neigh1)
417 :     {
418 :     my $maps_to = $sims1->{$n1}->[0];
419 :     if ($maps_to && $neigh2H{$maps_to})
420 :     {
421 : overbeek 1.2 push(@pairs,"$n1:$maps_to");
422 : overbeek 1.1 }
423 :     }
424 : overbeek 1.19 $matched_pairs{"$peg1,$peg2"} = join(",",@pairs);
425 : overbeek 1.1 }
426 : overbeek 1.2 return \%matched_pairs;
427 : overbeek 1.1 }
428 :    
429 : olson 1.11 sub get_peg_loc_tuples
430 :     {
431 :     my($genome) = @_;
432 :     my $fidHash = $sapObject->all_features(-ids => $genome, -type => 'peg');
433 :     # print STDERR "GOT pegs for $genome\n";
434 :    
435 :     my $all_fids = $fidHash->{$genome};
436 :     my $locHash = $sapObject->fid_locations(-ids => $all_fids);
437 :     my @peg_loc_tuples_in_genome =
438 :     sort { &compare_locs($a->[1],$b->[1]) }
439 :     map { [$_,$locHash->{$_}] }
440 :     keys(%$locHash);
441 :     return @peg_loc_tuples_in_genome;
442 :     }
443 :    
444 :     sub get_peg_loc_tuplesD
445 :     {
446 :     my($genome, $gdir) = @_;
447 : olson 1.23
448 :     my %locH;
449 :     if (open(my $fh, "<", "$gdir/Features/peg/tbl"))
450 :     {
451 :     while (<$fh>)
452 :     {
453 :     if (/^(\S+)\t(\S+)/)
454 :     {
455 :     $locH{$1} = $2;
456 :     }
457 :     }
458 :     close($fh);
459 :     }
460 :     # my %locH = map { $_ =~ /^(\S+)\t(\S+)/; ($1 => $2) } `cat $gdir/Features/peg/tbl`;
461 : overbeek 1.18 my @all_fids = keys(%locH);
462 :    
463 : olson 1.11 my @peg_loc_tuples_in_genome =
464 :     sort { &compare_locs($a->[1],$b->[1]) }
465 : overbeek 1.18 map { [$_, [split(/,/,$locH{$_})]] }
466 : olson 1.11 @all_fids;
467 :     return @peg_loc_tuples_in_genome;
468 :     }
469 :    
470 : overbeek 1.18
471 : overbeek 1.1 sub compare_locs {
472 :     my($loc1,$loc2) = @_;
473 :    
474 :     my($contig1,$min1,$max1) = &SeedUtils::boundaries_of($loc1);
475 :     my($contig2,$min2,$max2) = &SeedUtils::boundaries_of($loc2);
476 :     return (($contig1 cmp $contig2) or (($min1+$max1) <=> ($min2+$max2)));
477 :     }
478 :    
479 :     sub set_neighbors {
480 :     my($peg_loc_tuples,$N,$ignore_ov) = @_;
481 :    
482 :     my $peg_to_neighbors = {};
483 :     my $i;
484 :    
485 :     for ($i=0; ($i < @$peg_loc_tuples); $i++)
486 :     {
487 :     my($contigI,$minI,$maxI) = &SeedUtils::boundaries_of($peg_loc_tuples->[$i]->[1]);
488 :     $contigI || confess "BAD";
489 :     my $neighbors = [];
490 :     my $j = $i-1;
491 :     my $to_add = $N;
492 :     while (($j >= 0) && ($to_add > 0) &&
493 :     &same_contig($peg_loc_tuples->[$j]->[1],$contigI))
494 :     {
495 :     $j--;
496 :     if (&distinct($peg_loc_tuples->[$j]->[1],$peg_loc_tuples->[$j+1]->[1],$ignore_ov))
497 :     {
498 :     $to_add--;
499 :     }
500 :     }
501 :     $j++;
502 :     while ($j < $i) { push(@$neighbors,$peg_loc_tuples->[$j]->[0]); $j++ }
503 :    
504 :     $j = $i+1;
505 :     $to_add = $N;
506 :     while (($j < @$peg_loc_tuples) && ($to_add > 0) &&
507 :     &same_contig($peg_loc_tuples->[$j]->[1],$contigI))
508 :     {
509 :     push(@$neighbors,$peg_loc_tuples->[$j]->[0]);
510 :     if (&distinct($peg_loc_tuples->[$j]->[1],$peg_loc_tuples->[$j-1]->[1],$ignore_ov))
511 :     {
512 :     $to_add--;
513 :     }
514 :     $j++;
515 :     }
516 :     $peg_to_neighbors->{$peg_loc_tuples->[$i]->[0]} = $neighbors;
517 :     }
518 :     return $peg_to_neighbors;
519 :     }
520 :    
521 :     sub distinct {
522 :     my($x,$y,$ignore_ov) = @_;
523 :    
524 :     return ($ignore_ov > &overlap($x,$y));
525 :     }
526 :    
527 :     sub overlap {
528 :     my($x,$y) = @_;
529 :    
530 :     my($contig1,$min1,$max1) = &SeedUtils::boundaries_of($x);
531 :     my($contig2,$min2,$max2) = &SeedUtils::boundaries_of($y);
532 :     if ($contig1 ne $contig2) { return 0 }
533 :     if (&SeedUtils::between($min1,$min2,$max1)) { return ($max1 - $min2)+1 }
534 :     if (&SeedUtils::between($min2,$min1,$max2)) { return ($max2 - $min1)+1 }
535 :     return 0;
536 :     }
537 :    
538 :     sub same_contig {
539 :     my($entry,$contig) = @_;
540 :    
541 :     $contig || confess "BAD";
542 :     my($contig1,$minI,$maxI) = &SeedUtils::boundaries_of($entry);
543 :     return ($contig eq $contig1);
544 :     }
545 :    
546 :     sub get_functions {
547 : overbeek 1.10 my($sapObject,$pegs1,$pegs2,$g1dir) = @_;
548 :    
549 :     if(! $g1dir)
550 :     {
551 :     return $sapObject->ids_to_functions(-ids => [@$pegs1,@$pegs2]);
552 :     }
553 :     else
554 :     {
555 :     my $functions = $sapObject->ids_to_functions(-ids => $pegs2);
556 : olson 1.23 # (-s "$g1dir/assigned_functions") || die "$g1dir/assigned_functions is missing";
557 :     # foreach (`cat $g1dir/assigned_functions`)
558 :    
559 :     if (open(my $fh, "<", "$g1dir/assigned_functions"))
560 : overbeek 1.10 {
561 : olson 1.23 while (<$fh>)
562 : overbeek 1.10 {
563 : olson 1.23 if (/^(fig\|\S+)\t(\S.*\S)/)
564 :     {
565 :     $functions->{$1} = $2;
566 :     }
567 : overbeek 1.10 }
568 : olson 1.23 close($fh);
569 : overbeek 1.10 }
570 :     return $functions;
571 :     }
572 : overbeek 1.1 }
573 :    
574 :     sub get_aliases {
575 : overbeek 1.10 my($sapObject,$pegs) = @_;
576 : overbeek 1.1
577 :     my $aliases = {};
578 : overbeek 1.10 my $aliasHash = $sapObject->fids_to_ids(-ids => $pegs);
579 :    
580 : overbeek 1.1 foreach my $peg (@$pegs)
581 :     {
582 : overbeek 1.10 my $typeH = $aliasHash->{$peg} ? $aliasHash->{$peg} : {};
583 :     my @all_aliases = map { @{$typeH->{$_}} } keys(%$typeH);
584 : overbeek 1.1 my $aliasStr = (@all_aliases > 0) ? join(",",@all_aliases) : "";
585 :     $aliases->{$peg} = $aliasStr;
586 :     }
587 :     return $aliases;
588 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3