[Bio] / FigKernelPackages / ANNO.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/ANNO.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.14 - (view) (download) (as text)

1 : olson 1.1 #!/usr/bin/perl -w
2 :     use strict;
3 :    
4 :     #
5 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
6 :     # for Interpretations of Genomes. All Rights Reserved.
7 :     #
8 :     # This file is part of the SEED Toolkit.
9 :     #
10 :     # The SEED Toolkit is free software. You can redistribute
11 :     # it and/or modify it under the terms of the SEED Toolkit
12 :     # Public License.
13 :     #
14 :     # You should have received a copy of the SEED Toolkit Public License
15 :     # along with this program; if not write to the University of Chicago
16 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
17 :     # Genomes at veronika@thefig.info or download a copy from
18 :     # http://www.theseed.org/LICENSE.TXT.
19 :     #
20 :     package ANNO;
21 :    
22 : olson 1.10 use Data::Dumper;
23 :     use FIG_Config;
24 : olson 1.1 use strict;
25 :     use ERDB;
26 :     use Tracer;
27 :     use SeedUtils;
28 :     use ServerThing;
29 : olson 1.11 use KmerMgr;
30 : olson 1.10
31 :     my $rna_tool = "/vol/search_for_rnas-2007-0625/search_for_rnas";
32 : olson 1.1
33 :     sub new {
34 : parrello 1.9 my ($class, $sapDB) = @_;
35 : olson 1.1 # Create the sapling object.
36 : parrello 1.9 my $sap = $sapDB || ERDB::GetDatabase('Sapling');
37 : olson 1.1 # Create the server object.
38 :     my $retVal = { db => $sap };
39 :     # Bless and return it.
40 :     bless $retVal, $class;
41 : olson 1.10 $retVal->init_kmers();
42 : olson 1.1 return $retVal;
43 :     }
44 :    
45 :    
46 :     =head2 Primary Methods
47 :    
48 :     =head3 methods
49 :    
50 :     my $methodList = $ssObject->methods();
51 :    
52 :     Return a list of the methods allowed on this object.
53 :    
54 :     =cut
55 :    
56 :     use constant METHODS => [qw(metabolic_reconstruction
57 :     assign_function_to_prot
58 :     call_genes
59 :     find_rnas
60 :     assign_functions_to_DNA
61 : parrello 1.4 find_special_proteins
62 : parrello 1.6 assign_functions_to_dna_small
63 : olson 1.8 get_dataset
64 :     get_vector_basis_sets
65 :     get_active_datasets
66 : olson 1.1 )];
67 :    
68 : olson 1.10 use constant RAW_METHODS => [qw(assign_function_to_prot
69 :     call_genes
70 :     find_rnas
71 :     assign_functions_to_DNA
72 :     )];
73 :    
74 :     sub raw_methods
75 :     {
76 :     my ($self) = @_;
77 :     return RAW_METHODS;
78 :     }
79 :    
80 : olson 1.1 sub methods {
81 :     # Get the parameters.
82 :     my ($self) = @_;
83 :     # Return the result.
84 :     return METHODS;
85 :     }
86 :    
87 :     #
88 :     # Docs are in ANNOserver.pm.
89 :     #
90 :    
91 : parrello 1.4 sub find_special_proteins {
92 :     # Get the parameters.
93 :     my ($self, $args) = @_;
94 :     # Pull in the special protein finder.
95 :     require find_special_proteins;
96 :     # Convert the hash to the form expected by find_special_proteins.
97 :     my $params = {
98 :     contigs => $args->{-contigs},
99 :     is_init => $args->{-is_init},
100 :     is_alt => $args->{-is_alt},
101 :     is_term => $args->{-is_term},
102 :     comment => $args->{-comment}
103 :     };
104 :     if (exists $args->{-templates}) {
105 :     my $templates = $args->{-templates};
106 :     if (ref $templates eq 'ARRAY') {
107 :     $params->{references} = $templates;
108 :     } elsif ($templates =~ /^pyr/) {
109 :     $params->{pyrrolysine} = 1
110 :     }
111 :     }
112 :     # Process the input.
113 :     my @retVal = find_special_proteins::find_selenoproteins($params);
114 :     # Return the result.
115 :     return \@retVal;
116 :     }
117 :    
118 : olson 1.1 sub metabolic_reconstruction {
119 :     # Get the parameters.
120 :     my ($self, $args) = @_;
121 :    
122 :     my $sapling = $self->{db};
123 :     my $retVal = [];
124 :    
125 :     # This counter will be used to generate user IDs for roles without them.
126 :     my $next = 1000;
127 :    
128 :     my $id_roles = $args->{-roles};
129 :     my @id_roles1 = map { (ref $_ ? $_ : [$_, "FR" . ++$next]) } @$id_roles;
130 :    
131 :     my @id_roles = ();
132 :     foreach my $tuple (@id_roles1)
133 :     {
134 :     my($function,$id) = @$tuple;
135 : parrello 1.3 foreach my $role (split(/(?:; )|(?: [\]\@] )/,$function))
136 : olson 1.1 {
137 :     push(@id_roles,[$role,$id]);
138 :     }
139 :     }
140 :    
141 :     my %big;
142 :     my $id_display = 1;
143 :     map {push(@{$big{$_->[0]}}, $_->[1])} @id_roles;
144 :     my @resultRows = $sapling->GetAll("Subsystem Includes Role",
145 : parrello 1.5 'Subsystem(usable) = ? ORDER BY Subsystem(id), Includes(sequence)',
146 :     [1], [qw(Subsystem(id) Role(id) Includes(abbreviation))]);
147 : olson 1.1 my %ss_roles;
148 :     foreach my $row (@resultRows) {
149 :     my ($sub, $role, $abbr) = @$row;
150 :     $ss_roles{$sub}->{$role} = $abbr;
151 :     }
152 :     foreach my $sub (keys %ss_roles) {
153 :     my $roles = $ss_roles{$sub};
154 : parrello 1.3 my @rolesubset = grep { $big{$_} } keys %$roles;
155 :     my @abbr = map{$roles->{$_}} @rolesubset;
156 : olson 1.1 my $set = join(" ", @abbr);
157 :     if (@abbr > 0) {
158 :     my ($variant, $size) = $self->get_max_subset($sub, $set);
159 :     if ($variant) {
160 :     foreach my $role (keys %$roles) {
161 :     if ($id_display) {
162 : parrello 1.3 if (exists $big{$role}) {
163 :     foreach my $id (@{$big{$role}}) {
164 :     push (@$retVal, [$variant, $role, $id]);
165 :     }
166 :     }
167 : olson 1.1 } else {
168 :     push (@$retVal, [$variant, $role]);
169 :     }
170 :     }
171 :     }
172 :     }
173 :     }
174 :     # Return the result.
175 :     return $retVal;
176 :     }
177 :    
178 : parrello 1.6 =head3 assign_functions_to_dna_small
179 :    
180 :     my $idHash = $annoObject->assign_functions_to_dna_small({
181 :     -seqs => [[$id1, $comment1, $seq1],
182 :     [$id2, $comment2, $seq2],
183 :     ... ],
184 :     -kmer => 10,
185 :     -minHits => 3,
186 :     -maxGap => 600,
187 :     });
188 :    
189 :     This method uses FIGfams to assign functions to sequences. It is intended for smaller
190 :     sequence sets than the main method, because it eschews the normal flow control; however,
191 :     it is easier to use for things like the EXCEL interface.
192 :    
193 :     The parameters are as follows.
194 :    
195 :     =item parameter
196 :    
197 :     The parameter should be a reference to a hash with the following keys.
198 :    
199 :     =over 8
200 :    
201 :     =item -seqs
202 :    
203 :     Reference to a list of 3-tuples, each consisting of (0) an arbitrary unique ID and
204 :     (1) a comment, and (2) a sequence associated with the ID.
205 :    
206 :     =item -kmer
207 :    
208 :     KMER size (7 to 12) to use for the FIGfam analysis. Larger sizes are faster, smaller
209 :     sizes are more accurate.
210 :    
211 :     =item -minHits (optional)
212 :    
213 :     A number from 1 to 10, indicating the minimum number of matches required to
214 :     consider a protein as a candidate for assignment to a FIGfam. A higher value
215 :     indicates a more reliable matching algorithm; the default is C<3>.
216 :    
217 :     =item -maxGap (optional)
218 :    
219 :     When looking for a match, if two sequence elements match and are closer than
220 :     this distance, then they will be considered part of a single match. Otherwise,
221 :     the match will be split. The default is C<600>.
222 :    
223 :     =back
224 :    
225 :     =item RETURN
226 :    
227 :     Returns a hash mapping each incoming ID to a list of hit regions. Each hit
228 : parrello 1.7 region is a n-tuple consisting of (0) the number of matches to the function, (1) the
229 :     start location, (2) the stop location, (3) the proposed function, (4) the name
230 :     of the Genome Set from which the gene is likely to have originated, (5) the ID
231 :     number of the OTU (or C<undef> if the OTU was not found), and (6) the IDs of the
232 :     roles represented in the function, if any of them have IDs.
233 :    
234 : parrello 1.6
235 :     =back
236 :    
237 :     =cut
238 :    
239 :     sub assign_functions_to_dna_small {
240 :     # Get the parameters.
241 :     my ($self, $args) = @_;
242 :     # Get the Kmers object.
243 : olson 1.11 my $kmers = $self->{kmer_mgr}->get_default_kmer_object();
244 :    
245 : parrello 1.6 # Analyze the options.
246 :     my $maxGap = $args->{-maxGap} || 600;
247 :     my $minHits = $args->{-minHits} || 3;
248 :     # Get the KMER size.
249 :     my $kmer = $args->{-kmer};
250 :     # Declare the return variable.
251 :     my $retVal = {};
252 : parrello 1.7 # Get the sapling database.
253 :     my $sap = $self->{db};
254 : parrello 1.6 # Get the sequence tuples.
255 :     my $seqs = ServerThing::GetIdList(-seqs => $args);
256 :     # Loop through the sequences, finding assignments.
257 :     for my $seqTuple (@$seqs) {
258 :     # Extract the ID and sequence.
259 :     my ($id, undef, $seq) = @$seqTuple;
260 :     # Compute the assignment.
261 :     my $assignment = $kmers->assign_functions_to_PEGs_in_DNA($kmer, $seq,
262 :     $minHits, $maxGap);
263 : parrello 1.7 # Loop through the assignments, adding the function and OTU IDs.
264 :     for my $tuple (@$assignment) {
265 :     # Extract the function and OTU.
266 :     my $function = $tuple->[3];
267 :     my $otu = $tuple->[4];
268 :     # Get the IDs for the roles (if any).
269 :     my @roleIdx;
270 :     if ($function) {
271 :     # We have a function, so split it into roles.
272 :     my @roles = roles_of_function($function);
273 :     # Accumulate the IDs for the roles found.
274 :     for my $role (@roles) {
275 :     push @roleIdx, $sap->GetEntityValues(Role => $role, ['role-index']);
276 :     }
277 :     }
278 :     # Get the ID for the OTU (if any).
279 :     my $otuIdx;
280 :     if ($otu) {
281 :     ($otuIdx) = $sap->GetFlat("Genome IsCollectedInto",
282 :     'Genome(scientific-name) = ?', [$otu],
283 :     'IsCollectedInto(to-link)');
284 :     }
285 :     # Update the tuple.
286 :     splice @$tuple, 5, undef, $otuIdx, @roleIdx;
287 :     }
288 : parrello 1.6 # Store the result.
289 :     $retVal->{$id} = $assignment;
290 :     }
291 :     # Return the results.
292 :     return $retVal;
293 :     }
294 :    
295 : olson 1.10 =head3 assign_function_to_prot
296 :    
297 :     Raw CGI handler for the Kmer protein assignment code.
298 :    
299 :     =cut
300 :    
301 :     sub assign_function_to_prot
302 :     {
303 :     my($self, $cgi) = @_;
304 :    
305 :     my @id = $cgi->param('id_seq');
306 :    
307 :     my %params = map { my $v = $cgi->param($_); defined($v) ? ($_ => $cgi->param($_)) : () }
308 :     qw(-kmer -scoreThreshold -hitThreshold -seqHitThreshold -normalizeScores -detailed
309 :     -assignToAll -kmerDataset -determineFamily -returnFamilySims);
310 :    
311 :     $params{-all} = 1;
312 :    
313 : olson 1.11 my $ds = $params{-kmerDataset} || $self->{kmer_mgr}->default_dataset;
314 :    
315 :     my $kmers = $self->{kmer_mgr}->get_kmer_object($ds);
316 :     ref($kmers) or myerror($cgi, "500 invalid dataset name $ds");
317 :     my $kmer_fasta = $self->{kmer_mgr}->get_extra_fasta_path($ds);
318 : olson 1.10
319 :     @id or die "figfam server missing id_seq argument";
320 :    
321 :     my @output;
322 :     my @missing;
323 :     foreach my $parm (@id) {
324 :     my ($id, $seq) = split /,/, $parm;
325 :     my $triple = [$id, undef, $seq];
326 :     my @res = $kmers->assign_functions_to_prot_set(-seqs => [$triple], %params);
327 :     my $res = $res[0];
328 :     if ($res->[1] || !$params{-assignToAll})
329 :     {
330 :     push(@output, $res);
331 :     }
332 :     elsif ($params{-assignToAll})
333 :     {
334 :     push(@missing, $triple);
335 :     }
336 :     }
337 :     if ($params{-assignToAll} && @missing)
338 :     {
339 :     my @rest = $kmers->assign_functions_using_similarity(-seqs => \@missing,
340 :     -simDb => $kmer_fasta);
341 :    
342 :     push(@output, @rest);
343 :     }
344 :    
345 :     return @output;
346 :     }
347 :    
348 :     =head3 call_genes
349 :    
350 :     Raw CGI handler for the gene caller.
351 :    
352 :     =cut
353 :    
354 :     sub call_genes
355 :     {
356 :     my($self, $cgi) = @_;
357 :    
358 :     my @id = $cgi->param('id_seq');
359 :     @id or myerror($cgi, "500 missing id_seq", "figfam server missing id_seq argument");
360 :    
361 :     my %params = map { my $v = $cgi->param($_); defined($v) ? ($_ => $cgi->param($_)) : () }
362 :     qw(-geneticCode -minContigLen -verbose);
363 :    
364 :     my $genetic_code = ($params{-geneticCode} =~ /^(\d+)$/ ? $1 : 11);
365 :    
366 : olson 1.11 my $min_training_len = ($params{-minContigLen} =~ /^(\d+)$/ ? $1 : 2000);
367 : olson 1.10 #
368 :     # Create fasta of the contig data.
369 :     #
370 :    
371 :     my $fh;
372 :     my $tmp = "$FIG_Config::temp/contigs.$$";
373 :     my $tmp2 = "$FIG_Config::temp/contigs.aa.$$";
374 :     my $tbl = "$FIG_Config::temp/tbl.$$";
375 :     my $tbl2 = "$FIG_Config::temp/tbl2.$$";
376 :     open($fh, ">", $tmp) or die "Cannot write $tmp: $!";
377 :    
378 :     foreach my $parm (@id) {
379 :     my ($id, $seq) = split /,/, $parm;
380 :     &FIG::display_id_and_seq($id, \$seq, $fh);
381 :     }
382 :     close($fh);
383 :    
384 :     # Training stuff.
385 :     my $trainingParms = "";
386 :     my @trainSet = $cgi->param('training_set');
387 :     if (@trainSet) {
388 :     my $tmp3 = "$FIG_Config::temp/tbl3.$$";
389 :     my $fh3;
390 :     open($fh3, ">", $tmp3) or die "Cannot write $tmp3: $!";
391 :     while (@trainSet) {
392 :     my $loc = pop @trainSet;
393 :     my $id = pop @trainSet;
394 :     print $fh3 "$id\t$loc\n";
395 :     }
396 :     $trainingParms = "-train=$tmp3";
397 :     my @trainContigs = $cgi->param('train_seq');
398 :     if (@trainContigs) {
399 :     undef $fh3;
400 :     my $tmp4 = "$FIG_Config::temp/fasta1.$$";
401 :     open($fh3, ">", $tmp4);
402 :     foreach my $parm (@trainContigs) {
403 :     my ($id, $seq) = split /,/, $parm;
404 :     &FIG::display_id_and_seq($id, \$seq, $fh3);
405 :     }
406 :     close($fh3);
407 :     $trainingParms .= ",$tmp4";
408 :     }
409 :     }
410 :     # Verbose check
411 :     my $verbose = ($params{-verbose} ? "-verbose" : "");
412 :     if ($verbose) {
413 :     warn "Input file: $tmp; training parameter=$trainingParms\n";
414 :     }
415 :     # Call glimmer
416 :     my $res = system("$FIG_Config::bin/run_glimmer3 $verbose -minlen=$min_training_len $trainingParms -code=$genetic_code 1.1 $tmp > $tbl");
417 :     if ($res != 0)
418 :     {
419 :     die "glimmer failed with rc=$res";
420 :     }
421 :    
422 :     my $fh2;
423 :     open($fh, "<", $tbl) or die "cannot read $tbl: $!";
424 :     open($fh2, ">", $tbl2)or die "cannot write $tbl2: $!";
425 :     my $ctr = 1;
426 :     my $encoded_tbl = [];
427 :     while (<$fh>)
428 :     {
429 :     chomp;
430 :     my(@a) = split(/\t/);
431 :     $a[0] = sprintf("prot_%05d", $ctr++);
432 :     push(@a, $a[1]);
433 :     print $fh2 join("\t", @a), "\n";
434 :     my ($contig, $beg, $end) = ($a[1] =~ /^(\S+)_(\d+)_(\d+)$/);
435 :     push @$encoded_tbl, [$a[0], $contig, $beg, $end];
436 :     }
437 :     close($fh);
438 :     close($fh2);
439 :    
440 :     $res = system("$FIG_Config::bin/get_fasta_for_tbl_entries -code=$genetic_code $tmp < $tbl2 > $tmp2");
441 :     if ($res != 0)
442 :     {
443 :     die "error rc=$res running get_fasta_for_tbl_entries\n";
444 :     }
445 :    
446 :     if (!open($fh,"<", $tmp2))
447 :     {
448 :     die "Cannot read $tmp2: $!";
449 :     }
450 :    
451 :     my $out;
452 :     my $buf;
453 :     while (read($fh, $buf, 4096))
454 :     {
455 :     $out .= $buf;
456 :     }
457 :    
458 :     close($fh);
459 :     return [$out, $encoded_tbl];
460 :     }
461 :    
462 :     =head3 find_rnas
463 :    
464 :     Raw CGI handler for the rna finder
465 :    
466 :     =cut
467 :    
468 :     sub find_rnas
469 :     {
470 :     my($self, $cgi) = @_;
471 :     my @id = $cgi->param('id_seq');
472 :    
473 :     my %params = map { my $v = $cgi->param($_); defined($v) ? ($_ => $cgi->param($_)) : () }
474 : gdpusch 1.14 qw(-genus -species -domain -rnas);
475 : olson 1.10
476 :     @id or die "missing id_seq parameter";
477 :    
478 :     $params{-genus} or die "missing genus parameter";
479 :     $params{-species} or die "missing species parameter";
480 :     $params{-domain} or die "missing domain parameter";
481 :    
482 :     #
483 :     # Create fasta of the contig data.
484 :     #
485 :    
486 :     my $fh;
487 :     my $tmp_dir = "$FIG_Config::temp/find_rnas.$$";
488 :     my $log = "$tmp_dir/log";
489 :     &FIG::verify_dir($tmp_dir);
490 :     my $tmp = "$tmp_dir/contigs";
491 :     my $tmp2 = "$tmp_dir/contigs2";
492 :     my $tbl = "$tmp_dir/tbl";
493 :     my $tbl2 = "$tmp_dir/tbl2";
494 :    
495 :     open($fh, ">", $tmp) or die "cannot write $tmp: $!";
496 :    
497 :     foreach my $parm (@id) {
498 :     my ($id, $seq) = split /,/, $parm;
499 :     &FIG::display_id_and_seq($id, \$seq, $fh);
500 :     }
501 :     close($fh);
502 :    
503 : gdpusch 1.14 my $opt_rna_types = $params{-rnas} ? "-rnas=$params{-rnas}" : "";
504 :     my $cmd = "$rna_tool $opt_rna_types --tmpdir=$tmp_dir --contigs=$tmp --orgid=1 --domain=$params{-domain} --genus=$params{-genus} --species=$params{-species}";
505 : olson 1.10 warn "Run: $cmd\n";
506 :     #
507 :     # Need to clear the PERL5LIB from the environment since tool is configured to use its own.
508 :     #
509 :     my $res = system("cd $tmp_dir; env PERL5LIB= $cmd > $tbl 2> $log");
510 :     if ($res != 0)
511 :     {
512 :     die "cmd failed with rc=$res: $cmd";
513 :     }
514 :    
515 :     my $fh2;
516 :     open($fh, "<", $tbl) or die "cannot read $tbl: $!";
517 :     open($fh2, ">", $tbl2) or die "cannot write $tbl2: $!";
518 :     my $ctr = 1;
519 :     my $encoded_tbl = [];
520 :     while (<$fh>)
521 :     {
522 :     chomp;
523 :     my(@a) = split(/\t/);
524 :    
525 :     my $new = sprintf("rna_%05d", $ctr++);
526 :    
527 :     print $fh2 join("\t", $new, $a[1]), "\n";
528 :     my ($contig, $beg, $end) = ($a[1] =~ /^(\S+)_(\d+)_(\d+)$/);
529 :     push @$encoded_tbl, [$new, $contig, $beg, $end, $a[2]];
530 :     }
531 :     close($fh);
532 :     close($fh2);
533 :    
534 :     $res = system("$FIG_Config::bin/get_dna $tmp < $tbl2 > $tmp2");
535 :     if ($res != 0)
536 :     {
537 :     die "get_dna $tmp failed with rc=$res";
538 :     }
539 :    
540 :     open($fh,"<", $tmp2) or die "Cannot read $tmp2: $!";
541 :    
542 :     my $out;
543 :     my $buf;
544 :     while (read($fh, $buf, 4096))
545 :     {
546 :     $out .= $buf;
547 :     }
548 :     close($fh);
549 :     return [$out, $encoded_tbl];
550 :     #unlink($tmp);
551 :     #unlink($tmp2);
552 :     #unlink($tbl);
553 :     #unlink($tbl2);
554 :     }
555 :    
556 :    
557 :     =head3 assign_functions_to_DNA
558 :    
559 :     Raw CGI handler for the DNA kmer code
560 :    
561 :     =cut
562 :    
563 :     sub assign_functions_to_DNA
564 :     {
565 :     my($self, $cgi) = @_;
566 :    
567 :     my @id = $cgi->param('id_seq');
568 :     my %params = map { my $v = $cgi->param($_); defined($v) ? ($_ => $cgi->param($_)) : () }
569 : olson 1.12 qw(-kmer -minHits -maxGap -kmerDataset -detailed);
570 : olson 1.10
571 :     my $min_hits = $params{-minHits};
572 :     my $max_gap = $params{-maxGap};
573 : olson 1.12
574 :     my $details = $params{-detailed} ? 1 : 0;
575 : olson 1.10
576 :     my $kmer = $params{-kmer};
577 :    
578 : olson 1.11 my $ds = $params{-kmerDataset} || $self->{kmer_mgr}->default_dataset;
579 :    
580 :     my $kmers = $self->{kmer_mgr}->get_kmer_object($ds);
581 :     ref($kmers) or myerror($cgi, "500 invalid dataset name $ds");
582 : olson 1.10
583 :     @id or die "missing id_seq argument";
584 :    
585 :     my @out;
586 :     foreach my $parm (@id) {
587 :     my ($id, $seq) = split /,/, $parm;
588 :     # print L "try $id\n$seq\n";
589 :     my $res;
590 :     eval {
591 : olson 1.13 $res = $kmers->assign_functions_to_PEGs_in_DNA($kmer, $seq, $min_hits, $max_gap, 0, $details);
592 : olson 1.10
593 :     };
594 :     if ($@)
595 :     {
596 :     die "failure on assign_functions_to_PEGs_in_DNA: $@";
597 :     }
598 :     push(@out, map { [$id, $_ ] } @$res);
599 :     }
600 :     return @out;
601 :     }
602 :    
603 :     =head3 get_dataset
604 :    
605 :     Return the default dataset; if -kmerDataset is passed, verify that it is
606 :     a valid dataset.
607 :    
608 :     =cut
609 :    
610 :     sub get_dataset
611 :     {
612 :     my($self, $args) = @_;
613 :     my $ds;
614 :     if (defined($ds = $args->{'-kmerDataset'}))
615 :     {
616 : olson 1.11 my $kmers = $self->{kmer_mgr}->get_kmer_object($ds);
617 : olson 1.10 ref($kmers) or return undef;
618 :     }
619 :     else
620 :     {
621 : olson 1.11 $ds = $self->{kmer_mgr}->default_dataset;
622 : olson 1.10 }
623 :     return $ds;
624 :     }
625 :    
626 :     =head3 get_vector_basis_sets
627 :    
628 :     Return the vector basis sets for the given dataset name.
629 :    
630 :     =cut
631 :    
632 :     sub get_vector_basis_sets
633 :     {
634 :     my($self, $args) = @_;
635 : olson 1.11
636 :     my $ds = $args->{'-kmerDataset'} || $self->{kmer_mgr}->default_dataset;
637 :     my $kmers = $self->{kmer_mgr}->get_kmer_object($ds);
638 :     ref($kmers) or return undef;
639 : olson 1.10
640 :     my $dir = $kmers->dir();
641 :    
642 :     my $res = {};
643 :     my @todo = ([function => "$dir/family.vector.def"], [otu => "$dir/setI"]);
644 :     for my $ent (@todo)
645 :     {
646 :     my($what, $file) = @$ent;
647 :     if (open(my $fh, "<", $file))
648 :     {
649 :     local $/ = undef;
650 :     $res->{$what} = <$fh>;
651 :     close($fh);
652 :     }
653 :     else
654 :     {
655 :     push(@{$res->{errors}}, "Cannot open $file: $!");
656 :     }
657 :     }
658 :     return $res;
659 :     }
660 :    
661 :     =head3 get_active_datasets
662 :    
663 :     Return a list of the currently active Kmer datasets.
664 :    
665 :     =cut
666 :    
667 :     sub get_active_datasets
668 :     {
669 : olson 1.11 my($self) = @_;
670 : olson 1.10
671 : olson 1.11 return $self->{kmer_mgr}->get_active_datasets;
672 : olson 1.10 }
673 : parrello 1.6
674 : olson 1.1 =head2 Internal Utility Methods
675 :    
676 : olson 1.10 =head3 init_kmers
677 :    
678 :     $annoObject->init_kmers()
679 :    
680 :     Initialize the kmer data set information from the FIG environment. Used
681 :     on the annotation servers.
682 :    
683 :     =cut
684 :    
685 :     sub init_kmers
686 :     {
687 :     my($self) = @_;
688 :    
689 : olson 1.11 my $kmer_dir = $FIG_Config::KmerBase || "/vol/figfam-prod";
690 :     -d $kmer_dir or die "Cannot find kmer base directory $kmer_dir";
691 : parrello 1.6
692 : olson 1.11 my $kmgr = KmerMgr->new(base_dir => $kmer_dir);
693 : parrello 1.6
694 : olson 1.11 $self->{kmer_mgr} = $kmgr;
695 : parrello 1.6 }
696 :    
697 : olson 1.1 =head3 get_max_subset
698 :    
699 :     my ($max_variant, $max_size) = $ssObject->get_max_subset($sub, $setA);
700 :    
701 :     Given a subsystem ID and a role rule, return the ID of the variant for
702 :     the subsystem that matches the most roles in the rule and the number of
703 :     roles matched.
704 :    
705 :     =over 4
706 :    
707 :     =item sub
708 :    
709 :     Name (ID) of the subsystem whose variants are to be examined.
710 :    
711 :     =item setA
712 :    
713 :     A space-delimited list of role abbreviations, lexically ordered. This provides
714 :     a unique specification of the roles in the set.
715 :    
716 :     =item RETURN
717 :    
718 : parrello 1.2 Returns a 2-element list consisting of name variant found (subsystem name, colon,
719 :     and variant code) and the number of roles matched.
720 : olson 1.1
721 :     =back
722 :    
723 :     =cut
724 :    
725 :     sub get_max_subset {
726 :     my ($self, $sub, $setA) = @_;
727 :     my $sapling = $self->{db};
728 :     my $max_size = 0;
729 :     my $max_set;
730 :     my $max_variant;
731 :     my %set_hash;
732 :     my $qh = $sapling->Get("Subsystem Describes Variant", 'Subsystem(id) = ? AND Variant(type) = ?', [$sub, 'normal']);
733 :     while (my $resultRow = $qh->Fetch()) {
734 :     my @variantRoleRule = $resultRow->Value('Variant(role-rule)');
735 :     my ($variantCode) = $resultRow->Value('Variant(code)');
736 :     my $variantId = $sub.":".$variantCode;
737 :     foreach my $setB (@variantRoleRule) {
738 :     my $size = is_A_a_superset_of_B($setA, $setB);
739 :     if ($size && $size > $max_size) {
740 :     $max_size = $size;
741 :     $max_set = $setB;
742 :     $max_variant = $variantId;
743 :     }
744 :     }
745 :     }
746 :     #if ($max_size) {
747 :     #print STDERR "Success $max_variant, $max_set\n";
748 :     #}
749 :     return($max_variant, $max_size);
750 :     }
751 :    
752 :    
753 :     =head3 is_A_a_superset_of_B
754 :    
755 :     my $size = SS::is_A_a_superset_of_B($a, $b);
756 :    
757 :     This method takes as input two role rules, and returns 0 if the first
758 :     role rule is NOT a superset of the second; otherwise, it returns the size
759 :     of the second rule. A role rule is a space-delimited list of role
760 :     abbreviations in lexical order. This provides a unique identifier for a
761 :     set of roles in a subsystem.
762 :    
763 :     =over 4
764 :    
765 :     =item a
766 :    
767 :     First role rule.
768 :    
769 :     =item b
770 :    
771 :     Second role rule.
772 :    
773 :     =item RETURN
774 :    
775 :     Returns 0 if the first rule is NOT a superset of the second and the size of the
776 :     second rule if it is. As a result, if the first rule IS a superset, this method
777 :     will evaluate to TRUE, and to FALSE otherwise.
778 :    
779 :     =back
780 :    
781 :     =cut
782 :    
783 :     sub is_A_a_superset_of_B {
784 :     my ($a, $b) = @_;
785 :     my @a = split(" ", $a);
786 :     my @b = split(" ", $b);
787 :     if (@b > @a) {
788 :     return(0);
789 :     }
790 :     my %given;
791 :     map { $given{$_} = 1} @a;
792 :     map { if (! $given{$_}) {return 0}} split(" ", $b);
793 :     my $l = scalar(@b);
794 :     return scalar(@b);
795 :     }
796 :    
797 :    
798 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3