[Bio] / FigKernelPackages / FIG.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/FIG.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.286, Wed Jun 8 23:05:21 2005 UTC revision 1.287, Thu Jun 9 05:36:30 2005 UTC
# Line 19  Line 19 
19  use Tracer;  use Tracer;
20    
21  eval { require FigGFF; };  eval { require FigGFF; };
22  if ($@ and $ENV{USER} eq "olson")  if ($@ and $ENV{USER} eq "olson") {
 {  
23      warn $@;      warn $@;
24  }  }
25    
# Line 49  Line 48 
48  };  };
49    
50  my $xmlrpc_available = 1;  my $xmlrpc_available = 1;
51  if ($@ ne "")  if ($@ ne "") {
 {  
52      $xmlrpc_available = 0;      $xmlrpc_available = 0;
53  }  }
54    
# Line 86  Line 84 
84  C<FIG_Config.pm> file. This file contains names and URLs for the key  C<FIG_Config.pm> file. This file contains names and URLs for the key
85  directories as well as the type and login information for the database.  directories as well as the type and login information for the database.
86    
87    FIG was designed to operate as a series of peer instances. Each instance is
88    updated independently by its owner, and the instances can be synchronized
89    using a process called a I<peer-to-peer update>. The terms
90    I<SEED instance> and I<peer> are used more-or-less interchangeably.
91    
92    The POD documentation for this module is still in progress, and is provided
93    on an AS IS basis without warranty. If you have a correction and you're
94    not a developer, EMAIL the details to B<bruce@gigabarb.com> and I'll fold
95    it in.
96    
97    B<NOTE>: The usage example for each method specifies whether it is static
98    
99        FIG::something
100    
101    or dynamic
102    
103        $fig->something
104    
105    If the method is static and has no parameters (C<FIG::something()>) it can
106     also be invoked dynamically. This is a general artifact of the
107    way PERL implements object-oriented programming.
108    
109    =head2 Hiding/Caching in a FIG object
110    
111    We save the DB handle, cache taxonomies, and put a few other odds and ends in the
112    FIG object.  We expect users to invoke these services using the object $fig constructed
113    using:
114    
115        use FIG;
116        my $fig = new FIG;
117    
118    $fig is then used as the basic mechanism for accessing FIG services.  It is, of course,
119    just a hash that is used to retain/cache data.  The most commonly accessed item is the
120    DB filehandle, which is accessed via $self->db_handle.
121    
122    We cache genus/species expansions, taxonomies, distances (very crudely estimated) estimated
123    between genomes, and a variety of other things.
124    
125  =cut  =cut
126    
127    
128  #: Constructor FIG->new();  #: Constructor FIG->new();
129    
130  =head2 Public Methods  =head2 Public Methods
# Line 114  Line 151 
151          my $figrpc = new FIGrpc($ENV{FIG_URL});          my $figrpc = new FIGrpc($ENV{FIG_URL});
152          return $figrpc;          return $figrpc;
153      }      }
154        # Here we have the normal case. Check for default tracing.
155        if (defined $FIG_Config::trace_levels) {
156            # It's on, so we set it up. If the type is omitted, we default to WARN, which
157            # is the standard error output.
158            my $trace_type = (defined $FIG_Config::trace_type ? $FIG_Config::trace_type : "WARN");
159            TSetup($FIG_Config::trace_levels, $trace_type);
160        }
161        # Connect to the database, then return ourselves.
162      my $rdbH = new DBrtns;      my $rdbH = new DBrtns;
163      bless {      bless {
164          _dbf  => $rdbH,          _dbf  => $rdbH,
165      }, $class;      }, $class;
166  }  }
167    
168    =head3 db_handle
169    
170    C<< my $dbh = $fig->db_handle; >>
171    
172    Return the handle to the internal B<DBrtns> object. This allows direct access to
173    the database methods.
174    
175    =cut
176    
177    sub db_handle {
178        my($self) = @_;
179        return $self->{_dbf};
180    }
181    
182    =head3 cached
183    
184    C<< my $x = $fig->cached($name); >>
185    
186    Return a reference to a hash containing transient data. If no hash exists with the
187    specified name, create an empty one under that name and return it.
188    
189    The idea behind this method is to allow clients to cache data in the FIG object for
190    later use. (For example, a method might cache feature data so that it can be
191    retrieved later without using the database.) This facility should be used sparingly,
192    since different clients may destroy each other's data if they use the same name.
193    
194    =over 4
195    
196    =item name
197    
198    Name assigned to the cached data.
199    
200    =item RETURN
201    
202    Returns a reference to a hash that is permanently associated with the specified name.
203    If no such hash exists, an empty one will be created for the purpose.
204    
205    =back
206    
207    =cut
208    
209    sub cached {
210        my($self,$what) = @_;
211    
212        my $x = $self->{$what};
213        if (! $x) {
214            $x = $self->{$what} = {};
215        }
216        return $x;
217    }
218    
219  =head3 get_system_name  =head3 get_system_name
220    
# Line 135  Line 229 
229      return "seed";      return "seed";
230  }  }
231    
232  # Destructor: releases the database handle.  =head3 DESTROY
233    
234    The destructor releases the database handle.
235    
236    =cut
237    
238  sub DESTROY {  sub DESTROY {
239      my($self) = @_;      my($self) = @_;
# Line 164  Line 262 
262      open(TMP,">$tmpD") || die "could not open $tmpD";      open(TMP,">$tmpD") || die "could not open $tmpD";
263    
264      my $genome;      my $genome;
265      foreach $genome ($self->genomes)      foreach $genome ($self->genomes) {
266      {          if (! $to_del{$genome}) {
         if (! $to_del{$genome})  
         {  
267              print TMP "$genome\n";              print TMP "$genome\n";
268          }          }
269      }      }
# Line 188  Line 284 
284  C<< my $ok = $fig->add_genome($genomeF); >>  C<< my $ok = $fig->add_genome($genomeF); >>
285    
286  Add a new genome to the data store. A genome's data is kept in a directory  Add a new genome to the data store. A genome's data is kept in a directory
287  by itself, underneath the main organism directory.  by itself, underneath the main organism directory. This method essentially
288    moves genome data from an external directory to the main directory and
289    performs some indexing tasks to integrate it.
290    
291  =over 4  =over 4
292    
293  =item genomeF  =item genomeF
294    
295  Name of the directory containing the genome files.  Name of the directory containing the genome files. This should be a
296    fully-qualified directory name. The last segment of the directory
297    name should be the genome ID.
298    
299  =item RETURN  =item RETURN
300    
# Line 211  Line 311 
311    
312      my(undef, $path, $genome) = File::Spec->splitpath($genomeF);      my(undef, $path, $genome) = File::Spec->splitpath($genomeF);
313    
314      if ($genome !~ /^\d+\.\d+$/)      if ($genome !~ /^\d+\.\d+$/) {
     {  
315          warn "Invalid genome filename $genomeF\n";          warn "Invalid genome filename $genomeF\n";
316          return $rc;          return $rc;
317      }      }
318    
319      if (-d $FIG_Config::organisms/$genome)      if (-d $FIG_Config::organisms/$genome) {
     {  
320          warn "Organism already exists for $genome\n";          warn "Organism already exists for $genome\n";
321          return $rc;          return $rc;
322      }      }
# Line 230  Line 328 
328    
329      my @errors = `$FIG_Config::bin/verify_genome_directory $genomeF`;      my @errors = `$FIG_Config::bin/verify_genome_directory $genomeF`;
330    
331      if (@errors)      if (@errors) {
     {  
332          warn "Errors found while verifying genome directory $genomeF:\n";          warn "Errors found while verifying genome directory $genomeF:\n";
333          print join("", @errors);          print join("", @errors);
334          return $rc;          return $rc;
# Line 245  Line 342 
342      &run("load_features $genome");      &run("load_features $genome");
343    
344      $rc = 1;      $rc = 1;
345      if (-s "$FIG_Config::organisms/$genome/Features/peg/fasta")      if (-s "$FIG_Config::organisms/$genome/Features/peg/fasta") {
     {  
346          &run("index_translations $genome");          &run("index_translations $genome");
347          my @tmp = `cut -f1 $FIG_Config::organisms/$genome/Features/peg/tbl`;          my @tmp = `cut -f1 $FIG_Config::organisms/$genome/Features/peg/tbl`;
348          chomp @tmp;          chomp @tmp;
# Line 254  Line 350 
350          &enqueue_similarities(\@tmp);          &enqueue_similarities(\@tmp);
351      }      }
352      if ((-s "$FIG_Config::organisms/$genome/assigned_functions") ||      if ((-s "$FIG_Config::organisms/$genome/assigned_functions") ||
353          (-d "$FIG_Config::organisms/$genome/UserModels"))          (-d "$FIG_Config::organisms/$genome/UserModels")) {
     {  
354          &run("add_assertions_of_function $genome");          &run("add_assertions_of_function $genome");
355      }      }
356    
357      return $rc;      return $rc;
358  }  }
359    
360    =head3 parse_genome_args
361    
362    C<< my ($mode, @genomes) = FIG::parse_genome_args(@args); >>
363    
364    Extract a list of genome IDs from an argument list. If the argument list is empty,
365    return all the genomes in the data store.
366    
367    This is a function that is performed by many of the FIG command-line utilities. The
368    user has the option of specifying a list of specific genome IDs or specifying none
369    in order to get all of them. If your command requires additional arguments in the
370    command line, you can still use this method if you shift them out of the argument list
371    before calling. The $mode return value will be C<all> if the user asked for all of
372    the genomes or C<some> if he specified a list of IDs. This is useful to know if,
373    for example, we are loading a table. If we're loading everything, we can delete the
374    entire table; if we're only loading some genomes, we must delete them individually.
375    
376    This method uses the genome directory rather than the database because it may be used
377    before the database is ready.
378    
379    =over 4
380    
381    =item args1, args2, ... argsN
382    
383    List of genome IDs. If all genome IDs are to be processed, then this list should be
384    empty.
385    
386    =item RETURN
387    
388    Returns a list. The first element of the list is C<all> if the user is asking for all
389    the genome IDs and C<some> otherwise. The remaining elements of the list are the
390    desired genome IDs.
391    
392    =back
393    
394    =cut
395    
396    sub parse_genome_args {
397        # Get the parameters.
398        my @args = @_;
399        # Check the mode.
400        my $mode = (@args > 0 ? 'some' : 'all');
401        # Build the return list.
402        my @retVal = ($mode);
403        # Process according to the mode.
404        if ($mode eq 'all') {
405            # We want all the genomes, so we get them from the organism directory.
406            my $orgdir = "$FIG_Config::organisms";
407            opendir( GENOMES, $orgdir ) || Confess("Could not open directory $orgdir");
408            push @retVal, grep { $_ =~ /^\d/ } readdir( GENOMES );
409            closedir( GENOMES );
410        } else {
411            # We want only the genomes specified by the user.
412            push @retVal, @args;
413        }
414        # Return the result.
415        return @retVal;
416    }
417    
418    =head3 reload_table
419    
420    C<< $fig->reload_table($mode, $table, $flds, $xflds, $fileName, $keyList, $keyName); >>
421    
422    Reload a database table from a sequential file. If I<$mode> is C<all>, the table
423    will be dropped and re-created. If I<$mode> is C<some>, the data for the individual
424    items in I<$keyList> will be deleted before the table is loaded. Thus, the load
425    process is optimized for the type of reload.
426    
427    =over 4
428    
429    =item mode
430    
431    C<all> if we are reloading the entire table, C<some> if we are only reloading
432    specific entries.
433    
434    =item table
435    
436    Name of the table to reload.
437    
438    =item flds
439    
440    String defining the table columns, in SQL format. In general, this is a
441    comma-delimited set of field specifiers, each specifier consisting of the
442    field name followed by the field type and any optional qualifiers (such as
443    C<NOT NULL> or C<DEFAULT>); however, it can be anything that would appear
444    between the parentheses in a C<CREATE TABLE> statement. The order in which
445    the fields are specified is important, since it is presumed that is the
446    order in which they are appearing in the load file.
447    
448    =item xflds
449    
450    Reference to a hash that describes the indexes. The hash is keyed by index name.
451    The value is the index's field list. This is a comma-delimited list of field names
452    in order from most significant to least significant. If a field is to be indexed
453    in descending order, its name should be followed by the qualifier C<DESC>. For
454    example, the following I<$xflds> value will create two indexes, one for name followed
455    by creation date in reverse chronological order, and one for ID.
456    
457        { name_index => "name, createDate DESC", id_index => "id" }
458    
459    =item fileName
460    
461    Fully-qualified name of the file containing the data to load. Each line of the
462    file must correspond to a record, and the fields must be arranged in order and
463    tab-delimited.
464    
465    =item keyList
466    
467    Reference to a list of the IDs for the objects being reloaded. This parameter is
468    only used if I<$mode> is C<some>.
469    
470    =item keyName (optional)
471    
472    Name of the key field containing the IDs in the keylist. If omitted, C<genome> is
473    assumed.
474    
475    =back
476    
477    =cut
478    
479    sub reload_table {
480            # Get the parameters.
481            my ($self, $mode, $table, $flds, $xflds, $fileName, $keyList, $keyName) = @_;
482        if (!defined $keyName) {
483            $keyName = 'genome';
484        }
485        # Get the database handler.
486        my $dbf = $self->{_dbf};
487            # If we're in ALL mode, we drop and re-create the table. Otherwise,
488            # we delete the obsolete genomes.
489            if ( $mode eq 'all') {
490                    Trace("Recreating $table.") if T(2);
491                    $dbf->drop_table( tbl  => $table );
492                    $dbf->create_table( tbl  => $table, flds => $flds );
493            } else {
494                    Trace("Clearing obsolete data from $table.") if T(2);
495                    foreach my $key ( @{$keyList} ) {
496                            $dbf->SQL("DELETE FROM $table WHERE ( $keyName = \'$key\' )");
497                    }
498            }
499            # The table is now reading for loading.
500            Trace("Loading $table from $fileName.") if T(2);
501            $dbf->load_table( tbl  => $table, file => $fileName );
502            # If we're in ALL mode, we need to build the indexes.
503            if ( $mode eq 'all' ) {
504                    Trace("Creating indexes for $table.") if T(2);
505                    # Loop through the indexes in the index hash.
506                    for my $idxName (keys %{$xflds}) {
507                            Trace("Creating index $idxName.") if T(3);
508                            $dbf->create_index( idx  => $idxName,
509                                    tbl  => $table,
510                                    type => "btree",
511                                    flds => $xflds->{$idxName}
512                            );
513                    }
514                    $dbf->vacuum_it( "$table" );
515            }
516    }
517    
518  =head3 enqueue_similarities  =head3 enqueue_similarities
519    
520  usage: enqueue_similarities(\@sims)  C<< FIG::enqueue_similarities(\@fids); >>
521    
522    Queue the passed Feature IDs for similarity computation. The actual
523    computation is performed by L</create_sim_askfor_pool>. The queue is a
524    persistent text file in the global data directory, and this method
525    essentially writes new IDs on the end of it.
526    
527  Queue the passed fids (a reference to a list) for similarity  =over 4
528  computation.  
529    =item fids
530    
531    Reference to a list of feature IDs.
532    
533    =back
534    
535  =cut  =cut
536  #: Return Type ;  #: Return Type ;
# Line 287  Line 550 
550    
551      flock(TMP, LOCK_EX) or die "Cannot lock $sim_q\n";      flock(TMP, LOCK_EX) or die "Cannot lock $sim_q\n";
552    
553      foreach $fid (@$fids)      foreach $fid (@$fids) {
     {  
554          print TMP "$fid\n";          print TMP "$fid\n";
555      }      }
556      close(TMP);      close(TMP);
# Line 303  Line 565 
565    
566  =cut  =cut
567    
568  sub export_similarity_request  sub export_similarity_request {
 {  
569      my($self, $nr_file, $fasta_file) = @_;      my($self, $nr_file, $fasta_file) = @_;
570    
571      my $req_dir = "$FIG_Config::fig/var/sim_requests";      my $req_dir = "$FIG_Config::fig/var/sim_requests";
# Line 344  Line 605 
605      copy("$sim_q", "$req_dir/q") or confess "Copy $sim_q $req_dir/q failed: $!";      copy("$sim_q", "$req_dir/q") or confess "Copy $sim_q $req_dir/q failed: $!";
606    
607      my($buf);      my($buf);
608      while (1)      while (1) {
     {  
609          my $n = read($nr_read_fh, $buf, 4096);          my $n = read($nr_read_fh, $buf, 4096);
610          defined($n) or confess "Error reading nr: $!";          defined($n) or confess "Error reading nr: $!";
611          last unless $n;          last unless $n;
# Line 371  Line 631 
631      #      #
632    
633      open(my $q_fh, "<$req_dir/q");      open(my $q_fh, "<$req_dir/q");
634      while (my $id = <$q_fh>)      while (my $id = <$q_fh>) {
     {  
635          chomp $id;          chomp $id;
636    
637          my $seq = $self->get_translation($id);          my $seq = $self->get_translation($id);
# Line 388  Line 647 
647    
648  =head3 create_sim_askfor_pool  =head3 create_sim_askfor_pool
649    
650  usage: create_sim_askfor_pool()  C<< $fig->create_sim_askfor_pool($chunk_size); >>
651    
652  Creates an askfor pool, a snapshot of the current NR and similarity  Creates an askfor pool, which a snapshot of the current NR and similarity
653  queue. Zeros out the old queue.  queue. This process clears the old queue.
654    
655  The askfor pool needs to keep track of which sequences need to be  The askfor pool needs to keep track of which sequences need to be
656  calculated, which have been handed out, etc. To simplify this task we  calculated, which have been handed out, etc. To simplify this task we
# Line 403  Line 662 
662  populating the sim_askfor_index table with this information and with  populating the sim_askfor_index table with this information and with
663  initial status information.  initial status information.
664    
665    =over 4
666    
667    =item chunk_size
668    
669    Number of features to put into a processing chunk. The default is 15.
670    
671    =back
672    
673  =cut  =cut
674  #: Return Type $;  #: Return Type $;
675  sub create_sim_askfor_pool  sub create_sim_askfor_pool {
 {  
676      my($self, $chunk_size) = @_;      my($self, $chunk_size) = @_;
677    
678      $chunk_size = 20000 unless $chunk_size =~ /^\d+$/;      $chunk_size = 20000 unless $chunk_size =~ /^\d+$/;
# Line 422  Line 688 
688      flock($lock, LOCK_EX);      flock($lock, LOCK_EX);
689    
690      my $num = 0;      my $num = 0;
691      if (open(my $toc, "<$pool_dir/TOC"))      if (open(my $toc, "<$pool_dir/TOC")) {
692      {          while (<$toc>) {
         while (<$toc>)  
         {  
693              chomp;              chomp;
694              # print STDERR "Have toc entry  $_\n";              # print STDERR "Have toc entry  $_\n";
695              my ($idx, $time, $str) = split(/\s+/, $_, 3);              my ($idx, $time, $str) = split(/\s+/, $_, 3);
# Line 472  Line 736 
736      # We've created our pool; we can now run the formatdb and      # We've created our pool; we can now run the formatdb and
737      # extract the sequences for the blast run.      # extract the sequences for the blast run.
738      #      #
739      my $child_pid = $self->run_in_background(sub {      my $child_pid = $self->run_in_background(
740            sub {
741          #          #
742          # Need to close db or there's all sorts of trouble.          # Need to close db or there's all sorts of trouble.
743          #          #
# Line 490  Line 755 
755      close(FPID);      close(FPID);
756    
757      my $db = $self->db_handle();      my $db = $self->db_handle();
758      if (!$db->table_exists("sim_queue"))      if (!$db->table_exists("sim_queue")) {
     {  
759          $db->create_table(tbl => "sim_queue",          $db->create_table(tbl => "sim_queue",
760                            flds => "qid varchar(32), chunk_id INTEGER, seek INTEGER, len INTEGER, " .                            flds => "qid varchar(32), chunk_id INTEGER, seek INTEGER, len INTEGER, " .
761                            "assigned BOOL, finished BOOL, output_file varchar(255), " .                            "assigned BOOL, finished BOOL, output_file varchar(255), " .
# Line 520  Line 784 
784      open(my $tmpfh, ">$tmpfile") or confess "Cannot open tmpfile $tmpfile: $!";      open(my $tmpfh, ">$tmpfile") or confess "Cannot open tmpfile $tmpfile: $!";
785    
786      open(my $q_fh, "<$cpool_dir/q");      open(my $q_fh, "<$cpool_dir/q");
787      while (my $id = <$q_fh>)      while (my $id = <$q_fh>) {
     {  
788          chomp $id;          chomp $id;
789    
790          my $seq = $self->get_translation($id);          my $seq = $self->get_translation($id);
# Line 537  Line 800 
800          #          #
801    
802          $cur_size += length($seq);          $cur_size += length($seq);
803          if ($cur_size >= $chunk_size)          if ($cur_size >= $chunk_size) {
         {  
804              my $chunk_end = tell($seq_fh);              my $chunk_end = tell($seq_fh);
805              my $chunk_len = $chunk_end - $chunk_begin;              my $chunk_len = $chunk_end - $chunk_begin;
806    
# Line 551  Line 813 
813          $seq_idx++;          $seq_idx++;
814      }      }
815    
816      if ($cur_size > 0)      if ($cur_size > 0) {
     {  
817          my $chunk_end = tell($seq_fh);          my $chunk_end = tell($seq_fh);
818          my $chunk_len = $chunk_end - $chunk_begin;          my $chunk_len = $chunk_end - $chunk_begin;
819    
# Line 591  Line 852 
852  #  #
853  #=cut  #=cut
854    
855  sub get_sim_queue  sub get_sim_queue  {
 {  
856      my($self, $pool_id, $all_sims) = @_;      my($self, $pool_id, $all_sims) = @_;
   
   
857  }  }
858    
   
   
859  =head3 get_sim_work  =head3 get_sim_work
860    
861    C<< my ($nrPath, $fasta) = $fig->get_sim_work(); >>
862    
863  Get the next piece of sim computation work to be performed. Returned are  Get the next piece of sim computation work to be performed. Returned are
864  the path to the NR and a string containing the fasta data.  the path to the NR and a string containing the fasta data.
865    
866  =cut  =cut
867    
868  sub get_sim_work  sub get_sim_work {
869  {  
870      my($self) = @_;      my($self) = @_;
871    
872      #      #
# Line 624  Line 882 
882                             LIMIT 1));                             LIMIT 1));
883      print "Got work ", Dumper($work), "\n";      print "Got work ", Dumper($work), "\n";
884    
885      if (not $work or @$work == 0)      if (not $work or @$work == 0) {
     {  
886          return undef;          return undef;
887      }      }
888    
# Line 645  Line 902 
902    
903  =head3 sim_work_done  =head3 sim_work_done
904    
905    C<< $fig->sim_work_done($pool_id, $chunk_id, $out_file); >>
906    
907  Declare that the work in pool_id/chunk_id has been completed, and output written  Declare that the work in pool_id/chunk_id has been completed, and output written
908  to the pool directory (get_sim_work gave it the path).  to the pool directory (get_sim_work gave it the path).
909    
910    =over 4
911    
912    =item pool_id
913    
914    The ID number of the pool containing the work that just completed.
915    
916    =item chunk_id
917    
918    The ID number of the chunk completed.
919    
920    =item out_file
921    
922    The file into which the work was placed.
923    
924    =back
925    
926  =cut  =cut
927    
928  sub sim_work_done  sub sim_work_done {
 {  
929      my($self, $pool_id, $chunk_id, $out_file) = @_;      my($self, $pool_id, $chunk_id, $out_file) = @_;
930    
931      if (! -f $out_file)      if (! -f $out_file) {
932      {          Confess("sim_work_done: output file $out_file does not exist");
         confess "sim_work_done: output file $out_file does not exist";  
933      }      }
934    
935      my $db = $self->db_handle();      my $db = $self->db_handle();
# Line 667  Line 940 
940      my $rows = $dbh->do(qq(UPDATE sim_queue      my $rows = $dbh->do(qq(UPDATE sim_queue
941                             SET finished = TRUE, output_file = ?                             SET finished = TRUE, output_file = ?
942                             WHERE qid = ? and chunk_id = ?), undef, $out_file, $pool_id, $chunk_id);                             WHERE qid = ? and chunk_id = ?), undef, $out_file, $pool_id, $chunk_id);
943      if ($rows != 1)      if ($rows != 1) {
944      {          if ($dbh->errstr) {
945          if ($dbh->errstr)              Confess("Update not able to set finished=TRUE: ", $dbh->errstr);
946          {          } else {
947              confess "Update not able to set finished=TRUE: ", $dbh->errstr;              Confess("Update not able to set finished=TRUE");
         }  
         else  
         {  
             confess "Update not able to set finished=TRUE";  
948          }          }
949      }      }
   
950      #      #
951      # Determine if this was the last piece of work for this pool. If so, we can      # Determine if this was the last piece of work for this pool. If so, we can
952      # schedule the postprocessing work.      # schedule the postprocessing work.
# Line 689  Line 957 
957      my $out = $db->SQL(qq(SELECT chunk_id      my $out = $db->SQL(qq(SELECT chunk_id
958                            FROM sim_queue                            FROM sim_queue
959                            WHERE qid = ? AND not finished), undef, $pool_id);                            WHERE qid = ? AND not finished), undef, $pool_id);
960      if (@$out == 0)      if (@$out == 0) {
     {  
961          #          #
962          # Pool is done.          # Pool is done.
963          #          #
# Line 700  Line 967 
967    
968  =head3 schedule_sim_pool_postprocessing  =head3 schedule_sim_pool_postprocessing
969    
970  Schedule a job to do the similarity postprocessing for pool $pool_id.  C<< $fig->schedule_sim_pool_postprocessing($pool_id); >>
971    
972    Schedule a job to do the similarity postprocessing for the specified pool.
973    
974    =over 4
975    
976    =item pool_id
977    
978    ID of the pool whose similarity postprocessing needs to be scheduled.
979    
980    =back
981    
982  =cut  =cut
983    
984  sub schedule_sim_pool_postprocessing  sub schedule_sim_pool_postprocessing {
985  {  
986      my($self, $pool_id) = @_;      my($self, $pool_id) = @_;
987    
988      my $pool_dir = "$FIG_Config::fig/var/sim_pools";      my $pool_dir = "$FIG_Config::fig/var/sim_pools";
# Line 739  Line 1015 
1015    
1016  =head3 postprocess_computed_sims  =head3 postprocess_computed_sims
1017    
1018  We build a pipe to this pipeline:  C<< $fig->postprocess_computed_sims($pool_id); >>
1019    
1020    Set up to reduce, reformat, and split the similarities in a given pool. We build
1021    a pipe to this pipeline:
1022    
1023      reduce_sims peg.synonyms 300 | reformat_sims nr | split_sims dest prefix      reduce_sims peg.synonyms 300 | reformat_sims nr | split_sims dest prefix
1024    
1025  We put the new sims in the pool directory, and then copy to NewSims.  Then we put the new sims in the pool directory, and then copy to NewSims.
1026    
1027    =over 4
1028    
1029    =item pool_id
1030    
1031    ID of the pool whose similarities are to be post-processed.
1032    
1033    =back
1034    
1035  =cut  =cut
1036    
1037  sub postprocess_computed_sims  sub postprocess_computed_sims {
 {  
1038      my($self, $pool_id) = @_;      my($self, $pool_id) = @_;
1039    
1040      #      #
# Line 780  Line 1066 
1066                                              FROM sim_queue                                              FROM sim_queue
1067                                              WHERE qid = ? and output_file IS NOT NULL                                              WHERE qid = ? and output_file IS NOT NULL
1068                                              ORDER BY chunk_id), undef, $pool_id);                                              ORDER BY chunk_id), undef, $pool_id);
1069      for my $file (@$files)      for my $file (@$files) {
     {  
1070          my $buf;          my $buf;
1071          open(my $fh, "<$file") or confess "Cannot sim input file $file: $!";          open(my $fh, "<$file") or confess "Cannot sim input file $file: $!";
1072          while (read($fh, $buf, 4096))          while (read($fh, $buf, 4096)) {
         {  
1073              print $process $buf;              print $process $buf;
1074          }          }
1075          close($fh);          close($fh);
1076      }      }
1077      my $res = close($process);      my $res = close($process);
1078      if (!$res)      if (!$res) {
1079      {          if ($!) {
         if ($!)  
         {  
1080              confess "Error closing process pipeline: $!";              confess "Error closing process pipeline: $!";
1081          }          } else {
         else  
         {  
1082              confess "Process pipeline exited with status $?";              confess "Process pipeline exited with status $?";
1083          }          }
1084      }      }
# Line 813  Line 1093 
1093    
1094      &verify_dir("$FIG_Config::data/NewSims");      &verify_dir("$FIG_Config::data/NewSims");
1095    
1096      for my $sim_file (@new_sims)      for my $sim_file (@new_sims) {
     {  
1097          my $target = "$FIG_Config::data/NewSims/$sim_file";          my $target = "$FIG_Config::data/NewSims/$sim_file";
1098          if (-s $target)          if (-s $target) {
1099          {              Confess("$target already exists");
             confess "$target already exists";  
1100          }          }
1101          print "copying sim file $sim_file\n";          print "copying sim file $sim_file\n";
1102          &FIG::run("cp $sim_dir/$sim_file $target");          &FIG::run("cp $sim_dir/$sim_file $target");
# Line 826  Line 1104 
1104      }      }
1105  }  }
1106    
   
1107  =head3 get_active_sim_pools  =head3 get_active_sim_pools
1108    
1109  usage: get_active_sim_pools()  C<< @pools = $fig->get_active_sim_pools(); >>
1110    
1111  Return a list of the pool id's for the sim processing queues that have entries awaiting  Return a list of the pool IDs for the sim processing queues that have
1112  computation.  entries awaiting computation.
1113    
1114  =cut  =cut
1115  #: Return Type @;  #: Return Type @;
1116  sub get_active_sim_pools  sub get_active_sim_pools {
 {  
1117      my($self) = @_;      my($self) = @_;
1118    
1119      my $dbh = $self->db_handle();      my $dbh = $self->db_handle();
# Line 850  Line 1126 
1126    
1127  =head3 get_sim_pool_info  =head3 get_sim_pool_info
1128    
1129  usage: get_sim_pool_info($pool_id)  C<< my ($total_entries, $n_finished, $n_assigned, $n_unassigned) = $fig->get_sim_pool_info($pool_id); >>
1130    
1131    Return information about the given sim pool.
1132    
1133    =over 4
1134    
1135    =item pool_id
1136    
1137    Pool ID of the similarity processing queue whose information is desired.
1138    
1139    =item RETURN
1140    
1141    Returns a four-element list. The first is the number of features in the
1142    queue; the second is the number of features that have been processed; the
1143    third is the number of features that have been assigned to a
1144    processor, and the fourth is the number of features left over.
1145    
1146  Return information about the given sim pool. Return value  =back
 is a list ($total_entries, $n_finished, $n_assigned, $n_unassigned)  
1147    
1148  =cut  =cut
1149  #: Return Type @;  #: Return Type @;
1150  sub get_sim_pool_info  sub get_sim_pool_info {
1151  {  
1152      my($self, $pool_id) = @_;      my($self, $pool_id) = @_;
1153      my($dbh, $res, $total_entries, $n_finished, $n_assigned, $n_unassigned);      my($dbh, $res, $total_entries, $n_finished, $n_assigned, $n_unassigned);
1154    
# Line 908  Line 1198 
1198  #  #
1199  #=cut  #=cut
1200    
1201  sub get_sim_chunk  sub get_sim_chunk  {
 {  
1202      my($self, $n_seqs, $worker_id) = @_;      my($self, $n_seqs, $worker_id) = @_;
   
   
1203  }  }
1204    
1205  =head3 get_local_hostname  =head3 get_local_hostname
1206    
1207  usage: my $result = $fig->get_local_hostname();  C<< my $result = FIG::get_local_hostname(); >>
1208    
1209    Return the local host name for the current processor. The name may be
1210    stored in a configuration file, or we may have to get it from the
1211    operating system.
1212    
1213  =cut  =cut
1214  #: Return Type $;  #: Return Type $;
# Line 929  Line 1220 
1220      #      #
1221    
1222      my $hostfile = "$FIG_Config::fig_disk/config/hostname";      my $hostfile = "$FIG_Config::fig_disk/config/hostname";
1223      if (-f $hostfile)      if (-f $hostfile) {
     {  
1224          my $fh;          my $fh;
1225          if (open($fh, $hostfile))          if (open($fh, $hostfile)) {
         {  
1226              my $hostname = <$fh>;              my $hostname = <$fh>;
1227              chomp($hostname);              chomp($hostname);
1228              return $hostname;              return $hostname;
# Line 953  Line 1242 
1242    
1243      my @hostent = gethostbyname($hostname);      my @hostent = gethostbyname($hostname);
1244    
1245      if (@hostent > 0)      if (@hostent > 0) {
     {  
1246          my $sock;          my $sock;
1247          my $ip = $hostent[4];          my $ip = $hostent[4];
1248    
1249          socket($sock, PF_INET, SOCK_STREAM, $tcp);          socket($sock, PF_INET, SOCK_STREAM, $tcp);
1250          if (bind($sock, sockaddr_in(0, $ip)))          if (bind($sock, sockaddr_in(0, $ip))) {
         {  
1251              #              #
1252              # It worked. Reverse-map back to a hopefully fqdn.              # It worked. Reverse-map back to a hopefully fqdn.
1253              #              #
1254    
1255              my @rev = gethostbyaddr($ip, AF_INET);              my @rev = gethostbyaddr($ip, AF_INET);
1256              if (@rev > 0)              if (@rev > 0) {
             {  
1257                  my $host = $rev[0];                  my $host = $rev[0];
1258                  #                  #
1259                  # Check to see if we have a FQDN.                  # Check to see if we have a FQDN.
1260                  #                  #
1261    
1262                  if ($host =~ /\./)                  if ($host =~ /\./) {
                 {  
1263                      #                      #
1264                      # Good.                      # Good.
1265                      #                      #
1266                      return $host;                      return $host;
1267                  }                  } else {
                 else  
                 {  
1268                      #                      #
1269                      # We didn't get a fqdn; bail and return the IP address.                      # We didn't get a fqdn; bail and return the IP address.
1270                      #                      #
1271                      return get_hostname_by_adapter()                      return get_hostname_by_adapter()
1272                  }                  }
1273              }              } else {
             else  
             {  
1274                  return inet_ntoa($ip);                  return inet_ntoa($ip);
1275              }              }
1276          }           } else {
         else  
         {  
1277              #              #
1278              # Our hostname must be wrong; we can't bind to the IP              # Our hostname must be wrong; we can't bind to the IP
1279              # address it maps to.              # address it maps to.
# Line 1002  Line 1281 
1281              #              #
1282              return get_hostname_by_adapter()              return get_hostname_by_adapter()
1283          }          }
1284      }      } else {
     else  
     {  
1285          #          #
1286          # Our hostname isn't known to DNS. This isn't good.          # Our hostname isn't known to DNS. This isn't good.
1287          # Return the name associated with the adapter.          # Return the name associated with the adapter.
# Line 1015  Line 1292 
1292    
1293  =head3 get_hostname_by_adapter  =head3 get_hostname_by_adapter
1294    
1295  usage: my $name = $fig->get_hostname_by_adapter();  C<< my $name = FIG::get_hostname_by_adapter(); >>
1296    
1297    Return the local host name for the current network environment.
1298    
1299  =cut  =cut
1300  #: Return Type $;  #: Return Type $;
1301  sub get_hostname_by_adapter {  sub get_hostname_by_adapter {
     shift if UNIVERSAL::isa($_[0],__PACKAGE__);  
1302      #      #
1303      # Attempt to determine our local hostname based on the      # Attempt to determine our local hostname based on the
1304      # network environment.      # network environment.
# Line 1076  Line 1354 
1354    
1355      my($fh);      my($fh);
1356    
1357      if (!open($fh, "netstat -rn |"))      if (!open($fh, "netstat -rn |")) {
     {  
1358          warn "Cannot run netstat to determine local IP address\n";          warn "Cannot run netstat to determine local IP address\n";
1359          return "localhost";          return "localhost";
1360      }      }
1361    
1362      my $interface_name;      my $interface_name;
1363    
1364      while (<$fh>)      while (<$fh>) {
     {  
1365          my @cols = split();          my @cols = split();
1366    
1367          if ($cols[0] eq "default" || $cols[0] eq "0.0.0.0")          if ($cols[0] eq "default" || $cols[0] eq "0.0.0.0") {
         {  
1368              $interface_name = $cols[$#cols];              $interface_name = $cols[$#cols];
1369          }          }
1370      }      }
# Line 1103  Line 1378 
1378    
1379      my $ifconfig;      my $ifconfig;
1380    
1381      for my $dir ((split(":", $ENV{PATH}), "/sbin", "/usr/sbin"))      for my $dir ((split(":", $ENV{PATH}), "/sbin", "/usr/sbin")) {
1382      {          if (-x "$dir/ifconfig") {
         if (-x "$dir/ifconfig")  
         {  
1383              $ifconfig = "$dir/ifconfig";              $ifconfig = "$dir/ifconfig";
1384              last;              last;
1385          }          }
1386      }      }
1387    
1388      if ($ifconfig eq "")      if ($ifconfig eq "") {
     {  
1389          warn "Ifconfig not found\n";          warn "Ifconfig not found\n";
1390          return "localhost";          return "localhost";
1391      }      }
1392      # print "Foudn $ifconfig\n";      # print "Foudn $ifconfig\n";
1393    
1394      if (!open($fh, "$ifconfig $interface_name |"))      if (!open($fh, "$ifconfig $interface_name |")) {
     {  
1395          warn "Could not run $ifconfig: $!\n";          warn "Could not run $ifconfig: $!\n";
1396          return "localhost";          return "localhost";
1397      }      }
1398    
1399      my $ip;      my $ip;
1400      while (<$fh>)      while (<$fh>) {
     {  
1401          #          #
1402          # Mac:          # Mac:
1403          #         inet 140.221.10.153 netmask 0xfffffc00 broadcast 140.221.11.255          #         inet 140.221.10.153 netmask 0xfffffc00 broadcast 140.221.11.255
# Line 1139  Line 1409 
1409          s/^\s*//;          s/^\s*//;
1410    
1411          # print "Have '$_'\n";          # print "Have '$_'\n";
1412          if (/inet\s+addr:(\d+\.\d+\.\d+\.\d+)\s+/)          if (/inet\s+addr:(\d+\.\d+\.\d+\.\d+)\s+/) {
         {  
1413              #              #
1414              # Linux hit.              # Linux hit.
1415              #              #
1416              $ip = $1;              $ip = $1;
1417              # print "Got linux $ip\n";              # print "Got linux $ip\n";
1418              last;              last;
1419          }          } elsif (/inet\s+(\d+\.\d+\.\d+\.\d+)\s+/) {
         elsif (/inet\s+(\d+\.\d+\.\d+\.\d+)\s+/)  
         {  
1420              #              #
1421              # Mac hit.              # Mac hit.
1422              #              #
# Line 1160  Line 1427 
1427      }      }
1428      close($fh);      close($fh);
1429    
1430      if ($ip eq "")      if ($ip eq "") {
     {  
1431          warn "Didn't find an IP\n";          warn "Didn't find an IP\n";
1432          return "localhost";          return "localhost";
1433      }      }
# Line 1171  Line 1437 
1437    
1438  =head3 get_seed_id  =head3 get_seed_id
1439    
1440  usage: my $id = $fig->get_seed_id();  C<< my $id = FIG::get_seed_id(); >>
1441    
1442    Return the Universally Unique ID for this SEED instance. If one
1443    does not exist, it will be created.
1444    
1445  =cut  =cut
1446  #: Return type $;  #: Return type $;
# Line 1181  Line 1450 
1450      #      #
1451      # If it's not there, create one, and make it readonly.      # If it's not there, create one, and make it readonly.
1452      #      #
   
     shift if UNIVERSAL::isa($_[0],__PACKAGE__);  
1453      my $id;      my $id;
1454      my $id_file = "$FIG_Config::fig_disk/config/seed_id";      my $id_file = "$FIG_Config::fig_disk/config/seed_id";
1455      if (! -f $id_file)      if (! -f $id_file) {
     {  
1456          my $newid = `uuidgen`;          my $newid = `uuidgen`;
1457          if (!$newid)          if (!$newid) {
         {  
1458              die "Cannot run uuidgen: $!";              die "Cannot run uuidgen: $!";
1459          }          }
1460    
1461          chomp($newid);          chomp($newid);
1462          my $fh = new FileHandle(">$id_file");          my $fh = new FileHandle(">$id_file");
1463          if (!$fh)          if (!$fh) {
         {  
1464              die "error creating $id_file: $!";              die "error creating $id_file: $!";
1465          }          }
1466          print $fh "$newid\n";          print $fh "$newid\n";
# Line 1209  Line 1473 
1473      return $id;      return $id;
1474  }  }
1475    
1476  =pod  =head3 get_release_info
1477    
1478  =head1 get_release_info  C<< my ($name, $id, $inst, $email, $parent_id, $description) = FIG::get_release_info();  >>
1479    
1480  Return the current data release information.  It is returned as the list  Return the current data release information..
 ($name, $id, $inst, $email, $parent_id, $description).  
1481    
1482  The release info comes from the file FIG/Data/RELEASE. It is formatted as:  The release info comes from the file FIG/Data/RELEASE. It is formatted as:
1483    
# Line 1226  Line 1489 
1489  <description>  <description>
1490    
1491  For instance:  For instance:
1492    
1493  -----  -----
1494  SEED Data Release, 09/15/2004.  SEED Data Release, 09/15/2004.
1495  4148208C-1DF2-11D9-8417-000A95D52EF6  4148208C-1DF2-11D9-8417-000A95D52EF6
# Line 1241  Line 1505 
1505    
1506  =cut  =cut
1507  #: Return Type @;  #: Return Type @;
1508  sub get_release_info  sub get_release_info {
 {  
1509      my($fig, $no_create) = @_;      my($fig, $no_create) = @_;
1510    
1511      my $rel_file = "$FIG_Config::data/RELEASE";      my $rel_file = "$FIG_Config::data/RELEASE";
1512    
1513      if (! -f $rel_file and !$no_create)      if (! -f $rel_file and !$no_create) {
     {  
1514          #          #
1515          # Create a new one.          # Create a new one.
1516          #          #
1517    
1518          my $newid = `uuidgen`;          my $newid = `uuidgen`;
1519          if (!$newid)          if (!$newid) {
         {  
1520              die "Cannot run uuidgen: $!";              die "Cannot run uuidgen: $!";
1521          }          }
1522    
# Line 1270  Line 1531 
1531          $description .= "Contains $a archaeal, $b bacterial, $e eukaryal, $v viral and $env environmental genomes.\n";          $description .= "Contains $a archaeal, $b bacterial, $e eukaryal, $v viral and $env environmental genomes.\n";
1532    
1533          my $fh = new FileHandle(">$rel_file");          my $fh = new FileHandle(">$rel_file");
1534          if (!$fh)          if (!$fh) {
         {  
1535              warn "error creating $rel_file: $!";              warn "error creating $rel_file: $!";
1536              return undef;              return undef;
1537          }          }
# Line 1285  Line 1545 
1545          chmod(0444, $rel_file);          chmod(0444, $rel_file);
1546      }      }
1547    
1548      if (open(my $fh, $rel_file))      if (open(my $fh, $rel_file)) {
     {  
1549          my(@lines) = <$fh>;          my(@lines) = <$fh>;
1550          close($fh);          close($fh);
1551    
# Line 1300  Line 1559 
1559      return undef;      return undef;
1560  }  }
1561    
1562  =pod  =head3 get_peer_last_update
1563    
1564  =head1 get_peer_last_update  C<< my $date = $fig->get_peer_last_update($peer_id); >>
   
 usage: my $date = $fig->get_peer_last_update($peer_id);  
1565    
1566  Return the timestamp from the last successful peer-to-peer update with  Return the timestamp from the last successful peer-to-peer update with
1567  the given peer.  the given peer. If the specified peer has made updates, comparing this
1568    timestamp to the timestamp of the updates can tell you whether or not
1569    the updates have been integrated into your SEED data store.
1570    
1571  We store this information in FIG/Data/Global/Peers/<peer-id>.  We store this information in FIG/Data/Global/Peers/<peer-id>.
1572    
1573    =over 4
1574    
1575    =item peer_id
1576    
1577    Universally Unique ID for the desired peer.
1578    
1579    =item RETURN
1580    
1581    Returns the date/time stamp for the last peer-to-peer updated performed
1582    with the identified SEED instance.
1583    
1584    =back
1585    
1586  =cut  =cut
1587  #: Return Type $;  #: Return Type $;
1588  sub get_peer_last_update  sub get_peer_last_update  {
 {  
1589      my($self, $peer_id) = @_;      my($self, $peer_id) = @_;
1590    
1591      my $dir = "$FIG_Config::data/Global/Peers";      my $dir = "$FIG_Config::data/Global/Peers";
# Line 1323  Line 1594 
1594      &verify_dir($dir);      &verify_dir($dir);
1595    
1596      my $update_file = "$dir/last_update";      my $update_file = "$dir/last_update";
1597      if (-f $update_file)      if (-f $update_file)  {
     {  
1598          my $time = file_head($update_file, 1);          my $time = file_head($update_file, 1);
1599          chomp $time;          chomp $time;
1600          return $time;          return $time;
1601      }      } else {
     else  
     {  
1602          return undef;          return undef;
1603      }      }
1604  }  }
1605    
1606  =pod  =head3 set_peer_last_update
1607    
1608  =head1 set_peer_last_update  C<< $fig->set_peer_last_update($peer_id, $time); >>
1609    
1610  usage: $fig->set_peer_last_update($peer_id, $time);  Manually set the update timestamp for a specified peer. This informs
1611    the SEED that you have all of the assignments and updates from a
1612    particular SEED instance as of a certain date.
1613    
1614  =cut  =cut
1615  #: Return Type ;  #: Return Type ;
1616    
1617  sub set_peer_last_update  sub set_peer_last_update {
 {  
1618      my($self, $peer_id, $time) = @_;      my($self, $peer_id, $time) = @_;
1619    
1620      my $dir = "$FIG_Config::data/Global/Peers";      my $dir = "$FIG_Config::data/Global/Peers";
# Line 1361  Line 1630 
1630    
1631  =head3 cgi_url  =head3 cgi_url
1632    
1633  usage: my $url = $fig->cgi_url();  C<< my $url = FIG::$fig->cgi_url(); >>
1634    
1635    Return the URL for the CGI script directory.
1636    
1637  =cut  =cut
1638  #: Return Type $;  #: Return Type $;
1639  sub cgi_url {  sub cgi_url {
     shift if UNIVERSAL::isa($_[0],__PACKAGE__);  
1640      return &plug_url($FIG_Config::cgi_url);      return &plug_url($FIG_Config::cgi_url);
1641  }  }
1642    
1643  =head3 temp_url  =head3 temp_url
1644    
1645  usage: my $url = $fig->temp_url();  C<< my $url = FIG::temp_url(); >>
1646    
1647    Return the URL of the temporary file directory.
1648    
1649  =cut  =cut
1650  #: Return Type $;  #: Return Type $;
1651  sub temp_url {  sub temp_url {
     shift if UNIVERSAL::isa($_[0],__PACKAGE__);  
1652      return &plug_url($FIG_Config::temp_url);      return &plug_url($FIG_Config::temp_url);
1653  }  }
1654    
1655  =head3 plug_url  =head3 plug_url
1656    
1657  usage: my $url2 = $fig->plug_url($url);  C<< my $url2 = $fig->plug_url($url); >>
1658    
1659    or
1660    
1661    C<< my $url2 = $fig->plug_url($url); >>
1662    
1663    Change the domain portion of a URL to point to the current domain. This essentially
1664    relocates URLs into the current environment.
1665    
1666    =over 4
1667    
1668    =item url
1669    
1670    URL to relocate.
1671    
1672    =item RETURN
1673    
1674    Returns a new URL with the base portion converted to the current operating host.
1675    If the URL does not begin with C<http://>, the URL will be returned unmodified.
1676    
1677    =back
1678    
1679  =cut  =cut
1680  #: Return Type $;  #: Return Type $;
# Line 1412  Line 1703 
1703    
1704  =head3 file_read  =head3 file_read
1705    
1706  usage: my $text = $fig->file_read($fileName);  C<< my $text = $fig->file_read($fileName); >>
1707    
1708    or
1709    
1710    C<< my @lines = $fig->file_read($fileName); >>
1711    
1712    or
1713    
1714    C<< my $text = FIG::file_read($fileName); >>
1715    
1716    or
1717    
1718    C<< my @lines = FIG::file_read($fileName); >>
1719    
1720    Read an entire file into memory. In a scalar context, the file is returned
1721    as a single text string with line delimiters included. In a list context, the
1722    file is returned as a list of lines, each line terminated by a line
1723    delimiter. (For a method that automatically strips the line delimiters,
1724    use C<Tracer::GetFile>.)
1725    
1726  usage: my @lines = $fig->file_read($fileName);  =over 4
1727    
1728    =item fileName
1729    
1730    Fully-qualified name of the file to read.
1731    
1732    =item RETURN
1733    
1734    In a list context, returns a list of the file lines. In a scalar context, returns
1735    a string containing all the lines of the file with delimiters included.
1736    
1737    =back
1738    
1739  =cut  =cut
1740  #: Return Type $;  #: Return Type $;
1741  #: Return Type @;  #: Return Type @;
1742  sub file_read  sub file_read {
1743  {  
1744      shift if UNIVERSAL::isa($_[0],__PACKAGE__);      shift if UNIVERSAL::isa($_[0],__PACKAGE__);
1745      my($file) = @_;      my($fileName) = @_;
1746        return file_head($fileName, '*');
1747    
     if (open(my $fh, "<$file"))  
     {  
         if (wantarray)  
         {  
             my @ret = <$fh>;  
             return @ret;  
         }  
         else  
         {  
             local $/;  
             my $text = <$fh>;  
             close($fh);  
             return $text;  
         }  
     }  
1748  }  }
1749    
1750    
1751  =head3 file_head  =head3 file_head
1752    
1753  usage: my $text = $fig->file_read($fileName, $count);  C<< my $text = $fig->file_head($fileName, $count); >>
1754    
1755    or
1756    
1757    C<< my @lines = $fig->file_head($fileName, $count); >>
1758    
1759    or
1760    
1761    C<< my $text = FIG::file_head($fileName, $count); >>
1762    
1763    or
1764    
1765    C<< my @lines = FIG::file_head($fileName, $count); >>
1766    
1767    Read a portion of a file into memory. In a scalar context, the file portion is
1768    returned as a single text string with line delimiters included. In a list
1769    context, the file portion is returned as a list of lines, each line terminated
1770    by a line delimiter.
1771    
1772    =over 4
1773    
1774    =item fileName
1775    
1776    Fully-qualified name of the file to read.
1777    
1778    =item count (optional)
1779    
1780    Number of lines to read from the file. If omitted, C<1> is assumed. If the
1781    non-numeric string C<*> is specified, the entire file will be read.
1782    
1783    =item RETURN
1784    
1785    In a list context, returns a list of the desired file lines. In a scalar context, returns
1786    a string containing the desired lines of the file with delimiters included.
1787    
1788  usage: my @lines = $fig->file_read($fileName, $count);  =back
1789    
1790  =cut  =cut
1791  #: Return Type $;  #: Return Type $;
1792  #: Return Type @;  #: Return Type @;
1793  sub file_head  sub file_head {
1794  {  
1795      shift if UNIVERSAL::isa($_[0],__PACKAGE__);      shift if UNIVERSAL::isa($_[0],__PACKAGE__);
1796      my($file, $n) = @_;      my($file, $count) = @_;
1797    
1798      if (!$n)      my ($n, $allFlag);
1799      {      if ($count eq '*') {
1800          $n = 1;          Trace("Full file read for \"$file\".") if T(2);
1801            $allFlag = 1;
1802            $n = 0;
1803        } else {
1804            $allFlag = 0;
1805            $n = (!$count ? 1 : $count);
1806            Trace("Reading $n record(s) from \"$file\".") if T(2);
1807      }      }
1808    
1809      if (open(my $fh, "<$file"))      if (open(my $fh, "<$file")) {
     {  
1810          my(@ret, $i);          my(@ret, $i);
   
1811          $i = 0;          $i = 0;
1812          while (<$fh>)          while (<$fh>) {
         {  
1813              push(@ret, $_);              push(@ret, $_);
1814              $i++;              $i++;
1815              last if $i >= $n;              last if !$allFlag && $i >= $n;
1816          }          }
1817          close($fh);          close($fh);
1818            if (wantarray) {
         if (wantarray)  
         {  
1819              return @ret;              return @ret;
1820          }          } else {
         else  
         {  
1821              return join("", @ret);              return join("", @ret);
1822          }          }
1823      }      }
1824  }  }
1825    
1826    ################ Basic Routines [ existed since WIT ] ##########################
1827    
1828  =pod  =head3 min
   
 =head1 hiding/caching in a FIG object  
   
 We save the DB handle, cache taxonomies, and put a few other odds and ends in the  
 FIG object.  We expect users to invoke these services using the object $fig constructed  
 using:  
   
     use FIG;  
     my $fig = new FIG;  
   
 $fig is then used as the basic mechanism for accessing FIG services.  It is, of course,  
 just a hash that is used to retain/cache data.  The most commonly accessed item is the  
 DB filehandle, which is accessed via $self->db_handle.  
   
 We cache genus/species expansions, taxonomies, distances (very crudely estimated) estimated  
 between genomes, and a variety of other things.  I am not sure that using cached/2 was a  
 good idea, but I did it.  
   
 =cut  
1829    
1830  sub db_handle {  C<< my $min = FIG::min(@x); >>
     my($self) = @_;  
1831    
1832      return $self->{_dbf};  or
 }  
1833    
1834  sub cached {  C<< my $min = $fig->min(@x); >>
     my($self,$what) = @_;  
1835    
1836      my $x = $self->{$what};  Return the minimum numeric value from a list.
     if (! $x)  
     {  
         $x = $self->{$what} = {};  
     }  
     return $x;  
 }  
1837    
1838  ################ Basic Routines [ existed since WIT ] ##########################  =over 4
1839    
1840    =item x1, x2, ... xN
1841    
1842  =pod  List of numbers to process.
1843    
1844  =head1 min  =item RETURN
1845    
1846  usage: $n = &FIG::min(@x)  Returns the numeric value of the list entry possessing the lowest value. Returns
1847    C<undef> if the list is empty.
1848    
1849  Assumes @x contains numeric values.  Returns the minimum of the values.  =back
1850    
1851  =cut  =cut
1852  #: Return Type $;  #: Return Type $;
# Line 1544  Line 1857 
1857    
1858      (@x > 0) || return undef;      (@x > 0) || return undef;
1859      $min = $x[0];      $min = $x[0];
1860      for ($i=1; ($i < @x); $i++)      for ($i=1; ($i < @x); $i++) {
     {  
1861          $min = ($min > $x[$i]) ? $x[$i] : $min;          $min = ($min > $x[$i]) ? $x[$i] : $min;
1862      }      }
1863      return $min;      return $min;
1864  }  }
1865    
1866  =pod  =head3 max
1867    
1868    C<< my $max = FIG::max(@x); >>
1869    
1870    or
1871    
1872    C<< my $max = $fig->max(@x); >>
1873    
1874  =head1 max  Return the maximum numeric value from a list.
1875    
1876  usage: $n = &FIG::max(@x)  =over 4
1877    
1878    =item x1, x2, ... xN
1879    
1880    List of numbers to process.
1881    
1882    =item RETURN
1883    
1884    Returns the numeric value of t/he list entry possessing the highest value. Returns
1885    C<undef> if the list is empty.
1886    
1887  Assumes @x contains numeric values.  Returns the maximum of the values.  =back
1888    
1889  =cut  =cut
1890  #: Return Type $;  #: Return Type $;
# Line 1568  Line 1895 
1895    
1896      (@x > 0) || return undef;      (@x > 0) || return undef;
1897      $max = $x[0];      $max = $x[0];
1898      for ($i=1; ($i < @x); $i++)      for ($i=1; ($i < @x); $i++) {
     {  
1899          $max = ($max < $x[$i]) ? $x[$i] : $max;          $max = ($max < $x[$i]) ? $x[$i] : $max;
1900      }      }
1901      return $max;      return $max;
1902  }  }
1903    
1904  =pod  =head3 between
1905    
1906  =head1 between  C<< my $flag = FIG::between($x, $y, $z); >>
1907    
1908  usage: &FIG::between($x,$y,$z)  or
1909    
1910    C<< my $flag = $fig->between($x, $y, $z); >>
1911    
1912    Determine whether or not $y is between $x and $z.
1913    
1914    =over 4
1915    
1916    =item x
1917    
1918    First edge number.
1919    
1920    =item y
1921    
1922  Returns true iff $y is between $x and $z.  Number to examine.
1923    
1924    =item z
1925    
1926    Second edge number.
1927    
1928    =item RETURN
1929    
1930    Return TRUE if the number I<$y> is between the numbers I<$x> and I<$z>. The check
1931    is inclusive (that is, if I<$y> is equal to I<$x> or I<$z> the function returns
1932    TRUE), and the order of I<$x> and I<$z> does not matter. If I<$x> is lower than
1933    I<$z>, then the return is TRUE if I<$x> <= I<$y> <= I<$z>. If I<$z> is lower,
1934    then the return is TRUE if I<$x> >= I$<$y> >= I<$z>.
1935    
1936    =back
1937    
1938  =cut  =cut
1939  #: Return Type $;  #: Return Type $;
# Line 1589  Line 1941 
1941      shift if UNIVERSAL::isa($_[0],__PACKAGE__);      shift if UNIVERSAL::isa($_[0],__PACKAGE__);
1942      my($x,$y,$z) = @_;      my($x,$y,$z) = @_;
1943    
1944      if ($x < $z)      if ($x < $z) {
     {  
1945          return (($x <= $y) && ($y <= $z));          return (($x <= $y) && ($y <= $z));
1946      }      } else {
     else  
     {  
1947          return (($x >= $y) && ($y >= $z));          return (($x >= $y) && ($y >= $z));
1948      }      }
1949  }  }
1950    
1951  =pod  =head3 standard_genetic_code
1952    
1953  =head1 standard_genetic_code  C<< my $code = FIG::standard_genetic_code(); >>
1954    
1955  usage: $code = &FIG::standard_genetic_code()  Return a hash containing the standard translation of nucleotide triples to proteins.
1956    Methods such as L</translate> can take a translation scheme as a parameter. This method
1957  Routines like "translate" can take a "genetic code" as an argument.  I implemented such  returns the default translation scheme. The scheme is implemented as a reference to a
1958  codes using hashes that assumed uppercase DNA triplets as keys.  hash that contains nucleotide triplets as keys and has protein letters as values.
1959    
1960  =cut  =cut
1961  #: Return Type $;  #: Return Type $;
1962  sub standard_genetic_code {  sub standard_genetic_code {
     shift if UNIVERSAL::isa($_[0],__PACKAGE__);  
1963    
1964      my $code = {};      my $code = {};
1965    
# Line 1683  Line 2031 
2031      return $code;      return $code;
2032  }  }
2033    
2034  =pod  =head3 translate
2035    
2036    C<< my $aa_seq = &FIG::translate($dna_seq, $code, $fix_start);  >>
2037    
2038    Translate a DNA sequence to a protein sequence using the specified genetic code.
2039    If I<$fix_start> is TRUE, will translate an initial C<TTG> or C<GTG> code to
2040    C<M>. (In the standard genetic code, these two combinations normally translate
2041    to C<V> and C<L>, respectively.)
2042    
2043    =over 4
2044    
2045    =item dna_seq
2046    
2047    DNA sequence to translate. Note that the DNA sequence can only contain
2048    known nucleotides.
2049    
2050    =item code
2051    
2052    Reference to a hash specifying the translation code. The hash is keyed by
2053    nucleotide triples, and the value for each key is the corresponding protein
2054    letter. If this parameter is omitted, the L</standard_genetic_code> will be
2055    used.
2056    
2057    =item fix_start
2058    
2059    TRUE if the first triple is to get special treatment, else FALSE. If TRUE,
2060    then a value of C<TTG> or C<GTG> in the first position will be translated to
2061    C<M> instead of the value specified in the translation code.
2062    
2063  =head1 translate  =item RETURN
2064    
2065  usage: $aa_seq = &FIG::translate($dna_seq,$code,$fix_start);  Returns a string resulting from translating each nucleotide triple into a
2066    protein letter.
2067    
2068  If $code is undefined, I use the standard genetic code.  If $fix_start is true, I  =back
 will translate initial TTG or GTG to 'M'.  
2069    
2070  =cut  =cut
2071  #: Return Type $;  #: Return Type $;
2072  sub translate {  sub translate {
2073      shift if UNIVERSAL::isa($_[0],__PACKAGE__);      shift if UNIVERSAL::isa($_[0],__PACKAGE__);
2074    
2075      my( $dna,$code,$start) = @_;      my( $dna,$code,$start) = @_;
2076      my( $i,$j,$ln );      my( $i,$j,$ln );
2077      my( $x,$y );      my( $x,$y );
2078      my( $prot );      my( $prot );
2079    
2080      if (! defined($code))      if (! defined($code)) {
     {  
2081          $code = &FIG::standard_genetic_code;          $code = &FIG::standard_genetic_code;
2082      }      }
2083      $ln = length($dna);      $ln = length($dna);
2084      $prot = "X" x ($ln/3);      $prot = "X" x ($ln/3);
2085      $dna =~ tr/a-z/A-Z/;      $dna =~ tr/a-z/A-Z/;
2086    
2087      for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++)      for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) {
     {  
2088          $x = substr($dna,$i,3);          $x = substr($dna,$i,3);
2089          if ($y = $code->{$x})          if ($y = $code->{$x}) {
         {  
2090              substr($prot,$j,1) = $y;              substr($prot,$j,1) = $y;
2091          }          }
2092      }      }
2093    
2094      if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/))      if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) {
     {  
2095          substr($prot,0,1) = 'M';          substr($prot,0,1) = 'M';
2096      }      }
2097      return $prot;      return $prot;
2098  }  }
2099    
2100  =pod  =head3 reverse_comp
2101    
2102    C<< my $dnaR = FIG::reverse_comp($dna); >>
2103    
2104    or
2105    
2106    C<< my $dnaR = $fig->reverse_comp($dna); >>
2107    
2108    Return the reverse complement os the specified DNA sequence.
2109    
2110    NOTE: for extremely long DNA strings, use L</rev_comp>, which allows you to
2111    pass the strings around in the form of pointers.
2112    
2113    =over 4
2114    
2115    =item dna
2116    
2117  =head1 reverse_comp  DNA sequence whose reverse complement is desired.
2118    
2119  usage: $dnaR = &FIG::reverse_comp($dna) or  =item RETURN
2120         $dnaRP = &FIG::rev_comp($seqP)  
2121    Returns the reverse complement of the incoming DNA sequence.
2122    
2123  In WIT, we implemented reverse complement passing a pointer to a sequence and returning  =back
 a pointer to a sequence.  In most cases the pointers are a pain (although in a few they  
 are just what is needed).  Hence, I kept both versions of the function to allow you  
 to use whichever you like.  Use rev_comp only for long strings where passing pointers is a  
 reasonable effeciency issue.  
2124    
2125  =cut  =cut
2126  #: Return Type $;  #: Return Type $;
# Line 1747  Line 2131 
2131      return ${&rev_comp(\$seq)};      return ${&rev_comp(\$seq)};
2132  }  }
2133    
2134  =head1 rev_comp  =head3 rev_comp
2135    
2136    C<< my $dnaRP = FIG::rev_comp(\$dna); >>
2137    
2138    or
2139    
2140    C<< my $dnaRP = $fig->rev_comp(\$dna); >>
2141    
2142    Return the reverse complement of the specified DNA sequence. The DNA sequence
2143    is passed in as a string reference rather than a raw string for performance
2144    reasons. If this is unnecessary, use L</reverse_comp>, which processes strings
2145    instead of references to strings.
2146    
2147    =over 4
2148    
2149    =item dna
2150    
2151    Reference to the DNA sequence whose reverse complement is desired.
2152    
2153    =item RETURN
2154    
2155    Returns a reference to the reverse complement of the incoming DNA sequence.
2156    
2157    =back
2158    
2159  =cut  =cut
2160  #: Return Type $;  #: Return Type $;
# Line 1762  Line 2169 
2169      return \$rev;      return \$rev;
2170  }  }
2171    
2172  =pod  =head3 verify_dir
2173    
2174    C<< FIG::verify_dir($dir); >>
2175    
2176  =head1 verify_dir  or
2177    
2178  usage: &FIG::verify_dir($dir)  C<< $fig->verify_dir($dir); >>
2179    
2180  Makes sure that $dir exists.  If it has to create it, it sets permissions to 0777.  Insure that the specified directory exists.  If it must be created, the permissions will
2181    be set to C<0777>.
2182    
2183  =cut  =cut
2184    
# Line 1776  Line 2186 
2186      shift if UNIVERSAL::isa($_[0],__PACKAGE__);      shift if UNIVERSAL::isa($_[0],__PACKAGE__);
2187      my($dir) = @_;      my($dir) = @_;
2188    
2189      if (-d $dir) { return }      if (-d $dir) {
2190      if ($dir =~ /^(.*)\/[^\/]+$/)          return
2191      {      }
2192        if ($dir =~ /^(.*)\/[^\/]+$/) {
2193          &verify_dir($1);          &verify_dir($1);
2194      }      }
2195      mkdir($dir,0777) || die "could not make $dir: $!";      mkdir($dir,0777) || Confess("Could not make directory $dir: $!");
     # chmod 02777,$dir;  
2196  }  }
2197    
2198  =pod  =head3 run
   
 =head1 table_exists  
   
 usage: if ($fig->table_exists($table)) { }  
   
 Test for existence of table in the relational DB. This returns 0 if the table  
 does not exist; 1 if it does  
   
 =cut  
2199    
2200    C<< FIG::run($cmd); >>
2201    
2202  =pod  or
   
 =head1 run  
2203    
2204  usage: &FIG::run($cmd)  C<< $fig->run($cmd); >>
2205    
2206  Runs $cmd and fails (with trace) if the command fails.  Run a command. If the command fails, the error will be traced.
2207    
2208  =cut  =cut
2209    
# Line 1811  Line 2211 
2211      shift if UNIVERSAL::isa($_[0],__PACKAGE__);      shift if UNIVERSAL::isa($_[0],__PACKAGE__);
2212      my($cmd) = @_;      my($cmd) = @_;
2213    
2214      if ($ENV{VERBOSE})      if ($ENV{VERBOSE}) {
2215      {          my @tmp = `date`;
2216          my @tmp = `date`; chomp @tmp; print STDERR "$tmp[0]: running $cmd\n";          chomp @tmp;
2217            print STDERR "$tmp[0]: running $cmd\n";
2218      }      }
2219      (system($cmd) == 0) || confess "FAILED: $cmd";      Trace("Running command: $cmd") if T(3);
2220        (system($cmd) == 0) || Confess("FAILED: $cmd");
2221  }  }
2222    
2223    =head3 augment_path
2224    
2225    C<< FIG::augment_path($dirName); >>
2226    
2227  =pod  Add a directory to the system path.
2228    
2229  =head1 read_fasta_record(\*FILEHANDLE)  This method adds a new directory to the front of the system path. It looks in the
2230    configuration file to determine whether this is Windows or Unix, and uses the
2231    appropriate separator.
2232    
2233  Usage:     ( $seq_id, $seq_pointer, $comment ) = &read_fasta_record(\*FILEHANDLE);  =over 4
2234    
2235  Function:  Reads a FASTA-formatted sequence file one record at a time.  =item dirName
     The input filehandle defaults to STDIN if not specified.  
     Returns a sequence ID, a pointer to the sequence, and an optional  
     record comment (NOTE: Record comments are deprecated, as some tools  
     such as BLAST do not handle them gracefully). Returns an empty list  
     if attempting to read a record results in an undefined value  
     (e.g., due to reaching the EOF).  
2236    
2237  Author:    Gordon D. Pusch  Name of the directory to add to the path.
2238    
2239  Date:      2004-Feb-18  =back
2240    
2241  =cut  =cut
 #: Return Type @;  
 sub read_fasta_record  
 {  
     shift if UNIVERSAL::isa($_[0],__PACKAGE__);  
     my ($file_handle) = @_;  
     my ( $old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record );  
2242    
2243      if (not defined($file_handle))  { $file_handle = \*STDIN; }  sub augment_path {
2244        my ($dirName) = @_;
2245        if ($FIG_Config::win_mode) {
2246            $ENV{PATH} = "$dirName;$ENV{PATH}";
2247        } else {
2248            $ENV{PATH} = "$dirName:$ENV{PATH}";
2249        }
2250    }
2251    
2252      $old_end_of_record = $/;  =head3 read_fasta_record
     $/ = "\n>";  
2253    
2254      if (defined($fasta_record = <$file_handle>))  C<< my ($seq_id, $seq_pointer, $comment) = FIG::read_fasta_record(\*FILEHANDLE); >>
2255      {  
2256          chomp $fasta_record;  or
2257          @lines  =  split( /\n/, $fasta_record );  
2258          $head   =  shift @lines;  C<< my ($seq_id, $seq_pointer, $comment) = $fig->read_fasta_record(\*FILEHANDLE); >>
2259          $head   =~ s/^>?//;  
2260    Read and parse the next logical record of a FASTA file. A FASTA logical record
2261    consists of multiple lines of text. The first line begins with a C<< > >> symbol
2262    and contains the sequence ID followed by an optional comment. (NOTE: comments
2263    are currently deprecated, because not all tools handle them properly.) The
2264    remaining lines contain the sequence data.
2265    
2266    This method uses a trick to smooth its operation: the line terminator character
2267    is temporarily changed to C<< \n> >> so that a single read operation brings in
2268    the entire logical record.
2269    
2270    =over 4
2271    
2272    =item FILEHANDLE
2273    
2274    Open handle of the FASTA file. If not specified, C<STDIN> is assumed.
2275    
2276    =item RETURN
2277    
2278    If we are at the end of the file, returns C<undef>. Otherwise, returns a
2279    three-element list. The first element is the sequence ID, the second is
2280    a pointer to the sequence data (that is, a string reference as opposed to
2281    as string), and the third is the comment.
2282    
2283    =back
2284    
2285    =cut
2286    #: Return Type @;
2287    sub read_fasta_record {
2288    
2289        shift if UNIVERSAL::isa($_[0],__PACKAGE__);
2290        my ($file_handle) = @_;
2291        my ($old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record);
2292    
2293        if (not defined($file_handle))  { $file_handle = \*STDIN; }
2294    
2295        $old_end_of_record = $/;
2296        $/ = "\n>";
2297    
2298        if (defined($fasta_record = <$file_handle>)) {
2299            chomp $fasta_record;
2300            @lines  =  split( /\n/, $fasta_record );
2301            $head   =  shift @lines;
2302            $head   =~ s/^>?//;
2303          $head   =~ m/^(\S+)/;          $head   =~ m/^(\S+)/;
2304          $seq_id = $1;          $seq_id = $1;
   
2305          if ($head  =~ m/^\S+\s+(.*)$/)  { $comment = $1; } else { $comment = ""; }          if ($head  =~ m/^\S+\s+(.*)$/)  { $comment = $1; } else { $comment = ""; }
   
2306          $sequence  =  join( "", @lines );          $sequence  =  join( "", @lines );
   
2307          @parsed_fasta_record = ( $seq_id, \$sequence, $comment );          @parsed_fasta_record = ( $seq_id, \$sequence, $comment );
2308      }      } else {
     else  
     {  
2309          @parsed_fasta_record = ();          @parsed_fasta_record = ();
2310      }      }
2311    
# Line 1876  Line 2314 
2314      return @parsed_fasta_record;      return @parsed_fasta_record;
2315  }  }
2316    
2317    =head3 display_id_and_seq
2318    
2319  =pod  C<< FIG::display_id_and_seq($id_and_comment, $seqP, $fh); >>
2320    
2321  =head1 display_id_and_seq  or
2322    
2323  usage: &FIG::display_id_and_seq($id_and_comment,$seqP,$fh)  C<< $fig->display_id_and_seq($id_and_comment, $seqP, $fh); >>
2324    
2325  This command has always been used to put out fasta sequences.  Note that it  Display a fasta ID and sequence to the specified open file. This method is designed
2326  takes a pointer to the sequence.  $fh is optional and defalts to STDOUT.  to work well with L</read_fasta_sequence> and L</rev_comp>, because it takes as
2327    input a string pointer rather than a string. If the file handle is omitted it
2328    defaults to STDOUT.
2329    
2330    The output is formatted into a FASTA record. The first line of the output is
2331    preceded by a C<< > >> symbol, and the sequence is split into 60-character
2332    chunks displayed one per line. Thus, this method can be used to produce
2333    FASTA files from data gathered by the rest of the system.
2334    
2335  =cut  =over 4
2336    
2337    =item id_and_comment
2338    
2339    The sequence ID and (optionally) the comment from the sequence's FASTA record.
2340    The ID
2341    
2342    =item seqP
2343    
2344    Reference to a string containing the sequence. The sequence is automatically
2345    formatted into 60-character chunks displayed one per line.
2346    
2347    =item fh
2348    
2349    Open file handle to which the ID and sequence should be output. If omitted,
2350    C<STDOUT> is assumed.
2351    
2352    =back
2353    
2354    =cut
2355    
2356  sub display_id_and_seq {  sub display_id_and_seq {
2357    
2358      shift if UNIVERSAL::isa($_[0],__PACKAGE__);      shift if UNIVERSAL::isa($_[0],__PACKAGE__);
2359    
2360      my( $id, $seq, $fh ) = @_;      my( $id, $seq, $fh ) = @_;
2361    
2362      if (! defined($fh) )  { $fh = \*STDOUT; }      if (! defined($fh) )  { $fh = \*STDOUT; }
# Line 1899  Line 2365 
2365      &display_seq($seq, $fh);      &display_seq($seq, $fh);
2366  }  }
2367    
2368    =head3 display_id_and_seq
2369    
2370    C<< FIG::display_seq($seqP, $fh); >>
2371    
2372    or
2373    
2374    C<< $fig->display_seq($seqP, $fh); >>
2375    
2376    Display a fasta sequence to the specified open file. This method is designed
2377    to work well with L</read_fasta_sequence> and L</rev_comp>, because it takes as
2378    input a string pointer rather than a string. If the file handle is omitted it
2379    defaults to STDOUT.
2380    
2381    The sequence is split into 60-character chunks displayed one per line for
2382    readability.
2383    
2384    =over 4
2385    
2386    =item seqP
2387    
2388    Reference to a string containing the sequence.
2389    
2390    =item fh
2391    
2392    Open file handle to which the sequence should be output. If omitted,
2393    C<STDOUT> is assumed.
2394    
2395    =back
2396    
2397    =cut
2398    
2399  sub display_seq {  sub display_seq {
2400    
2401      shift if UNIVERSAL::isa($_[0],__PACKAGE__);      shift if UNIVERSAL::isa($_[0],__PACKAGE__);
2402    
2403      my ( $seq, $fh ) = @_;      my ( $seq, $fh ) = @_;
2404      my ( $i, $n, $ln );      my ( $i, $n, $ln );
2405    
# Line 1908  Line 2407 
2407    
2408      $n = length($$seq);      $n = length($$seq);
2409  #   confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );  #   confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
2410      for ($i=0; ($i < $n); $i += 60)      for ($i=0; ($i < $n); $i += 60) {
2411      {          if (($i + 60) <= $n) {
         if (($i + 60) <= $n)  
         {  
2412              $ln = substr($$seq,$i,60);              $ln = substr($$seq,$i,60);
2413          }          } else {
         else  
         {  
2414              $ln = substr($$seq,$i,($n-$i));              $ln = substr($$seq,$i,($n-$i));
2415          }          }
2416          print $fh "$ln\n";          print $fh "$ln\n";
# Line 1926  Line 2421 
2421  ##########  be part of the SOAP/WSTL interface  ##########  be part of the SOAP/WSTL interface
2422  #=pod  #=pod
2423  #  #
2424  #=head1 file2N  #=head3 file2N
2425  #  #
2426  #usage: $n = $fig->file2N($file)  #usage: $n = $fig->file2N($file)
2427  #  #
# Line 1942  Line 2437 
2437      my $rdbH = $self->db_handle;      my $rdbH = $self->db_handle;
2438    
2439      if (($relational_db_response = $rdbH->SQL("SELECT fileno FROM file_table WHERE ( file = \'$file\')")) &&      if (($relational_db_response = $rdbH->SQL("SELECT fileno FROM file_table WHERE ( file = \'$file\')")) &&
2440          (@$relational_db_response == 1))          (@$relational_db_response == 1)) {
     {  
2441          return $relational_db_response->[0]->[0];          return $relational_db_response->[0]->[0];
2442      }      } elsif (($relational_db_response = $rdbH->SQL("SELECT MAX(fileno) FROM file_table "))  && (@$relational_db_response == 1) && ($relational_db_response->[0]->[0])) {
     elsif (($relational_db_response = $rdbH->SQL("SELECT MAX(fileno) FROM file_table "))  && (@$relational_db_response == 1) && ($relational_db_response->[0]->[0]))  
     {  
2443          my $fileno = $relational_db_response->[0]->[0] + 1;          my $fileno = $relational_db_response->[0]->[0] + 1;
2444          if ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', $fileno )"))          if ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', $fileno )")) {
         {  
2445              return $fileno;              return $fileno;
2446          }          }
2447      }      } elsif ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', 1 )")) {
     elsif ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', 1 )"))  
     {  
2448          return 1;          return 1;
2449      }      }
2450      return undef;      return undef;
# Line 1963  Line 2452 
2452    
2453  #=pod  #=pod
2454  #  #
2455  #=head1 N2file  #=head3 N2file
2456  #  #
2457  #usage: $filename = $fig->N2file($n)  #usage: $filename = $fig->N2file($n)
2458  #  #
# Line 1979  Line 2468 
2468      my $rdbH = $self->db_handle;      my $rdbH = $self->db_handle;
2469    
2470      if (($relational_db_response = $rdbH->SQL("SELECT file FROM file_table WHERE ( fileno = $fileno )")) &&      if (($relational_db_response = $rdbH->SQL("SELECT file FROM file_table WHERE ( fileno = $fileno )")) &&
2471          (@$relational_db_response == 1))          (@$relational_db_response == 1)) {
     {  
2472          return $relational_db_response->[0]->[0];          return $relational_db_response->[0]->[0];
2473      }      }
2474      return undef;      return undef;
# Line 1989  Line 2477 
2477    
2478  #=pod  #=pod
2479  #  #
2480  #=head1 openF  #=head3 openF
2481  #  #
2482  #usage: $fig->openF($filename)  #usage: $fig->openF($filename)
2483  #  #
# Line 2006  Line 2494 
2494      my($fxs,$x,@fxs,$fh);      my($fxs,$x,@fxs,$fh);
2495    
2496      $fxs = $self->cached('_openF');      $fxs = $self->cached('_openF');
2497      if ($x = $fxs->{$file})      if ($x = $fxs->{$file}) {
     {  
2498          $x->[1] = time();          $x->[1] = time();
2499          return $x->[0];          return $x->[0];
2500      }      }
2501    
2502      @fxs = keys(%$fxs);      @fxs = keys(%$fxs);
2503      if (defined($fh = new FileHandle "<$file"))      if (defined($fh = new FileHandle "<$file")) {
2504      {          if (@fxs >= 50) {
         if (@fxs >= 50)  
         {  
2505              @fxs = sort { $fxs->{$a}->[1] <=> $fxs->{$b}->[1] } @fxs;              @fxs = sort { $fxs->{$a}->[1] <=> $fxs->{$b}->[1] } @fxs;
2506              $x = $fxs->{$fxs[0]};              $x = $fxs->{$fxs[0]};
2507              undef $x->[0];              undef $x->[0];
# Line 2030  Line 2515 
2515    
2516  #=pod  #=pod
2517  #  #
2518  #=head1 closeF  #=head3 closeF
2519  #  #
2520  #usage: $fig->closeF($filename)  #usage: $fig->closeF($filename)
2521  #  #
# Line 2046  Line 2531 
2531      my($self,$file) = @_;      my($self,$file) = @_;
2532      my($fxs,$x);      my($fxs,$x);
2533    
2534      if (($fxs = $self->{_openF}) &&      if (($fxs = $self->{_openF}) && ($x = $fxs->{$file})) {
         ($x = $fxs->{$file}))  
     {  
2535          undef $x->[0];          undef $x->[0];
2536          delete $fxs->{$file};          delete $fxs->{$file};
2537      }      }
2538  }  }
2539    
2540  =pod  =head3 ec_name
2541    
2542    C<< my $enzymatic_function = $fig->ec_name($ec); >>
2543    
2544    Returns the enzymatic name corresponding to the specified enzyme code.
2545    
2546    =over 4
2547    
2548    =item ec
2549    
2550  =head1 ec_name  Code number for the enzyme whose name is desired. The code number is actually
2551    a string of digits and periods (e.g. C<1.2.50.6>).
2552    
2553  usage: $enzymatic_function = $fig->ec_name($ec)  =item RETURN
2554    
2555  Returns enzymatic name for EC.  Returns the name of the enzyme specified by the indicated code, or a null string
2556    if the code is not found in the database.
2557    
2558    =back
2559    
2560  =cut  =cut
2561    
# Line 2075  Line 2570 
2570      return "";      return "";
2571  }  }
2572    
2573  =pod  =head3 all_roles
2574    
2575  =head1 all_roles  C<< my @roles = $fig->all_roles; >>
2576    
2577  usage: @roles = $fig->all_roles  Return a list of the known roles. Currently, this is a list of the enzyme codes and names.
2578    
2579  Supposed to return all known roles.  For now, we get all ECs with "names".  The return value is a list of list references. Each element of the big list contains an
2580    enzyme code (EC) followed by the enzymatic name.
2581    
2582  =cut  =cut
2583    
# Line 2094  Line 2590 
2590      return @$relational_db_response;      return @$relational_db_response;
2591  }  }
2592    
2593  =pod  =head3 expand_ec
   
 =head1 expand_ec  
2594    
2595  usage: $expanded_ec = $fig->expand_ec($ec)  C<< my $expanded_ec = $fig->expand_ec($ec); >>
2596    
2597  Expands "1.1.1.1" to "1.1.1.1 - alcohol dehydrogenase" or something like that.  Expands "1.1.1.1" to "1.1.1.1 - alcohol dehydrogenase" or something like that.
2598    
# Line 2111  Line 2605 
2605      return ($name = $self->ec_name($ec)) ? "$ec - $name" : $ec;      return ($name = $self->ec_name($ec)) ? "$ec - $name" : $ec;
2606  }  }
2607    
2608    =head3 clean_tmp
2609    
2610  =pod  C<< FIG::clean_tmp(); >>
   
 =head1 clean_tmp  
2611    
2612  usage: &FIG::clean_tmp  Delete temporary files more than two days old.
2613    
2614  We store temporary files in $FIG_Config::temp.  There are specific classes of files  We store temporary files in $FIG_Config::temp.  There are specific classes of files
2615  that are created and should be saved for at least a few days.  This routine can be  that are created and should be saved for at least a few days.  This routine can be
# Line 2127  Line 2620 
2620  sub clean_tmp {  sub clean_tmp {
2621    
2622      my($file);      my($file);
2623      if (opendir(TMP,"$FIG_Config::temp"))      if (opendir(TMP,"$FIG_Config::temp")) {
     {  
2624  #       change the pattern to pick up other files that need to be cleaned up  #       change the pattern to pick up other files that need to be cleaned up
2625          my @temp = grep { $_ =~ /^(Geno|tmp)/ } readdir(TMP);          my @temp = grep { $_ =~ /^(Geno|tmp)/ } readdir(TMP);
2626          foreach $file (@temp)          foreach $file (@temp) {
2627          {              if (-M "$FIG_Config::temp/$file" > 2) {
             if (-M "$FIG_Config::temp/$file" > 2)  
             {  
2628                  unlink("$FIG_Config::temp/$file");                  unlink("$FIG_Config::temp/$file");
2629              }              }
2630          }          }
# Line 2144  Line 2634 
2634  ################ Routines to process genomes and genome IDs  ##########################  ################ Routines to process genomes and genome IDs  ##########################
2635    
2636    
2637  =pod  =head3 genomes
2638    
2639  =head1 genomes  C<< my @genome_ids = $fig->genomes($complete, $restrictions, $domain); >>
2640    
2641  usage: @genome_ids = $fig->genomes( $complete, $restrictions, $domain );  Return a list of genome IDs. If called with no parameters, all genome IDs
2642    in the database will be returned.
2643    
2644  Genomes are assigned ids of the form X.Y where X is the taxonomic id maintained by  Genomes are assigned ids of the form X.Y where X is the taxonomic id maintained by
2645  NCBI for the species (not the specific strain), and Y is a sequence digit assigned to  NCBI for the species (not the specific strain), and Y is a sequence digit assigned to
2646  this particular genome (as one of a set with the same genus/species).  Genomes also  this particular genome (as one of a set with the same genus/species).  Genomes also
2647  have versions, but that is a separate issue.  have versions, but that is a separate issue.
2648    
2649    =over 4
2650    
2651    =item complete
2652    
2653    TRUE if only complete genomes should be returned, else FALSE.
2654    
2655    =item restrictions
2656    
2657    TRUE if only restriction genomes should be returned, else FALSE.
2658    
2659    =item domain
2660    
2661    Name of the domain from which the genomes should be returned. Possible values are
2662    C<Bacteria>, C<Virus>, C<Eukaryota>, C<unknown>, C<Archaea>, and
2663    C<Environmental Sample>. If no domain is specified, all domains will be
2664    eligible.
2665    
2666    =item RETURN
2667    
2668    Returns a list of all the genome IDs with the specified characteristics.
2669    
2670    =back
2671    
2672  =cut  =cut
2673    
2674  sub genomes  :remote :list {  sub genomes  :remote :list {
# Line 2163  Line 2677 
2677      my $rdbH = $self->db_handle;      my $rdbH = $self->db_handle;
2678    
2679      my @where = ();      my @where = ();
2680      if ($complete)      if ($complete) {
     {  
2681          push(@where,"( complete = \'1\' )")          push(@where,"( complete = \'1\' )")
2682      }      }
2683    
2684      if ($restrictions)      if ($restrictions) {
     {  
2685          push(@where,"( restrictions = \'1\' )")          push(@where,"( restrictions = \'1\' )")
2686      }      }
2687    
2688      if ($domain)      if ($domain) {
     {  
2689          push( @where, "( maindomain = '$domain' )" )          push( @where, "( maindomain = '$domain' )" )
2690      }      }
2691    
2692      my $relational_db_response;      my $relational_db_response;
2693      if (@where > 0)      if (@where > 0) {
     {  
2694          my $where = join(" AND ",@where);          my $where = join(" AND ",@where);
2695          $relational_db_response = $rdbH->SQL("SELECT genome  FROM genome where $where");          $relational_db_response = $rdbH->SQL("SELECT genome  FROM genome where $where");
2696      }      } else {
     else  
     {  
2697          $relational_db_response = $rdbH->SQL("SELECT genome  FROM genome");          $relational_db_response = $rdbH->SQL("SELECT genome  FROM genome");
2698      }      }
2699      my @genomes = sort { $a <=> $b } map { $_->[0] } @$relational_db_response;      my @genomes = sort { $a <=> $b } map { $_->[0] } @$relational_db_response;
2700      return @genomes;      return @genomes;
2701  }  }
2702    
2703    =head3 is_complete
2704    
2705    C<< my $flag = $fig->is_complete($genome); >>
2706    
2707    Return TRUE if the genome with the specified ID is complete, else FALSE.
2708    
2709    =over 4
2710    
2711    =item genome
2712    
2713    ID of the relevant genome.
2714    
2715    =item RETURN
2716    
2717    Returns TRUE if there is a complete genome in the database with the specified ID,
2718    else FALSE.
2719    
2720    =back
2721    
2722    =cut
2723    
2724  sub is_complete {  sub is_complete {
2725      my($self,$genome) = @_;      my($self,$genome) = @_;
2726    
# Line 2200  Line 2729 
2729      return (@$relational_db_response == 1)      return (@$relational_db_response == 1)
2730      }      }
2731    
2732    =head3 genome_counts
2733    
2734    C<< my ($arch, $bact, $euk, $vir, $env, $unk) = $fig->genome_counts($complete); >>
2735    
2736    Count the number of genomes in each domain. If I<$complete> is TRUE, only complete
2737    genomes will be included in the counts.
2738    
2739    =over 4
2740    
2741    =item complete
2742    
2743    TRUE if only complete genomes are to be counted, FALSE if all genomes are to be
2744    counted
2745    
2746    =item RETURN
2747    
2748    A six-element list containing the number of genomes in each of six categories--
2749    Archaea, Bacteria, Eukaryota, Viral, Environmental, and Unknown, respectively.
2750    
2751    =back
2752    
2753    =cut
2754    
2755  sub genome_counts {  sub genome_counts {
2756      my($self,$complete) = @_;      my($self,$complete) = @_;
2757      my($x,$relational_db_response);      my($x,$relational_db_response);
2758    
2759      my $rdbH = $self->db_handle;      my $rdbH = $self->db_handle;
2760    
2761      if ($complete)      if ($complete) {
     {  
2762          $relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome where complete = '1'");          $relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome where complete = '1'");
2763      }      } else {
     else  
     {  
2764          $relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome");          $relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome");
2765      }      }
2766    
2767      my ($arch, $bact, $euk, $vir, $env, $unk) = (0, 0, 0, 0, 0, 0);      my ($arch, $bact, $euk, $vir, $env, $unk) = (0, 0, 0, 0, 0, 0);
2768      if (@$relational_db_response > 0)      if (@$relational_db_response > 0) {
2769      {          foreach $x (@$relational_db_response) {
         foreach $x (@$relational_db_response)  
         {  
2770              if    ($x->[1] =~ /^archaea/i)  { ++$arch }              if    ($x->[1] =~ /^archaea/i)  { ++$arch }
2771              elsif ($x->[1] =~ /^bacter/i)   { ++$bact }              elsif ($x->[1] =~ /^bacter/i)   { ++$bact }
2772              elsif ($x->[1] =~ /^eukar/i)    { ++$euk }              elsif ($x->[1] =~ /^eukar/i)    { ++$euk }
# Line 2233  Line 2780 
2780  }  }
2781    
2782    
2783  =pod  =head3 genome_domain
2784    
2785  =head1 genome_domain  C<< my $domain = $fig->genome_domain($genome_id); >>
2786    
2787  usage: $domain = $fig->genome_domain($genome_id);  Find the domain of a genome.
2788    
2789  Returns the domain of a genome ID, and 'undef' if it is not in the database.  =over 4
2790    
2791    =item genome_id
2792    
2793    ID of the genome whose domain is desired.
2794    
2795    =item RETURN
2796    
2797    Returns the name of the genome's domain (archaea, bacteria, etc.), or C<undef> if
2798    the genome is not in the database.
2799    
2800  =cut  =cut
2801    
# Line 2248  Line 2804 
2804      my $relational_db_response;      my $relational_db_response;
2805      my $rdbH = $self->db_handle;      my $rdbH = $self->db_handle;
2806    
2807      if ($genome)      if ($genome) {
     {  
2808          if (($relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome WHERE ( genome = \'$genome\' )"))          if (($relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome WHERE ( genome = \'$genome\' )"))
2809              && (@$relational_db_response == 1))              && (@$relational_db_response == 1)) {
         {  
2810              # die Dumper($relational_db_response);              # die Dumper($relational_db_response);
2811              return $relational_db_response->[0]->[1];              return $relational_db_response->[0]->[1];
2812          }          }
# Line 2261  Line 2815 
2815  }  }
2816    
2817    
2818  =pod  =head3 genome_pegs
2819    
2820  =head1 genome_pegs  C<< my $num_pegs = $fig->genome_pegs($genome_id); >>
2821    
2822  usage: $num_pegs = $fig->genome_pegs($genome_id);  Return the number of protein-encoding genes (PEGs) for a specified
2823    genome.
2824    
2825  Returns the number of protein-encoding genes (PEGs) in $genome_id if  =over 4
2826  "$genome_id" is indexed in the "genome" database, and 'undef' otherwise.  
2827    =item genome_id
2828    
2829    ID of the genome whose PEG count is desired.
2830    
2831    =item RETURN
2832    
2833    Returns the number of PEGs for the specified genome, or C<undef> if the genome
2834    is not indexed in the database.
2835    
2836    =back
2837    
2838  =cut  =cut
2839    
# Line 2277  Line 2842 
2842      my $relational_db_response;      my $relational_db_response;
2843      my $rdbH = $self->db_handle;      my $rdbH = $self->db_handle;
2844    
2845      if ($genome)      if ($genome) {
     {  
2846          if (($relational_db_response = $rdbH->SQL("SELECT pegs FROM genome WHERE ( genome = \'$genome\' )"))          if (($relational_db_response = $rdbH->SQL("SELECT pegs FROM genome WHERE ( genome = \'$genome\' )"))
2847              && (@$relational_db_response == 1))              && (@$relational_db_response == 1)) {
         {  
2848              return $relational_db_response->[0]->[0];              return $relational_db_response->[0]->[0];
2849          }          }
2850      }      }
# Line 2289  Line 2852 
2852  }  }
2853    
2854    
2855  =pod  =head3 genome_rnas
   
 =head1 genome_rnas  
2856    
2857  usage: $num_rnas = $fig->genome_rnas($genome_id);  C<< my $num_rnas = $fig->genome_rnas($genome_id); >>
2858    
2859  Returns the number of RNA-encoding genes (RNAs) in $genome_id if  Return the number of RNA-encoding genes for a genome.
2860  "$genome_id" is indexed in the "genome" database, and 'undef' otherwise.  "$genome_id" is indexed in the "genome" database, and 'undef' otherwise.
2861    
2862    =over 4
2863    
2864    =item genome_id
2865    
2866    ID of the genome whose RNA count is desired.
2867    
2868    =item RETURN
2869    
2870    Returns the number of RNAs for the specified genome, or C<undef> if the genome
2871    is not indexed in the database.
2872    
2873    =back
2874    
2875  =cut  =cut
2876    
2877  sub genome_rnas {  sub genome_rnas {
# Line 2305  Line 2879 
2879      my $relational_db_response;      my $relational_db_response;
2880      my $rdbH = $self->db_handle;      my $rdbH = $self->db_handle;
2881    
2882      if ($genome)      if ($genome) {
     {  
2883          if (($relational_db_response = $rdbH->SQL("SELECT rnas FROM genome WHERE ( genome = \'$genome\' )"))          if (($relational_db_response = $rdbH->SQL("SELECT rnas FROM genome WHERE ( genome = \'$genome\' )"))
2884              && (@$relational_db_response == 1))              && (@$relational_db_response == 1)) {
         {  
2885              return $relational_db_response->[0]->[0];              return $relational_db_response->[0]->[0];
2886          }          }
2887      }      }
# Line 2317  Line 2889 
2889  }  }
2890    
2891    
2892  =pod  =head3 genome_szdna
   
 =head1 genome_szdna  
2893    
2894  usage: $szdna = $fig->genome_szdna($genome_id);  usage: $szdna = $fig->genome_szdna($genome_id);
2895    
2896  Returns the number of DNA base-pairs in the genome contigs file(s) of $genome_id  Return the number of DNA base-pairs in a genome's contigs.
2897  "$genome_id" is indexed in the "genome" database, and 'undef' otherwise.  
2898    =over 4
2899    
2900    =item genome_id
2901    
2902    ID of the genome whose base-pair count is desired.
2903    
2904    =item RETURN
2905    
2906    Returns the number of base pairs in the specified genome's contigs, or C<undef>
2907    if the genome is not indexed in the database.
2908    
2909    =back
2910    
2911  =cut  =cut
2912    
# Line 2333  Line 2915 
2915      my $relational_db_response;      my $relational_db_response;
2916      my $rdbH = $self->db_handle;      my $rdbH = $self->db_handle;
2917    
2918      if ($genome)      if ($genome) {
2919      {          if (($relational_db_response =
2920          if (($relational_db_response = $rdbH->SQL("SELECT szdna FROM genome WHERE ( genome = \'$genome\' )"))              $rdbH->SQL("SELECT szdna FROM genome WHERE ( genome = \'$genome\' )"))
2921              && (@$relational_db_response == 1))              && (@$relational_db_response == 1)) {
2922          {  
2923              return $relational_db_response->[0]->[0];              return $relational_db_response->[0]->[0];
2924    
2925          }          }
2926      }      }
2927      return undef;      return undef;
2928  }  }
2929    
2930    =head3 genome_version
2931    
2932  =pod  C<< my $version = $fig->genome_version($genome_id); >>
2933    
2934  =head1 genome_version  Return the version number of the specified genome.
   
 usage: $version = $fig->genome_version($genome_id);  
2935    
2936  Versions are incremented for major updates.  They are put in as major  Versions are incremented for major updates.  They are put in as major
2937  updates of the form 1.0, 2.0, ...  updates of the form 1.0, 2.0, ...
# Line 2363  Line 2945 
2945  the DNA or amino acid sequences).  However, the basic intent of the system is to  the DNA or amino acid sequences).  However, the basic intent of the system is to
2946  support editing by the main group issuing periodic major updates.  support editing by the main group issuing periodic major updates.
2947    
2948    =over 4
2949    
2950    =item genome_id
2951    
2952    ID of the genome whose version is desired.
2953    
2954    =item RETURN
2955    
2956    Returns the version number of the specified genome, or C<undef> if the genome is not in
2957    the data store or no version number has been assigned.
2958    
2959    =back
2960    
2961  =cut  =cut
2962    
2963  sub genome_version :scalar {  sub genome_version :scalar {
# Line 2371  Line 2966 
2966      my(@tmp);      my(@tmp);
2967      if ((-s "$FIG_Config::organisms/$genome/VERSION") &&      if ((-s "$FIG_Config::organisms/$genome/VERSION") &&
2968          (@tmp = `cat $FIG_Config::organisms/$genome/VERSION`) &&          (@tmp = `cat $FIG_Config::organisms/$genome/VERSION`) &&
2969          ($tmp[0] =~ /^(\S+)$/))                  ($tmp[0] =~ /^(\S+)$/)) {
     {  
2970          return $1;          return $1;
2971      }      }
2972      return undef;      return undef;
2973  }  }
2974    
2975  =head1 genome_md5sum  =head3 genome_md5sum
2976    
2977  usage: $md5sum = $fig->genome_md5sum($genome_id);  C<< my $md5sum = $fig->genome_md5sum($genome_id); >>
2978    
2979  Returns the MD5 checksum of genome $genome_id.  Returns the MD5 checksum of the specified genome.
2980    
2981  The checksum of a genome is defined as the checksum of its signature file. The signature  The checksum of a genome is defined as the checksum of its signature file. The signature
2982  file consists of tab-separated lines, one for each contig, ordered by the contig id.  file consists of tab-separated lines, one for each contig, ordered by the contig id.
2983  Each line contains the contig ID, the length of the contig in nucleotides, and the  Each line contains the contig ID, the length of the contig in nucleotides, and the
2984  MD5 checksum of the nucleotide data, with uppercase letters forced to lower case.  MD5 checksum of the nucleotide data, with uppercase letters forced to lower case.
2985    
2986    The checksum is indexed in the database. If you know a genome's checksum, you can use
2987    the L</genome_with_md5sum> method to find its ID in the database.
2988    
2989    =over 4
2990    
2991    =item genome
2992    
2993    ID of the genome whose checksum is desired.
2994    
2995    =item RETURN
2996    
2997    Returns the specified genome's checksum, or C<undef> if the genome is not in the
2998    database.
2999    
3000    =back
3001    
3002  =cut  =cut
3003    
# Line 2397  Line 3006 
3006      my $relational_db_response;      my $relational_db_response;
3007      my $rdbH = $self->db_handle;      my $rdbH = $self->db_handle;
3008    
3009      if ($genome)      if ($genome) {
3010      {          if (($relational_db_response =
3011          if (($relational_db_response = $rdbH->SQL("SELECT md5sum FROM genome_md5sum WHERE ( genome = \'$genome\' )"))               $rdbH->SQL("SELECT md5sum FROM genome_md5sum WHERE ( genome = \'$genome\' )"))
3012              && (@$relational_db_response == 1))              && (@$relational_db_response == 1)) {
         {  
3013              return $relational_db_response->[0]->[0];              return $relational_db_response->[0]->[0];
3014          }          }
3015      }      }
3016      return undef;      return undef;
3017  }  }
3018    
3019  =head1 genome_with_md5sum  =head3 genome_with_md5sum
3020    
3021    C<< my $genome = $fig->genome_with_md5sum($cksum); >>
3022    
3023    Find a genome with the specified checksum.
3024    
3025  usage: $genome = $fig->genome_with_md5sum($cksum);  The MD5 checksum is computed from the content of the genome (see L</genome_md5sum>). This method
3026    can be used to determine if a genome already exists for a specified content.
3027    
3028    =over 4
3029    
3030    =item cksum
3031    
3032    Checksum to use for searching the genome table.
3033    
3034    =item RETURN
3035    
3036  Find a genome with checksum $cksum.  The ID of a genome with the specified checksum, or C<undef> if no such genome exists.
3037    
3038    =back
3039    
3040  =cut  =cut
3041    
# Line 2421  Line 3044 
3044      my $relational_db_response;      my $relational_db_response;
3045      my $rdbH = $self->db_handle;      my $rdbH = $self->db_handle;
3046    
3047      if (($relational_db_response = $rdbH->SQL("SELECT genome FROM genome_md5sum WHERE ( md5sum = \'$cksum\' )"))      if (($relational_db_response =
3048          && (@$relational_db_response == 1))           $rdbH->SQL("SELECT genome FROM genome_md5sum WHERE ( md5sum = \'$cksum\' )"))
3049      {               && (@$relational_db_response == 1)) {
3050          return $relational_db_response->[0]->[0];          return $relational_db_response->[0]->[0];
3051      }      }
3052    
3053      return undef;      return undef;
3054  }  }
3055    
3056    =head3 contig_md5sum
3057    
3058    C<< my $cksum = $fig->contig_md5sum($genome, $contig); >>
3059    
3060    Return the MD5 checksum for a contig. The MD5 checksum is computed from the content
3061    of the contig. This method retrieves the checksum stored in the database. The checksum
3062    can be compared to the checksum of an external contig as a cheap way of seeing if they
3063    match.
3064    
3065    =over 4
3066    
3067    =item genome
3068    
3069    ID of the genome containing the contig.
3070    
3071    =item contig
3072    
3073    ID of the relevant contig.
3074    
3075    =item RETURN
3076    
3077    Returns the checksum of the specified contig, or C<undef> if the contig is not in the
3078    database.
3079    
3080    =back
3081    
3082    =cut
3083    
3084  sub contig_md5sum :scalar {  sub contig_md5sum :scalar {
3085      my($self, $genome, $contig) = @_;      my($self, $genome, $contig) = @_;
3086      my $relational_db_response;      my $relational_db_response;
3087      my $rdbH = $self->db_handle;      my $rdbH = $self->db_handle;
3088    
3089      if ($genome)      if ($genome) {
3090      {          if (($relational_db_response =
3091          if (($relational_db_response = $rdbH->SQL(qq(SELECT md5 FROM contig_md5sums               $rdbH->SQL(qq(SELECT md5 FROM contig_md5sums WHERE (genome = ? AND contig = ?)), undef, $genome, $contig))
3092                                                       WHERE (genome = ? AND               && (@$relational_db_response == 1)) {
                                                             contig = ?)), undef, $genome, $contig))  
             && (@$relational_db_response == 1))  
         {  
3093              return $relational_db_response->[0]->[0];              return $relational_db_response->[0]->[0];
3094          }          }
3095      }      }
3096      return undef;      return undef;
3097  }  }
3098    
3099  =pod  =head3 genus_species
3100    
3101    C<< my $gs = $fig->genus_species($genome_id); >>
3102    
3103    Return the genus, species, and possibly also the strain of a specified genome.
3104    
3105  =head1 genus_species  This method converts a genome ID into a more recognizble species name. The species name
3106    is stored directly in the genome table of the database. Essentially, if the strain is
3107    present in the database, it will be returned by this method, and if it's not present,
3108    it won't.
3109    
3110  usage: $gs = $fig->genus_species($genome_id)  =over 4
3111    
3112    =item genome_id
3113    
3114  Returns the genus and species (and strain if that has been properly recorded)  ID of the genome whose name is desired.
3115  in a printable form.  
3116    =item RETURN
3117    
3118    Returns the scientific species name associated with the specified ID, or C<undef> if the
3119    ID is not in the database.
3120    
3121    =back
3122    
3123  =cut  =cut
3124    
# Line 2464  Line 3127 
3127      my $ans;      my $ans;
3128    
3129      my $genus_species = $self->cached('_genus_species');      my $genus_species = $self->cached('_genus_species');
3130      if (! ($ans = $genus_species->{$genome}))      if (! ($ans = $genus_species->{$genome})) {
     {  
3131          my $rdbH = $self->db_handle;          my $rdbH = $self->db_handle;
3132          my $relational_db_response = $rdbH->SQL("SELECT genome,gname  FROM genome");          my $relational_db_response = $rdbH->SQL("SELECT genome,gname  FROM genome");
3133          my $pair;          my $pair;
3134          foreach $pair (@$relational_db_response)          foreach $pair (@$relational_db_response) {
         {  
3135              $genus_species->{$pair->[0]} = $pair->[1];              $genus_species->{$pair->[0]} = $pair->[1];
3136          }          }
3137          $ans = $genus_species->{$genome};          $ans = $genus_species->{$genome};
# Line 2478  Line 3139 
3139      return $ans;      return $ans;
3140  }  }
3141    
3142  =pod  =head3 org_of
3143    
3144  =head1 org_of  C<< my $org = $fig->org_of($prot_id); >>
3145    
3146  usage: $org = $fig->org_of($prot_id)  Return the genus/species name of the organism containing a protein. Note that in this context
3147    I<protein> is not a certain string of amino acids but a protein encoding region on a specific
3148    contig.
3149    
3150    For a FIG protein ID (e.g. C<fig|134537.1.peg.123>), the organism and strain
3151    information is always available. In the case of external proteins, we can usually
3152    determine an organism, but not anything more precise than genus/species (and
3153    often not that). When the organism name is not present, a null string is returned.
3154    
3155  In the case of external proteins, we can usually determine an organism, but not  =over 4
3156  anything more precise than genus/species (and often not that).  This routine takes  
3157  a protein ID (which may be a feature ID) and returns "the organism".  =item prot_id
3158    
3159    Protein or feature ID.
3160    
3161    =item RETURN
3162    
3163    Returns the displayable scientific name (genus, species, and strain) of the organism containing
3164    the identified PEG. If the name is not available, returns a null string. If the PEG is not found,
3165    returns C<undef>.
3166    
3167    =back
3168    
3169  =cut  =cut
3170    
# Line 2495  Line 3173 
3173      my $relational_db_response;      my $relational_db_response;
3174      my $rdbH = $self->db_handle;      my $rdbH = $self->db_handle;
3175    
3176      if ($prot_id =~ /^fig\|/)      if ($prot_id =~ /^fig\|/) {
     {  
         #  
         #  Trying to guess what Ross wanted (there was a servere bug):  
         #  
         #  deleted -> undefined  
         #  failed lookup -> ""  
         #  
3177          return  $self->is_deleted_fid( $prot_id) ? undef          return  $self->is_deleted_fid( $prot_id) ? undef
3178                : $self->genus_species( $self->genome_of( $prot_id ) ) || "";                : $self->genus_species( $self->genome_of( $prot_id ) ) || "";
3179      }      }
3180    
3181      if (($relational_db_response = $rdbH->SQL("SELECT org FROM external_orgs WHERE ( prot = \'$prot_id\' )")) &&      if (($relational_db_response =
3182          (@$relational_db_response >= 1))              $rdbH->SQL("SELECT org FROM external_orgs WHERE ( prot = \'$prot_id\' )")) &&
3183      {              (@$relational_db_response >= 1)) {
3184          $relational_db_response->[0]->[0] =~ s/^\d+://;          $relational_db_response->[0]->[0] =~ s/^\d+://;
3185          return $relational_db_response->[0]->[0];          return $relational_db_response->[0]->[0];
3186      }      }
3187      return "";      return "";
3188  }  }
3189    
3190  #  =head3 genus_species_domain
3191  #  Support for colorizing organisms by domain  
3192  #  -- GJO  C<< my ($gs, $domain) = $fig->genus_species_domain($genome_id); >>
3193  #  
3194  =pod  Returns a genome's genus and species (and strain if that has been properly
3195    recorded) in a printable form, along with its domain. This method is similar
3196    to L</genus_species>, except it also returns the domain name (archaea,
3197    bacteria, etc.).
3198    
3199  =head1 genus_species_domain  =over 4
3200    
3201    =item genome_id
3202    
3203  usage: ($gs, $domain) = $fig->genus_species_domain($genome_id)  ID of the genome whose species and domain information is desired.
3204    
3205    =item RETURN
3206    
3207  Returns the genus and species (and strain if that has been properly recorded)  Returns a two-element list. The first element is the species name and the
3208  in a printable form, and domain.  second is the domain name.
3209    
3210    =back
3211    
3212  =cut  =cut
3213    
# Line 2535  Line 3215 
3215      my ($self, $genome) = @_;      my ($self, $genome) = @_;
3216    
3217      my $genus_species_domain = $self->cached('_genus_species_domain');      my $genus_species_domain = $self->cached('_genus_species_domain');
3218      if ( ! $genus_species_domain->{ $genome } )      if ( ! $genus_species_domain->{ $genome } ) {
     {  
3219          my $rdbH = $self->db_handle;          my $rdbH = $self->db_handle;
3220          my $relational_db_response = $rdbH->SQL("SELECT genome,gname,maindomain FROM genome");          my $relational_db_response = $rdbH->SQL("SELECT genome,gname,maindomain FROM genome");
3221          my $triple;          my $triple;
3222          foreach $triple ( @$relational_db_response )          foreach $triple ( @$relational_db_response ) {
         {  
3223              $genus_species_domain->{ $triple->[0] } = [ $triple->[1], $triple->[2] ];              $genus_species_domain->{ $triple->[0] } = [ $triple->[1], $triple->[2] ];
3224          }          }
3225      }      }
# Line 2549  Line 3227 
3227      return $gsdref ? @$gsdref : ( "", "" );      return $gsdref ? @$gsdref : ( "", "" );
3228  }  }
3229    
3230    =head3 domain_color
3231    
3232    C<< my $web_color = FIG::domain_color($domain); >>
3233    
3234    Return the web color string associated with a specified domain. The colors are
3235    extremely subtle (86% luminance), so they absolutely require a black background.
3236    Archaea are slightly cyan, bacteria are slightly magenta, eukaryota are slightly
3237    yellow, viruses are slightly silver, environmental samples are slightly gray,
3238    and unknown or invalid domains are pure white.
3239    
3240    =over 4
3241    
3242    =item domain
3243    
3244    Name of the domain whose color is desired.
3245    
3246    =item RETURN
3247    
3248    Returns a web color string for the specified domain (e.g. C<#FFDDFF> for
3249    bacteria).
3250    
3251    =back
3252    
3253    =cut
3254    
3255  my %domain_color = ( AR => "#DDFFFF", BA => "#FFDDFF", EU => "#FFFFDD",  my %domain_color = ( AR => "#DDFFFF", BA => "#FFDDFF", EU => "#FFFFDD",
3256                       VI => "#DDDDDD", EN => "#BBBBBB" );                       VI => "#DDDDDD", EN => "#BBBBBB" );
# Line 2559  Line 3261 
3261      return $domain_color{ uc substr($domain, 0, 2) } || "#FFFFFF";      return $domain_color{ uc substr($domain, 0, 2) } || "#FFFFFF";
3262  }  }
3263    
3264    =head3 org_and_color_of
3265    
3266  =pod  C<< my ($org, $color) = $fig->org_and_domain_of($prot_id); >>
   
 =head1 org_and_color_of  
   
 usage: ($org, $color) = $fig->org_and_domain_of($prot_id)  
3267    
3268  Return the best guess organism and domain html color string of an organism.  Return the best guess organism and domain html color string of an organism.
3269  In the case of external proteins, we can usually determine an organism, but not  In the case of external proteins, we can usually determine an organism, but not
3270  anything more precise than genus/species (and often not that).  This routine takes  anything more precise than genus/species (and often not that).
3271  a protein ID (which may be a feature ID) and returns "the organism".  
3272    =over 4
3273    
3274    =item prot_id
3275    
3276    Relevant protein or feature ID.
3277    
3278    =item RETURN
3279    
3280    Returns a two-element list. The first element is the displayable organism name, and the second
3281    is an HTML color string based on the domain (see L</domain_color>).
3282    
3283    =back
3284    
3285  =cut  =cut
3286    
# Line 2578  Line 3289 
3289      my $relational_db_response;      my $relational_db_response;
3290      my $rdbH = $self->db_handle;      my $rdbH = $self->db_handle;
3291    
3292      if ($prot_id =~ /^fig\|/)      if ($prot_id =~ /^fig\|/) {
     {  
3293          my( $gs, $domain ) = $self->genus_species_domain($self->genome_of($prot_id));          my( $gs, $domain ) = $self->genus_species_domain($self->genome_of($prot_id));
3294          return ( $gs, domain_color( $domain ) );          return ( $gs, domain_color( $domain ) );
3295      }      }
3296    
3297      if (($relational_db_response = $rdbH->SQL("SELECT org FROM external_orgs WHERE ( prot = \'$prot_id\' )")) &&      if (($relational_db_response =
3298          (@$relational_db_response >= 1))           $rdbH->SQL("SELECT org FROM external_orgs WHERE ( prot = \'$prot_id\' )")) &&
3299      {           (@$relational_db_response >= 1)) {
3300          return ($relational_db_response->[0]->[0], "#FFFFFF");          return ($relational_db_response->[0]->[0], "#FFFFFF");
3301      }      }
3302      return ("", "#FFFFFF");      return ("", "#FFFFFF");
3303  }  }
3304    
3305  #  =head3 abbrev
3306  #  End of support for colorizing organisms by domain  
3307  #  -- GJO  C<< my $abbreviated_name = FIG::abbrev($genome_name); >>
 #  
3308    
3309  =pod  or
3310    
3311  =head1 abbrev  C<< my $abbreviated_name = $fig->abbrev($genome_name); >>
3312    
3313  usage: $abbreviated_name = $fig->abbrev($genome_name)  Abbreviate a genome name to 10 characters or less.
3314    
3315  For alignments and such, it is very useful to be able to produce an abbreviation of genus/species.  For alignments and such, it is very useful to be able to produce an abbreviation of genus/species.
3316  That's what this does.  Note that multiple genus/species might reduce to the same abbreviation, so  That's what this does.  Note that multiple genus/species might reduce to the same abbreviation, so
3317  be careful (disambiguate them, if you must).  be careful (disambiguate them, if you must).
3318    
3319    The abbreviation is formed from the first three letters of the species name followed by the
3320    first three letters of the genus name followed by the first three letters of the species name and
3321    then the next four nonblank characters.
3322    
3323    =over 4
3324    
3325    =item genome_name
3326    
3327    The name to abbreviate.
3328    
3329    =item RETURN
3330    
3331    An abbreviated version of the specified name.
3332    
3333    =back
3334    
3335  =cut  =cut
3336    
3337  sub abbrev :scalar {  sub abbrev :scalar {
# Line 2616  Line 3341 
3341      $genome_name =~ s/^(\S{3})\S+/$1./;      $genome_name =~ s/^(\S{3})\S+/$1./;
3342      $genome_name =~ s/^(\S+)\s+(\S{3})\S+/$1$2./;      $genome_name =~ s/^(\S+)\s+(\S{3})\S+/$1$2./;
3343      $genome_name =~ s/ //g;      $genome_name =~ s/ //g;
3344      if (length($genome_name) > 10)      if (length($genome_name) > 10) {
     {  
3345          $genome_name = substr($genome_name,0,10);          $genome_name = substr($genome_name,0,10);
3346      }      }
3347      return $genome_name;      return $genome_name;
# Line 2625  Line 3349 
3349    
3350  ################ Routines to process Features and Feature IDs  ##########################  ################ Routines to process Features and Feature IDs  ##########################
3351    
3352  =pod  =head3 ftype
3353    
3354    C<< my $type = FIG::ftype($fid); >>
3355    
3356  =head1 ftype  or
3357    
3358  usage: $type = &FIG::ftype($fid)  C<< my $type = $fig->ftype($fid); >>
3359    
3360  Returns the type of a feature, given the feature ID.  This just amounts  Returns the type of a feature, given the feature ID.  This just amounts
3361  to lifting it out of the feature ID, since features have IDs of tghe form  to lifting it out of the feature ID, since features have IDs of the form
3362    
3363      fig|x.y.f.n      fig|x.y.f.n
3364    
3365  where  where
3366          x.y is the genome ID          x.y is the genome ID
3367          f   is the type pf feature          f   is the type of feature
3368          n   is an integer that is unique within the genome/type          n   is an integer that is unique within the genome/type
3369    
3370    =over 4
3371    
3372    =item fid
3373    
3374    FIG ID of the feature whose type is desired.
3375    
3376    =item RETURN
3377    
3378    Returns the feature type (e.g. C<peg>, C<rna>, C<pi>, or C<pp>), or C<undef> if the
3379    feature ID is not a FIG ID.
3380    
3381    =back
3382    
3383  =cut  =cut
3384    
3385  sub ftype {  sub ftype {
3386      shift if UNIVERSAL::isa($_[0],__PACKAGE__);      shift if UNIVERSAL::isa($_[0],__PACKAGE__);
3387      my($feature_id) = @_;      my($feature_id) = @_;
3388    
3389      if ($feature_id =~ /^fig\|\d+\.\d+\.([^\.]+)/)      if ($feature_id =~ /^fig\|\d+\.\d+\.([^\.]+)/) {
     {  
3390          return $1;          return $1;
3391      }      }
3392      return undef;      return undef;
3393  }  }
3394    
3395  =pod  =head3 genome_of
3396    
3397    C<< my $genome_id = $fig->genome_of($fid); >>
3398    
3399    or
3400    
3401    C<< my $genome_id = FIG::genome_of($fid); >>
3402    
3403    Return the genome ID from a feature ID.
3404    
3405  =head1 genome_of  =over 4
3406    
3407  usage: $genome_id = $fig->genome_of($fid)  =item fid
3408    
3409  This just extracts the genome ID from a feature ID.  ID of the feature whose genome ID is desired.
3410    
3411    =item RETURN
3412    
3413    If the feature ID is a FIG ID, returns the genome ID embedded inside it; otherwise, it
3414    returns C<undef>.
3415    
3416    =back
3417    
3418  =cut  =cut
3419    
# Line 2673  Line 3426 
3426      return undef;      return undef;
3427  }  }
3428    
3429  =head1 genome_and_peg_of  =head3 genome_and_peg_of
3430    
3431  usage: ($genome_id, $peg_number = $fig->genome_and_peg_of($fid)  C<< my ($genome_id, $peg_number = FIG::genome_and_peg_of($fid); >>
3432    
3433  This just extracts the genome ID and peg number from a feature ID.  C<< my ($genome_id, $peg_number = $fig->genome_and_peg_of($fid); >>
3434    
3435  =cut  Return the genome ID and peg number from a feature ID.
3436    
3437    =over 4
3438    
3439    =item prot_id
3440    
3441    ID of the feature whose genome and PEG number as desired.
3442    
3443    =item RETURN
3444    
3445    Returns the genome ID and peg number associated with a feature if the feature
3446    is represented by a FIG ID, else C<undef>.
3447    
3448    =back
3449    
3450    =cut
3451    
3452  sub genome_and_peg_of {  sub genome_and_peg_of {
3453      shift if UNIVERSAL::isa($_[0],__PACKAGE__);      shift if UNIVERSAL::isa($_[0],__PACKAGE__);
3454      my $prot_id = (@_ == 1) ? $_[0] : $_[1];      my $prot_id = (@_ == 1) ? $_[0] : $_[1];
3455    
3456      if ($prot_id =~ /^fig\|(\d+\.\d+)\.peg\.(\d+)/)      if ($prot_id =~ /^fig\|(\d+\.\d+)\.peg\.(\d+)/) {
     {  
3457          return ($1, $2);          return ($1, $2);
3458      }      }
3459      return undef;      return undef;
3460  }  }
3461    
3462  =pod  =head3 by_fig_id
3463    
3464    C<< my @sorted_by_fig_id = sort { FIG::by_fig_id($a,$b) } @fig_ids; >>
3465    
3466    Compare two feature IDs.
3467    
3468    This function is designed to assist in sorting features by ID. The sort is by
3469    genome ID followed by feature type and then feature number.
3470    
3471    =over 4
3472    
3473    =item a
3474    
3475  =head1 by_fig_id  First feature ID.
3476    
3477  usage: @sorted_by_fig_id = sort { &FIG::by_fig_id($a,$b) } @fig_ids  =item b
3478    
3479  This is a bit of a clutzy way to sort a list of FIG feature IDs, but it works.  Second feature ID.
3480    
3481    =item RETURN
3482    
3483    Returns a negative number if the first parameter is smaller, zero if both parameters
3484    are equal, and a positive number if the first parameter is greater.
3485    
3486    =back
3487    
3488  =cut  =cut
3489    
# Line 2707  Line 3491 
3491      my($a,$b) = @_;      my($a,$b) = @_;
3492      my($g1,$g2,$t1,$t2,$n1,$n2);      my($g1,$g2,$t1,$t2,$n1,$n2);
3493      if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&      if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&
3494          ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3)))               ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) {
     {  
3495          ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);          ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);
3496      }      } else {
     else  
     {  
3497          $a cmp $b;          $a cmp $b;
3498      }      }
3499  }  }
3500    
3501  =pod  =head3 genes_in_region
3502    
3503  =head1 genes_in_region  C<< my ($features_in_region, $beg1, $end1) = $fig->genes_in_region($genome, $contig, $beg, $end, size_limit); >>
3504    
3505  usage: ($features_in_region,$beg1,$end1) = $fig->genes_in_region($genome,$contig,$beg,$end)  Locate features that overlap a specified region of a contig. This includes features that begin or end
3506    outside that region, just so long as some part of the feature can be found in the region of interest.
3507    
3508  It is often important to be able to find the genes that occur in a specific region on  It is often important to be able to find the genes that occur in a specific region on
3509  a chromosome.  This routine is designed to provide this information.  It returns all genes  a chromosome.  This routine is designed to provide this information.  It returns all genes
3510  that overlap the region ($genome,$contig,$beg,$end).  $beg1 is set to the minimum coordinate of  that overlap positions from I<$beg> through I<$end> in the specified contig.
3511  the returned genes (which may be before the given region), and $end1 the maximum coordinate.  
3512    The I<$size_limit> parameter limits the search process. It is presumed that no features are longer than the
3513    specified size limit. A shorter size limit means you'll miss some features; a longer size limit significantly
3514    slows the search process. For prokaryotes, a value of C<10000> (the default) seems to work best.
3515    
3516    =over 4
3517    
3518  The routine assumes that genes are not more than 10000 bases long, which is certainly not true  =item genome
3519  in eukaryotes.  Hence, in euks you may well miss genes that overlap the boundaries of the specified  
3520  region (sorry).  ID of the genome containing the relevant contig.
3521    
3522    =item contig
3523    
3524    ID of the relevant contig.
3525    
3526    =item beg
3527    
3528    Position of the first base pair in the region of interest.
3529    
3530    =item end
3531    
3532    Position of the last base pair in the region of interest.
3533    
3534    =item size_limit
3535    
3536    Maximum allowable size for a feature. If omitted, C<10000> is assumed.
3537    
3538    =item RETURN
3539    
3540    Returns a three-element list. The first element is a reference to a list of the feature IDs found. The second
3541    element is the position of the leftmost base pair in any feature found. This may be well before the region of
3542    interest begins or it could be somewhere inside. The third element is the position of the rightmost base pair
3543    in any feature found. Again, this can be somewhere inside the region or it could be well to the right of it.
3544    
3545    =back
3546    
3547  =cut  =cut
3548  #: Return Type @;  #: Return Type @;
3549  sub genes_in_region {  sub genes_in_region {
3550      my($self,$genome,$contig,$beg,$end) = @_;      my($self, $genome, $contig, $beg, $end, $pad) = @_;
3551        if (!defined $pad) { $pad = 10000; }
3552      my($x,$relational_db_response,$feature_id,$b1,$e1,@feat,@tmp,$l,$u);      my($x,$relational_db_response,$feature_id,$b1,$e1,@feat,@tmp,$l,$u);
3553    
     my $pad = 10000;  
3554      my $rdbH = $self->db_handle;      my $rdbH = $self->db_handle;
3555    
3556      my $minV = $beg - $pad;      my $minV = $beg - $pad;
3557      my $maxV = $end + $pad;      my $maxV = $end + $pad;
3558      if (($relational_db_response = $rdbH->SQL("SELECT id FROM features      if (($relational_db_response = $rdbH->SQL("SELECT id FROM features "
3559                                           WHERE ( minloc > $minV ) AND ( minloc < $maxV ) AND ( maxloc < $maxV) AND                                                  . " WHERE ( minloc > $minV ) AND ( minloc < $maxV ) AND ( maxloc < $maxV) AND "
3560                                                 ( genome = \'$genome\' )  AND ( contig = \'$contig\' );")) &&                                                  . " ( genome = \'$genome\' )  AND ( contig = \'$contig\' );")) &&
3561           (@$relational_db_response >= 1))          (@$relational_db_response >= 1)) {
3562      {              @tmp =  sort { ($a->[1] cmp $b->[1]) or (($a->[2]+$a->[3]) <=> ($b->[2]+$b->[3])) }
3563          @tmp =  sort { ($a->[1] cmp $b->[1]) or                  map  { $feature_id = $_->[0]; $x = $self->feature_location($feature_id); $x ? [$feature_id,&boundaries_of($x)] : ()
                        (($a->[2]+$a->[3]) <=> ($b->[2]+$b->[3]))  
                      }  
                 map  { $feature_id = $_->[0];  
                        $x = $self->feature_location($feature_id);  
                        $x ? [$feature_id,&boundaries_of($x)] : ()  
3564                       } @$relational_db_response;                       } @$relational_db_response;
3565    
3566    
# Line 2789  Line 3596 
3596  #  #
3597  #  =pod  #  =pod
3598  #  #
3599  #  =head1 genes_in_region  #  =head3 genes_in_region
3600  #  #
3601  #  usage: ( $features_in_region, $min_coord, $max_coord )  #  usage: ( $features_in_region, $min_coord, $max_coord )
3602  #           = $fig->genes_in_region( $genome, $contig, $beg, $end )  #           = $fig->genes_in_region( $genome, $contig, $beg, $end )
# Line 2853  Line 3660 
3660  #                           : ( [], undef, undef );  #                           : ( [], undef, undef );
3661  #  }  #  }
3662    
3663  #  These will be part of the fix to genes_in_region.  -- GJO  #  These will be part of the fix to genes_in_region.  -- GJO
3664    
3665    =head3 regions_spanned
3666    
3667    C<< my ( [ $contig, $beg, $end ], ... ) = $fig->regions_spanned( $loc ); >>
3668    
3669    or
3670    
3671    C<< my ( [ $contig, $beg, $end ], ... ) = FIG::regions_spanned( $loc ); >>
3672    
3673    The location of a feature in a scalar context is
3674    
3675        contig_b1_e1, contig_b2_e2, ...   [one contig_b_e for each segment]
3676    
3677    This routine takes as input a scalar location in the above form
3678    and reduces it to one or more regions spanned by the gene. This
3679    involves combining regions in the location string that are on the
3680    same contig and going in the same direction. Unlike L</boundaries_of>,
3681    which returns one region in which the entire gene can be found,
3682    B<regions_spanned> handles wrapping through the orgin, features
3683    split over contigs and exons that are not ordered nicely along
3684    the chromosome (ugly but true).
3685    
3686  =pod  =over 4
3687    
3688  =head1 regions_spanned  =item loc
3689    
3690  usage: ( [ $contig, $beg, $end ], ... ) = $fig->regions_spanned( $loc )  The location string for a feature.
3691    
3692  The location of a feature in a scalar context is  =item RETURN
3693    
3694      contig_b1_e1,contig_b2_e2,...   [one contig_b_e for each segment]  Returns a list of list references. Each inner list contains a contig ID, a starting
3695    position, and an ending position. The starting position may be numerically greater
3696    than the ending position (which indicates a backward-traveling gene). It is
3697    guaranteed that the entire feature is covered by the regions in the list.
3698    
3699  This routine takes as input a fig location and reduces it to one or more  =back
 regions spanned by the gene.  Unlike boundaries_of, regions_spanned handles  
 wrapping through the orgin, features split over contigs and exons that are  
 not ordered nicely along the chromosome (ugly but true).  
3700    
3701  =cut  =cut
3702    
# Line 2923  Line 3751 
3751      return wantarray ? @regions : \@regions;      return wantarray ? @regions : \@regions;
3752  }  }
3753    
3754    =head3 filter_regions
3755    
3756    C<< my  @regions = FIG::filter_regions( $contig, $min, $max,  @regions ); >>
3757    
3758    or
3759    
3760    C<< my \@regions = FIG::filter_regions( $contig, $min, $max,  @regions ); >>
3761    
3762    or
3763    
3764    C<< my @regions = FIG::filter_regions( $contig, $min, $max, \@regions ); >>
3765    
3766    or
3767    
3768    C<< my \@regions = FIG::filter_regions( $contig, $min, $max, \@regions ); >>
3769    
3770    Filter a list of regions to those that overlap a specified section of a
3771    particular contig. Region definitions correspond to those produced
3772    by L</regions_spanned>.  That is, C<[>I<contig>C<,>I<beg>C<,>I<end>C<]>.
3773    In the function call, either I<$contig> or I<$min> and I<$max> can be
3774    undefined (permitting anything). So, for example,
3775    
3776        my @regions = FIG::filter_regions(undef, 1, 5000, $regionList);
3777    
3778    would return all regions in C<$regionList> that overlap the first
3779    5000 base pairs in any contig. Conversely,
3780    
3781        my @regions = FIG::filter_regions('NC_003904', undef, undef, $regionList);
3782    
3783    would return all regions on the contig C<NC_003904>.
3784    
3785    =over 4
3786    
3787    =item contig
3788    
3789    ID of the contig whose regions are to be passed by the filter, or C<undef>
3790    if the contig doesn't matter.
3791    
3792    =item min
3793    
3794    Leftmost position of the region used for filtering. Only regions which contain
3795    at least one base pair at or beyond this position will be passed. A value
3796    of C<undef> is equivalent to zero.
3797    
3798    =item max
3799    
3800    Rightmost position of the region used for filtering. Only regions which contain
3801    at least one base pair on or before this position will be passed. A value
3802    of C<undef> is equivalent to the length of the contig.
3803    
3804  =pod  =item regionList
3805    
3806  =head1 filter_regions  A list of regions, or a reference to a list of regions. Each region is a
3807    reference to a three-element list, the first element of which is a contig
3808    ID, the second element of which is the start position, and the third
3809    element of which is the ending position. (The ending position can be
3810    before the starting position if the region is backward-traveling.)
3811    
3812  usage:  @regions = filter_regions( $contig, $min, $max,  @regions )  =item RETURN
3813         \@regions = filter_regions( $contig, $min, $max,  @regions )  
3814          @regions = filter_regions( $contig, $min, $max, \@regions )  In a scalar context, returns a reference to a list of the filtered regions.
3815         \@regions = filter_regions( $contig, $min, $max, \@regions )  In a list context, returns the list itself.
3816    
3817  This function provides a simple filter for extracting a list of genome regions  =back
 for those that overlap a particular interval.  Region definitions correspond  
 to those produced by regions_spanned.  That is, [ contig, beg, end ].  In the  
 function call, either $contig or $min and $max can be undefined (permitting  
 anything).  
3818    
3819  =cut  =cut
3820    
# Line 2973  Line 3850 
3850      return wantarray ? @filtered : \@filtered;      return wantarray ? @filtered : \@filtered;
3851  }  }
3852    
3853    =head3 close_genes
3854    
3855    C<< my @features = $fig->close_genes($fid, $dist); >>
3856    
3857    Return all features within a certain distance of a specified other feature.
3858    
3859    This method is a quick way to get genes that are near another gene. It calls
3860    L</boundaries_of> to get the boundaries of the incoming gene, then passes
3861    the region computed to L</genes_in_region>.
3862    
3863    So, for example, if the specified I<$dist> is 500, the method would select
3864    a region that extends 500 base pairs to either side of the boundaries for
3865    the gene I<$fid>, and pass it to C<genes_in_region> for analysis. The
3866    features returned would be those that overlap the selected region. Note
3867    that the flaws inherent in C<genes_in_region> are also inherent in this
3868    method: if a feature is more than 10000 base pairs long, it may not
3869    be caught even though it has an overlap in the specified region.
3870    
3871    =over 4
3872    
3873    =item fid
3874    
3875    ID of the relevant feature.
3876    
3877    =item dist
3878    
3879    Desired maximum distance.
3880    
3881    =item RETURN
3882    
3883    Returns a list of feature IDs for genes that overlap or are close to the boundaries
3884    for the specified incoming feature.
3885    
3886    =back
3887    
3888    =cut
3889    
3890  sub close_genes {  sub close_genes {
3891      my($self,$fid,$dist) = @_;      my($self,$fid,$dist) = @_;
# Line 2995  Line 3908 
3908      return ();      return ();
3909  }  }
3910    
3911    =head3 adjacent_genes
3912    
3913    C<< my ($left_fid, $right_fid) = $fig->adjacent_genes($fid, $dist); >>
3914    
3915    Return the IDs of the genes immediately to the left and right of a specified
3916    feature.
3917    
3918    This method gets a list of the features within the specified distance of
3919    the incoming feature (using L</close_genes>), and then chooses the two
3920    closest to the feature found. If the incoming feature is on the + strand,
3921    these are features to the left and the right. If the incoming feature is
3922    on the - strand, the features will be returned in reverse order.
3923    
3924    =over 4
3925    
3926    =item fid
3927    
3928    ID of the feature whose neighbors are desired.
3929    
3930    =item dist
3931    
3932    Maximum permissible distance to the neighbors.
3933    
3934    =item RETURN
3935    
3936    Returns a two-element list containing the IDs of the features on either side
3937    of the incoming feature.
3938    
3939    =back
3940    
3941    =cut
3942    
3943  sub adjacent_genes  sub adjacent_genes
3944  {  {
3945      my ($self, $fid, $dist) = @_;      my ($self, $fid, $dist) = @_;
# Line 3022  Line 3967 
3967    
3968      my ($left_neighbor, $right_neighbor) = ($close[$i-1], $close[$i+1]);      my ($left_neighbor, $right_neighbor) = ($close[$i-1], $close[$i+1]);
3969    
3970      if (0) # this was if ($i > 0) I just skip this whole section!  #    if (0) # this was if ($i > 0) I just skip this whole section!
3971      {  #    {
3972          if ($self->strand_of($close[$i-1]) eq $strand) { $left_neighbor  = $close[$i-1]; }  #           if ($self->strand_of($close[$i-1]) eq $strand) { $left_neighbor  = $close[$i-1]; }
3973          # else {$left_neighbor=$close[$i+1]} # RAE: this is the alternative that is needed if you do it by strand  #    }
     }  
3974    
3975      if ($i < $#close)      if ($i < $#close)
3976      {      {
3977          if ($self->strand_of($close[$i+1]) eq $strand) { $right_neighbor = $close[$i+1]; }          if ($self->strand_of($close[$i+1]) eq $strand) { $right_neighbor = $close[$i+1]; }
         # else {$right_neighbor = $close[$i-1]} # RAE: this is the alternative that is needed if you do it by strand  
3978      }      }
3979    
3980      # ...return genes in transcription order...      # ...return genes in transcription order...
# Line 3043  Line 3986 
3986      return ($left_neighbor, $right_neighbor) ;      return ($left_neighbor, $right_neighbor) ;
3987  }  }
3988    
3989    =head3 feature_location
3990    
3991  =pod  C<< my $loc = $fig->feature_location($fid); >>
3992    
3993  =head1 feature_location  or
3994    
3995  usage: $loc = $fig->feature_location($fid)   OR  C<< my @loc = $fig->feature_location($fid);; >>
        @loc = $fig->feature_location($fid)  
3996    
3997  The location of a feature in a scalar context is  Return the location of a feature. The location consists
3998    of a list of (contigID, begin, end) triples encoded
3999    as strings with an underscore delimiter. So, for example,
4000    C<NC_002755_100_199> indicates a location starting at position
4001    100 and extending through 199 on the contig C<NC_002755>. If
4002    the location goes backward, the start location will be higher
4003    than the end location (e.g. C<NC_002755_199_100>).
4004    
4005      contig_b1_e1,contig_b2_e2,...   [one contig_b_e for each exon]  In a scalar context, this method returns the locations as a
4006    comma-delimited string
4007    
4008        NC_002755_100_199,NC_002755_210_498
4009    
4010    In a list context, the locations are returned as a list
4011    
4012        (NC_002755_100_199, NC_002755_210_498)
4013    
4014    =over 4
4015    
4016    =item fid
4017    
4018    ID of the feature whose location is desired.
4019    
4020    =item RETURN
4021    
4022  In a list context it is  Returns the locations of a feature, either as a comma-delimited
4023    string or a list.
4024    
4025      (contig_b1_e1,contig_b2_e2,...)  =back
4026    
4027  =cut  =cut
4028    
# Line 3070  Line 4035 
4035      if ($self->is_deleted_fid($feature_id)) { return undef }      if ($self->is_deleted_fid($feature_id)) { return undef }
4036    
4037      $locations = $self->cached('_location');      $locations = $self->cached('_location');
4038      if (! ($location = $locations->{$feature_id}))      if (! ($location = $locations->{$feature_id})) {
     {  
4039          my $rdbH = $self->db_handle;          my $rdbH = $self->db_handle;
4040          if (($relational_db_response = $rdbH->SQL("SELECT location FROM features WHERE ( id = \'$feature_id\' )")) &&          if (($relational_db_response = $rdbH->SQL("SELECT location FROM features WHERE ( id = \'$feature_id\' )")) &&
4041              (@$relational_db_response == 1))              (@$relational_db_response == 1)) {
         {  
4042              $locations->{$feature_id} = $location = $relational_db_response->[0]->[0];              $locations->{$feature_id} = $location = $relational_db_response->[0]->[0];
4043          }          }
4044      }      }
4045    
4046      if ($location)      if ($location) {
     {  
4047          return wantarray() ? split(/,/,$location) : $location;          return wantarray() ? split(/,/,$location) : $location;
4048      }      }
4049      return undef;      return undef;
4050  }  }
4051    
4052    #TODO: BRUCE IS HERE
4053    
4054  sub contig_of  sub contig_of
4055  {  {
4056      my ($self, $locus) = @_;      my ($self, $locus) = @_;
# Line 3126  Line 4090 
4090      if ($beg < $end) { return '+'; } else { return '-'; }      if ($beg < $end) { return '+'; } else { return '-'; }
4091  }  }
4092    
4093  =pod  =head3 find_contig_with_checksum
   
 =head1 find_contig_with_checksum  
4094    
4095  Find a contig in the given genome with the given checksum.  Find a contig in the given genome with the given checksum.
4096    
   
4097  =cut  =cut
4098    
4099  sub find_contig_with_checksum  sub find_contig_with_checksum
# Line 3255  Line 4216 
4216      }      }
4217  }  }
4218    
4219  =pod  =head3 read_contig
   
 =head1 read_contig  
4220    
4221  Read a single contig from the contigs file.  Read a single contig from the contigs file.
4222    
# Line 3302  Line 4261 
4261      return $contig_txt;      return $contig_txt;
4262  }  }
4263    
4264  =pod  =head3 boundaries_of
   
 =head1 boundaries_of  
4265    
4266  usage: ($contig,$beg,$end) = $fig->boundaries_of($loc)  usage: ($contig,$beg,$end) = $fig->boundaries_of($loc)
4267    
# Line 3338  Line 4295 
4295      return undef;      return undef;
4296  }  }
4297    
4298  =pod  =head3 all_features_detailed
   
 =head1 all_features_detailed  
4299    
4300  usage: $fig->all_features_detailed($genome)  usage: $fig->all_features_detailed($genome)
4301    
# Line 3362  Line 4317 
4317  }  }
4318    
4319    
4320  =pod  =head3 all_features
   
 =head1 all_features  
4321    
4322  usage: $fig->all_features($genome,$type)  usage: $fig->all_features($genome,$type)
4323    
# Line 3392  Line 4345 
4345  }  }
4346    
4347    
4348  =pod  =head3 pegs_of
   
 =head1 pegs_of  
4349    
4350  usage: $fig->pegs_of($genome)  usage: $fig->pegs_of($genome)
4351    
# Line 3410  Line 4361 
4361  }  }
4362    
4363    
4364  =pod  =head3 rnas_of
   
 =head1 rnas_of  
4365    
4366  usage: $fig->rnas_of($genome)  usage: $fig->rnas_of($genome)
4367    
# Line 3426  Line 4375 
4375      return $self->all_features($genome,"rna");      return $self->all_features($genome,"rna");
4376  }  }
4377    
4378  =pod  =head3 feature_aliases
   
 =head1 feature_aliases  
4379    
4380  usage: @aliases = $fig->feature_aliases($fid)  OR  usage: @aliases = $fig->feature_aliases($fid)  OR
4381         $aliases = $fig->feature_aliases($fid)         $aliases = $fig->feature_aliases($fid)
# Line 3469  Line 4416 
4416      return wantarray() ? @aliases : join(",",@aliases);      return wantarray() ? @aliases : join(",",@aliases);
4417  }  }
4418    
4419  =pod  =head3 by_alias
   
 =head1 by_alias  
4420    
4421  usage: $peg = $fig->by_alias($alias)  usage: $peg = $fig->by_alias($alias)
4422    
# Line 3524  Line 4469 
4469      }      }
4470  }  }
4471    
4472  =pod  =head3 possibly_truncated
   
 =head1 possibly_truncated  
4473    
4474  usage: $fig->possibly_truncated($fid)  usage: $fig->possibly_truncated($fid)
4475    
# Line 3569  Line 4512 
4512    
4513  ################ Routines to process functional coupling for PEGs  ##########################  ################ Routines to process functional coupling for PEGs  ##########################
4514    
4515  =pod  =head3 coupling_and_evidence
   
 =head1 coupled_to  
   
 usage: @coupled_to = $fig->coupled_to($peg)  
   
 The new form of coupling and evidence computation is based on precomputed data.  
 The old form took minutes to dynamically compute things when needed.  The old form  
 still works, ikf the directory Data/CouplingData is not present.  If it is present,  
 it is assumed to contain comprehensive coupling data in the form of precomputed scores  
 and PCHs.  
   
 If Data/CouplingData is present, this routine returns a list of 2-tuples [Peg,Sc].  It  
 returns the empty list if the peg is not coupled.  It returns undef, if Data/CouplingData  
 is not there.  
   
 =cut  
   
 sub coupled_to {  
     my($self,$peg) = @_;  
   
     my $rdbH = $self->db_handle;  
     if (! $rdbH->table_exists('fc_pegs')) { return undef }  
   
     my $relational_db_response = $rdbH->SQL("SELECT peg2,score FROM fc_pegs  
                                              WHERE  peg1 = \'$peg\' ");  
     return @$relational_db_response;  
 }  
   
 =pod  
   
 =head1 coupling_evidence  
   
 usage: @evidence = $fig->coupling_evidence($peg1,$peg2)  
   
 The new form of coupling and evidence computation is based on precomputed data.  
 The old form took minutes to dynamically compute things when needed.  The old form  
 still works, ikf the directory Data/CouplingData is not present.  If it is present,  
 it is assumed to contain comprehensive coupling data in the form of precomputed scores  
 and PCHs.  
   
 If Data/CouplingData is present, this routine returns a list of 3-tuples [Peg3,Peg4,Rep].  
 Here, Peg3 is similar to Peg1, Peg4 is similar to Peg2, and Rep == 1 iff this is a  
 "representative pair".  That is, we take all pairs and create a representative set  
 in which each pair is not "too close" to any other pair in the representative set.  
 Think of "too close" as being roughly 95% identical at the DNA level.  This keeps (usually)  
 a single pair from a set of different genomes from the same species.  
   
 It returns the empty list if the peg is not coupled.  It returns undef, if Data/CouplingData  
 is not there.  
   
 =cut  
   
 sub coupling_evidence {  
     my($self,$peg1,$peg2) = @_;  
   
     my $rdbH = $self->db_handle;  
     if (! $rdbH->table_exists('pchs')) { return undef }  
   
     my $relational_db_response = $rdbH->SQL("SELECT peg3,peg4,rep FROM pchs  
                                              WHERE  peg1 = \'$peg1\' AND  
                                                     peg2 = \'$peg2\'");  
     return @$relational_db_response;  
 }  
   
 =pod  
   
 =head1 coupling_and_evidence  
4516    
4517  usage: @coupling_data = $fig->coupling_and_evidence($fid,$bound,$sim_cutoff,$coupling_cutoff,$keep_record)  usage: @coupling_data = $fig->coupling_and_evidence($fid,$bound,$sim_cutoff,$coupling_cutoff,$keep_record)
4518    
# Line 3649  Line 4525 
4525  a "pair of close homologs" of [$peg,CoupledToFID]).  The maximum score for a single  a "pair of close homologs" of [$peg,CoupledToFID]).  The maximum score for a single
4526  PCH is 1, but "Score" is the sum of the scores for the entire set of PCHs.  PCH is 1, but "Score" is the sum of the scores for the entire set of PCHs.
4527    
 NOTE: once the new version of precomputed coupling is installed (i.e., when Data/CouplingData  
 is filled with the precomputed relations), the parameters on computing evidence are ignored.  
   
4528  If $keep_record is true, the system records the information, asserting coupling for each  If $keep_record is true, the system records the information, asserting coupling for each
4529  of the pairs in the set of evidence, and asserting a pin from the given $fd through all  of the pairs in the set of evidence, and asserting a pin from the given $fd through all
4530  of the PCH entries used in forming the score.  of the PCH entries used in forming the score.
# Line 3659  Line 4532 
4532  =cut  =cut
4533    
4534  sub coupling_and_evidence {  sub coupling_and_evidence {
     my($self,$peg1,$bound,$sim_cutoff,$coupling_cutoff,$keep_record) = @_;  
   
     my $rdbH = $self->db_handle;  
     if ($rdbH->table_exists('fc_pegs') && $self->is_complete(&FIG::genome_of($peg1)))  
     {  
         my @ans = ();  
         my $tuple;  
         foreach $tuple ($self->coupled_to($peg1))  
         {  
             my($peg2,$sc) = @$tuple;  
             my $evidence = [map { [$_->[0],$_->[1]] } $self->coupling_evidence($peg1,$peg2)];  
             push(@ans,[$sc,$peg2,$evidence]);  
         }  
         return @ans;  
     }  
     else  
     {  
         return &coupling_and_evidence1($self,$peg1,$bound,$sim_cutoff,$coupling_cutoff,$keep_record);  
     }  
 }  
   
 sub coupling_and_evidence1 {  
4535      my($self,$feature_id,$bound,$sim_cutoff,$coupling_cutoff,$keep_record) = @_;      my($self,$feature_id,$bound,$sim_cutoff,$coupling_cutoff,$keep_record) = @_;
4536      my($neighbors,$neigh,$similar1,$similar2,@hits,$sc,$ev,$genome1);      my($neighbors,$neigh,$similar1,$similar2,@hits,$sc,$ev,$genome1);
4537    
# Line 3733  Line 4584 
4584      my @ans = ();      my @ans = ();
4585    
4586      $genome = &genome_of($peg);      $genome = &genome_of($peg);
4587      foreach $peg1 ($self->in_pch_pin_with($peg))      foreach $peg1 ($self->in_pch_pin_with($peg)) {
     {  
4588          $peg1 =~ s/,.*$//;          $peg1 =~ s/,.*$//;
4589          if ($peg ne $peg1)          if ($peg ne $peg1) {
         {  
4590              $genome1 = &genome_of($peg1);              $genome1 = &genome_of($peg1);
4591              $maps{$peg}->{$genome1} = $peg1;              $maps{$peg}->{$genome1} = $peg1;
4592          }          }
4593      }      }
4594    
4595      $loc = [&boundaries_of(scalar $self->feature_location($peg))];      $loc = [&boundaries_of(scalar $self->feature_location($peg))];
4596      foreach $peg1 ($self->in_cluster_with($peg))      foreach $peg1 ($self->in_cluster_with($peg)) {
4597      {          if ($peg ne $peg1) {
         if ($peg ne $peg1)  
         {  
4598  #           print STDERR "peg1=$peg1\n";  #           print STDERR "peg1=$peg1\n";
4599              $loc1 = [&boundaries_of(scalar $self->feature_location($peg1))];              $loc1 = [&boundaries_of(scalar $self->feature_location($peg1))];
4600              if (&close_enough($loc,$loc1,$bound))              if (&close_enough($loc,$loc1,$bound)) {
4601              {                  foreach $peg2 ($self->in_pch_pin_with($peg1)) {
                 foreach $peg2 ($self->in_pch_pin_with($peg1))  
                 {  
4602                      $genome2 = &genome_of($peg2);                      $genome2 = &genome_of($peg2);
4603                      if (($peg3 = $maps{$peg}->{$genome2}) && ($peg2 ne $peg3))                      if (($peg3 = $maps{$peg}->{$genome2}) && ($peg2 ne $peg3)) {
                     {  
4604                          $loc2 = [&boundaries_of(scalar $self->feature_location($peg2))];                          $loc2 = [&boundaries_of(scalar $self->feature_location($peg2))];
4605                          $loc3 = [&boundaries_of(scalar $self->feature_location($peg3))];                          $loc3 = [&boundaries_of(scalar $self->feature_location($peg3))];
4606                          if (&close_enough($loc2,$loc3,$bound))                          if (&close_enough($loc2,$loc3,$bound)) {
4607                          {                              push @{$ev{$peg1}}, [$peg3,$peg2];
                             push(@{$ev{$peg1}},[$peg3,$peg2]);  
4608                          }                          }
4609                      }                      }
4610                  }                  }
4611              }              }
4612          }          }
4613      }      }
4614      foreach $peg1 (keys(%ev))      foreach $peg1 (keys(%ev)) {
     {  
4615          $pairs = $ev{$peg1};          $pairs = $ev{$peg1};
4616          $sc = $self->score([$peg,map { $_->[0] } @$pairs]);          my @pegMap = $peg, map { $_->[0] } @$pairs;
4617          if ($sc >= $coupling_cutoff)          $sc = $self->score(\@pegMap);
4618          {          if ($sc >= $coupling_cutoff) {
4619              push(@ans,[$sc,$peg1]);              push(@ans,[$sc,$peg1]);
4620          }          }
4621      }      }
4622      return sort { $b->[0] <=> $a->[0] } @ans;      return sort { $b->[0] <=> $a->[0] } @ans;
4623  }  }
4624    
   
4625  sub score {  sub score {
4626      my($self,$pegs) = @_;      my($self,$pegs) = @_;
4627      my(@ids);      my(@ids);
# Line 3846  Line 4687 
4687      return $count;      return $count;
4688  }  }
4689    
4690  =pod  =head3 add_chr_clusters_and_pins
   
 =head1 add_chr_clusters_and_pins  
4691    
4692  usage: $fig->add_chr_clusters_and_pins($peg,$hits)  usage: $fig->add_chr_clusters_and_pins($peg,$hits)
4693    
# Line 3867  Line 4706 
4706      my(@clusters,@pins,$x,$sc,$neigh,$pairs,$y,@corr,@orgs,%projection);      my(@clusters,@pins,$x,$sc,$neigh,$pairs,$y,@corr,@orgs,%projection);
4707      my($genome,$cluster,$pin,$peg2);      my($genome,$cluster,$pin,$peg2);
4708    
4709      if (@$hits > 0)      if (@$hits > 0) {
     {  
4710          @clusters = ();          @clusters = ();
4711          @pins = ();          @pins = ();
4712          push(@clusters,[$peg,map { $_->[1] } @$hits]);          my @pinMap = ($peg, map { $_->[1] } @$hits);
4713          foreach $x (@$hits)          push @clusters, \@pinMap;
4714          {          foreach $x (@$hits) {
4715              ($sc,$neigh,$pairs) = @$x;              ($sc,$neigh,$pairs) = @$x;
4716              push(@pins,[$neigh,map { $_->[1] } @$pairs]);              my @mapped = ($neigh, map { $_->[1] } @$pairs);
4717              foreach $y (@$pairs)              push @pins, \@mapped;
4718              {              foreach $y (@$pairs) {
4719                  $peg2 = $y->[0];                  $peg2 = $y->[0];
4720                  if ($peg2 =~ /^fig\|(\d+\.\d+)/)                  if ($peg2 =~ /^fig\|(\d+\.\d+)/) {
                 {  
4721                      $projection{$1}->{$peg2} = 1;                      $projection{$1}->{$peg2} = 1;
4722                  }                  }
4723              }              }
4724          }          }
4725          @corr = ();          @corr = ();
4726          @orgs = keys(%projection);          @orgs = keys(%projection);
4727          if (@orgs > 0)          if (@orgs > 0) {
4728          {              foreach $genome (sort { $a <=> $b } @orgs) {
4729              foreach $genome (sort { $a <=> $b } @orgs)                  push @corr, sort { &FIG::by_fig_id($a,$b) } keys(%{$projection{$genome}});
             {  
                 push(@corr,sort { &FIG::by_fig_id($a,$b) } keys(%{$projection{$genome}}));  
4730              }              }
4731              push(@pins,[$peg,@corr]);              push(@pins,[$peg,@corr]);
4732          }          }
4733    
4734          foreach $cluster (@clusters)          foreach $cluster (@clusters) {
         {  
4735              $self->add_chromosomal_cluster($cluster);              $self->add_chromosomal_cluster($cluster);
4736          }          }
4737    
4738          foreach $pin (@pins)          foreach $pin (@pins) {
         {  
4739              $self->add_pch_pin($pin);              $self->add_pch_pin($pin);
4740          }          }
4741      }      }
# Line 3933  Line 4766 
4766              $j++;              $j++;
4767          }          }
4768      }      }
4769      return ($self->score([map { $_->[0] } @$ev]),$ev);      my @mapped = map { $_->[0] } @$ev;
4770        return ($self->score(\@mapped),$ev);
4771  }  }
4772    
4773  sub accumulate_ev {  sub accumulate_ev {
# Line 3996  Line 4830 
4830  ################ Translations of PEGsand External Protein Sequences  ##########################  ################ Translations of PEGsand External Protein Sequences  ##########################
4831    
4832    
4833  =pod  =head3 translatable
   
 =head1 translatable  
4834    
4835  usage: $fig->translatable($prot_id)  usage: $fig->translatable($prot_id)
4836    
# Line 4016  Line 4848 
4848  }  }
4849    
4850    
4851  =pod  =head3 translation_length
   
 =head1 translation_length  
4852    
4853  usage: $len = $fig->translation_length($prot_id)  usage: $len = $fig->translation_length($prot_id)
4854    
# Line 4045  Line 4875 
4875  }  }
4876    
4877    
4878  =pod  =head3 get_translation
   
 =head1 get_translation  
4879    
4880  usage: $translation = $fig->get_translation($prot_id)  usage: $translation = $fig->get_translation($prot_id)
4881    
# Line 4092  Line 4920 
4920      return '';      return '';
4921  }  }
4922    
4923  =pod  =head3 mapped_prot_ids
   
 =head1 mapped_prot_ids  
4924    
4925  usage: @mapped = $fig->mapped_prot_ids($prot)  usage: @mapped = $fig->mapped_prot_ids($prot)
4926    
# Line 4173  Line 4999 
4999      return $self->{_user};      return $self->{_user};
5000  }  }
5001    
5002  =pod  =head3 function_of
   
 =head1 function_of  
5003    
5004  usage: @functions = $fig->function_of($peg)  OR  usage: @functions = $fig->function_of($peg)  OR
5005         $function  = $fig->function_of($peg,$user)         $function  = $fig->function_of($peg,$user)
# Line 4236  Line 5060 
5060      return $wantarray ? () : "";      return $wantarray ? () : "";
5061  }  }
5062    
5063  =pod  =head3 translated_function_of
   
 =head1 translated_function_of  
5064    
5065  usage: $function  = $fig->translated_function_of($peg,$user)  usage: $function  = $fig->translated_function_of($peg,$user)
5066    
# Line 4268  Line 5090 
5090  }  }
5091    
5092    
5093  =pod  =head3 translate_function
   
 =head1 translate_function  
5094    
5095  usage: $translated_func = $fig->translate_function($func)  usage: $translated_func = $fig->translate_function($func)
5096    
# Line 4313  Line 5133 
5133      return $function;      return $function;
5134  }  }
5135    
5136  =pod  =head3 assign_function
   
 =head1 assign_function  
5137    
5138  usage: $fig->assign_function($peg,$user,$function,$confidence)  usage: $fig->assign_function($peg,$user,$function,$confidence)
5139    
# Line 4419  Line 5237 
5237      if ($x =~ /^U\d/)                     { return 1 }      if ($x =~ /^U\d/)                     { return 1 }
5238      if ($x =~ /^orf/i)                    { return 1 }      if ($x =~ /^orf/i)                    { return 1 }
5239      if ($x =~ /uncharacterized/i)         { return 1 }      if ($x =~ /uncharacterized/i)         { return 1 }
5240      if ($x =~ /pseudogene/i)              { return 1 }      if ($x =~ /psedogene/i)               { return 1 }
5241      if ($x =~ /^predicted/i)              { return 1 }      if ($x =~ /^predicted/i)              { return 1 }
5242      if ($x =~ /AGR_/)                     { return 1 }      if ($x =~ /AGR_/)                     { return 1 }
5243      if ($x =~ /similar to/i)              { return 1 }      if ($x =~ /similar to/i)              { return 1 }
# Line 4437  Line 5255 
5255          ($x =~ /Expressed/i) ||          ($x =~ /Expressed/i) ||
5256          ($x =~ /[a-zA-Z]{2,3}\|/) ||          ($x =~ /[a-zA-Z]{2,3}\|/) ||
5257          ($x =~ /predicted by Psort/) ||          ($x =~ /predicted by Psort/) ||
         ($x =~ /predicted coding region/) ||  
5258          ($x =~ /^bh\d+/i) ||          ($x =~ /^bh\d+/i) ||
5259          ($x =~ /cds_/i) ||          ($x =~ /cds_/i) ||
5260          ($x =~ /^[a-z]{2,3}\d+/i) ||          ($x =~ /^[a-z]{2,3}\d+/i) ||
# Line 4450  Line 5267 
5267    
5268  ############################  Similarities ###############################  ############################  Similarities ###############################
5269    
5270  =pod  =head3 sims
   
 =head1 sims  
5271    
5272  usage: @sims = $fig->sims($peg,$maxN,$maxP,$select)  usage: @sims = $fig->sims($peg,$maxN,$maxP,$select)
5273    
# Line 4833  Line 5648 
5648    
5649    
5650  sub bbhs {  sub bbhs {
5651      my($self,$peg,$cutoff) = @_;      my($self,$peg,$cutoff,$frac_match) = @_;
5652      my($sim,$peg2,$genome2,$i,@sims2,%seen);      my($sim,$peg2,$genome2,$i,@sims2,%seen);
5653    
5654      if ($self->is_deleted_fid($peg)) { return () }      if ($self->is_deleted_fid($peg)) { return () }
5655    
5656        $frac_match = defined($frac_match) ? $frac_match : 0;
5657    
5658      $cutoff = defined($cutoff) ? $cutoff : 1.0e-10;      $cutoff = defined($cutoff) ? $cutoff : 1.0e-10;
5659      my @bbhs = ();      my @bbhs = ();
5660        my @precomputed = ();
5661      my $rdbH = $self->db_handle;      my $rdbH = $self->db_handle;
5662      if (! $rdbH->table_exists('bbh')) { return () }      my $relational_db_response = $rdbH->SQL("SELECT seek, others FROM bbhs WHERE peg = \'$peg\' ");
5663      my $relational_db_response = $rdbH->SQL("SELECT peg2,psc  FROM bbh WHERE peg1 = \'$peg\' ");      if (@$relational_db_response == 1)
     if (@$relational_db_response > 0)  
5664      {      {
5665          return @$relational_db_response;          my($seek_set,$others) = @{$relational_db_response->[0]};
5666            if (open(CORES,"<$FIG_Config::global/bbh.cores"))
5667            {
5668                my $seek;
5669                foreach $seek (split(/,/,$seek_set))
5670                {
5671                    seek(CORES,$seek,0);
5672                    $_ = <CORES>;
5673                    chop;
5674                    push(@precomputed,split(/,/,$_));
5675      }      }
5676      else              close(CORES);
5677            }
5678            push(@precomputed,split(/,/,$others));
5679        }
5680        my %bbhs = map { $_ => 1 } @precomputed;
5681    
5682        foreach $sim ($self->sims($peg,10000,$cutoff,"fig"))
5683      {      {
5684          return ();          $peg2 = $sim->id2;
5685            my $frac = &FIG::min(($sim->e1+1 - $sim->b1) / $sim->ln1, ($sim->e2+1 - $sim->b2) / $sim->ln2);
5686            if ($bbhs{$peg2} && ($frac >= $frac_match))
5687            {
5688                push(@bbhs,[$peg2,$sim->psc]);
5689            }
5690      }      }
5691        return @bbhs;
5692  }  }
5693    
5694  #  #
# Line 4858  Line 5696 
5696  #  #
5697    
5698    
5699  sub bbh_list  sub bbh_list {
 {  
5700      my($self, $genome, $features) = @_;      my($self, $genome, $features) = @_;
5701    
5702      my $cutoff = 1.0e-10;      my $cutoff = 1.0e-10;
5703    
5704      my $out = {};      my $out = {};
5705      for my $feature (@$features)      for my $feature (@$features) {
     {  
5706          my @bbhs = $self->bbhs($feature, $cutoff);          my @bbhs = $self->bbhs($feature, $cutoff);
5707            my @featureList = grep { /fig\|$genome\.peg/ } map { $_->[0] } @bbhs;
5708          $out->{$feature} = [grep { /fig\|$genome\.peg/ } map { $_->[0] } @bbhs];          $out->{$feature} = \@featureList;
5709      }      }
5710      return $out;      return $out;
5711  }  }
5712    
5713  =pod  =head3 dsims
   
 =head1 dsims  
5714    
5715  usage: @sims = $fig->dsims($peg,$maxN,$maxP,$select)  usage: @sims = $fig->dsims($peg,$maxN,$maxP,$select)
5716    
# Line 4970  Line 5804 
5804    
5805  ################################# chromosomal clusters ####################################  ################################# chromosomal clusters ####################################
5806    
5807  =pod  =head3 in_cluster_with
   
 =head1 in_cluster_with  
5808    
5809  usage: @pegs = $fig->in_cluster_with($peg)  usage: @pegs = $fig->in_cluster_with($peg)
5810    
# Line 4988  Line 5820 
5820      return $self->in_set_with($peg,"chromosomal_clusters","cluster_id");      return $self->in_set_with($peg,"chromosomal_clusters","cluster_id");
5821  }  }
5822    
5823  =pod  =head3 add_chromosomal_clusters
   
 =head1 add_chromosomal_clusters  
5824    
5825  usage: $fig->add_chromosomal_clusters($file)  usage: $fig->add_chromosomal_clusters($file)
5826    
# Line 5026  Line 5856 
5856    
5857  #=pod  #=pod
5858  #  #
5859  #=head1 export_chromosomal_clusters  #=head3 export_chromosomal_clusters
5860  #  #
5861  #usage: $fig->export_chromosomal_clusters  #usage: $fig->export_chromosomal_clusters
5862  #  #
# Line 5085  Line 5915 
5915    
5916  ################################# PCH pins  ####################################  ################################# PCH pins  ####################################
5917    
5918  =pod  =head3 in_pch_pin_with
   
 =head1 in_pch_pin_with  
5919    
5920  usage: $fig->in_pch_pin_with($peg)  usage: $fig->in_pch_pin_with($peg)
5921    
# Line 5104  Line 5932 
5932      return $self->in_set_with($peg,"pch_pins","pin");      return $self->in_set_with($peg,"pch_pins","pin");
5933  }  }
5934    
5935  =pod  =head3 add_pch_pins
   
 =head1 add_pch_pins  
5936    
5937  usage: $fig->add_pch_pins($file)  usage: $fig->add_pch_pins($file)
5938    
# Line 5202  Line 6028 
6028    
6029  ################################# Annotations  ####################################  ################################# Annotations  ####################################
6030    
6031  =pod  =head3 add_annotation
   
 =head1 add_annotation  
6032    
6033  usage: $fig->add_annotation($fid,$user,$annotation)  usage: $fig->add_annotation($fid,$user,$annotation)
6034    
# Line 5249  Line 6073 
6073      return 0;      return 0;
6074  }  }
6075    
6076  =pod  =head3 merged_related_annotations
   
 =head1 merged_related_annotations  
6077    
6078  usage: @annotations = $fig->merged_related_annotations($