Wed Jul 2 05:54:55 2008 UTC (11 years, 6 months ago) by parrello
Branch: MAIN
New script for loading evidence codes.

#!/usr/bin/perl -w

# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
# This file is part of the SEED Toolkit.
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.

use strict;
use Tracer;
use CustomAttributes;
use Stats;

=head1 EvCodeRefresh Script

    EvCodeRefresh [options] <filename>

Refresh evidence codes from a sequential file

=head2 Introduction

This script loads evidence codes from a tab-delimited file into the
B<IsEvidencedBy> table. The incoming file should contain two columns of data-- a
feature ID followed by an evidence code. For compatibility, there may be an
intervening column that is ignored.

=head2 Positional Parameters

=over 4

=item filename

Name of the file from which the evidence codes are to be loaded. The file must
be a tab-delimited file containing two or three columns. The first column must
contain a feature ID, and the last column an evidence code to be applied to that
feature. The evidence code will automatically be split into a class (e.g.
C<ilit>, C<ff>) and a modifier.


=head2 Command-Line Options

=over 4

=item trace

Specifies the tracing level. The higher the tracing level, the more messages
will appear in the trace log. Use E to specify emergency tracing.

=item append

Normally, the existing evidence codes are erased before the data from the file
is loaded. If C<append> is specified, then the erase is suppressed, and the
existing codes are kept.

=item classes

If this option is specified, it should be the name of a tab-delimited file
containing the evidence classes. The evidence class table will be deleted and
reloaded from the file, which should be a valid ERDB load file for the
B<EvidenceClass> table.

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item help

Display this command's parameters and options.

=item warn

Create an event in the RSS feed when an error occurs.

=item phone

Phone number to message when the script is complete.



# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(ERDB CustomAttributes) ],
                                              trace => ["2", "tracing level"],
                                              append => ["", "if specified, the evidence codes will be appended to existing codes"],
                                              classes => ["", "evidence class file name (optional)"],
                                              phone => ["", "phone number (international format) to call when load finishes"]
# Set a variable to contain return type information.
my $rtype;
# Insure we catch errors.
eval {
    # Get the attributes database.
    my $attr = CustomAttributes->new();
    # Check for a class file.
    if ($options->{classes}) {
        # We have one. Load it into the evidence class table.
        Trace("Loading evidence classes from $options->{classes}.") if T(2);
        $attr->LoadTable($options->{classes}, 'EvidenceClass', truncate => 1);
        Trace("Evidence classes loaded.") if T(2);
    # Verify that the input file exists.
    my $fileName = $parameters[0];
    if (! $fileName) {
        Confess("No input file name specified.");
    } else {
        # Now we convert the input file into a load file. First, we open it.
        my $ih = Open(undef, "<$fileName");
        # We'll count the number of codes in here.
        my $stats = Stats->new();
        # Create the load file. We sort it to speed up the load.
        my $loadFileName = "$FIG_Config::temp/IsEvidencedBy$$.dtx";
        my $oh = Open(undef, "| sort >$loadFileName");
        # Finally, we use this hash to track all the evidence classes.
        my %classes = ();
        Trace("Reading evidence codes.") if T(3);
        # Loop through the input file, writing load records.
        while (! eof $ih) {
            # Read the input record.
            my @fields = Tracer::GetLine($ih);
            my $line = $.;
            Trace("$line input lines processed.") if T(3) && ($. % 10000 == 0);
            # Insure it's valid.
            my $last = $#fields;
            if ($last >= 3 || $last < 1) {
                Trace("Record $line in input file has incorrect number of columns.") if T(3);
                $stats->Add(errors => 1);
            } else {
                # Get the feature ID and the code.
                my ($fid, $code) = @fields[0, $last];
                # Validate the feature ID and the evidence code.
                if (! ($fid =~ /^fig\|\d+/)) {
                    Trace("Record $line in input file has invalid feature ID \"$fid\".") if T(3);
                    $stats->Add(errors => 1);
                } elsif (! ($code =~ /^([a-z]+)(.*)/)) {
                    Trace("Record $line in input file has invalid evidence code \"$code\".") if T(3);
                    $stats->Add(errors => 1);
                } else {
                    # We have a valid input row. Produce the output line. Note that as a
                    # result of the pattern match that validated the evidence code, $1
                    # contains the class and $2 the modifier.
                    my ($class, $modifier) = ($1, $2);
                    Tracer::PutLine($oh, [$fid, $class, $modifier]);
                    # Count this as an output row and as a member of the specified class.
                    $stats->Add(rows => 1);
                    $stats->Add($class => 1);
                    $classes{$class} = 1;
        Trace("Evidence codes reformatted.") if T(2);
        # Close the files.
        close $oh;
        close $ih;
        Trace("Evidence codes will be loaded from $loadFileName.") if T(2);
        # Now we need to verify the incoming evidence codes against the known
        # evidence classes. We issue a message for every non-existent class. It's
        # not a serious error, but it's something the user should know.
        Trace("Verifying evidence classes.") if T(2);
        for my $class (keys %classes) {
            if (! $attr->Exists(EvidenceClass => $class)) {
                Trace("Evidence class \"$class\" not found in database!") if T(2);
                $stats->Add(bad_class => 1);
        # We are almost ready to reload the evidence codes. Set up the append option.
        # It's passed in as the truncate option flag on the load.
        my $truncate = ($options->{append} ? 0 : 1);
        # Now we load.
        Trace("Loading evidence codes.") if T(2);
        $attr->LoadTable($loadFileName, 'IsEvidencedBy', truncate => $truncate,
                         mode => 'concurrent');
        # Tell the user we're done, and show the statistics.
        Trace("Evidence codes loaded.\n" . $stats->Show()) if T(2);
if ($@) {
    Trace("Script failed with error: $@") if T(0);
    $rtype = "error";
} else {
    Trace("Script complete.") if T(2);
    $rtype = "no error";
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "EvCodeRefresh terminated with $rtype.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);


