[Bio] / FigKernelScripts / add_dlits.pl Repository:
ViewVC logotype

View of /FigKernelScripts/add_dlits.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Wed Jun 15 19:17:47 2011 UTC (8 years, 5 months ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, rast_rel_2014_0729, mgrast_release_3_1_2, mgrast_release_3_1_1, rast_rel_2011_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_10262011, HEAD
New script for adding protein-based Dlits.

#!/usr/bin/perl -w

=head1 DLIT Add Script

This is a command-line script to add direct literature references to the SEED
database. The references are added as evidence code attributes for proteins
(indicated by MD5 protein IDs).

    add_dlits [options] input_file

The single positional parameter should be the name of a tab-delimited file
containing the literature references. Each literature reference should have
the following columns.

=over 4

=item 1

ID of the protein or feature to which the literature reference is being
attached. If a feature ID is specified, it will be converted to a protein
ID before the reference is processed.

=item 2

PUBMED ID of the literature reference.

=item 3

Curator of the liberature reference.

=item 4

Publication title information. If omitted, then the publication should already
exist in the SEED data store.

=back

Command-line parameters can be specified with single or double hyphens and must use
an equal sign if a value is required. The following options are supported.

=over 4

=item override

If this option is specified, an existing literature reference will be replaced
by new information in the input; otherwise, references that duplicate existing
ones will be ignored. (A reference is considered a duplicate if it specifies the
same protein and the same PUBMED article. Specifying this option allows the
curator to be changed.)

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item port

Database port to use for connecting to the SEED, if the default is to be
overridden. This parameter is only provided for testing purposes.

=item host

Database host to use for connecting to the SEED, if the default is to be
overridden. This parameter is only provided for testing purposes.

=item attrPort

Database port to use for connecting to the attribute server, if the default
is to be overridden. This parameter is only provided for testing purposes.

=item attrHost

Database host to use for connecting to the attribute server, if the default
is to be overridden. This parameter is only provided for testing purposes.

=item h

Display this command's parameters and options.

=back

=cut

    use strict;
    use Tracer;
    use FIG;

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(SaplingDataLoader ERDB) ],
                                           {
                                            trace => ["3", "tracing level"],
                                            override => ["", "if specified, duplicate references will be replaced, otherwise they will be left unchanged"],
                                            port => ["", "alternate port for connecting to the database"],
                                            host => ["", "alternate host for connecting to the database"],
                                            attrPort => ["", "alternate port for connecting to the attributes"],
                                            attrHost => ["", "alternate host for connecting to the attributes"],
                                           },
                                           "<inputFile>",
                                           @ARGV);
# Create the statistics object.
my $stats = Stats->new();
# Insure we catch errors.
eval {
    # Check for an alternate DB port.
    if ($options->{port}) {
        $FIG_Config::dbport = $options->{port};
    }
    if ($options->{host}) {
        $FIG_Config::dbhost = $options->{host};
    }
    if ($options->{attrPort}) {
        $FIG_Config::attrPort = $options->{attrPort};
    }
    if ($options->{attrHost}) {
        $FIG_Config::attrHost = $options->{attrHost};
    }
    # Get the SEED data store.
    my $fig = FIG->new();
    # Compute the input file. If no file name is specified, we use "-", which translates to
    # STDIN.
    my $inFileName = $parameters[0] || "-";
    if ($inFileName eq "-") {
        Trace("Literature data will be taken from standard input.") if T(2);
    } else {
        Trace("Literature data will be taken from $inFileName.") if T(2);
    }
    my $ih = Open(undef, "<$inFileName");
    # Get the override flag.
    my $override = $options->{override};
    # Loop through the input file.
    while (! eof $ih) {
        # Get the next literature reference.
        my ($id, $pubmed, $curator, $title) = Tracer::GetLine($ih);
        Trace($stats->Ask('linesIn') . " references processed.") if $stats->Check(linesIn => 1000) && T(3);
        # This will be set to TRUE if we find an error.
        my $error = 0;
        # Insure we have a protein ID.
        my $md5 = $id;
        if ($id =~ /^fig\|/) {
            $md5 = $fig->md5_of_peg($id);
            if (! defined $md5) {
                Trace("No protein sequence found for feature $id.") if T(0);
                $stats->Add(badFeature => 1);
                $error = 1;
            }
        }
        if ($pubmed =~ /\D/) {
            Trace("Invalid PUBMED ID $pubmed specified for $id.") if T(0);
            $stats->Add(badPubmed => 1);
            $error = 1;
        } elsif (! $curator) {
            Trace("No curator specified for $id reference $pubmed.") if T(0);
            $stats->Add(noCurator => 1);
            $error = 1;
        }
        # Only proceed if there was no error.
        if (! $error) {
            # If there is a title, then insure the literature item is in the
            # database.
            if ($title) {
                # We have a title. In override mode, we use update-title to insure
                # an existing title is overridden. In normal mode we use add-title
                # and it only updates if the publication is new.
                if ($override) {
                    my $rc = $fig->update_title($pubmed, $title);
                    if ($rc == 0) {
                        Trace("Error updating title of publication $pubmed.") if T(0);
                        $stats->Add(titleError => 1);
                    } elsif ($rc == 1) {
                        $stats->Add(titleUpdate => 1);
                    } else {
                        $stats->Add(titleNoChange => 1);
                    }
                } else {
                    my $rc = $fig->add_title($pubmed, $title);
                    if ($rc == 0) {
                        $stats->Add(titleIngore => 1);
                    } elsif ($rc == 1) {
                        $stats->Add(titleAdd => 1);
                    } else {
                        $stats->Add(titleNoChange => 1);
                    }
                }
            }
            # Now we can update the evidence codes.
            my $rc = $fig->add_dlit(-status => 'D',
                                    -md5 => $md5,
                                    -pubmed => $pubmed,
                                    -curator => $curator,
                                    -override => $override);
            if (! $rc) {
                $stats->Add(dlitAddFailed => 1);
            } else {
                $stats->Add(dlitAdded => 1);
            }
        }
    }
    Trace("Processing complete.") if T(2);
};
if ($@) {
    Trace("Script failed with error: $@") if T(0);
} else {
    Trace("Script complete.") if T(2);
}
Trace("Statistics for this run: " . $stats->Show()) if T(2);

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3