[Bio] / Sprout / SmallHarvester.pl Repository:
ViewVC logotype

View of /Sprout/SmallHarvester.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Wed Dec 16 01:45:19 2009 UTC (9 years, 4 months ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2010_0526, rast_rel_2014_0729, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_10262011, HEAD
Sapling v08 changes.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

use strict;
use Tracer;
use ERDB;
use FIG;
use Sapling;
use Stats;
use Time::HiRes;
use FullLocation;

=head1 Small Feature Harvester Script

    SmallHarvester [options] <genome1> <genome2> ...

Missing Feature Harvester

=head2 Introduction

This script looks for features in particular genomes that are missing from the
Sapling database and adds them. Only the basic feature and location
information is inserted. The rest is left for the next real load.

=head2 Positional Parameters

=over 4

=item genome1, genome2

IDs of the genomes whose features are to be harvested.

=back

=head2 Command-Line Options

=over 4

=item trace

Specifies the tracing level. The higher the tracing level, the more messages
will appear in the trace log. Use E to specify emergency tracing.

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item help

Display this command's parameters and options.

=item warn

Create an event in the RSS feed when an error occurs.

=item phone

Phone number to message when the script is complete.

=back

=cut

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(ERDB Stats Sapling) ],
                                           {
                                              trace => ["2", "tracing level"],
                                              phone => ["", "phone number (international format) to call when load finishes"],
                                           },
                                           "<genome1> <genome2> ...",
                                           @ARGV);
# Set a variable to contain return type information.
my $rtype;
# Insure we catch errors.
eval {
    # Create a statistics object to track our progress.
    my $stats = Stats->new();
    # Start a timer.
    my $totalStart = time();
    # Connect to the sapling database.
    my $sap = ERDB::GetDatabase('Sapling');
    # Get the maximum location  segment length. We'll need this later.
    my $maxLength = $sap->TuningParameter('maxLocationLength');
    # Get the SEED database.
    my $fig = FIG->new();
    # Loop through the genome list.
    for my $genome (@parameters) {
        $stats->Add(genome => 1);
        Trace("Processing genome $genome.") if T(2);
        # Get this genome's features in the Sapling into a hash. We use this
        # to see which features are missing.
        my %sapFeatures = map { $_ => 1 } $sap->GetFlat("IsOwnerOf",
                                                        'IsOwnerOf(from-link) = ?',
                                                        [$genome], 'to-link');
        # Get the features from the SEED.
        my $featureList = $fig->all_features_detailed($genome);
        # Loop through them.
        for my $featureTuple (@$featureList) {
            Trace($stats->Ask('feature') . " features processed.")
                if $stats->Check(features => 1000);
            # Get the current feature.
            my ($fid, $locs, undef, $type) = @$featureTuple;
            # Is it in the sapling?
            if ($sapFeatures{$fid}) {
                # Yes. Count it.
                $stats->Add(found => 1);
            } else {
                # No, we have to add it, so we need more information.
                # Start with the translation (if any).
                my $translation;
                # Is this a PEG?
                if ($type eq 'peg') {
                    # Yes. Get the translation of the protein sequence.
                    $translation = $fig->get_translation($fid);
                    # Compute the protein's ID.
                    my $prot_id = ERDB::DigestKey($translation);
                    # Does it already exist?
                    if (! $sap->Exists(ProteinSequence => $prot_id)) {
                        # No, we must create it.
                        $sap->InsertObject('ProteinSequence', id => $prot_id,
                                           sequence => $translation);
                    }
                    # Connect it to the new feature.
                    $sap->InsertObject('IsProteinFor', from_link => $prot_id,
                                       to_link => $fid);
                }
                # Now we create the feature itself. Get the location
                # information.
                my $loc = FullLocation->new($fig, $genome, $locs, $translation);
                # Get the sequence length.
                my $sequenceLength = $loc->Length;
                # Get the functional assignment.
                my $function = $fig->function_of($fid);
                # Create the feature.
                $sap->InsertObject('Feature', id => $fid, function => $function,
                                   locked => 0, sequence_length => $sequenceLength);
                # Connect it to the genome.
                $sap->InsertObject('IsOwnerOf', from_link => $genome,
                                   to_link => $fid);
                # Create its default identifier.
                $sap->InsertObject('Identifier', id => $fid, natural_form => $fid,
                                   source => 'SEED');
                $sap->InsertObject('Identifies', from_link => $fid,
                                   to_link => $fid);
                # Now we must create the location connections. This will track
                # the ordinal position of the current location segment.
                my $locN = 1;
                # Loop through the individual basic location objects.
                for my $loc (@{$loc->Locs}) {
                    $stats->Add(locations => 1);
                    # Extract the contig ID. Note that we need to prefix the
                    # genome ID to convert it to Sapling form.
                    my $contig = $loc->Contig();
                    my $contigID = "$genome:$contig";
                    # Now we peel off sections of the location and connect them
                    # to the feature.
                    my $peel = $loc->Peel($maxLength);
                    while (defined $peel) {
                        ConnectPeel($sap, $fid, $contigID, $loc, $locN, $peel,
                                    $stats);
                        $peel = $loc->Peel($maxLength);
                    }
                    # Output the residual. There will always be one, because of the way
                    # Peel works.
                    ConnectPeel($sap, $fid, $contigID, $loc, $locN, $peel, $stats);
                }
            }
        }
    }
    # Compute the total time.
    $stats->Add('total-time' => (time() - $totalStart));
    # Display the statistics from this run.
    Trace("Statistics for reload:\n" . $stats->Show()) if T(2);
};
if ($@) {
    Trace("Script failed with error: $@") if T(0);
} else {
    Trace("Script complete.") if T(2);
}
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "ERDBReloader completed.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);
    }
}

## This is a dink little subroutine that creates a location connection from a
## peeled location.
sub ConnectPeel {
    # Get the parameters.
    my ($sap, $fid, $contigID, $loc, $locN, $peel, $stats) = @_;
    # Output the connection.
    $sap->InsertObject('IsLocatedIn', from_link => $fid, to_link => $contigID,
                       ordinal => $locN, begin => $loc->Left(),
                       dir => $loc->Dir, len => $loc->Length());
    # Record this segment.
    $stats->Add(segments => 1);
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3