[Bio] / Sprout / LoadExpressionGenome.pl Repository:
ViewVC logotype

View of /Sprout/LoadExpressionGenome.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Fri Jan 28 22:51:42 2011 UTC (8 years, 6 months ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_release_3_0, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_10262011, HEAD
Created script for loading expression data.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

use strict;
use Tracer;
use Stats;
use SaplingExpressionLoader;
use Sapling;
use SaplingGenomeLoader;
use File::Spec;
use FIG_Config;

=head1 LoadExpressionGenome Script

=head2 Introduction

    LoadExpressionGenome [options] genome1 genome2 ...

This script loads an expression directory and its associated genome into a Sapling database.

=head2 Command-Line Options

=over 4

=item trace

Specifies the tracing level. The higher the tracing level, the more messages
will appear in the trace log. Use E to specify emergency tracing.

=item root

Directory in which the expression data is found. Each genome's expression data should be
in a subdirectory of this one with the same name as the genome ID. So, if the value
were C</vol/expression-data/current>, the expression data for genome C<100226.1> would
be in the directory C</vol/expression-data/current/100226.1>.

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item help

Display this command's parameters and options.

=item warn

Create an event in the RSS feed when an error occurs.

=item genome

For each genome, also load the associated genome directory from the current SEED. This
will take place before loading the expression data.

=item init

If specified, the database will be initialized before loading.

=item dbName

Name of the database to contain the data, if different from the Sapling master database.

=back

=cut

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(SaplingExpressionLoader SaplingGenomeLoader SaplingDataLoader) ],
                                           {
                                              trace => ["3", "tracing level"],
                                              root => ["/vol/expression/Jan5.processed/", "root directory for expression data subdirectories"],
                                              genome => ["", "if specified, the genome directory will be loaded before the expression data"],
                                              init => ["", "if specified, the database will be initialized before loading"],
                                              dbName => ["", "name of the database, if not the standard Sapling"],
                                           },
                                           "genome1 genome2 ...",
                                           @ARGV);
# Set a variable to contain return type information.
my $rtype;
# Create a statistics object.
my $stats = Stats->new();
# Insure we catch errors.
eval {
    # Get the Sapling database.
    my $dbName;
    if ($options->{dbName}) {
        $dbName = $options->{dbName};
        Trace("Connecting to Sapling database $dbName.") if T(2);
    } else {
        Trace("Connecting to default Sapling database.") if T(2);
    }
    my $sap = Sapling->new(dbName => $dbName);
    # Initialize if we need to. Note that if we DO initialize, we indicate that we don't need
    # to clear genomes before loading.
    my $clearNeeded = 1;
    if ($options->{init}) {
        Trace("Initializing database.") if T(2);
        $sap->CreateTables();
        $sap->InternalizeDBD();
        $clearNeeded = 0;
    }
    # Get the expression directory.
    my $rootDirectory = $options->{root};
    if (! -d $rootDirectory) {
        Trace("Root directory $rootDirectory not found.") if T(0);
    } else {
        # Loop through the genomes.
        for my $genome (@parameters) {
            Trace("Processing genome $genome.") if T(2);
            # Compute the expression directory name.
            my $expDirectory = File::Spec->catfile($rootDirectory, $genome);
            if (! -d $expDirectory) {
                # The directory is invalid, so we skip this genome.
                Trace("Expression directory $expDirectory not found.") if T(1);
                $stats->Add(missingDirectory => 1);
            } else {
                # The direcytory is valid. Check to see if we need to do the genome.
                if ($options->{genome}) {
                    # Here we have to load the genome first. Check to see if we have to clear.
                    if ($clearNeeded) {
                        # Clear the old genome data.
                        Trace("Deleting old data for genome $genome.") if T(3);
                        my $newStats = SaplingGenomeLoader::ClearGenome($sap, $genome);
                        # Update the delete counts.
                        AccumulateDeletions($stats, $newStats);
                    }
                    # Now load the genome from its organism directory.
                    my $newStats = SaplingGenomeLoader::Load($sap, $genome, "$FIG_Config::organisms/$genome");
                    # Roll up the statistics.
                    $stats->Accumulate($newStats);
                }
                # The genome is safe, so now we process the expression data.
                if ($clearNeeded) {
                    # Clear the old expression data.
                    Trace("Deleting old expression data for genome $genome.") if T(3);
                    my $newStats = SaplingExpressionLoader::ClearExpressionData($sap, $genome);
                    # Update the delete counts.
                    AccumulateDeletions($stats, $newStats);
                }
                # Finally, load the expression data itself.
                Trace("Loading expression data for genome $genome.") if T(3);
                my $newStats = SaplingExpressionLoader::Load($sap, $genome, $expDirectory);
            }
        }
    }
};
if ($@) {
    Trace("Script failed with error: $@") if T(0);
    $rtype = "error";
} else {
    Trace("Script complete.") if T(2);
    $rtype = "no error";
}
# Display the statistics.
Trace("Statistics for this run:\n" . $stats->Show()) if T(2);

=head2 Utility Methods

=head3 AccumulateDeletions

    AccumulateDeletions($stats, $deleteStats);

Accumulate the deletions listed in a statistics object in another statistics object. As
they are transferred, the prefix C<delete> is added to each table name.

=over 4

=item stats

Target statistics object.

=item deleteStats

Statistics object containing the record-deletion counts for each table.

=back

=cut

sub AccumulateDeletions {
    # Get the parameters.
    my ($stats, $deleteStats) = @_;
    # Get the map of the deletion statistics.
    my $mapHash = $deleteStats->Map();
    # Loop through the delete counts, adding them to the result hash.
    for my $table (keys %$mapHash) {
        $stats->Add("delete-$table", $mapHash->{$table});
    }
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3