[Bio] / Sprout / FCupdate.pl Repository:
ViewVC logotype

View of /Sprout/FCupdate.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Wed May 8 20:28:44 2013 UTC (5 years, 9 months ago) by parrello
Branch: MAIN
CVS Tags: rast_rel_2014_0912, rast_rel_2014_0729, HEAD
Changes since 1.2: +2 -2 lines
Updates for new incremental load.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

use strict;
use Tracer;
use Stats;
use ERDB;
use Sapling;
use SeedUtils;
use FC;

=head1 FCupdate Script

=head2 Introduction

    FCupdate [options] 

Functional Coupling Update Script

This script examines the 1000 highest-scoring pair sets in the Sapling database
to identify genomes that should have their coupling data updated. It will then
look for a C<pchs.raw> file for each identified genome. If one is found, it will
be used to add new pairs and pair sets to the database. Existing pair sets may
also get split or deleted. This script should be run periodically to insure that
the coupling data is up to date.

=head2 Command-Line Options

=over 4

=item incomplete

If specified, then incomplete genomes will be included when looking for new
genomes to process; otherwise, only complete genomes will be processed.

=item trace

Specifies the tracing level. The higher the tracing level, the more messages
will appear in the trace log. Use E to specify emergency tracing.

=item genomes

If this parameter is specified, then instead of looking for new genomes, the
specific genomes listed will be processed. The genomes should be specified as a
comma-delimited list.

=item sampleSize

Rather than examine all of the pair sets, the highest-scoring sets are scanned
to look for unrepresented genomes. This parameter specifies the number of pair
sets to scan; the default is C<1000>.

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item dbName

SQL name of the target database. If not specified, the default name is used.
This option allows you to specify a backup or alternate database that can
be updated without compromising the main database.

=item dbhost

Name of the MySQL database host. If not specified, the default host is used.
This option is required when the default host is restricted to read-only
database access.

=item help

Display this command's parameters and options.

=item warn

Create an event in the RSS feed when an error occurs.

=item phone

Phone number to message when the script is complete.

=back

=cut

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(FC Sapling) ],
                                           {
                                              dbName => ["", "if specified, the SQL name of the target database"],
                                              dbhost => ["", "if specified, the name of the target database"],
                                              incomplete => ["", "if specified, incomplete genomes will be processed"],
                                              trace => ["2", "tracing level"],
                                              genomes => ["", "specific genomes to process"],
                                              sampleSize => ["1000", "number of pair sets to sample when looking for new genomes"],
                                              phone => ["", "phone number (international format) to call when load finishes"]
                                           },
                                           "",
                                           @ARGV);
# Set a variable to contain return type information.
my $rtype;
# Create the statistics object.
my $stats = Stats->new();
my $myStartTime = time();
# Insure we catch errors.
eval {
    # Create the SAPLING object.
    my $sap = ERDB::GetDatabase('Sapling', undef, %$options);
    # We'll put our list of genomes to process in here.
    my @genomes;
    # Check for genomes specified on the command line.
    if ($options->{genomes}) {
        # If they're there, we extract themdirectly.
        @genomes = split m/\s*,\s*/, $options->{genomes};
    } else {
        # No command-line genomes, so we figure out the best ones by sampling
        # pair sets.
        @genomes = ComputeGenomes($sap, $options->{sampleSize},
                                  $options->{incomplete});
    }
    Trace(scalar(@genomes) . " genomes selected for re-coupling.") if T(3);
    # Loop through the genomes found.
    for my $genome (@genomes) {
        # Look for the PCH.RAW file in the organism directory.
        my $pchFile = "$FIG_Config::organisms/$genome/pchs.raw";
        if (-f $pchFile) {
            # We found one, so run the genome.
            Trace("Processing $genome PCH file $pchFile.") if T(2);
            FC::extend_pairs_and_pair_sets($sap, $pchFile, $stats);
            $stats->Add(genomes => 1);
        } else {
            # None exists, so issue a warning.
            Trace("PCH file $pchFile not found for $genome.") if T(1);
            $stats->Add(missingPCHfile => 1);
        }
    }
};
if ($@) {
    Trace("Script failed with error: $@") if T(0);
    $rtype = "error";
} else {
    Trace("Script complete.") if T(2);
    $rtype = "no error";
}
# Display the run statistics.
$stats->Add(duration => (time() - $myStartTime));
Trace("Statistics for this run:\n" . $stats->Show()) if T(2);
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "FCupdate terminated with $rtype.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);
    }
}

=head3 ComputeGenomes

    my @genomes = ComputeGenomes($sap, $sampleSize, $incomplete);

Compute a list of genomes whose functional coupling data should be
updated. A sampling of pair sets is examined, and the return list will
contain any genomes not present in any of the pair sets.

=over 4

=item sap

L<Sapling> object to use for accessing the database.

=item sampleSize

Number of pair sets to sample. The specified number of pair sets with the
highest scores will be examined.

=item incomplete

If TRUE, then both complete and incomplete genomes will be returned; otherwise,
only complete genomes will be returned.

=item RETURN

Returns a list of the IDs of genomes whose functional coupling data probably
needs to be updated.

=back

=cut

sub ComputeGenomes {
    # Get the parameters.
    my ($sap, $sampleSize, $incomplete) = @_;
    # Validate the sample size.
    unless ($sampleSize =~ /^\d+$/) {
        Confess("Invalid sampleSize parameter.");
    }
    # We want to get a hash of all the genome IDs in the system. Initially, every
    # genome ID will map to 1. If a genome ID is found in a pair set, we map it
    # to 0. At the end, all genome IDs that still map to 1 are returned.
    # First, we filter out non-prokaryotes.
    my $filter = "Genome(prokaryotic) = ?";
    my @parms = (1);
    # Check to see if incomplete genomes are allowed. If not, we add the complete-flag
    # to the filter.
    if (! $incomplete) {
        $filter .= " AND Genome(complete) = ?";
        push @parms, 1;
    }
    # Create the genome ID hash.
    my %genomes = map { $_ => 1 } $sap->GetFlat("Genome", $filter, \@parms, "id");
    # Ask for the highest-scoring pair sets.
    my @sets = $sap->GetFlat("PairSet", "ORDER BY PairSet(score) DESC LIMIT $sampleSize", [], "id");
    # Loop through the sets.
    for my $set (@sets) {
        Trace("Processing pair set #$set for genome search.") if T(3);
        # This will count the pairings.
        my $count = 0;
        # Get all the pairs in this set.
        my $query = $sap->Get("IsDeterminedBy", "IsDeterminedBy(from-link) = ?",
                              [$set]);
        while (my $pairData = $query->Fetch()) {
            # Count this pair.
            $count++;
            # Record the genomes for the pegs in the pair. The pegs can be found
            # separated by a colon in the pairing ID.
            for my $peg (split m/:/, $pairData->PrimaryValue('to-link')) {
                $genomes{genome_of($peg)} = 0;
            }
        }
    }
    # Find the unrepresented genomes.
    my @retVal = grep { $genomes{$_} } sort keys %genomes;
    # Return them.
    return @retVal;
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3