[Bio] / Sprout / SaplingLoadCheck.pl Repository:
ViewVC logotype

View of /Sprout/SaplingLoadCheck.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (download) (as text) (annotate)
Fri Jul 10 14:22:27 2015 UTC (3 years, 9 months ago) by parrello
Branch: MAIN
CVS Tags: HEAD
Changes since 1.7: +27 -13 lines
Added replaceAll option for sapling load check.

#!/usr/bin/perl -w

=head1 Sapling Incremental Load

This script performs a periodic, incremental update of a Sapling database
from the main SEED files. The update is determined by comparing the SEED files
to the current database content. In particular, the list of genomes, the list of
subsystems, and the expression data will be compared. The Taxonomy will always
be reloaded. FIGfams will be reloaded if they are a new release. It is possible
that subsystems and genomes may be deleted.

The currently-supported command-line options are as follows.

=over 4

=item create

If specified, the database is presumed to be new. The tables will be created and the DBD stored.

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item h

Display this command's parameters and options.

=item dbhost

Alternate database host, if the database is located somewhere other than the
default. This is necessary on some Sapling machines to insure we get a writable
copy of the database.

=item dbName

Name of the sapling database to update, if it is not the standard one.

=item port

Database access port. This is useful for testing.

=item notaxon

If specified, the taxonomy information will NOT be reloaded.

=item fighost

Alternate database host for the SEED. This is useful for testing.

=item figport

Alternate database access port for the SEED. This is useful for testing.

=item figdisk

Alternate directory for the SEED. This is useful for testing.

=item figdb

Name of the database for the SEED. This is useful for testing.

=item replaceAll

If specified, all genomes will be replaced. This is an expensive operation, but
occasionally necessary.

=back

=cut

use strict;
use Tracer;
use Sapling;
use SaplingDataLoader;
use Stats;

use SaplingExpressionLoader;
use SaplingFunctionLoader;
use SaplingGenomeLoader;
use SaplingSubsystemLoader;
use SaplingTaxonomyLoader;
use SaplingFamilyLoader;
use FIG;

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(SaplingDataLoader) ],
                                           { dbhost => ["", "alternate database host machine"],
                                             port => ["", "alternate database port"],
                                             notaxon => ["", "if specified, the taxonomy data will NOT be reloaded"],
                                             fighost => ["", "alternate SEED database host"],
                                             figport => ["", "alternate SEED database port"],
                                             figdisk => ["", "FIG instance directory (requires a special FIG_Config)"],
                                             figdb => ["", "name of the SEED MySQL database"],
                                             dbName => ["", "name of the Sapling database to update"],
                                             create => ["", "create the database tables"],
                                             replaceAll => ["", "replace all genomes"] },
                                           "",
                                           @ARGV);
# Create the statistics object.
my $stats = Stats->new();
# Insure we catch errors.
eval {
    # Get the Sapling database.
    my $sap = Sapling->new(dbhost => $options->{dbhost}, port => $options->{port},
                           dbName => $options->{dbName});
    # Get the SEED data. We may need to update some of the configuration parameters.
    if ($options->{fighost}) {
        $FIG_Config::dbhost = $options->{fighost};
    }
    if ($options->{figport}) {
        $FIG_Config::dbport = $options->{figport};
    }
    if ($options->{figdisk}) {
        $FIG_Config::fig_disk = $options->{figdisk};
        $FIG_Config::global = "$options->{figdisk}/FIG/Data/Global";
        $FIG_Config::organisms = "$options->{figdisk}/FIG/Data/Organisms";
        $FIG_Config::data = "$options->{figdisk}/FIG/Data";
    }
    if ($options->{figdb}) {
    	$FIG_Config::db = "$options->{figdb}";
    }
    my $fig = $sap->GetSourceObject();
    # Check for a table-create situation.
    if ($options->{create}) {
    	# Store the DBD.
    	$sap->InternalizeDBD();
    	Trace("DBD stored in database.") if T(2);
    	# Recreate the tables.
    	$sap->CreateTables();
    }
    # Update the taxonomies.
    if (! $options->{notaxon}) {
    	my $setFile = "$FIG_Config::global/genome.sets";
    	if (! -f $setFile) {
    		$setFile = "";
    		Trace("WARNING: No OTU file found.") if T(1);
    	}
        Trace("Updating taxonomy data.") if T(2);
        my $subStats = SaplingTaxonomyLoader::Process($sap, "$FIG_Config::global/Taxonomy",
                $setFile);
    }
    # Compute the updated subsystems. A subsystems is "changed" if it is new or
    # its version number has changed. A subsystem is "deleted" if it is in the
    # database but not the SEED. The function below returns a hash
    # reference. In the case of the changed subsystems, the hash reference
    # maps the subsystem ID to its directory name. In the case of deleted subsystems,
    # the hash reference maps the subsystem ID to an empty string.
    my $changedSubsystems = ComputeSubsystemChanges($stats,
            $sap, $fig);
    # Perform the subsystem changes. This returns a hash that maps each genome
    # to a hash of the subsystems of which it is a direct member.
    my $subsysGenomes = UpdateSubsystems($changedSubsystems, $stats, $sap);
    # Update the genomes. This returns a hash of the genomes added.
    my $newGenomes = UpdateGenomes($stats, $sap, $options->{replaceAll});
    # Loop through the genomes, applying the bindings.
    UpdateBindings($changedSubsystems, $subsysGenomes, $newGenomes, $stats, $sap);
    # Now we must compare the expression data.
    UpdateExpressionData($stats, $sap);
    # Finally, we need to update the FIGfams. First, we must find
    # the latest figfam-prod release directory.
    my @releases = sort { Tracer::Cmp($a, $b) } grep { $_ =~ /^Release\d+/ } OpenDir("/vol/figfam-prod");
    # Find the first valid FIGfam directory.
    my $figFamRel;
    for (my $i = $#releases; $i >= 0 && ! $figFamRel; $i--) {
        my $release = $releases[$i];
        if (-f "/vol/figfam-prod/$release/coupling.values") {
            $figFamRel = $release;
        }
    }
    if (! $figFamRel) {
        Confess("No FIGfam directory found.");
    } else {
        # We have a FIGfam release directory.
        my $figFamDir = "/vol/figfam-prod/$figFamRel";
        Trace("FIGfams are currently in $figFamDir.") if T(2);
        # Get the current release from the database.
        my $dbRel = $sap->GetFlat('FamilyType', "FamilyType(id) = ?",
                ['FIGfams'], 'version');
        if (! $dbRel || $dbRel ne $figFamRel) {
            # Here we have a new release, so we need to reload.
            Trace("Reloading FIGfams.") if T(2);
            my $subStats = SaplingFamilyLoader::Process($sap, $figFamDir);
            $stats->Accumulate($subStats);
            # Update the release information.
            if (! $dbRel) {
                Trace("Adding release marker for $figFamRel.") if T(2);
                $sap->InsertObject('FamilyType', id => 'FIGfams',
                        version => $figFamRel);
            } else {
                Trace("Updating release marker for $figFamRel.") if T(2);
                $sap->UpdateEntity('FamilyType', 'FIGfams', version => $figFamRel);
            }
        }
    }
    # All done.
    Trace("Processing complete.") if T(2);
};
if ($@) {
    Trace("Script failed with error: $@") if T(0);
} else {
    Trace("Script complete.") if T(2);
}
Trace("Statistics for this run:\n" . $stats->Show()) if T(2);

=head2 Subroutines

=head3 ComputeSubsystemChanges

    my $changedSubsystems = ComputeSubsystemChanges($stats, $sap, $fig);

This method compares the subsystems in the SEED to the subsystems in the
Sapling. If the subsystem is in both places and the version number has changed,
or if it is only in the SEED, it will be marked for reloading. If it is only
in the Sapling, it will be marked for deletion.

=over 4

=item stats

A L<Stats> object that will be used to record the method's activity.

=item sap

The L<Sapling> object used to communicate with the database.

=item fig

A L<FIG> object used to communicate with the SEED,

=item RETURN

Returns a reference to a hash keyed by subsystem ID. For subsystems to be
loaded or reloaded, it maps the ID to the subsystem's directory name. For
subsystems to be deleted, it maps the ID to an empty string.

=back

=cut

sub ComputeSubsystemChanges {
    # Get the parameters.
    my ($stats, $sap, $fig) = @_;
    Trace("Analyzing subsystems.") if T(2);
    # Get the list of subsystems in the SEED. This requires a direct query
    # to the SEED database to get us the version numbers, and we have to
    # convert the subsystem IDs.
    Trace("Reading subsystems from SEED.") if T(3);
    # First we get the configured list of subsystems.
    my $subsWanted = $sap->SubsystemHash();
    # Now we read the subsystems and versions from the SEED database.
    my $fig_dbh = $fig->db_handle;
    my @seedSubs = map { [$sap->SubsystemID($_->[0]), $_] }
        @{$fig_dbh->SQL("SELECT `subsystem`, `version` FROM subsystem_metadata")};
    # Filter the version list against the configured-subsystem list to create
    # the desired hash.
    my %seedSubs = map { $_->[0] => $_->[1] } grep { $subsWanted->{$_->[0]} } @seedSubs;
    # Get a similar list for the Sapling.
    Trace("Reading subsystems from Sapling.") if T(3);
    my %sapSubs = map { $_->[0] => $_->[1] }
        $sap->GetAll("Subsystem", "", [], 'id version');
    # Declare the return hash.
    my %retVal;
    Trace("Scanning subsystems for changes.") if T(2);
    # Loop through the SEED subsystems, looking for ones to reload.
    my @seedSubList = sort keys %seedSubs;
    for my $seedSub (@seedSubList) {
        $stats->Add(seedSubsystemsChecked => 1);
        # Get this subsystem's version and directory name.
        my $seedVersion = $seedSubs{$seedSub}[1];
        my $seedDirectory = "$FIG_Config::data/Subsystems/$seedSubs{$seedSub}[0]";
        # Verify that the subsystem is real.
        if (! -d $seedDirectory) {
            Trace("Subsystem $seedSub is not found in the data directory.") if T(3);
            delete $seedSubs{$seedSub};
        } else {
            # It is. See if the subsystem is new or changed.
            my $sapVersion = $sapSubs{$seedSub};
            if (! defined $sapVersion) {
                $stats->Add(seedSubsystemsNewFound => 1);
                $retVal{$seedSub} = $seedDirectory;
            } elsif ($seedVersion > $sapVersion) {
                $stats->Add(seedSubsystemsChangedFound => 1);
                $retVal{$seedSub} = $seedDirectory;
                Trace("Must update $seedSub from $sapVersion to $seedVersion.") if T(3);
            }
        }
    }
    # Loop through the Sapling subsystems, looking for ones that were deleted.
    Trace("Scanning for deleted subsystems.") if T(2);
    for my $sapSub (sort keys %sapSubs) {
        $stats->Add(sapSubsystemsChecked => 1);
        # Check for this subsystem in the SEED.
        if (! exists $seedSubs{$sapSub}) {
            $stats->Add(sapSubsystemDeletesFound => 1);
            $retVal{$sapSub} = '';
        }
    }
    # Return the hash of updates.
    return \%retVal;
}

=head3 UpdateSubsystems

    my $subsysGenomes = UpdateSubsystems($changedSubsystems, $stats, $sap);

Update the specified subsystems in the database. New subsystems will be added,
existing subsystems may be changed, and obsolete subsystems will be deleted. The
return hash will indicate which genomes are direct members of updated subsystems.
When the subsystem bindings are examined for the various genomes, the direct
members will be skipped, since they will already have been updated by this
process.

=over 4

=item changedSubsystems

Reference to a hash mapping the IDs of the modified subsystems. If a subsystem is
to be deleted, it will map to an empty string. If it is to be created or updated,
it will map to the name of the directory containing the subsystem data.

=item stats

L<Stats> object to be updated with statistics from this operation.

=item sap

L<Sapling> object for accessing the database.

=item RETURN

Returns a reference to a hash that maps each genome modified by the subsystem
changes to a list of the subsystems containing it.

=back

=cut

sub UpdateSubsystems {
    # Get the parameters.
    my ($changedSubsystems, $stats, $sap) = @_;
    Trace("Processing subsystem updates.") if T(2);
    # Declare the return variable.
    my %retVal;
    # Loop through the updated subsystems.
    for my $subsysID (sort keys %$changedSubsystems) {
        my $subsysDir = $changedSubsystems->{$subsysID};
        if (! $subsysDir) {
            # Here the subsystem is being deleted. Note that in this case we don't
            # need to track anything in the return hash, since the subsystem will
            # be skipped during the bindings by virtue of not being in the database.
            Trace("Deleting subsystem $subsysID.") if T(2);
            my $subStats = SaplingSubsystemLoader::ClearSubsystem($sap, $subsysID);
            $stats->Accumulate($subStats);
            $stats->Add(foundSubsystemsDeleted => 1);
        } else {
            # Here the subsystem is being updated or loaded.
            Trace("Updating subsystem $subsysID from $subsysDir.") if T(2);
            my $subStats = SaplingSubsystemLoader::Process($sap, $subsysID, $subsysDir);
            $stats->Accumulate($subStats);
            $stats->Add(foundSubsystemsUpdated => 1);
            # Get the list of genomes to skip when updating this subsystem's
            # bindings.
            my @genomes = $sap->GetFlat('Describes IsImplementedBy IsUsedBy',
                'Describes(from-link) = ?', [$subsysID], 'IsUsedBy(to-link)');
            for my $genome (@genomes) {
                push @{$retVal{$genome}}, $subsysID;
                $stats->Add(bindingExceptions => 1);
            }
        }
    }
    # Return the hash of binding exceptions.
    return \%retVal;
}

=head3 UpdateGenomes

    my $newGenomes = UpdateGenomes($stats, $sap, $all);

Compare the genomes in the Sapling database to the genomes in the SEED. New
genomes will be added and obsolete genomes will be deleted. A list of the
genomes added will be returned to help control the processing of the
subsystem bindings. For those genomes, the subsystem bindings will already
have been processed by this method.

=over 4

=item stats

L<Stats> object to contain statistics on this operation.

=item sap

L<Sapling> object for accessing the Sapling database.

=item all

If TRUE, then all genomes will be considered new, forcing a mass replacement.

=item RETURN

Returns a reference to a list of the IDs for the genomes added to the
Sapling by this method.

=back

=cut

sub UpdateGenomes {
    # Get the parameters.
    my ($stats, $sap, $all) = @_;
    # Declare the return variable.
    my @retVal;
    Trace("Processing genomes.") if T(2);
    # Get the hash of SEED genomes.
    Trace("Retrieving genomes from SEED.") if T(3);
    my $seedGenomes = $sap->GenomeHash();
    # Get a similar hash of genomes currently in Sapling.
    my %sapGenomes = map { $_ => 1 } $sap->GetFlat('Genome', "", [], 'id');
    Trace("Scanning for new genomes.") if T(2);
    # Loop through the SEED genomes, looking for new ones.
    for my $seedGenome (sort keys %$seedGenomes) {
        $stats->Add(seedGenomesChecked => 1);
        if ($all || ! $sapGenomes{$seedGenome}) {
            # Here we have a genome to be loaded.
            if (! $sapGenomes{$seedGenome}) {
                $stats->Add(seedGenomesNewFound => 1);
            } else {
                $stats->Add(seedGenomesReplaced => 1);
            }
            my $subStats = SaplingGenomeLoader::Process($sap, $seedGenome,
                    "$FIG_Config::organisms/$seedGenome");
            $stats->Accumulate($subStats);
            # Record it in the return list.
            push @retVal, $seedGenome;
        }
    }
    # Loop through the Sapling genomes, looking for obsolete ones.
    for my $sapGenome (sort keys %sapGenomes) {
        $stats->Add(sapGenomesChecked => 1);
        if (! $seedGenomes->{$sapGenome}) {
            # Here we have an obsolete genome to be deleted.
            $stats->Add(sapGenomeDeletes => 1);
            my $subStats = SaplingGenomeLoader::ClearGenome($sap, $sapGenome);
            $stats->Accumulate($subStats);
        }
    }
    # Return the list of new genomes.
    return \@retVal;
}

=head3 UpdateBindings

    UpdateBindings($changedSubsystems, $subsysGenomes, $newGenomes, $stats, $sap);

Run through all the genomes in the database, updating the subsystem
bindings. Only the subsystems listed as changed will be processed. New genomes
will be skipped, because the bindings were processed when the genome was
loaded. Genomes that are direct subsystem members will be skipped for those
subsystems, since those bindings were processed when the subsystem was loaded.

=over 4

=item changedSubsystems

Reference to a hash whose keys are the IDs of modified subsystems. These
subsystems are the ones whose bindings will be processed.

=item subsysGenomes

Reference to a hash mapping genome IDs to the IDs of the subsystems for
which the genomes are direct members. Each genome ID is mapped to a list
of subsystem IDs.

=item newGenomes

Reference to a list of the genomes that have been added in this run.

=item stats

L<Stats> object for tracking the statistics of this operation.

=item sap

L<Sapling> object for connecting to the Sapling object.

=back

=cut

sub UpdateBindings {
    # Get the parameters.
    my ($changedSubsystems, $subsysGenomes, $newGenomes, $stats, $sap) = @_;
    # Get the full list of changed subsystems.
    my @changes = grep { $changedSubsystems->{$_} } keys %$changedSubsystems;
    # Get the list of old genomes.
    my $genomeHash = $sap->GenomeHash();
    my %newGenomeHash = map { $_ => 1 } @$newGenomes;
    my @genomeList = grep { ! $newGenomeHash{$_} } sort keys %$genomeHash;
    # Loop through them.
    Trace("Processing genome bindings.") if T(2);
    for my $genome (@genomeList) {
        Trace("Processing bindings for $genome.") if T(3);
        # Compute the subsystem list for this genome. It's all the changed
        # subsystems less the ones directly attached to this genome.
        my %excluded = map { $_ => 1 } @{$subsysGenomes->{$genome}};
        my @subsysList = grep { ! $excluded{$_} } @changes;
        # Create a loader for this genome.
        my $loader = SaplingGenomeLoader->new($sap, $genome, "$FIG_Config::organisms/$genome");
        # Use it to update the subsystems.
        $loader->LoadSubsystems(\@subsysList);
        # Fold in the statistics.
        $stats->Accumulate($loader->{stats});
    }
}

=head3 UpdateExpressionData

    UpdateExpressionData($stats, $sap);

Compare the expression data in the SEED and in the Sapling data base. Insure that
all genomes in the Sapling which have expression data in the SEED have expression
data in the Sapling as well.

=over 4

=item stats

L<Stats> object to track the activity of this operation.

=item sap

L<Sapling> object for connecting to the database.

=cut

sub UpdateExpressionData {
    # Get the parameters.
    my ($stats, $sap) = @_;
    # Get the expression data directory.
    my $expDirectory = '/vol/expression/current';
    # Get the list of Sapling genomes.
    my %genomeHash = map { $_ => 1 } $sap->GetFlat('Genome', '', [], 'id');
    # Get the list of Sapling genomes with expression data.
    my %expDataHash = map { $_ => 1 } $sap->GetFlat('Genome HadResultsProducedBy',
            '', [], 'id');
    # Now we run through the expression directory. For each genome that is in the
    # genome hash and not in the expression data hash, we load its expression
    # data. Every genome that is in the expression directory will also be deleted
    # from the expression data hash. At the end, whatever is left in the expression
    # data hash will be deleted from the data base.
    my @expFiles = OpenDir($expDirectory, 1);
    Trace("Processing expression data. " . scalar(@expFiles) . " directories found.") if T(2);
    # Loop through the expression directory.
    for my $expFile (@expFiles) {
        # Check the status of this genome.
        if (! $genomeHash{$expFile}) {
            $stats->Add(expressionGenomeSkipped => 1);
        } elsif ($expDataHash{$expFile}) {
            $stats->Add(expressionGenomeFound => 1);
            # Insure we know not to delete this expression data.
            delete $expDataHash{$expFile};
        } else {
            # Here we need to load the genome's expression data.
            Trace("Loading expression data for $expFile.") if T(2);
            $stats->Add(expressionGenomeNew => 1);
            # Load the expression data.
            my $subStats = SaplingExpressionLoader::Process($sap, $expFile,
                    "$expDirectory/$expFile");
            # Fold in the statistics.
            $stats->Accumulate($subStats);
            # Insure we know not to delete this expression data.
            delete $expDataHash{$expFile};
        }
    }
    # Delete the expression data not found on disk.
    for my $genome (sort keys %expDataHash) {
        Trace("Deleting expression data for $genome.") if T(2);
        $stats->Add(expressionGenomeDelete => 1);
        # Delete the expression data.
        my $subStats = SaplingExpressionLoader::ClearExpressionData($sap, $genome);
        # Fold in the statistics.
        $stats->Accumulate($subStats);
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3