[Bio] / Sprout / SaplingDataLoader.pm Repository:
ViewVC logotype

View of /Sprout/SaplingDataLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Wed Mar 23 18:42:34 2011 UTC (9 years ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_04082011, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_3, mgrast_dev_03312011, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, mgrast_dev_04052011
Changes since 1.1: +135 -0 lines
More loader updates.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

package SaplingDataLoader;

    use strict;
    use Tracer;
    use Stats;
    use SeedUtils;
    use SAPserver;
    use Sapling;

=head1 Sapling Data Loader

This is the base class for packages that load the Sapling database from
SEED data files.

=head2 Loader Object Methods

=head3 new

    my $loaderObject = SaplingGenomeLoader->new($sap, @stats);

Create a loader object that can be used to facilitate loading Sapling data from a
directory.

=over 4

=item sap

L<Sapling> object used to access the target database.

=item stats

List of names for statistics to be initialized in the statistics object.

=back

The object created contains the following fields.

=over 4

=item supportRecords

A hash of hashes, used to track the support records known to exist in the database.

=item sap

L<Sapling> object used to access the database.

=item stats

L<Stats> object for tracking statistical information about the load.

=back

=cut

sub new {
    # Get the parameters.
    my ($class, $sap, @stats) = @_;
    # Create the object.
    my $retVal = {
        sap => $sap,
        stats => Stats->new(@stats),
        supportRecords => {}
    };
    # Bless and return it.
    bless $retVal, $class;
    return $retVal;
}

=head2 Internal Utility Methods

=head3 DeleteRelatedRecords

    DeleteRelatedRecords($sap, $genome, $stats, $relName, $entityName);

Delete all the records in the named entity and relationship relating to the
specified genome and roll up the statistics in the specified statistics object.

=over 4

=item sap

L<Sapling> object for accessing the database.

=item genome

ID of the relevant genome.

=item stats

L<Stats> object for tracking the delete activity.

=item relName

Name of a relationship from the B<Genome> table.

=item entityName

Name of the entity on the other side of the relationship.

=back

=cut

sub DeleteRelatedRecords {
    # Get the parameters.
    my ($sap, $genome, $stats, $relName, $entityName) = @_;
    # Get all the relationship records.
    my (@targets) = $sap->GetFlat($relName, "$relName(from-link) = ?", [$genome],
                                  "to-link");
    # Loop through the relationship records, deleting them and the target entity
    # records.
    for my $target (@targets) {
        # Delete the relationship instance.
        $sap->DeleteRow($relName, $genome, $target);
        $stats->Add($relName => 1);
        # Delete the entity instance.
        my $subStats = $sap->Delete($entityName, $target);
        # Roll up the statistics.
        $stats->Accumulate($subStats);
    }
}

=head3 ExtractFields

    my %fieldHash = SaplingGenomeLoader::ExtractFields($tableName, $dataHash);

Extract from the incoming hash the field names and values from the specified table.

=over 4

=item tableName

Name of the table whose field names and values are desired.

=item dataHash

Reference to a hash mapping fully-qualified ERDB field names to values.

=item RETURN

Returns a hash containing only the fields from the specified table and their values.

=back

=cut

sub ExtractFields {
    # Get the parameters.
    my ($tableName, $dataHash) = @_;
    # Declare the return variable.
    my %retVal;
    # Extract the desired fields.
    for my $field (keys %$dataHash) {
        # Is this a field for the specified table?
        if ($field =~ /^$tableName\(([^)]+)/) {
            # Yes, put it in the output hash.
            $retVal{$1} = $dataHash->{$field};
        }
    }
    # Return the computed hash.
    return %retVal;
}

=head3 InsureEntity

    my $createdFlag = $loaderObject->InsureEntity($entityType => $id, %fields);

Insure that the specified record exists in the database. If no record is found of the
specified type with the specified ID, one will be created with the indicated fields.

=over 4

=item $entityType

Type of entity to check.

=item id

ID of the entity instance in question.

=item fields

Hash mapping field names to values for all the fields in the desired entity record except
for the ID.

=item RETURN

Returns TRUE if a new object was created, FALSE if it already existed.

=back

=cut

sub InsureEntity {
    # Get the parameters.
    my ($self, $entityType, $id, %fields) = @_;
    # Get the database.
    my $sap = $self->{sap};
    # Get the support record ID hash.
    my $supportHash = $self->{supportRecords};
    # Denote we haven't created a new record.
    my $retVal = 0;
    # Get the sub-hash for this entity type.
    my $entityHash = $supportHash->{$entityType};
    if (! defined $entityHash) {
        $entityHash = {};
        $supportHash->{$entityType} = $entityHash;
    }
    # Check for this instance.
    if (! $entityHash->{$id}) {
        # It's not found. Check the database.
        if (! $sap->Exists($entityType => $id)) {
            # It's not in the database either, so create it.
            $sap->InsertObject($entityType, id => $id, %fields);
            $self->{stats}->Add(insertSupport => 1);
            $retVal = 1;
        }
        # Mark the record in the hash so we know we have it.
        $entityHash->{$id} = 1;
    }
    # Return the insertion indicator.
    return $retVal;
}

=head3 ConnectFunctionRoles

    $self->ConnectFunctionRoles($fid, $function);

Connect the specified feature to the roles indicated by its functional assignment.

=over 4

=item fid

ID of the feature of interest.

=item function

Functional assignment for the feature. Most of the time, this corresponds to a single role,
but that is not always the case.

=back

=cut

sub ConnectFunctionRoles {
    # Get the parameters.
    my ($self, $fid, $function) = @_;
    # Get the statistics object.
    my $stats = $self->{stats};
    # Get the Sapling database.
    my $sap = $self->{sap};
    # Get the roles and the error count from the function.
    my ($roles, $errors) = SeedUtils::roles_for_loading($function);
    # Accumulate the errors in the stats object.
    $stats->Add(roleErrors => $errors);
    # Is this a suspicious function?
    if (! defined $roles) {
        # Yes, so track it.
        $stats->Add(badFunction => 1);
    } else {
        # No, connect the roles.
        for my $role (@$roles) {
            # Insure this role exists.
            my $hypo = hypo($role);
            $self->InsureEntity(Role => $role, hypothetical => $hypo);
            # Connect it to the feature.
            $sap->InsertObject('IsFunctionalIn', from_link => $role, to_link => $fid);
        }
    }
}

=head ComputeAnnotationID

    my $annotationID = SaplingDataLoader::ComputeAnnotationID($fid, $keyStamp);

Compute the annotation ID for the specified feature and timestamp. The annotation ID is an
inverted number designed so that higher timestamps sort later in the ordering.

=over 4

=item fid

Relevant feature ID.

=item keyStamp

Timestamp to be used to form the key.

=item RETURN

Returns an ID string formed from the feature ID and the inverted timestamp.

=back

=cut

sub ComputeAnnotationID {
    # Get the parameters.
    my ($fid, $keyStamp) = @_;
    # Compute the annotation ID from the feature ID and keystamp.
    my $retVal = "$fid:" . Tracer::Pad(9999999999 - $keyStamp, 10, 1, "0");
    # Return the result.
    return $retVal;
}

=head3 ComputeKeyStamp

    my $keyStamp = SaplingDataLoader::ComputeKeyStamp($annotationID, $default);

Compute the timestamp value from the specified annotation ID. The timestamp portion is
parsed out and then inverted to get the original time value.

=over 4

=item annotationID

The annotation ID to parse for the timestamp.

=item default

Default value to return if the original annotation ID is undefined or invalid.

=item RETURN

Returns the timestamp value used to compute the original annotation ID.

=back

=cut

sub ComputeKeyStamp {
    # Get the parameters.
    my ($annotationID, $default) = @_;
    # Declare the return variable. We initialize it to the default value.
    my $retVal = $default;
    # Parse out the timestamp portion of the annotation ID.
    if ($annotationID && $annotationID =~ /:(\d+)/) {
        # If we found one, convert it to a timestamp.
        $retVal = 9999999999 - $1;
    }
    # Return the result.
    return $retVal;
}

=head2 The Process Method

Each loader must provide a C<Process> method for processing input from the
master file of load instructions. The master file contains a load type in the
first column that indicates the relevant load class (e.g. C<Function> for
L<SaplingFunctionLoader>). The remaining columns are the parameters passed to
the load method in sequence. The load method first clears existing data (if
necessary), then loads the new data.

    my $stats = SaplingDataLoader::Process($sap, @parms);

=cut


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3