[Bio] / Sprout / SubsystemSaplingLoader.pm Repository:
ViewVC logotype

View of /Sprout/SubsystemSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Thu Apr 2 01:49:17 2009 UTC (10 years, 5 months ago) by parrello
Branch: MAIN
CVS Tags: rast_rel_2009_05_18
Changes since 1.3: +4 -0 lines
Fixed to clean subsystem objects.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

package SubsystemSaplingLoader;

    use strict;
    use Tracer;
    use ERDB;
    use base 'BaseSaplingLoader';

=head1 Sapling Subsystem Load Group Class

=head2 Introduction

The Subsystem Load Group includes all of the major subsystem-related tables.

=head3 new

    my $sl = SubsystemSaplingLoader->new($erdb, $options, @tables);

Construct a new SubsystemSaplingLoader object.

=over 4

=item erdb

[[SaplingPm]] object for the database being loaded.

=item options

Reference to a hash of command-line options.

=item tables

List of tables in this load group.

=back

=cut

sub new {
    # Get the parameters.
    my ($class, $erdb, $options) = @_;
    # Create the table list.
    my @tables = sort qw(Subsystem IsClassFor SubsystemClass IsSuperclassOf Includes
                         Describes Role Variant IsRoleOf IsImplementedBy MachineRole
                         IsMachineOf MolecularMachine IsContainedIn Uses);
    # Create the BaseSaplingLoader object.
    my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
    # Return it.
    return $retVal;
}

=head2 Public Methods

=head3 Generate

    $sl->Generate();

Generate the data for the subsystem-related files.

=cut

sub Generate {
    # Get the parameters.
    my ($self) = @_;
    # Get the database object.
    my $erdb = $self->db();
    # Get the source object.
    my $fig = $self->source();
    # Is this the global section?
    if ($self->global()) {
        # Yes, build the subsystem framework.
        $self->GenerateSubsystems($fig, $erdb);
    } else {
        # Get the section ID.
        my $genomeID = $self->section();
        # Generate the subsystem date for this genome.
        $self->GenerateSubsystemData($fig, $erdb, $genomeID);
    }
}

=head3 GenerateSubsystems

    $sl->GenerateSubsystems($fig, $erdb);

Generate the subsystems, variants, and roles for this database. This
method concerns itself primarily with the genome-independent part of the
subsystem framework. This includes the following tables:

    Subsystem
    Describes
    Variant
    Includes
    Role
    IsClassFor
    SubsystemClass
    IsSuperclassOf

=over 4

=item fig

Source object from which the subsystem data will be extracted.

=item erdb

Database object for the Sapling database.

=back

=cut

sub GenerateSubsystems {
    # Get the parameters.
    my ($self, $fig, $erdb) = @_;
    # Get the subsystem hash for this Sapling instance. Its key list will be
    # the list of subsystems to put in the database.
    my $subHash = $erdb->SubsystemHash();
    # We'll track the various subsystem classes in here.
    my %subClassHash = ();
    # Loop through the subsystems.
    for my $subsystem (keys %$subHash) {
        Trace("Processing subsystem $subsystem.") if T(ERDBLoadGroup => 3);
        # Get the FIG subsystem object.
        my $ssData = $fig->get_subsystem($subsystem);
        # Get the subsystem properties.
        my $curator = $ssData->get_curator();
        my $description = $ssData->get_description();
        my $notes = $ssData->get_notes();
        my $version = $ssData->get_version();
        $self->PutE(Subsystem => $subsystem, curator => $curator,
                    description => $description, notes => $notes,
                    version => $version);
        # Get this subsystem's roles.
        my @roles = $ssData->get_roles();
        # This will track the column number for the role.
        my $col = 0;
        # Loop through the roles.
        for my $role (@roles) {
            # Compute this role's type.
            my $hypothetical = ($role =~ /hypothetical/i ? 1 : 0);
            # Create its entity.
            $self->PutE(Role => $role, hypothetical => $hypothetical);
            # Connect it to the subsystem.
            $self->PutR(Includes => $subsystem, $role,
                        abbreviation => $ssData->get_abbr_for_role($role),
                        sequence => $col++)
        }
        # Put the subsystem in its classes.
        my $classes = $ssData->get_classification();
        my $class = pop @$classes;
        if (defined $class) {
            # Create the class record.
            $self->CreateClass($class);
            # Connect it to the subsystem.
            $self->PutR(IsClassFor => $class, $subsystem);
            # Move up the hierarchy.
            while (my $newClass = pop @$classes) {
                $self->CreateClass($newClass);
                $self->PutR(IsSuperclassOf => $newClass, $class);
                $class = $newClass;
            }
        }
        # Next come the variants. Variant data is sparse in the SEED. We
        # start by getting all the known variant codes.
        my %variants = map { BaseSaplingLoader::Starless($_) => '' } $ssData->get_variant_codes();
        # -1 and 0 are always present.
        $variants{'0'} = 'Subsystem functionality is incomplete.';
        $variants{'-1'} = 'Subsystem is not functional.';
        # Now get notes from any variants that have them. Note that we need
        # to clean up the variant code with a call to Starless.
        my $variantHash = $ssData->get_variants();
        for my $variant (keys %$variantHash) {
            my $realVariantID = BaseSaplingLoader::Starless($variant);
            $variants{$realVariantID} = $variantHash->{$variant};
        }
        # Create the variants.
        for my $variant (keys %variants) {
            # The variant key is the subsystem ID plus the variant code.
            my $variantID = "$subsystem:$variant";
            # At this time, the role rule is not available, so we only have
            # the comment.
            $self->PutE(Variant => $variantID, comment => $variants{$variant},
                        role_rule => '');
            # Link the subsystem to the variant.
            $self->PutR(Describes => $subsystem, $variantID);
        }
        # Clear the subsystem cache to keep memory under control.
        $fig->clear_subsystem_cache();
    }
}

=head3 GenerateSubsystemData

    $sl->GenerateSubsystemData($fig, $erdb, $genomeID);

Generate the molecular machines and subsystem spreadsheet cells for this
database. This method concerns itself primarily with the genome-dependent
part of the subsystem framework. This includes the following tables.

    IsImplementedBy
    MolecularMachine
    IsMachineOf
    MachineRole
    Uses
    IsContainedIn
    IsRoleOf

=over 4

=item fig

Source object from which the subsystem data will be extracted.

=item erdb

Database object for the Sapling database.

=item genomeID

ID of the relevant genome.

=back

=cut

sub GenerateSubsystemData {
    # Get the parameters.
    my ($self, $fig, $erdb, $genomeID) = @_;
    # Get the subsystem hash for this Sapling instance. Its key list will be
    # the list of subsystems being put in the database.
    my $subHash = $erdb->SubsystemHash();    
    # Get the list of subsystems for this genome. The "1" indicates we want
    # all of them, including the ones for 0 and -1 variants. Note we grep
    # against the subsystem hash to exclude subsystems that are not flagged
    # for Sapling.
    my @subNames = grep { exists $subHash->{$_} }
                        $fig->subsystems_for_genome($genomeID, 1);
    # Loop through the named subsystems. Each one corresponds to a molecular
    # machine.
    for my $subName (@subNames) {
        $self->Track(MolecularMachines => $subName, 100);
        # Get the subsystem object.
        my $ssData = $fig->get_subsystem($subName);
        # Now we find the molecular machines for this subsystem/genome pair.
        my @rows = $ssData->get_genomes();
        for (my $gidx = 0; $gidx <= $#rows; $gidx++) {
            my ($rowGenome, $regionString) = split /:/, $rows[$gidx], 2;
            if ($rowGenome eq $genomeID) {
                # Here we're positioned on a row for our genome. If it is
                # a region-restricted molecular machine, then the region
                # string will be defined. If it's global, we use an empty
                # string for the region.
                $regionString ||= "";
                # Create the molecular machine. To do that, we need the variant code
                # for this genome.
                my $raw_variant_code = $ssData->get_variant_code($gidx);
                # Check for a leading asterisk. This means the variant assignment is not
                # curated.
                my $curated = ($raw_variant_code =~ /^\s*\*/ ? 0 : 1);
                # Clear any waste from the variant code.
                my $variant_code = BaseSaplingLoader::Starless($raw_variant_code);
                # Compute its type.
                my $variant_type = ($variant_code =~ /^0/ ? 'incomplete' :
                                    $variant_code =~ /^-/ ? 'vacant' : 'normal');
                # Create the variant and machine IDs.
                my $variantID = "$subName:$variant_code";
                my $machineID = ERDB::DigestKey("$variantID:$genomeID:$regionString");
                # Create the molecular machine and connect it to the genome and
                # subsystem.
                $self->PutE(MolecularMachine => $machineID, type => $variant_type,
                            curated => $curated, region => $regionString);
                $self->PutR(IsImplementedBy => $variantID, $machineID);
                $self->PutR(Uses => $genomeID, $machineID);
                # Now we loop through the subsystem's roles, creating the MachineRoles.
                # Molecular machines function as spreadsheet rows; machine roles are
                # spreadsheet cells.
                my @roles = $ssData->get_roles();
                for my $role (@roles) {
                    # Get this role's abbreviation.
                    my $ridx = $ssData->get_role_index($role);
                    my $abbr = $ssData->get_role_abbr($ridx);
                    # Create the machine-role ID.
                    my $machineRoleID = "$machineID:$abbr";
                    # Create the machine-role and connect it to the role and the
                    # machine.
                    $self->PutE(MachineRole => $machineRoleID);
                    $self->PutR(IsMachineOf => $machineID, $machineRoleID);
                    $self->PutR(IsRoleOf => $role, $machineRoleID);
                    # Now get a list of the features in this cell.
                    my @pegs = $ssData->get_pegs_from_cell($genomeID, $ridx);
                    # Connect them to the cell.
                    for my $peg (@pegs) {
                        $self->PutR(IsContainedIn => $peg, $machineRoleID);
                    }
                }
            }
        }
        # Clear the subsystem cache to save space.
        $fig->clear_subsystem_cache();
    }
}

=head3 CreateClass

    $sl->CreateClass($className);

Create a SubsystemClass record with the specified class name.

=over 4

=item className

Name of the subsystem classification to create.

=back

=cut

sub CreateClass {
    # Get the parameters.
    my ($self, $className) = @_;
    # Create the subsystem class record.
    $self->PutE(SubsystemClass => $className);
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3