[Bio] / Sprout / SubsystemSaplingLoader.pm Repository:
ViewVC logotype

View of /Sprout/SubsystemSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (download) (as text) (annotate)
Wed Nov 25 20:58:27 2009 UTC (10 years, 4 months ago) by parrello
Branch: MAIN
Changes since 1.9: +4 -0 lines
Fixed to work properly for null descriptions.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

package SubsystemSaplingLoader;

    use strict;
    use Tracer;
    use ERDB;
    use base 'BaseSaplingLoader';

=head1 Sapling Subsystem Load Group Class

=head2 Introduction

The Subsystem Load Group includes all of the major subsystem-related tables.

=head3 new

    my $sl = SubsystemSaplingLoader->new($erdb, $options, @tables);

Construct a new SubsystemSaplingLoader object.

=over 4

=item erdb

L<Sapling> object for the database being loaded.

=item options

Reference to a hash of command-line options.

=item tables

List of tables in this load group.

=back

=cut

sub new {
    # Get the parameters.
    my ($class, $erdb, $options) = @_;
    # Create the table list.
    my @tables = sort qw(Subsystem IsClassFor SubsystemClass IsSuperclassOf Includes
                         Describes Role Variant IsRoleOf IsImplementedBy MachineRole
                         IsMachineOf MolecularMachine Contains Uses VariantRole);
    # Create the BaseSaplingLoader object.
    my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
    # Return it.
    return $retVal;
}

=head2 Public Methods

=head3 Generate

    $sl->Generate();

Generate the data for the subsystem-related files.

=cut

sub Generate {
    # Get the parameters.
    my ($self) = @_;
    # Get the database object.
    my $erdb = $self->db();
    # Get the source object.
    my $fig = $self->source();
    # Is this the global section?
    if ($self->global()) {
        # Yes, build the subsystem framework.
        $self->GenerateSubsystems($fig, $erdb);
    } else {
        # Get the section ID.
        my $genomeID = $self->section();
        # Generate the subsystem date for this genome.
        $self->GenerateSubsystemData($fig, $erdb, $genomeID);
    }
}

=head3 GenerateSubsystems

    $sl->GenerateSubsystems($fig, $erdb);

Generate the subsystems, variants, and roles for this database. This
method concerns itself primarily with the genome-independent part of the
subsystem framework. This includes the following tables:

    Subsystem
    Describes
    Variant
    Includes
    Role
    IsClassFor
    SubsystemClass
    IsSuperclassOf

=over 4

=item fig

Source object from which the subsystem data will be extracted.

=item erdb

Database object for the Sapling database.

=back

=cut

sub GenerateSubsystems {
    # Get the parameters.
    my ($self, $fig, $erdb) = @_;
    # Get the subsystem hash for this Sapling instance. Its key list will be
    # the list of subsystems to put in the database.
    my $subHash = $erdb->SubsystemHash();
    # We'll track the various subsystem classes in here.
    my %subClassHash = ();
    # Loop through the subsystems.
    for my $subsystem (keys %$subHash) {
        Trace("Processing subsystem $subsystem.") if T(ERDBLoadGroup => 3);
        # Get the FIG subsystem object.
        my $ssData = $fig->get_subsystem($subsystem);
        # These will be set to 1 if the subsystem has the indicated property.
        my $experimental = 0;
        my $clustered = 0;
        # Get the subsystem's classes.
        my $classes = $ssData->get_classification();
        # Only proceed if classes exist.
        if (scalar @$classes) {
            # Check for one of the special roots. If we find it, we shift it off
            # the list.
            if ($classes->[0] =~ /Clustering/) {
                $clustered = 1;
                shift @$classes;
            } elsif ($classes->[0] =~ /Experimental/) {
                $experimental = 1;
                shift @$classes;
            }
            # Loop through the remaining classes from the bottom up.
            my $class = pop @$classes;
            if (defined $class) {
                # Create the class record.
                $self->CreateClass($class);
                # Connect it to the subsystem.
                $self->PutR(IsClassFor => $class, $subsystem);
                # Is this a new class?
                if (! $subClassHash{$class}) {
                    # Yes. We need to put it in its hierarchy.
                    while (my $newClass = pop @$classes) {
                        # Create the new class's record.
                        $self->CreateClass($newClass);
                        # Put it above the previous class.
                        $self->PutR(IsSuperclassOf => $newClass, $class);
                        # Insure we know we're done with this class.
                        $subClassHash{$class} = 1;
                        # Prepare for the next class.
                        $class = $newClass;
                    }
                }
            }
        }
        # Get the subsystem properties.
        my $curator = $ssData->get_curator();
        my $description = $ssData->get_description();
        my $notes = $ssData->get_notes();
        my $version = $ssData->get_version();
        my $usable = ($fig->is_experimental_subsystem($subsystem) ? 0 : 1);
        my $private = ($fig->is_exchangable_subsystem($subsystem) ? 0 : 1);
        # Ensure we have a description.
        if (! defined $description) {
            $description = '';
        }
        # Emit the subsystem record.
        $self->PutE(Subsystem => $subsystem, curator => $curator,
                    description => $description, notes => $notes,
                    version => $version, usable => $usable,
                    private => $private, cluster_based => $clustered,
                    experimental => $experimental);
        # Get this subsystem's roles.
        my @roles = $ssData->get_roles();
        # This will track the column number for the role.
        my $col = 0;
        # Loop through the roles.
        for my $role (@roles) {
            # Compute this role's type.
            my $hypothetical = ($role =~ /hypothetical/i ? 1 : 0);
            # Create its entity.
            $self->PutE(Role => $role, hypothetical => $hypothetical);
            # Check to see if it is main or auxiliary.
            my $auxFlag = ($fig->is_aux_role_in_subsystem($subsystem, $role) ? 1 : 0);
            # Connect it to the subsystem.
            $self->PutR(Includes => $subsystem, $role,
                        abbreviation => $ssData->get_abbr_for_role($role),
                        sequence => $col++, auxiliary => $auxFlag);
        }
        # Next come the variants. Variant data is sparse in the SEED. We
        # start by getting all the known variant codes.
        my %variants = map { BaseSaplingLoader::Starless($_) => '' } $ssData->get_variant_codes();
        # -1 and 0 are always present.
        $variants{'0'} = 'Subsystem functionality is incomplete.';
        $variants{'-1'} = 'Subsystem is not functional.';
        # Now get notes from any variants that have them. Note that we need
        # to clean up the variant code with a call to Starless.
        my $variantHash = $ssData->get_variants();
        for my $variant (keys %$variantHash) {
            my $realVariantID = BaseSaplingLoader::Starless($variant);
            $variants{$realVariantID} = $variantHash->{$variant};
        }
        # Next we need to compute the role rules. For each genome in the subsystem,
        # we compute its variant code and a list of its roles. These are put
        # into the following two-dimensional hash. Each inner hash maps a role
        # rule list to 1. The keys of the inner hash become the role rules.
        my %roleRuleHash = map { $_ => {} } keys %variants;
        # Loop through the list of genomes.
        my @genomes = $ssData->get_genomes();
        for (my $i = 0; $i < scalar(@genomes); $i++) {
            # Get this genome's variant code.
            my $variantCode = BaseSaplingLoader::Starless($ssData->get_variant_code($i));
            # Get its roles.
            my @roles = $ssData->get_roles_for_genome($genomes[$i]);
            # Convert them to a role rule.
            my $rule = join(" ", sort map { $ssData->get_abbr_for_role($_) } @roles);
            # Put the role in the hash.
            $roleRuleHash{$variantCode}{$rule} = 1;
        }
        # Create the variants.
        for my $variant (keys %variants) {
            # The variant key is the subsystem ID plus the variant code.
            my $variantID = ERDB::DigestKey("$subsystem:$variant");
            # Compute its type.
            my $variant_type = ($variant =~ /^0/ ? 'incomplete' :
                                $variant =~ /^-/ ? 'vacant' : 'normal');
            # The comment is easily computed from the variant data, so
            # we now have enough data to output the variant record.
            $self->PutE(Variant => $variantID, type => $variant_type,
                        code => $variant, comment => $variants{$variant});
            # Now output the role rules.
            for my $rule (keys %{$roleRuleHash{$variant}}) {
                $self->PutE(VariantRole => $variantID, role_rule => $rule);
            }
            # Link the subsystem to the variant.
            $self->PutR(Describes => $subsystem, $variantID);
        }
        # Clear the subsystem cache to keep memory under control.
        $fig->clear_subsystem_cache();
    }
}

=head3 GenerateSubsystemData

    $sl->GenerateSubsystemData($fig, $erdb, $genomeID);

Generate the molecular machines and subsystem spreadsheet cells for this
database. This method concerns itself primarily with the genome-dependent
part of the subsystem framework. This includes the following tables.

    IsImplementedBy
    MolecularMachine
    IsMachineOf
    MachineRole
    Uses
    Contains
    IsRoleOf

=over 4

=item fig

Source object from which the subsystem data will be extracted.

=item erdb

Database object for the Sapling database.

=item genomeID

ID of the relevant genome.

=back

=cut

sub GenerateSubsystemData {
    # Get the parameters.
    my ($self, $fig, $erdb, $genomeID) = @_;
    # Get the subsystem hash for this Sapling instance. Its key list will be
    # the list of subsystems being put in the database.
    my $subHash = $erdb->SubsystemHash();    
    # Get the list of subsystems for this genome. The "1" indicates we want
    # all of them, including the ones for 0 and -1 variants. Note we have
    # to normalize the subsystem names.
    my @subName = map { $erdb->SubsystemID($_) } $fig->subsystems_for_genome($genomeID, 1);
    # Loop through the named subsystems. Each one corresponds to a molecular
    # machine.
    for my $subName (grep { exists $subHash->{$_} } @subName) {
        $self->Track(MolecularMachines => $subName, 100);
        # Compute the MD5 hash of the subsystem ID.
        my $ssMD5 = ERDB::DigestKey($subName);
        # Get the subsystem object.
        my $ssData = $fig->get_subsystem($subName);
        # Now we find the molecular machines for this subsystem/genome pair.
        my @rows = $ssData->get_genomes();
        for (my $gidx = 0; $gidx <= $#rows; $gidx++) {
            my ($rowGenome, $regionString) = split /:/, $rows[$gidx], 2;
            if ($rowGenome eq $genomeID) {
                # Here we're positioned on a row for our genome. If it is
                # a region-restricted molecular machine, then the region
                # string will be defined. If it's global, we use an empty
                # string for the region.
                $regionString ||= "";
                # Create the molecular machine. To do that, we need the variant code
                # for this genome.
                my $raw_variant_code = $ssData->get_variant_code($gidx);
                # Check for a leading asterisk. This means the variant assignment is not
                # curated.
                my $curated = ($raw_variant_code =~ /^\s*\*/ ? 0 : 1);
                # Clear any waste from the variant code.
                my $variant_code = BaseSaplingLoader::Starless($raw_variant_code);
                # Create the variant and machine IDs.
                my $variantID = ERDB::DigestKey("$subName:$variant_code");
                my $machineID = ERDB::DigestKey("$subName:$variant_code:$genomeID:$regionString");
                # Create the molecular machine and connect it to the genome and
                # subsystem.
                $self->PutE(MolecularMachine => $machineID,
                            curated => $curated, region => $regionString);
                $self->PutR(IsImplementedBy => $variantID, $machineID);
                $self->PutR(Uses => $genomeID, $machineID);
                # Now we loop through the subsystem's roles, creating the MachineRoles.
                # Molecular machines function as spreadsheet rows; machine roles are
                # spreadsheet cells.
                my @roles = $ssData->get_roles();
                for my $role (@roles) {
                    # Get this role's abbreviation.
                    my $ridx = $ssData->get_role_index($role);
                    my $abbr = $ssData->get_role_abbr($ridx);
                    # Create the machine-role ID.
                    my $machineRoleID = "$ssMD5:$genomeID:$regionString:$abbr";
                    # Create the machine-role and connect it to the role and the
                    # machine.
                    $self->PutE(MachineRole => $machineRoleID);
                    $self->PutR(IsMachineOf => $machineID, $machineRoleID);
                    $self->PutR(IsRoleOf => $role, $machineRoleID);
                    # Now get a list of the features in this cell.
                    my @pegs = $ssData->get_pegs_from_cell($genomeID, $ridx);
                    # Connect them to the cell.
                    for my $peg (@pegs) {
                        $self->PutR(Contains => $machineRoleID, $peg);
                    }
                }
            }
        }
        # Clear the subsystem cache to save space.
        $fig->clear_subsystem_cache();
    }
}

=head3 CreateClass

    $sl->CreateClass($className);

Create a SubsystemClass record with the specified class name.

=over 4

=item className

Name of the subsystem classification to create.

=back

=cut

sub CreateClass {
    # Get the parameters.
    my ($self, $className) = @_;
    # Create the subsystem class record.
    $self->PutE(SubsystemClass => $className);
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3