[Bio] / Sprout / SubsystemSproutLoader.pm Repository:
ViewVC logotype

View of /Sprout/SubsystemSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.1 - (download) (as text) (annotate)
Thu Oct 9 18:16:26 2008 UTC (11 years, 4 months ago) by parrello
Branch: MAIN
CVS Tags: rast_rel_2008_12_18, mgrast_rel_2008_1110_v2, mgrast_rel_2008_1110, rast_rel_2008_10_29, rast_rel_2008_11_24
New Sprout loaders.

#!/usr/bin/perl -w

# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
# This file is part of the SEED Toolkit.
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.

package SubsystemSproutLoader;

    use strict;
    use Tracer;
    use ERDB;
    use base 'BaseSproutLoader';

=head1 Sprout Subsystem Load Group Class

=head2 Introduction

The  Load Group includes all of the major subsystem-based tables.

=head3 new

    my $sl = SproutLoader->new($erdb, $source, $options, @tables);

Construct a new SproutLoader object.

=over 4

=item erdb

[[SproutPm]] object for the database being loaded.

=item source

[[FigPm]] object used to access the source data. If this parameter is undefined,
it will be created the first time the L</source> method is called.

=item options

Reference to a hash of command-line options.

=item tables

List of tables in this load group.



sub new {
    # Get the parameters.
    my ($class, $erdb, $source, $options) = @_;
    # Create the table list.
    my @tables = sort qw(Diagram RoleOccursIn Subsystem Role RoleEC IsIdentifiedByEC Catalyzes SSCell ContainsFeature IsGenomeOf IsRoleOf OccursInSubsystem ParticipatesIn HasSSCell RoleSubset GenomeSubset ConsistsOfRoles ConsistsOfGenomes HasRoleSubset HasGenomeSubset SubsystemClass SubsystemHopeNotes);
    # Create the BaseSproutLoader object.
    my $retVal = BaseSproutLoader::new($class, $erdb, $source, $options, @tables);
    # Return it.
    return $retVal;

=head2 Public Methods

=head3 Generate


Generate the data for the subsystem-based files.


sub Generate {
    # Get the parameters.
    my ($self) = @_;
    # Get the sprout object.
    my $sprout = $self->db();
    # Get the FIG object.
    my $fig = $self->source();
    # Get the subsystem list.
    my $subHash = $self->GetSubsystems();
    if ($self->global()) {
        # In global mode, we generate the subsystem/role data.
        Trace("Generating subsystem data.") if T(2);
        # This hash will contain the roles for each EC. When we're done, this
        # information will be used to generate the Catalyzes table.
        my %ecToRoles = ();
        # Loop through the subsystems. Our first task will be to create the
        # roles. We do this by looping through the subsystems and creating a
        # role hash. The hash tracks each role ID so that we don't create
        # duplicates. As we move along, we'll connect the roles and subsystems
        # and memorize up the reactions.
        my ($genomeID, $roleID);
        my %roleData = ();
        for my $subsysID (sort keys %$subHash) {
            # Get the subsystem object.
            my $sub = $fig->get_subsystem($subsysID);
            # Only proceed if the subsystem has a spreadsheet.
            if (defined($sub) && ! $sub->{empty_ss}) {
                Trace("Creating subsystem $subsysID.") if T(3);
                # Create the subsystem record.
                $self->PutE(Subsystem => $subsysID, curator => $sub->get_curator(),
                            version => $sub->get_version(), description => $sub->get_description(),
                            notes => $sub->get_notes);
                # Add the hope notes.
                my $hopeNotes = $sub->get_hope_curation_notes();
                if ($hopeNotes) {
                    $self->PutE(SubsystemHopeNotes => $sub, 'hope-curation-notes' => $hopeNotes);
                # Now for the classification string. This comes back as a list
                # reference and we convert it to a splitter-delimited string.
                my $classList = $fig->subsystem_classification($subsysID);
                my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
                $self->PutE(SubsystemClass => $subsysID, classification => $classString);
                # Connect the subsystem to its roles. Each role is a column in the subsystem spreadsheet.
                for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
                    # Get the role's abbreviation.
                    my $abbr = $sub->get_role_abbr($col);
                    # Get its essentiality.
                    my $aux = $fig->is_aux_role_in_subsystem($subsysID, $roleID);
                    # Get its reaction note.
                    my $hope_note = $sub->get_hope_reaction_notes($roleID) || "";
                    # Connect to this role.
                    $self->Add(roleIn => 1);
                    $self->PutR(OccursInSubsystem => $roleID, $subsysID, abbr => $abbr, auxiliary => $aux,
                                'column-number' => $col, 'hope-reaction-note' => $hope_note);
                    # If it's a new role, add it to the role table.
                    if (! exists $roleData{$roleID}) {
                        # Add the role.
                        $self->Put('Role', id => $roleID);
                        $roleData{$roleID} = 1;
                        # Check for an EC number.
                        if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) {
                            my $ec = $1;
                            $self->PutR(IsIdentifiedByEC => $roleID, $ec);
                            # Check to see if this is our first encounter with this EC.
                            if (exists $ecToRoles{$ec}) {
                                # No, so just add this role to the EC list.
                                push @{$ecToRoles{$ec}}, $roleID;
                            } else {
                                # Output this EC.
                                $self->PutE(RoleEC => $ec);
                                # Create its role list.
                                $ecToRoles{$ec} = [$roleID];
                # Now we need to generate the subsets. The subset names must be concatenated to
                # the subsystem name to make them unique keys. There are two types of subsets:
                # genome subsets and role subsets. We do the role subsets first.
                my @subsetNames = $sub->get_subset_names();
                for my $subsetID (@subsetNames) {
                    # Create the subset record.
                    my $actualID = "$subsysID:$subsetID";
                    $self->PutE(RoleSubset => $actualID);
                    # Connect the subset to the subsystem.
                    $self->PutR(HasRoleSubset => $subsysID, $actualID);
                    # Connect the subset to its roles.
                    my @roles = $sub->get_subsetC_roles($subsetID);
                    for my $roleID (@roles) {
                        $self->PutR(ConsistsOfRoles => $actualID, $roleID);
                # Next the genome subsets.
                @subsetNames = $sub->get_subset_namesR();
                for my $subsetID (@subsetNames) {
                    # Create the subset record.
                    my $actualID = "$subsysID:$subsetID";
                    $self->PutE(GenomeSubset => $actualID);
                    # Connect the subset to the subsystem.
                    $self->PutR(HasGenomeSubset => $subsysID, $actualID);
                    # Connect the subset to its genomes.
                    my @genomes = $sub->get_subsetR($subsetID);
                    for my $genomeID (@genomes) {
                        $self->PutR(ConsistsOfGenomes => $actualID, $genomeID);
        # Now we loop through the diagrams. We need to create the diagram records
        # and link each diagram to its roles. Note that only roles which occur
        # in subsystems (and therefore appear in the %ecToRoles hash) are
        # included.
        for my $map ($fig->all_maps()) {
            Trace("Loading diagram $map.") if T(3);
            # Get the diagram's descriptive name.
            my $name = $fig->map_name($map);
            $self->PutE(Diagram => $map, name => $name);
            # Now we need to link all the map's roles to it.
            # A hash is used to prevent duplicates.
            my %roleHash = ();
            for my $ec ($fig->map_to_ecs($map)) {
                if (exists $ecToRoles{$ec}) {
                    for my $role (@{$ecToRoles{$ec}}) {
                        if (! $roleHash{$role}) {
                            $self->PutR(RoleOccursIn => $role, $map);
                            $roleHash{$role} = 1;
    } else {
        # Here we have a genome section.
        my $genomeID = $self->section();
        Trace("Connecting $genomeID to subsystems.") if T(3);
        # Get the subsystem data for this genome.
        my %subLists = $fig->get_all_subsystem_pegs($genomeID);
        # Loop through the subsystems.
        for my $subsysID (sort keys %subLists) {
            Trace("Processing subsystem $subsysID.") if T(3);
            # Get the subsystem's roles. We create a hash that maps role ID to its
            # column index.
            my $col = 0;
            my $subObject = $fig->get_subsystem($subsysID);
            my %roles = map { $_ => $col++ } $subObject->get_roles();
            # Create a list for the PEGs we find. This list will be used
            # to generate cluster numbers.
            my %pegsFound = ();
            my $aPegFound = 0;
            # Create a hash that maps spreadsheet IDs to PEGs. We will
            # use this to generate the ContainsFeature data after we have
            # the cluster numbers.
            my %cellPegs = ();
            # We'll stash the variant code in here.
            my $variant;
            # Loop through the subsystem's pegs.
            for my $pegTuple (@{$subLists{$subsysID}}) {
                my ($roleID, $pegID, $variantCode) = @$pegTuple;
                # Save the variant ID. (It should be the same for each peg.)
                $variant = $variantCode;
                # Only proceed if this feature exists and the role is valid.
                if ($fig->is_deleted($pegID)) {
                    $self->Add('deleted-pegs' => 1);
                } elsif (! exists $roles{$roleID}) {
                    $self->Add('bad-role' => 1);
                } else {
                    # Get the column number.
                    my $col = $roles{$roleID};
                    # Compute the spreadsheet cell ID.
                    my$cellID = "$subsysID:$genomeID:$col";
                    # Remember this feature and its cell. Note we put -1 in the
                    # hash. This is to help us when we generate cluster numbers.
                    $pegsFound{$pegID} = -1;
                    $aPegFound = 1;
                    push @{$cellPegs{$cellID}}, $pegID;
            # If we found some cells for this genome, we need to compute clusters and
            # denote it participates in the subsystem.
            if ($aPegFound) {
                # ConnAgree
                $self->PutR(ParticipatesIn => $genomeID, $subsysID,
                            'variant-code' => $variant);
                # Partition the PEGs found into clusters.
                my @clusters = $fig->compute_clusters([keys %pegsFound], $subObject);
                for (my $i = 0; $i <= $#clusters; $i++) {
                    my $subList = $clusters[$i];
                    for my $peg (@{$subList}) {
                        $pegsFound{$peg} = $i;
                # Create the ontainsFeature data.
                for my $cellID (keys %cellPegs) {
                    my $cellList = $cellPegs{$cellID};
                    for my $cellPeg (@$cellList) {
                        $self->PutR(ContainsFeature => $cellID, $cellPeg,
                                    'cluster-number' => $pegsFound{$cellPeg});


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3