[Bio] / Sprout / SaplingSubsystemLoader.pm Repository:
ViewVC logotype

View of /Sprout/SaplingSubsystemLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (download) (as text) (annotate)
Tue Oct 7 17:12:21 2014 UTC (4 years, 6 months ago) by parrello
Branch: MAIN
CVS Tags: HEAD
Changes since 1.8: +39 -29 lines
Miscellaneous loader fixes.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

package SaplingSubsystemLoader;

    use strict;
    use Tracer;
    use Stats;
    use SeedUtils;
    use SAPserver;
    use Sapling;
    use base qw(SaplingDataLoader);

=head1 Sapling Subsystem Loader

This class loads Subsystem data into a Sapling database from a subsystem directory.
Unlike L<SaplingGenomeLoader>, this version is designed for updating a populated
database only. Links to features and genomes are put in, but not the features and
genomes themselves, which may lead to orphan links.

=head2 Main Methods

=head3 Load

    my $stats = SaplingSubsystemLoader::Load($sap, $subsystem, $directory);

Load a subsystem from a subsystem directory into the sapling database.

=over 4

=item sap

L<Sapling> object used to access the target database.

=item subsystem

ID of the subsystem being loaded.

=item directory

Name of the directory containing the subsystem information.

=back

=cut

sub Load {
    # Get the parameters.
    my ($sap, $subsystem, $directory) = @_;
    # Create the loader object.
    my $loaderObject = SaplingSubsystemLoader->new($sap, $subsystem, $directory);
    # Create the subsystem record.
    $loaderObject->CreateSubsystem();
    # Read the spreadsheet file.
    $loaderObject->ParseSpreadsheet();
    # Return the statistics.
    return $loaderObject->{stats};
}

=head3 ClearSubsystem

    my $stats = SaplingSubsystemLoader::ClearSubsystem($sap, $subsystem);

Delete the specified subsystem and all the related records from the specified sapling
database. This method can also be used to clean up after a failed or aborted load.

=over 4

=item sap

L<Sapling> object used to access the target database.

=item subsystem

ID of the subsystem to delete.

=item RETURN

Returns a statistics object counting the records deleted.

=back

=cut

sub ClearSubsystem {
    # Get the parameters.
    my ($sap, $subsystem) = @_;
    # Create the statistics object.
    my $stats = Stats->new();
    # Delete the subsystem and all its associated records.
    $stats = $sap->Delete(Subsystem => $subsystem);
    # Return the statistics object.
    return $stats;
}

=head3 Process

    my $stats = SaplingSubsystemLoader::Process($sap, $subsystem, $directory);

Load subsystem data from the specified directory. If the subsystem data already
exists in the database, it will be deleted first.

=over 4

=item sap

L</Sapling> object for accessing the database.

=item subsystem

name of the subsystem whose data is being loaded.

=item directory

Name of the directory containing the subsystem data files. If omitted,
the subsystem will be deleted from the database.

=item RETURN

Returns a statistics object describing the activity during the reload.

=back

=cut

sub Process {
    # Get the parameters.
    my ($sap, $subsystem, $directory) = @_;
    # Clear the existing data for the specified subsystem.
    my $stats = ClearSubsystem($sap, $subsystem);
    if ($subsystem) {
        # Load the new subsystem data from the specified directory.
        my $newStats = Load($sap, $subsystem, $directory);
        # Merge the statistics.
        $stats->Accumulate($newStats);
    }
    # Return the result.
    return $stats;
}


=head2 Loader Object Methods

=head3 new

    my $loaderObject = SaplingSubsystemLoader->new($sap, $subsystem, $directory);

Create a loader object that can be used to facilitate loading Sapling data from a
subsystem directory.

=over 4

=item sap

L<Sapling> object used to access the target database.

=item subsystem

ID of the subsystem being loaded.

=item directory

Name of the directory containing the subsystem data.

=back

The object created contains the following fields.

=over 4

=item supportRecords

A hash of hashes, used to track the support records known to exist in the database.

=item sap

L<Sapling> object used to access the database.

=item stats

L<Stats> object for tracking statistical information about the load.

=item subsystem

ID of the subsystem being loaded.

=item directory

Name of the directory containing the subsystem data.

=item roleList

Reference to a list of roles abbreviations, in order.

=item roleHash

Reference to a hash mapping each role abbreviation to the association role ID.

=item variants

Hash mapping variant codes to descriptions.

=back

=cut

sub new {
    # Get the parameters.
    my ($class, $sap, $subsystem, $directory) = @_;
    # Create the object.
    my $retVal = SaplingDataLoader::new($class, $sap, qw(roles));
    # Add our specialized data.
    $retVal->{subsystem} = $subsystem;
    $retVal->{directory} = $directory;
    $retVal->{variants} = {};
    # Return the result.
    return $retVal;
}

=head3 CreateSubsystem

    $loaderObject->CreateSubsystem();

Create the root record for this subsystem and connect it to the classifications. This
method also reads in the variant descriptions (if any);

=cut

sub CreateSubsystem {
    # Get the parameters.
    my ($self) = @_;
    # Get the subsystem directory.
    my $directory = $self->{directory};
    # Get the Sapling database.
    my $sap = $self->{sap};
    # Read the classification information.
    my @classes;
    my $classFile = "$directory/CLASSIFICATION";
    if (-f $classFile) {
        my $ih = Open(undef, "<$classFile");
        @classes = grep { $_ } Tracer::GetLine($ih);
    }
    # Loop through the classes from bottom to top, insuring we have them linked up
    # in the database.
    my $lastClass;
    if (@classes) {
        Trace("Processing classifications.") if T(SaplingDataLoader => 3);
        # Insure the lowest-level class is present.
        my $i = $#classes;
        $lastClass = $classes[$i];
        my $createdFlag = $self->InsureEntity(SubsystemClass => $lastClass);
        # Work up through the other classes until we find one already present or hit the top.
        my $thisClass = $lastClass;
        while ($createdFlag && $i > 1) {
            # Connect to the next class up.
            $i--;
            my $nextClass = $classes[$i];
            $sap->InsertObject('IsSuperClassOf', from_link => $nextClass, to_link => $thisClass);
            # Insure the next class is in the database.
            $createdFlag = $self->InsureEntity(SubsystemClass => $nextClass);
        }
    }
    # Get the top class, if any. We use this to do some typing.
    my $topClass = $classes[0] || ' ';
    # Compute the class-related subsystem types.
    my $clusterBased = ($topClass =~ /clustering-based/i ? 1 : 0);
    my $experimental = ($topClass =~ /experimental/i ? 1 : 0);
    my $usable = ! $experimental;
    # Check for the privacy flag.
    my $private = (-f "$directory/EXCHANGABLE" ? 0 : 1);
    # Get the version.
    my $version = "0";
    my $versionFile = "$directory/VERSION";
    if (-f $versionFile) {
        ($version) = Tracer::GetFile($versionFile);
    }
    # Get the curator. This involves finding the start line in the curator log.
    my $curator = "fig";
    my $curatorFile = "$directory/curation.log";
    if (-f $curatorFile) {
        my $ih = Open(undef, "<$curatorFile");
        while ($curator eq "fig" && ! eof $ih) {
            my $line = <$ih>;
            if ($line =~ /^\d+\t(\S+)\s+started/) {
                $curator = $1;
                $curator =~ s/^master://;
            }
        }
    }
    # Finally, we need to get the notes and description from the notes file.
    my ($description, $notes) = ("", "");
    my $notesFile = "$directory/notes";
    if (-f $notesFile) {
        Trace("Processing notes file.") if T(SaplingDataLoader => 3);
        my $ih = Open(undef, "<$notesFile");
        my $notesHash = ParseNotesFile($ih);
        if (exists $notesHash->{description}) {
            $description = $notesHash->{description};
        }
        if (exists $notesHash->{notes}) {
            $notes = $notesHash->{notes};
        }
        # Stash the variant information for later.
        if (exists $notesHash->{variants}) {
            # We need to create a hash of variant data.
            my %varHash;
            # Get the individual lines of the variant line.
            my @varLines = split /\n/, $notesHash->{variants};
            for my $varLine (@varLines) {
                # Split this line around the tab.
                my ($code, $comment) = split /\t/, $varLine;
                # Only proceed if the code is nonempty.
                if (defined $code && $code ne '') {
                    # Trim excess spaces from the code.
                    $code =~ s/\s+//g;
                    # Store the comment.
                    $varHash{$code} = $comment;
                }
            }
            $self->{variants} = \%varHash;
        }
    }
    # Create the subsystem record.
    $sap->InsertObject('Subsystem', id => $self->{subsystem}, cluster_based => $clusterBased,
                       curator => $curator, description => $description, experimental => $experimental,
                       notes => $notes, private => $private, usable => $usable, version => $version);
    # If there is a classification for it, connect it.
    if ($lastClass) {
        $sap->InsertObject('IsClassFor', from_link => $lastClass, to_link => $self->{subsystem});
    }
}


=head3 ParseSpreadsheet

    $loaderObject->ParseSpreadsheet();

Read and parse the spreadsheet file. This creates the roles, the molecular machines, and fills
in the variant table.

=cut

use constant VARIANT_TYPES => { '-1' => 'vacant', '0' => 'incomplete'};

sub ParseSpreadsheet {
    # Get the parameters.
    my ($self) = @_;
    # Get the variant hash.
    my $varHash = $self->{variants};
    # Get the sapling database.
    my $sap = $self->{sap};
    # Get the statistics object.
    my $stats = $self->{stats};
    # Get the subsystem ID.
    my $subsystem = $self->{subsystem};
    # Compute its MD5 for the machine role IDs.
    my $ssMD5 = ERDB::DigestKey($subsystem);
    # Insure the default variants are present.
    if (! exists $varHash->{'0'}) {
        $varHash->{'0'} = 'Subsystem functionality is incomplete.';
    }
    if (! exists $varHash->{'-1'}) {
        $varHash->{'-1'} = 'Subsystem is not functional.';
    }
    # Open the spreadsheet file.
    Trace("Processing spreadsheet.") if T(SaplingDataLoader => 3);
    my $ih = Open(undef, "<$self->{directory}/spreadsheet");
    my (@roleList, %roleHash);
    # Loop through the roles.
    my $done = 0;
    while (! eof $ih && ! $done) {
        my ($abbr, $role) = Tracer::GetLine($ih);
        # Is this an end marker?
        if ($abbr eq '//') {
            # Yes. Stop the loop.
            $done = 1;
        } elsif ($abbr) {
            # No, store the role.
            push @roleList, $abbr;
            $roleHash{$abbr} = $role;
        }
    }
    # The next section is the subsets. All we care about here are the auxiliary roles.
    my %auxHash;
    $done = 0;
    while (! eof $ih && ! $done) {
        my ($subset, @idxes) = Tracer::GetLine($ih);
        # Is this an end marker?
        if ($subset eq '//') {
            # Yes. Stop the loop.
            $done = 1;
        } elsif ($subset =~ /^aux/) {
            # Here we have an auxiliary subset. Mark its roles in the auxiliary-role hash.
            for my $idx (@idxes) {
                $auxHash{$roleList[$idx - 1]} = 1;
            }
        }
    }
    # We now have enough information to generate the role tables.
    my $col = 0;
    Trace("Generating roles.") if T(SaplingDataLoader => 3);
    for my $abbr (@roleList) {
        # Get the role ID.
        my $roleID = $roleHash{$abbr};
        # Determine if it's hypothetical.
        my $hypo = (hypo($roleID) ? 1 : 0);
        # Insure it's in the database.
        $self->InsureEntity(Role => $roleID, hypothetical => $hypo, role_index => -1);
        # Connect it to the subsystem
        $sap->InsertObject('Includes', from_link => $subsystem, to_link => $roleID,
                           abbreviation => $abbr, auxiliary => ($auxHash{$abbr} ? 1 : 0),
                           sequence => $col++);
        $stats->Add(roles => 1);
    }
    # The final section is the role table itself. Here we get the rest of the variant data, as well.
    # We do this in two passes. First pass accumulates the data in a hash table. The second processes
    # the data. This insures that the last version of any molecular machine is the one we keep.
    my (%varsAdded, %machines);
    $done = 0;
    Trace("Processing role table.") if T(SaplingDataLoader => 3);
    while (! eof $ih && ! $done) {
        my ($genome, $variant, @cells) = Tracer::GetLine($ih);
        # Is this the end marker?
        if ($genome eq '//') {
            # Yes. Stop the loop.
            $done = 1;
        } elsif ($genome) {
            # Compute the true variant code and the curation flag.
            my $curated = ($variant =~ /^\s*\*/ ? 0 : 1);
            my $realVariant = Starless($variant);
            # Check for a region string.
            my ($genomeID, $regionString) = split m/:/, $genome;
            $regionString ||= "";
            # Compute the variant and molecular machine IDs.
            my $variantID = ERDB::DigestKey("$subsystem:$realVariant");
            my $machineID = ERDB::DigestKey("$subsystem:$realVariant:$genomeID:$regionString");
            # Insure we have the variant in the database.
            if (! exists $varsAdded{$variantID}) {
                # Denote the variant is in this subsystem.
                $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantID);
                # Create the variant record. For now, the role-rule is kept empty. We'll add the
                # rules later as we find them.
                $sap->InsertObject('Variant', id => $variantID, code => $realVariant,
                                   comment => ($varHash->{comment} || ''),
                                   type => (VARIANT_TYPES->{$realVariant} || ''));
                # Denote we've added this variant.
                $varsAdded{$variantID} = {};
                $stats->Add(variants => 1);
            }
            # Store this machine.
            $machines{$machineID} = [$variantID, $genomeID, $curated, $regionString, @cells];
        }
    }
    for my $machineID (keys %machines) {
       	# Get this machine's data.
       	my $machineData = $machines{$machineID};
       	my ($variantID, $genomeID, $curated, $regionString, @cells) = @$machineData;
      	Trace("Processing machine $machineID for genome $genomeID/$regionString.") if T(SaplingDataLoader => 3);
        # Create the molecular machine.
        $sap->InsertObject('IsImplementedBy', from_link => $variantID, to_link => $machineID);
        $sap->InsertObject('MolecularMachine', id => $machineID, curated => $curated,
                           region => $regionString);
        # Now loop through the cells.
        my @rolesFound;
        for (my $i = 0; $i <= $#cells; $i++) {
            my $cell = $cells[$i];
            # Is this cell occupied?
            if ($cell) {
                # Yes. Get this cell's role abbreviation and add it to the list of roles found
                # in this row.
                my $abbr = $roleList[$i];
                push @rolesFound, $abbr;
                # Create the machine role.
                my $machineRoleID = "$machineID:$abbr";
                $sap->InsertObject('IsMachineOf', from_link => $machineID, to_link => $machineRoleID);
                $sap->InsertObject('MachineRole', id => $machineRoleID);
                $sap->InsertObject('IsRoleOf', from_link => $roleHash{$abbr},
                                   to_link => $machineRoleID);
                # Connect the pegs in this cell to it.
                for my $pegN (split m/\s*,\s*/, $cell) {
                    $sap->InsertObject('Contains', from_link => $machineRoleID,
                                       to_link => "fig|$genomeID.peg.$pegN");
                }
            }
        }
        # Compute a role rule from this row's roles and associate it with this variant.
        my $roleRule = join(" ", @rolesFound);
        $varsAdded{$variantID}->{$roleRule} = 1;
    }
    # We've finished the spreadsheet. Now we go back and add the role rules to the variants.
    for my $variantID (keys %varsAdded) {
        my $ruleHash = $varsAdded{$variantID};
        for my $roleRule (sort keys %$ruleHash) {
            $sap->InsertValue($variantID, 'Variant(role-rule)', $roleRule);
        }
    }
}

=head2 Internal Utility Methods

=head3 ParseNotesFile

    my $notesHash = SaplingSubsystemLoader::ParseNotesFile($ih);

Read and parse the notes file from the specified file handle. The sections of the file will be
returned in a hash, keyed by section name.

=over 4

=item ih

Open handle for the notes file.

=item RETURN

Returns a reference to a hash keyed by section name, mapping each name to the text of that section.

=cut

sub ParseNotesFile {
    # Get the parameters.
    my ($ih) = @_;
    # Create the return hash.
    my $retVal = {};
    # Anything before the first separator will be classified as "notes".
    my ($section, @text) = ('notes');
    # Loop through the lines of the file.
    while (! eof $ih) {
        my $line = <$ih>;
        chomp $ih;
        if ($line =~ /^#####/) {
            # Here we have the start of a new section. If there's an old section,
            #put it in the output hash.
            if (@text) {
                $retVal->{$section} = join("\n", @text);
            }
            # Is there another section?
            if (! eof $ih) {
                # Yes. Save the new section name and clear the text array.
                my $sectionLine = <$ih>;
                $sectionLine =~ /^(\S+)/;
                $section = lc $1;
                undef @text;
            }
        } else {
            # Here we have an ordinary text line.
            push @text, $line;
        }
    }
    # Write out the last section (if any).
    if (@text) {
        $retVal->{$section} = join("\n", @text);
    }
    # Return the result hash.
    return $retVal;
}

=head3 Starless

    my $adjusted = SaplingSubsystemLoader::Starless($codeString);

Remove any spaces and leading or trailing asterisks from the incoming string and
return the result.

=over 4

=item codeString

Input string that needs to have the asterisks trimmed.

=item RETURN

Returns the incoming string with spaces and leading and trailing asterisks
removed.

=back

=cut

sub Starless {
    # Get the parameters.
    my ($codeString) = @_;
    # Declare the return variable.
    my $retVal = $codeString;
    # Remove the spaces.
    $retVal =~ s/\s+//g;
    # Trim the asterisks.
    $retVal =~ s/^\*+//;
    $retVal =~ s/\*+$//;
    # Return the result.
    return $retVal;
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3