[Bio] / Sprout / FeatureSaplingLoader.pm Repository:
ViewVC logotype

View of /Sprout/FeatureSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.1 - (download) (as text) (annotate)
Mon Jan 19 21:43:27 2009 UTC (10 years, 10 months ago) by parrello
Branch: MAIN
CVS Tags: rast_rel_2009_02_05
Sapling support.

#!/usr/bin/perl -w

# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
# This file is part of the SEED Toolkit.
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.

package FeatureSaplingLoader;

    use strict;
    use Tracer;
    use ERDB;
    use CGI qw(-nosticky);
    use BasicLocation;
    use HyperLink;
    use base 'BaseSaplingLoader';

=head1 Sapling Feature Load Group Class

=head2 Introduction

The Feature Load Group includes all of the major feature-related tables.

=head3 new

    my $sl = FeatureSaplingLoader->new($erdb, $options, @tables);

Construct a new FeatureSaplingLoader object.

=over 4

=item erdb

[[SaplingPm]] object for the database being loaded.

=item options

Reference to a hash of command-line options.

=item tables

List of tables in this load group.



sub new {
    # Get the parameters.
    my ($class, $erdb, $options) = @_;
    # Create the table list.
    my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink FeatureVirulent
                        Identifier IsSequenceFor ProteinSequence Concerns Publication
                        IdentifierSet IncludesIdentifier IsLocatedIn IsOwnerOf);
    # Create the BaseSaplingLoader object.
    my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
    # Return it.
    return $retVal;

=head2 Public Methods

=head3 Generate


Generate the data for the feature-related files.


sub Generate {
    # Get the parameters.
    my ($self) = @_;
    # Get the database object.
    my $erdb = $self->db();
    # Only proceed if this is a normal section. There's no global feature data.
    if (! $self->global()) {
        # Get the section ID.
        my $genomeID = $self->section();
        # Load this genome's features.

=head3 LoadGenomeFeatures


Load the feature-related data for a single genome.

=over 4

=item genomeID

ID of the genome whose feature data is to be loaded.



sub LoadGenomeFeatures {
    # Get the parameters.
    my ($self, $genomeID) = @_;
    # Get the source object.
    my $fig = $self->source();
    # Get the database.
    my $sapling = $self->db();
    # Get the maximum location  segment length. We'll need this later.
    my $maxLength = $sapling->TuningParameter('maxLocationLength');
    # This hash will be used to track identifiers. Each identifier can only
    # be used once.
    my %identifiers;
    # Get all of this genome's features.
    my $featureList = $fig->all_features_detailed_fast($genomeID);
    # Loop through them.
    for my $feature (@$featureList) {
        # Get this feature's data.
        my ($fid, $locationString, $aliases, $type, undef, undef, $assignment,
            $assignmentMaker, $quality) = @$feature;
        $self->Track(Features => $fid, 1000);
        # Fix the assignment for non-PEG features.
        if (! defined $assignment) {
            $assignment = $aliases;
            $assignmentMaker ||= 'master';
        # Convert the location string to a list of location objects.
        my @locs = map { BasicLocation->new($_) } split /\s*,\s*/, $locationString;
        # Now we need to run through the locations. We'll put the total sequence
        # length in here.
        my $seqLen = 0;
        # This will track the ordinal position of the current location segment.
        my $locN = 1;
        # Loop through the location objects.
        for my $loc (@locs) {
            # Add this location's length to the total length.
            $seqLen += $loc->Length();
            # Extract the contig ID. Note that we need to prefix the
            # genome ID to make it unique.
            my $contig = $loc->Contig();
            my $contigID = "$genomeID:$contig";
            # We also need the location's direction.
            my $dir = $loc->Dir();
            # Now we peel off sections of the location and connect them
            # to the feature.
            my $peel = $loc->Peel($maxLength);
            while (defined $peel) {
                $self->PutR(IsLocatedIn => $fid, $contigID, beg => $peel->Left(),
                            dir => $dir, len => $maxLength, locN => $locN++);
                $peel = $loc->Peel($maxLength);
            # Output the residual. There will always be one, because of the way
            # Peel works.
            $self->PutR(IsLocatedIn => $fid, $contigID, beg => $loc->Left(),
                        dir => $dir, len => $loc->Length(), locN => $locN);
        # Emit the feature record.
        $self->PutE(Feature => $fid, feature_type => $type,
                    sequence_length => $seqLen, function => $assignment);
        # Connect the feature to its genome.
        $self->PutR(IsOwnerOf => $genomeID, $fid);
        # Now we have a whole bunch of attribute-related stuff to store in
        # secondary Feature tables. First is the evidence codes.
        my @evidenceTuples = $fig->get_attributes($fid, 'evidence_code');
        for my $evidenceTuple (@evidenceTuples) {
            my (undef, undef, $code) = @$evidenceTuple;
            $self->PutE(FeatureEvidence => $fid, 'evidence-code' => $code);
        # Now we have the external links. These are stored using hyperlink objects.
        my @links = $fig->fid_links($fid);
        for my $link (@links) {
            my $hl = HyperLink->newFromHtml($link);
            $self->PutE(FeatureLink => $fid, link => $hl);
        # Virulence data is next. This is also hyperlink data.
        my @virulenceTuples = $fig->get_attributes($fid, 'virulence_associated%');
        for my $virulenceTuple (@virulenceTuples) {
            my (undef, undef, $text, $url) = @$virulenceTuple;
            my $hl = HyperLink->new($text, $url);
            $self->PutE(FeatureVirulent => $fid, virulent => $hl);
        # Finally, the essentiality stuff, which is the last of the hyperlinks.
        my @essentials = $fig->get_attributes($fid, undef, ['essential', 'potential-essential']);
        for my $essentialTuple (@essentials) {
            my (undef, undef, $essentialityType, $url) = @$essentialTuple;
            # Form a hyperlink from this essentiality tuple.
            my $link = HyperLink->new($essentialityType, $url);
            # Store it as essentiality data for this feature.
            $self->PutE(FeatureEssential => $fid, essential => $link);
        # If this is a PEG, we have a protein sequence.
        if ($type eq 'peg') {
            # Get the translation.
            my $proteinSequence = $fig->get_translation($fid);
            # Compute the ID.
            my $proteinID = ERDB::DigestKey($proteinSequence);
            # Create the protein record.
            $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);
            # Get the publications for this PEG.
            my @pubs = $fig->get_attributes($fid, 'PUBMED_CURATED_RELEVANT');
            for my $pub (@pubs) {
                # Parse out the article title from the data.
                my (undef, undef, $data, $url) = @_;
                my @pieces = split /,/, $data, 3;
                if (defined $pieces[2]) {
                    # Create the publication record.
                    my $hl = Hyperlink->new($pieces[2], $url);
                    my $key = ERDB::DigestKey($url);
                    $self->PutE(Publication => $key, citation => $hl);
                    # Connect it to the protein.
                    $self->PutR(Concerns => $key, $proteinID);
            # Now we need to get the identifiers for this feature and put
            # them in the protein's identifier set. The "1" tells FigPm to
            # send back the database name with each identifier. Note that
            # the FIG ID will come back with this list, but there may not be
            # a list if the genome is new.
            my @idTuples = grep { $_->[0] !~ /^[A-Z][A-Z]_\d+$/ } $fig->get_corresponding_ids($fid, 1); ##HACK: grep out the contig IDs
            if (! @idTuples) {
                push @idTuples, [$fid, 'SEED'];
            # Compute the identifier set name and create the set.
            my $setID = "$proteinID:$genomeID";
            $self->PutE(IdentifierSet => $setID);
            # Create the identifiers and onnect them to the protein and
            # the set.
            for my $idTuple (@idTuples) {
                my ($id, $source) = @$idTuple;
                # Only process this identifier if it's new. An identifier
                # can only be in one identifier set. Thankfully, the
                # identifiers belong to genomes, so we don't need to worry
                # about duplicates in other sections.
                if (exists $identifiers{$id} && $identifiers{$id} ne $proteinID) {
                    $self->Add(ambiguousProtein => 1);
                } else {
                    $self->PutE(Identifier => $id, source => $source);
                    $self->PutR(IsSequenceFor => $proteinID, $id);
                    $self->PutR(IncludesIdentifier => $setID, $id);
                    $identifiers{$id} = $proteinID;


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3