[Bio] / Sprout / FeatureSaplingLoader.pm Repository:
ViewVC logotype

View of /Sprout/FeatureSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Mon Mar 2 22:22:11 2009 UTC (10 years, 6 months ago) by parrello
Branch: MAIN
CVS Tags: rast_rel_2009_05_18, rast_rel_2009_03_26
Changes since 1.1: +67 -37 lines
Changed the way identifiers are handled.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

package FeatureSaplingLoader;

    use strict;
    use Tracer;
    use ERDB;
    use CGI qw(-nosticky);
    use BasicLocation;
    use HyperLink;
    use AliasAnalysis;
    use base 'BaseSaplingLoader';

=head1 Sapling Feature Load Group Class

=head2 Introduction

The Feature Load Group includes all of the major feature-related tables.

=head3 new

    my $sl = FeatureSaplingLoader->new($erdb, $options, @tables);

Construct a new FeatureSaplingLoader object.

=over 4

=item erdb

[[SaplingPm]] object for the database being loaded.

=item options

Reference to a hash of command-line options.

=item tables

List of tables in this load group.

=back

=cut

sub new {
    # Get the parameters.
    my ($class, $erdb, $options) = @_;
    # Create the table list.
    my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink
                         FeatureVirulent IsOwnerOf IsLocatedIn Identifies
                         Identifier IsNamedBy ProteinSequence Concerns Publication);
    # Create the BaseSaplingLoader object.
    my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
    # Return it.
    return $retVal;
}

=head2 Public Methods

=head3 Generate

    $sl->Generate();

Generate the data for the feature-related files.

=cut

sub Generate {
    # Get the parameters.
    my ($self) = @_;
    # Get the database object.
    my $erdb = $self->db();
    # Only proceed if this is a normal section. There's no global feature data.
    if (! $self->global()) {
        # Get the section ID.
        my $genomeID = $self->section();
        # Load this genome's features.
        $self->LoadGenomeFeatures($genomeID);
    }
}

=head3 LoadGenomeFeatures

    $sl->LoadGenomeFeatures($genomeID);

Load the feature-related data for a single genome.

=over 4

=item genomeID

ID of the genome whose feature data is to be loaded.

=back

=cut

sub LoadGenomeFeatures {
    # Get the parameters.
    my ($self, $genomeID) = @_;
    # Get the source object.
    my $fig = $self->source();
    # Get the database.
    my $sapling = $self->db();
    # Get the maximum location  segment length. We'll need this later.
    my $maxLength = $sapling->TuningParameter('maxLocationLength');
    # Get all of this genome's features.
    my $featureList = $fig->all_features_detailed_fast($genomeID);
    # Loop through them.
    for my $feature (@$featureList) {
        # Get this feature's data.
        my ($fid, $locationString, $aliases, $type, undef, undef, $assignment,
            $assignmentMaker, $quality) = @$feature;
        $self->Track(Features => $fid, 1000);
        # Fix the assignment for non-PEG features.
        if (! defined $assignment) {
            $assignment = $aliases;
            $assignmentMaker ||= 'master';
        }
        # Convert the location string to a list of location objects.
        my @locs = map { BasicLocation->new($_) } split /\s*,\s*/, $locationString;
        # Now we need to run through the locations. We'll put the total sequence
        # length in here.
        my $seqLen = 0;
        # This will track the ordinal position of the current location segment.
        my $locN = 1;
        # Loop through the location objects.
        for my $loc (@locs) {
            # Add this location's length to the total length.
            $seqLen += $loc->Length();
            # Extract the contig ID. Note that we need to prefix the
            # genome ID to make it unique.
            my $contig = $loc->Contig();
            my $contigID = "$genomeID:$contig";
            # We also need the location's direction.
            my $dir = $loc->Dir();
            # Now we peel off sections of the location and connect them
            # to the feature.
            my $peel = $loc->Peel($maxLength);
            while (defined $peel) {
                $self->PutR(IsLocatedIn => $fid, $contigID, ordinal => $locN++,
                            begin => $peel->Left(), len => $peel->Length(),
                            dir => $dir);
                $peel = $loc->Peel($maxLength);
            }
            # Output the residual. There will always be one, because of the way
            # Peel works.
            $self->PutR(IsLocatedIn => $fid, $contigID, ordinal => $locN,
                        begin => $loc->Left(), dir => $dir, len => $loc->Length());
        }
        # Emit the feature record.
        $self->PutE(Feature => $fid, feature_type => $type,
                    sequence_length => $seqLen, function => $assignment,
                    locked => $fig->is_locked_fid($fid));
        # Connect the feature to its genome.
        $self->PutR(IsOwnerOf => $genomeID, $fid);
        # Now we have a whole bunch of attribute-related stuff to store in
        # secondary Feature tables. First is the evidence codes.
        my @evidenceTuples = $fig->get_attributes($fid, 'evidence_code');
        for my $evidenceTuple (@evidenceTuples) {
            my (undef, undef, $code) = @$evidenceTuple;
            $self->PutE(FeatureEvidence => $fid, 'evidence-code' => $code);
        }
        # Now we have the external links. These are stored using hyperlink objects.
        my @links = $fig->fid_links($fid);
        for my $link (@links) {
            my $hl = HyperLink->newFromHtml($link);
            $self->PutE(FeatureLink => $fid, link => $hl);
        }
        # Virulence data is next. This is also hyperlink data.
        my @virulenceTuples = $fig->get_attributes($fid, 'virulence_associated%');
        for my $virulenceTuple (@virulenceTuples) {
            my (undef, undef, $text, $url) = @$virulenceTuple;
            my $hl = HyperLink->new($text, $url);
            $self->PutE(FeatureVirulent => $fid, virulent => $hl);
        }
        # Finally, the essentiality stuff, which is the last of the hyperlinks.
        my @essentials = $fig->get_attributes($fid, undef, ['essential', 'potential-essential']);
        for my $essentialTuple (@essentials) {
            my (undef, undef, $essentialityType, $url) = @$essentialTuple;
            # Form a hyperlink from this essentiality tuple.
            my $link = HyperLink->new($essentialityType, $url);
            # Store it as essentiality data for this feature.
            $self->PutE(FeatureEssential => $fid, essential => $link);
        }
        # If this is a PEG, we have a protein sequence.
        if ($type eq 'peg') {
            # Get the translation.
            my $proteinSequence = $fig->get_translation($fid);
            # Compute the ID.
            my $proteinID = ERDB::DigestKey($proteinSequence);
            # Create the protein record.
            $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);
            # Get the publications for this PEG.
            my @pubs = $fig->get_attributes($fid, 'PUBMED_CURATED_RELEVANT');
            for my $pub (@pubs) {
                # Parse out the article title from the data.
                my (undef, undef, $data, $url) = @_;
                my @pieces = split /,/, $data, 3;
                if (defined $pieces[2]) {
                    # Create the publication record.
                    my $hl = Hyperlink->new($pieces[2], $url);
                    my $key = ERDB::DigestKey($url);
                    $self->PutE(Publication => $key, citation => $hl);
                    # Connect it to the protein.
                    $self->PutR(Concerns => $key, $proteinID);
                }
            }
        }
        # Now we need to compute the identifiers. We start with the aliases
        # We need to insure we don't put multiple copies of the same alias
        # in the keyword list, so we'll put all aliases found in this hash.
        my %aliasHash;
        # Loop through the aliases, recording the normal and natural forms/
        for my $alias (split /,/, $aliases) {
            # Compute the alias type.
            my $aliasType = AliasAnalysis::TypeOf($alias);
            # Is this alias a known type?
            if (! defined $aliasType) {
                # Check to see if it's a locus tag.
                if ($alias =~ /^[A-Z]{3}\d+$/) {
                    # It is, so convert it to internal form.
                    my $normalized = AliasAnalysis::Normalize(LocusTag => $alias);
                    $aliasHash{$normalized} = 'LocusTag';
                } else {
                    # Here it's a complete mystery. If we haven't seen it yet,
                    # mark it as being an unknown type.
                    if (! exists $aliasHash{$alias}) {
                        $aliasHash{$alias} = "";
                    }
                }
            } else {
                # Here we have a known type.
                $aliasHash{$alias} = $aliasType;
            }
        }
        # Add the corresponding IDs. We ask for 2-tuples of the form (id, database).
        my @corresponders = $fig->get_corresponding_ids($fid, 1);
        for my $tuple (@corresponders) {
            my ($id, $xdb) = @{$tuple};
            # Ignore SEED: that's us. Also ignore contig IDs. Those result from a bug
            # at PIR.
            if ($xdb ne 'SEED' && ! ($xdb eq 'RefSeq' && $id =~ /^[A-Z][A-Z]_\d+$/)) {
                # Compute the ID's normalized form.
                my $normalized = AliasAnalysis::Normalize($xdb => $id);
                # Add it to the alias hash.
                $aliasHash{$normalized} = $xdb;
            }
        }
        # Convert the aliases to feature identifiers.
        for my $alias (keys %aliasHash) {
            # Compute the identifier's natural form and the real type.
            my $type = $aliasHash{$alias};
            my $naturalForm;
            if (! $type) {
                # Here we have an unknown type. The natural form is the same
                # as the internal form.
                $naturalForm = $alias;
                $type = 'Miscellaneous';
            } else {
                # Here we have a known type.
                $naturalForm = AliasAnalysis::Type($type => $alias);
            }
            # Create the identifier and connect it to the feature.
            $self->PutE(Identifier => $alias, source => $type,
                        natural_form => $naturalForm);
            $self->PutR(Identifies => $alias, $fid);
        }
    }
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3