[Bio] / Sprout / GenomeSaplingLoader.pm Repository:
ViewVC logotype

View of /Sprout/GenomeSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Mon Jan 19 21:43:27 2009 UTC (10 years, 10 months ago) by parrello
Branch: MAIN
CVS Tags: rast_rel_2009_02_05
Sapling support.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

package GenomeSaplingLoader;

    use strict;
    use Tracer;
    use ERDB;
    use base 'BaseSaplingLoader';

=head1 Sapling Genome Load Group Class

=head2 Introduction

The  Load Group includes all of the major genome-related tables.

=head3 new

    my $sl = GenomeSaplingLoader->new($erdb, $source, $options, @tables);

Construct a new GenomeSaplingLoader object.

=over 4

=item erdb

[[SaplingPm]] object for the database being loaded.

=item options

Reference to a hash of command-line options.

=item tables

List of tables in this load group.

=back

=cut

sub new {
    # Get the parameters.
    my ($class, $erdb, $options) = @_;
    # Create the table list.
    my @tables = sort qw(Genome IsMadeUpOf IsTaxonomyOf TaxonomicGrouping
                         IsGroupContaining DnaSequence DnaSequenceBases);
    # Create the BaseSaplingLoader object.
    my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
    # Return it.
    return $retVal;
}

=head2 Public Methods

=head3 Generate

    $sl->Generate();

Generate the data for the genome-related files.

=cut

sub Generate {
    # Get the parameters.
    my ($self) = @_;
    # Process according to the type of section.
    if ($self->global()) {
        # This is the global section. Create the taxonomic hierarchy.
        $self->CreateTaxonomies();
    } else {
        # Get the section ID.
        my $genomeID = $self->section();
        # This is a genome section. Create the data for the genome.
        $self->PlaceGenome($genomeID);
    }
}

=head3 CreateTaxonomies

    $sl->CreateTaxonomies();

Generate the taxonomy hierarchy. This includes the TaxonomicGrouping,
IsClassOf, and IsTaxonomyOf tables.

=cut

sub CreateTaxonomies {
    # Get the parameters.
    my ($self) = @_;
    # Get the Sapling object.
    my $sapling = $self->db();
    # Get the source object.
    my $fig = $sapling->GetSourceObject();
    # Create the taxonomy hash. For each taxonomic grouping, the hash will map
    # to its parent grouping.
    my %taxTree;
    # Get the genome list.
    my @genomes = sort keys %{$sapling->GenomeHash()};
    # Loop through them, processing the taxonomy of each.
    for my $genome (@genomes) {
        $self->Track(Organisms => $genome, 100);
        # Get the name of this genome. Genome names sometimes get
        # stored incorrectly in the taxonomy.
        my $genomeName = $fig->genus_species($genome);
        # Get the taxonomy list.
        my @taxClasses = grep { $_ ne $genomeName }
            split /\s*;\s*/, $fig->taxonomy_of($genome);
        # Loop through the taxonomy. For each class found, we connect
        # it to the genome with a sequence number indicating its position
        # in the genome's taxonomy, and we store its parent class name
        # so we can connect the groups. As a result, the genome is in
        # every taxonomic group it belongs to, and we have enough data
        # to produce the taxonomy tree as well.
        my $parent = undef;
        my $sequence = 0;
        for my $taxClass (@taxClasses) {
            $taxTree{$taxClass} = $parent;
            $parent = $taxClass;
            $self->PutR(IsTaxonomyOf => $taxClass, $genome,
                        sequence => $sequence++);
        }
    }
    # Now we loop through the taxonomy hash, creating the TaxonomicGrouping
    # and IsClassOf records.
    for my $taxClass (sort keys %taxTree) {
        $self->Track(TaxonomicGroupings => $taxClass, 100);
        # Determine whether or not this is a domain.
        my $parent = $taxTree{$taxClass};
        my $isDomain = (defined $parent ? 0 : 1);
        if (! $isDomain) {
            # It isn't a domain, so link it to its parent.
            $self->PutR(IsGroupContaining => $parent, $taxClass);
        }
        # Create the group record.
        $self->PutE(TaxonomicGrouping => $taxClass, domain => $isDomain);
    }
}


=head3 PlaceGenome

    $sl->PlaceGenome($genomeID);

Generate the data for a specific genome. This method generates data for
the Genome, IsMadeUpOf, DnaSequence and DnaSequenceBases
tables.

=over 4

=item genomeID

ID of the genome whose data is to be generated.

=back

=cut 

sub PlaceGenome {
    # Get the parameters.
    my ($self, $genomeID) = @_;
    # Get the Sapling object.
    my $sapling = $self->db();
    # Get the source object.
    my $fig = $sapling->GetSourceObject();
    # We start with the genome record itself, asking the FIG object
    # for its various properties.
    my $complete = $fig->is_complete($genomeID);
    my $dna_size = $fig->genome_szdna($genomeID);
    my $domain = $fig->genome_domain($genomeID);
    my $full_name = $fig->genus_species($genomeID);
    my $pegs = $fig->genome_pegs($genomeID);
    my $rnas = $fig->genome_rnas($genomeID);
    my $version = $fig->genome_version($genomeID) || $genomeID;
    # We need to compute the number of contigs from the list of contig IDs.
    my @contigIDs = $fig->contigs_of($genomeID);
    my $contigs = scalar(@contigIDs);
    # Write the genome record.
    $self->PutE(Genome => $genomeID, complete => $complete, contigs => $contigs,
                'dna-size' => $dna_size, domain => $domain, 'full-name' => $full_name,
                pegs => $pegs, rnas => $rnas, version => $version);
    # Now we create the DNA sequences. These correspond to the FIG contigs.
    for my $contigID (@contigIDs) {
        $self->Track(Contigs => $contigID, 100);
        # Get the contig length.
        my $length = $fig->contig_ln($genomeID, $contigID);
        # Generate the contig record. Note that the contig ID includes
        # the genome ID as a prefix. Otherwise, it would be non-unique.
        my $realContigID = "$genomeID:$contigID";
        $self->PutE(DnaSequence => $realContigID, length => $length);
        $self->PutR(IsMadeUpOf => $genomeID, $realContigID);
        # May God have mercy, because here we yank in the DNA.
        my $contigDNA = $fig->get_dna($genomeID, $contigID, 1, $length);
        $self->Add('dna-letters' => length($contigDNA));
        $self->PutE(DnaSequenceBases => $realContigID, bases => $contigDNA);
    }
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3