[Bio] / Sprout / GenomeSproutLoader.pm Repository:
ViewVC logotype

View of /Sprout/GenomeSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Thu Oct 9 18:16:26 2008 UTC (11 years, 1 month ago) by parrello
Branch: MAIN
CVS Tags: rast_rel_2008_12_18, mgrast_rel_2008_1110_v2, mgrast_rel_2008_1110, rast_rel_2008_10_29, rast_rel_2008_11_24
New Sprout loaders.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

package GenomeSproutLoader;

    use strict;
    use Tracer;
    use ERDB;
    use base 'BaseSproutLoader';

=head1 Sprout Genome Load Group Class

=head2 Introduction

The  Load Group includes all of the major genome-related tables.

=head3 new

    my $sl = SproutLoader->new($erdb, $source, $options, @tables);

Construct a new SproutLoader object.

=over 4

=item erdb

[[SproutPm]] object for the database being loaded.

=item source

[[FigPm]] object used to access the source data. If this parameter is undefined,
it will be created the first time the L</source> method is called.

=item options

Reference to a hash of command-line options.

=item tables

List of tables in this load group.

=back

=cut

sub new {
    # Get the parameters.
    my ($class, $erdb, $source, $options) = @_;
    # Create the table list.
    my @tables = sort qw(Genome HasContig Contig IsMadeUpOf Sequence);
    # Create the BaseSproutLoader object.
    my $retVal = BaseSproutLoader::new($class, $erdb, $source, $options, @tables);
    # Return it.
    return $retVal;
}

=head2 Public Methods

=head3 Generate

    $sl->Generate();

Generate the data for the genome-related files.

=cut

sub Generate {
    # Get the parameters.
    my ($self) = @_;
    # Get the section ID.
    my $genomeID = $self->section();
    # Get the sprout object.
    my $sprout = $self->db();
    # Get the FIG object.
    my $fig = $self->source();
    # Only proceed if we're not the global section.
    if (! $self->global()) {
        # Get the genus, species, and strain from the scientific name.
        my ($genus, $species, @extraData) = split / /, $fig->genus_species($genomeID);
        my $extra = join " ", @extraData;
        # Get the full taxonomy.
        my $taxonomy = $fig->taxonomy_of($genomeID);
        # Get the version. If no version is specified, we default to the genome ID by itself.
        my $version = $fig->genome_version($genomeID);
        if (! defined($version)) {
            $version = $genomeID;
        }
        # Get the DNA size.
        my $dnaSize = $fig->genome_szdna($genomeID);
        # Open the NMPDR group file for this genome.
        my $group;
        if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
            defined($group = <TMP>)) {
            # Clean the line ending.
            chomp $group;
        } else {
            # No group, so use the default.
            $group = $FIG_Config::otherGroup;
        }
        close TMP;
        # Get the contigs.
        my @contigs = $fig->all_contigs($genomeID);
        Trace(scalar(@contigs) . " contigs found for $genomeID.") if T(ERDBLoadGroup => 3);
        # Output the genome record.
        $self->PutE(Genome => $genomeID, complete => $fig->is_complete($genomeID),
                   contigs => scalar(@contigs), 'dna-size' => $fig->genome_szdna($genomeID),
                   genus => $genus, pegs => $fig->genome_pegs($genomeID),
                   'primary-group' => $group, rnas => $fig->genome_rnas($genomeID),
                   species => $species, 'unique-characterization' => $extra,
                   version => $fig->genome_version($genomeID), taxonomy => $taxonomy);
        # Now we loop through each of the genome's contigs.
        for my $contigID (@contigs) {
            Trace("Processing contig $contigID for $genomeID.") if T(4);
            $self->Add(contigIn => 1);
            # Create the contig ID.
            my $sproutContigID = "$genomeID:$contigID";
            # Create the contig record and relate it to the genome.
            $self->PutE(Contig => $sproutContigID);
            $self->PutR(HasContig => $genomeID, $sproutContigID);
            # Now we need to split the contig into sequences. The maximum sequence size is
            # a property of the Sprout object.
            my $chunkSize = $sprout->MaxSequence();
            # Now we get the sequence a chunk at a time.
            my $contigLen = $fig->contig_ln($genomeID, $contigID);
            for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {
                $self->Add(chunkIn => 1);
                # Compute the endpoint of this chunk.
                my $end = FIG::min($i + $chunkSize - 1, $contigLen);
                # Get the actual DNA.
                my $dna = $fig->get_dna($genomeID, $contigID, $i, $end);
                # Compute the sequenceID.
                my $seqID = "$sproutContigID.$i";
                # Write out the data. For now, the quality vector is always "unknown".
                $self->PutR(IsMadeUpOf => $sproutContigID, $seqID, len => ($end + 1 - $i),
                           'start-position' => $i);
                $self->PutE(Sequence => $seqID, 'quality-vector' => "unknown", sequence => $dna);
                $self->Add('dna-letters' => length($dna));
            }
        }
    }
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3