[Bio] / Sprout / GenomeSproutLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/GenomeSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package GenomeSproutLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use base 'BaseSproutLoader';
26 :    
27 :     =head1 Sprout Genome Load Group Class
28 :    
29 :     =head2 Introduction
30 :    
31 :     The Load Group includes all of the major genome-related tables.
32 :    
33 :     =head3 new
34 :    
35 :     my $sl = SproutLoader->new($erdb, $source, $options, @tables);
36 :    
37 :     Construct a new SproutLoader object.
38 :    
39 :     =over 4
40 :    
41 :     =item erdb
42 :    
43 :     [[SproutPm]] object for the database being loaded.
44 :    
45 :     =item source
46 :    
47 :     [[FigPm]] object used to access the source data. If this parameter is undefined,
48 :     it will be created the first time the L</source> method is called.
49 :    
50 :     =item options
51 :    
52 :     Reference to a hash of command-line options.
53 :    
54 :     =item tables
55 :    
56 :     List of tables in this load group.
57 :    
58 :     =back
59 :    
60 :     =cut
61 :    
62 :     sub new {
63 :     # Get the parameters.
64 :     my ($class, $erdb, $source, $options) = @_;
65 :     # Create the table list.
66 :     my @tables = sort qw(Genome HasContig Contig IsMadeUpOf Sequence);
67 :     # Create the BaseSproutLoader object.
68 :     my $retVal = BaseSproutLoader::new($class, $erdb, $source, $options, @tables);
69 :     # Return it.
70 :     return $retVal;
71 :     }
72 :    
73 :     =head2 Public Methods
74 :    
75 :     =head3 Generate
76 :    
77 :     $sl->Generate();
78 :    
79 :     Generate the data for the genome-related files.
80 :    
81 :     =cut
82 :    
83 :     sub Generate {
84 :     # Get the parameters.
85 :     my ($self) = @_;
86 :     # Get the section ID.
87 :     my $genomeID = $self->section();
88 :     # Get the sprout object.
89 :     my $sprout = $self->db();
90 :     # Get the FIG object.
91 :     my $fig = $self->source();
92 :     # Only proceed if we're not the global section.
93 :     if (! $self->global()) {
94 :     # Get the genus, species, and strain from the scientific name.
95 :     my ($genus, $species, @extraData) = split / /, $fig->genus_species($genomeID);
96 :     my $extra = join " ", @extraData;
97 :     # Get the full taxonomy.
98 :     my $taxonomy = $fig->taxonomy_of($genomeID);
99 :     # Get the version. If no version is specified, we default to the genome ID by itself.
100 :     my $version = $fig->genome_version($genomeID);
101 :     if (! defined($version)) {
102 :     $version = $genomeID;
103 :     }
104 :     # Get the DNA size.
105 :     my $dnaSize = $fig->genome_szdna($genomeID);
106 :     # Open the NMPDR group file for this genome.
107 :     my $group;
108 :     if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
109 :     defined($group = <TMP>)) {
110 :     # Clean the line ending.
111 :     chomp $group;
112 :     } else {
113 :     # No group, so use the default.
114 :     $group = $FIG_Config::otherGroup;
115 :     }
116 :     close TMP;
117 :     # Get the contigs.
118 :     my @contigs = $fig->all_contigs($genomeID);
119 :     Trace(scalar(@contigs) . " contigs found for $genomeID.") if T(ERDBLoadGroup => 3);
120 :     # Output the genome record.
121 :     $self->PutE(Genome => $genomeID, complete => $fig->is_complete($genomeID),
122 :     contigs => scalar(@contigs), 'dna-size' => $fig->genome_szdna($genomeID),
123 :     genus => $genus, pegs => $fig->genome_pegs($genomeID),
124 :     'primary-group' => $group, rnas => $fig->genome_rnas($genomeID),
125 :     species => $species, 'unique-characterization' => $extra,
126 :     version => $fig->genome_version($genomeID), taxonomy => $taxonomy);
127 :     # Now we loop through each of the genome's contigs.
128 :     for my $contigID (@contigs) {
129 :     Trace("Processing contig $contigID for $genomeID.") if T(4);
130 :     $self->Add(contigIn => 1);
131 :     # Create the contig ID.
132 :     my $sproutContigID = "$genomeID:$contigID";
133 :     # Create the contig record and relate it to the genome.
134 :     $self->PutE(Contig => $sproutContigID);
135 :     $self->PutR(HasContig => $genomeID, $sproutContigID);
136 :     # Now we need to split the contig into sequences. The maximum sequence size is
137 :     # a property of the Sprout object.
138 :     my $chunkSize = $sprout->MaxSequence();
139 :     # Now we get the sequence a chunk at a time.
140 :     my $contigLen = $fig->contig_ln($genomeID, $contigID);
141 :     for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {
142 :     $self->Add(chunkIn => 1);
143 :     # Compute the endpoint of this chunk.
144 :     my $end = FIG::min($i + $chunkSize - 1, $contigLen);
145 :     # Get the actual DNA.
146 :     my $dna = $fig->get_dna($genomeID, $contigID, $i, $end);
147 :     # Compute the sequenceID.
148 :     my $seqID = "$sproutContigID.$i";
149 :     # Write out the data. For now, the quality vector is always "unknown".
150 :     $self->PutR(IsMadeUpOf => $sproutContigID, $seqID, len => ($end + 1 - $i),
151 :     'start-position' => $i);
152 :     $self->PutE(Sequence => $seqID, 'quality-vector' => "unknown", sequence => $dna);
153 :     $self->Add('dna-letters' => length($dna));
154 :     }
155 :     }
156 :     }
157 :     }
158 :    
159 :    
160 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3