[Bio] / Sprout / GenomeSproutLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/GenomeSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package GenomeSproutLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use base 'BaseSproutLoader';
26 :    
27 :     =head1 Sprout Genome Load Group Class
28 :    
29 :     =head2 Introduction
30 :    
31 :     The Load Group includes all of the major genome-related tables.
32 :    
33 :     =head3 new
34 :    
35 :     my $sl = SproutLoader->new($erdb, $source, $options, @tables);
36 :    
37 :     Construct a new SproutLoader object.
38 :    
39 :     =over 4
40 :    
41 :     =item erdb
42 :    
43 :     [[SproutPm]] object for the database being loaded.
44 :    
45 :     =item options
46 :    
47 :     Reference to a hash of command-line options.
48 :    
49 :     =item tables
50 :    
51 :     List of tables in this load group.
52 :    
53 :     =back
54 :    
55 :     =cut
56 :    
57 :     sub new {
58 :     # Get the parameters.
59 : parrello 1.2 my ($class, $erdb, $options) = @_;
60 : parrello 1.1 # Create the table list.
61 : parrello 1.2 my @tables = sort qw(Genome HasContig Contig IsMadeUpOf Sequence Host IsPathogenicIn);
62 : parrello 1.1 # Create the BaseSproutLoader object.
63 : parrello 1.2 my $retVal = BaseSproutLoader::new($class, $erdb, $options, @tables);
64 : parrello 1.1 # Return it.
65 :     return $retVal;
66 :     }
67 :    
68 :     =head2 Public Methods
69 :    
70 :     =head3 Generate
71 :    
72 :     $sl->Generate();
73 :    
74 :     Generate the data for the genome-related files.
75 :    
76 :     =cut
77 :    
78 :     sub Generate {
79 :     # Get the parameters.
80 :     my ($self) = @_;
81 :     # Get the section ID.
82 :     my $genomeID = $self->section();
83 :     # Get the sprout object.
84 :     my $sprout = $self->db();
85 :     # Get the FIG object.
86 :     my $fig = $self->source();
87 :     # Only proceed if we're not the global section.
88 :     if (! $self->global()) {
89 :     # Get the genus, species, and strain from the scientific name.
90 : parrello 1.2 my $scientificName = $fig->genus_species($genomeID);
91 :     my ($genus, $species, $extra) = split / /, $scientificName, 3;
92 : parrello 1.1 # Get the full taxonomy.
93 :     my $taxonomy = $fig->taxonomy_of($genomeID);
94 :     # Get the version. If no version is specified, we default to the genome ID by itself.
95 :     my $version = $fig->genome_version($genomeID);
96 :     if (! defined($version)) {
97 :     $version = $genomeID;
98 :     }
99 :     # Open the NMPDR group file for this genome.
100 :     my $group;
101 :     if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
102 :     defined($group = <TMP>)) {
103 :     # Clean the line ending.
104 :     chomp $group;
105 :     } else {
106 :     # No group, so use the default.
107 :     $group = $FIG_Config::otherGroup;
108 :     }
109 :     close TMP;
110 : parrello 1.2 # Get the version-numbered ID. If there is none, we just use the genome.
111 :     my $genomeVersion = $fig->genome_version($genomeID) || $genomeID;
112 : parrello 1.1 # Get the contigs.
113 :     my @contigs = $fig->all_contigs($genomeID);
114 :     Trace(scalar(@contigs) . " contigs found for $genomeID.") if T(ERDBLoadGroup => 3);
115 : parrello 1.2 # Now come some attribute-related values. We create a hash of the genome's
116 :     # attributes.
117 :     my %attributes = map { $_->[1] => $_->[2] } $fig->get_attributes($genomeID);
118 :     # The first attribute is the pathogenic host list. Note we have to get rid
119 :     # of the value "No", which we treat as not being connected to any host.
120 :     my @hosts = grep { $_ ne 'No' } split /\s*,\s*/, ($attributes{Pathogenic_In} || "");
121 :     for my $host (@hosts) {
122 :     $self->PutE(Host => $host);
123 :     $self->PutR(IsPathogenicIn => $genomeID, $host);
124 :     }
125 :     # Next is the gram stain, which must be converted to semi-boolean.
126 :     my $gram_stain = $attributes{Gram_Stain};
127 :     if ($gram_stain =~ /positive/i) {
128 :     $gram_stain = 'Y';
129 :     } elsif ($gram_stain =~ /negative/i) {
130 :     $gram_stain = 'N';
131 :     } else {
132 :     $gram_stain = '?'
133 :     }
134 :     # The temperature range needs to be split in two. The default is 0 to 100.
135 :     my ($tempRangeMin, $tempRangeMax);
136 :     my $tempRange = $attributes{Temperature_Range};
137 :     if (! defined $tempRange) {
138 :     ($tempRangeMin, $tempRangeMax) = (0,100);
139 :     } elsif ($tempRange =~ /^\d+$/) {
140 :     ($tempRangeMin, $tempRangeMax) = ($tempRange, $tempRange);
141 :     } elsif ($tempRange =~ /^(\d+)-(\d+)$/) {
142 :     ($tempRangeMin, $tempRangeMax) = ($1, $2);
143 :     } else {
144 :     ($tempRangeMin, $tempRangeMax) = (0, 100);
145 :     }
146 :     # These attributes are all simple.
147 :     my $endospore = ERDBTypeSemiBoolean::ComputeFromString($attributes{Endospores});
148 :     my $motility = ERDBTypeSemiBoolean::ComputeFromString($attributes{Motility});
149 :     my $oxygen = $attributes{Oxygen_Requirement} || "unknown";
150 :     my $optimalTempRange = $attributes{Temperature_Range} || "unknown";
151 :     my $pathogenic = ERDBTypeSemiBoolean::ComputeFromString($attributes{Pathogenic});
152 :     my $salinity = $attributes{Salinity} || "unknown";
153 :     my $habitat = $attributes{Habitat} || "unknown";
154 :     # Now we loop through each of the genome's contigs. While doing so, we'll
155 :     # track the GC content and DNA size.
156 :     my $gc_content = 0;
157 :     my $dnaSize = 0;
158 : parrello 1.1 for my $contigID (@contigs) {
159 :     Trace("Processing contig $contigID for $genomeID.") if T(4);
160 : parrello 1.2 $self->Track(Contigs => $contigID, 100);
161 : parrello 1.1 $self->Add(contigIn => 1);
162 :     # Create the contig ID.
163 :     my $sproutContigID = "$genomeID:$contigID";
164 :     # Create the contig record and relate it to the genome.
165 :     $self->PutE(Contig => $sproutContigID);
166 :     $self->PutR(HasContig => $genomeID, $sproutContigID);
167 :     # Now we need to split the contig into sequences. The maximum sequence size is
168 :     # a property of the Sprout object.
169 :     my $chunkSize = $sprout->MaxSequence();
170 :     # Now we get the sequence a chunk at a time.
171 :     my $contigLen = $fig->contig_ln($genomeID, $contigID);
172 :     for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {
173 :     $self->Add(chunkIn => 1);
174 :     # Compute the endpoint of this chunk.
175 :     my $end = FIG::min($i + $chunkSize - 1, $contigLen);
176 :     # Get the actual DNA.
177 :     my $dna = $fig->get_dna($genomeID, $contigID, $i, $end);
178 : parrello 1.2 # Compute the stats.
179 :     my $chunkLen = length($dna);
180 :     my $chunkGC = length(join("", split /[^gc]+/, $dna));
181 :     $gc_content += $chunkGC;
182 :     $dnaSize += $chunkLen;
183 : parrello 1.1 # Compute the sequenceID.
184 :     my $seqID = "$sproutContigID.$i";
185 :     # Write out the data. For now, the quality vector is always "unknown".
186 :     $self->PutR(IsMadeUpOf => $sproutContigID, $seqID, len => ($end + 1 - $i),
187 :     'start-position' => $i);
188 :     $self->PutE(Sequence => $seqID, 'quality-vector' => "unknown", sequence => $dna);
189 : parrello 1.2 $self->Add('dna-letters' => $chunkLen);
190 : parrello 1.1 }
191 :     }
192 : parrello 1.2 # Finalize the GC content computation.
193 :     $gc_content = $gc_content * 100 / $dnaSize;
194 :     # Output the genome record.
195 :     $self->PutE(Genome => $genomeID, complete => $fig->is_complete($genomeID),
196 :     contigs => scalar(@contigs), dna_size => $dnaSize,
197 :     genus => $genus, pegs => $fig->genome_pegs($genomeID),
198 :     primary_group => $group, rnas => $fig->genome_rnas($genomeID),
199 :     species => $species, unique_characterization => $extra,
200 :     version => $genomeVersion, taxonomy => $taxonomy,
201 :     endospore => $endospore, gc_content => $gc_content,
202 :     gram_stain => $gram_stain, motility => $motility,
203 :     oxygen => $oxygen, optimal_temperature_range => $optimalTempRange,
204 :     pathogenic => $pathogenic, salinity => $salinity,
205 :     temperature_min => $tempRangeMin, temperature_max => $tempRangeMax,
206 :     habitat => $habitat, scientific_name => $scientificName);
207 : parrello 1.1 }
208 :     }
209 :    
210 :    
211 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3