[Bio] / Sprout / SubsystemSaplingLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SubsystemSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SubsystemSaplingLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use base 'BaseSaplingLoader';
26 :    
27 :     =head1 Sapling Subsystem Load Group Class
28 :    
29 :     =head2 Introduction
30 :    
31 :     The Subsystem Load Group includes all of the major subsystem-related tables.
32 :    
33 :     =head3 new
34 :    
35 :     my $sl = SubsystemSaplingLoader->new($erdb, $options, @tables);
36 :    
37 :     Construct a new SubsystemSaplingLoader object.
38 :    
39 :     =over 4
40 :    
41 :     =item erdb
42 :    
43 : parrello 1.8 L<Sapling> object for the database being loaded.
44 : parrello 1.1
45 :     =item options
46 :    
47 :     Reference to a hash of command-line options.
48 :    
49 :     =item tables
50 :    
51 :     List of tables in this load group.
52 :    
53 :     =back
54 :    
55 :     =cut
56 :    
57 :     sub new {
58 :     # Get the parameters.
59 :     my ($class, $erdb, $options) = @_;
60 :     # Create the table list.
61 : parrello 1.3 my @tables = sort qw(Subsystem IsClassFor SubsystemClass IsSuperclassOf Includes
62 : parrello 1.13 Describes Variant IsRoleOf IsImplementedBy MachineRole
63 : parrello 1.8 IsMachineOf MolecularMachine Contains Uses VariantRole);
64 : parrello 1.1 # Create the BaseSaplingLoader object.
65 :     my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
66 :     # Return it.
67 :     return $retVal;
68 :     }
69 :    
70 :     =head2 Public Methods
71 :    
72 :     =head3 Generate
73 :    
74 :     $sl->Generate();
75 :    
76 :     Generate the data for the subsystem-related files.
77 :    
78 :     =cut
79 :    
80 :     sub Generate {
81 :     # Get the parameters.
82 :     my ($self) = @_;
83 :     # Get the database object.
84 :     my $erdb = $self->db();
85 :     # Get the source object.
86 :     my $fig = $self->source();
87 :     # Is this the global section?
88 :     if ($self->global()) {
89 :     # Yes, build the subsystem framework.
90 :     $self->GenerateSubsystems($fig, $erdb);
91 :     } else {
92 :     # Get the section ID.
93 :     my $genomeID = $self->section();
94 :     # Generate the subsystem date for this genome.
95 :     $self->GenerateSubsystemData($fig, $erdb, $genomeID);
96 :     }
97 :     }
98 :    
99 :     =head3 GenerateSubsystems
100 :    
101 :     $sl->GenerateSubsystems($fig, $erdb);
102 :    
103 :     Generate the subsystems, variants, and roles for this database. This
104 :     method concerns itself primarily with the genome-independent part of the
105 :     subsystem framework. This includes the following tables:
106 :    
107 :     Subsystem
108 :     Describes
109 :     Variant
110 :     Includes
111 :     IsClassFor
112 :     SubsystemClass
113 :     IsSuperclassOf
114 :    
115 :     =over 4
116 :    
117 :     =item fig
118 :    
119 :     Source object from which the subsystem data will be extracted.
120 :    
121 :     =item erdb
122 :    
123 :     Database object for the Sapling database.
124 :    
125 :     =back
126 :    
127 :     =cut
128 :    
129 :     sub GenerateSubsystems {
130 :     # Get the parameters.
131 :     my ($self, $fig, $erdb) = @_;
132 :     # Get the subsystem hash for this Sapling instance. Its key list will be
133 :     # the list of subsystems to put in the database.
134 :     my $subHash = $erdb->SubsystemHash();
135 :     # We'll track the various subsystem classes in here.
136 :     my %subClassHash = ();
137 :     # Loop through the subsystems.
138 :     for my $subsystem (keys %$subHash) {
139 : parrello 1.3 Trace("Processing subsystem $subsystem.") if T(ERDBLoadGroup => 3);
140 : parrello 1.1 # Get the FIG subsystem object.
141 :     my $ssData = $fig->get_subsystem($subsystem);
142 : parrello 1.12 # Only proceed if we found it.
143 :     if (! defined $ssData) {
144 :     $self->Add(missingSubsystem => 1);
145 :     Trace("Subsystem $subsystem not found.") if T(ERDBLoadGroup => 1);
146 :     } else {
147 :     # These will be set to 1 if the subsystem has the indicated property.
148 :     my $experimental = 0;
149 :     my $clustered = 0;
150 :     # Get the subsystem's classes.
151 :     my $classes = $ssData->get_classification();
152 :     # Only proceed if classes exist.
153 :     if (scalar @$classes) {
154 :     # Check for one of the special roots. If we find it, we shift it off
155 :     # the list.
156 :     if ($classes->[0] =~ /Clustering/) {
157 :     $clustered = 1;
158 :     shift @$classes;
159 :     } elsif ($classes->[0] =~ /Experimental/) {
160 :     $experimental = 1;
161 :     shift @$classes;
162 :     }
163 :     # Loop through the remaining classes from the bottom up.
164 :     my $class = pop @$classes;
165 :     if (defined $class) {
166 :     # Create the class record.
167 :     $self->CreateClass($class);
168 :     # Connect it to the subsystem.
169 :     $self->PutR(IsClassFor => $class, $subsystem);
170 :     # Is this a new class?
171 :     if (! $subClassHash{$class}) {
172 :     # Yes. We need to put it in its hierarchy.
173 :     while (my $newClass = pop @$classes) {
174 :     # Create the new class's record.
175 :     $self->CreateClass($newClass);
176 :     # Put it above the previous class.
177 :     $self->PutR(IsSuperclassOf => $newClass, $class);
178 :     # Insure we know we're done with this class.
179 :     $subClassHash{$class} = 1;
180 :     # Prepare for the next class.
181 :     $class = $newClass;
182 :     }
183 :     }
184 :     }
185 :     }
186 :     # Get the subsystem properties.
187 :     my $curator = $ssData->get_curator();
188 :     my $description = $ssData->get_description();
189 :     my $notes = $ssData->get_notes();
190 :     my $version = $ssData->get_version();
191 :     my $usable = ($fig->is_experimental_subsystem($subsystem) ? 0 : 1);
192 :     my $private = ($fig->is_exchangable_subsystem($subsystem) ? 0 : 1);
193 :     # Fix the curator.
194 :     $curator =~ s/^master://;
195 :     # Ensure we have a description.
196 :     if (! defined $description) {
197 :     $description = '';
198 : parrello 1.9 }
199 : parrello 1.12 # Emit the subsystem record.
200 :     $self->PutE(Subsystem => $subsystem, curator => $curator,
201 :     description => $description, notes => $notes,
202 :     version => $version, usable => $usable,
203 :     private => $private, cluster_based => $clustered,
204 :     experimental => $experimental);
205 :     # Get this subsystem's roles.
206 :     my @roles = $ssData->get_roles();
207 :     # This will track the column number for the role.
208 :     my $col = 0;
209 :     # Loop through the roles.
210 :     for my $role (@roles) {
211 :     # Check to see if this role is main or auxiliary.
212 :     my $auxFlag = ($fig->is_aux_role_in_subsystem($subsystem, $role) ? 1 : 0);
213 : parrello 1.9 # Connect it to the subsystem.
214 : parrello 1.12 $self->PutR(Includes => $subsystem, $role,
215 :     abbreviation => $ssData->get_abbr_for_role($role),
216 :     sequence => $col++, auxiliary => $auxFlag);
217 :     }
218 :     # Next come the variants. Variant data is sparse in the SEED. We
219 :     # start by getting all the known variant codes.
220 :     my %variants = map { BaseSaplingLoader::Starless($_) => '' } $ssData->get_variant_codes();
221 :     # -1 and 0 are always present.
222 :     $variants{'0'} = 'Subsystem functionality is incomplete.';
223 :     $variants{'-1'} = 'Subsystem is not functional.';
224 :     # Now get notes from any variants that have them. Note that we need
225 :     # to clean up the variant code with a call to Starless.
226 :     my $variantHash = $ssData->get_variants();
227 :     for my $variant (keys %$variantHash) {
228 :     my $realVariantID = BaseSaplingLoader::Starless($variant);
229 :     $variants{$realVariantID} = $variantHash->{$variant};
230 :     }
231 :     # Next we need to compute the role rules. For each genome in the subsystem,
232 :     # we compute its variant code and a list of its roles. These are put
233 :     # into the following two-dimensional hash. Each inner hash maps a role
234 :     # rule list to 1. The keys of the inner hash become the role rules.
235 :     my %roleRuleHash = map { $_ => {} } keys %variants;
236 :     # Loop through the list of genomes.
237 :     my @genomes = $ssData->get_genomes();
238 :     for (my $i = 0; $i < scalar(@genomes); $i++) {
239 :     # Get this genome's variant code.
240 :     my $variantCode = BaseSaplingLoader::Starless($ssData->get_variant_code($i));
241 :     # Get its roles.
242 :     my @roles = $ssData->get_roles_for_genome($genomes[$i]);
243 :     # Convert them to a role rule.
244 :     my $rule = join(" ", sort map { $ssData->get_abbr_for_role($_) } @roles);
245 :     # Put the role in the hash.
246 :     $roleRuleHash{$variantCode}{$rule} = 1;
247 :     }
248 :     # Create the variants.
249 :     for my $variant (keys %variants) {
250 :     # The variant key is the subsystem ID plus the variant code.
251 :     my $variantID = ERDB::DigestKey("$subsystem:$variant");
252 :     # Compute its type.
253 :     my $variant_type = ($variant =~ /^0/ ? 'incomplete' :
254 :     $variant =~ /^-/ ? 'vacant' : 'normal');
255 :     # The comment is easily computed from the variant data, so
256 :     # we now have enough data to output the variant record.
257 :     $self->PutE(Variant => $variantID, type => $variant_type,
258 :     code => $variant, comment => $variants{$variant});
259 :     # Now output the role rules.
260 :     for my $rule (keys %{$roleRuleHash{$variant}}) {
261 :     $self->PutE(VariantRole => $variantID, role_rule => $rule);
262 : parrello 1.9 }
263 : parrello 1.12 # Link the subsystem to the variant.
264 :     $self->PutR(Describes => $subsystem, $variantID);
265 : parrello 1.5 }
266 : parrello 1.12 # Clear the subsystem cache to keep memory under control.
267 :     $fig->clear_subsystem_cache();
268 : parrello 1.5 }
269 : parrello 1.1 }
270 :     }
271 :    
272 :     =head3 GenerateSubsystemData
273 :    
274 :     $sl->GenerateSubsystemData($fig, $erdb, $genomeID);
275 :    
276 :     Generate the molecular machines and subsystem spreadsheet cells for this
277 :     database. This method concerns itself primarily with the genome-dependent
278 :     part of the subsystem framework. This includes the following tables.
279 :    
280 :     IsImplementedBy
281 :     MolecularMachine
282 :     IsMachineOf
283 :     MachineRole
284 :     Uses
285 : parrello 1.9 Contains
286 : parrello 1.1 IsRoleOf
287 :    
288 :     =over 4
289 :    
290 :     =item fig
291 :    
292 :     Source object from which the subsystem data will be extracted.
293 :    
294 :     =item erdb
295 :    
296 :     Database object for the Sapling database.
297 :    
298 :     =item genomeID
299 :    
300 :     ID of the relevant genome.
301 :    
302 :     =back
303 :    
304 :     =cut
305 :    
306 :     sub GenerateSubsystemData {
307 :     # Get the parameters.
308 :     my ($self, $fig, $erdb, $genomeID) = @_;
309 :     # Get the subsystem hash for this Sapling instance. Its key list will be
310 :     # the list of subsystems being put in the database.
311 :     my $subHash = $erdb->SubsystemHash();
312 :     # Get the list of subsystems for this genome. The "1" indicates we want
313 : parrello 1.5 # all of them, including the ones for 0 and -1 variants. Note we have
314 :     # to normalize the subsystem names.
315 :     my @subName = map { $erdb->SubsystemID($_) } $fig->subsystems_for_genome($genomeID, 1);
316 : parrello 1.1 # Loop through the named subsystems. Each one corresponds to a molecular
317 :     # machine.
318 : parrello 1.5 for my $subName (grep { exists $subHash->{$_} } @subName) {
319 : parrello 1.1 $self->Track(MolecularMachines => $subName, 100);
320 : parrello 1.7 # Compute the MD5 hash of the subsystem ID.
321 :     my $ssMD5 = ERDB::DigestKey($subName);
322 : parrello 1.1 # Get the subsystem object.
323 :     my $ssData = $fig->get_subsystem($subName);
324 : parrello 1.3 # Now we find the molecular machines for this subsystem/genome pair.
325 :     my @rows = $ssData->get_genomes();
326 :     for (my $gidx = 0; $gidx <= $#rows; $gidx++) {
327 :     my ($rowGenome, $regionString) = split /:/, $rows[$gidx], 2;
328 :     if ($rowGenome eq $genomeID) {
329 :     # Here we're positioned on a row for our genome. If it is
330 :     # a region-restricted molecular machine, then the region
331 :     # string will be defined. If it's global, we use an empty
332 :     # string for the region.
333 :     $regionString ||= "";
334 :     # Create the molecular machine. To do that, we need the variant code
335 :     # for this genome.
336 :     my $raw_variant_code = $ssData->get_variant_code($gidx);
337 :     # Check for a leading asterisk. This means the variant assignment is not
338 :     # curated.
339 :     my $curated = ($raw_variant_code =~ /^\s*\*/ ? 0 : 1);
340 :     # Clear any waste from the variant code.
341 :     my $variant_code = BaseSaplingLoader::Starless($raw_variant_code);
342 :     # Create the variant and machine IDs.
343 : parrello 1.7 my $variantID = ERDB::DigestKey("$subName:$variant_code");
344 :     my $machineID = ERDB::DigestKey("$subName:$variant_code:$genomeID:$regionString");
345 : parrello 1.3 # Create the molecular machine and connect it to the genome and
346 :     # subsystem.
347 : parrello 1.7 $self->PutE(MolecularMachine => $machineID,
348 : parrello 1.3 curated => $curated, region => $regionString);
349 :     $self->PutR(IsImplementedBy => $variantID, $machineID);
350 :     $self->PutR(Uses => $genomeID, $machineID);
351 :     # Now we loop through the subsystem's roles, creating the MachineRoles.
352 :     # Molecular machines function as spreadsheet rows; machine roles are
353 :     # spreadsheet cells.
354 :     my @roles = $ssData->get_roles();
355 :     for my $role (@roles) {
356 :     # Get this role's abbreviation.
357 :     my $ridx = $ssData->get_role_index($role);
358 :     my $abbr = $ssData->get_role_abbr($ridx);
359 :     # Create the machine-role ID.
360 : parrello 1.7 my $machineRoleID = "$ssMD5:$genomeID:$regionString:$abbr";
361 : parrello 1.3 # Create the machine-role and connect it to the role and the
362 :     # machine.
363 :     $self->PutE(MachineRole => $machineRoleID);
364 :     $self->PutR(IsMachineOf => $machineID, $machineRoleID);
365 :     $self->PutR(IsRoleOf => $role, $machineRoleID);
366 :     # Now get a list of the features in this cell.
367 :     my @pegs = $ssData->get_pegs_from_cell($genomeID, $ridx);
368 :     # Connect them to the cell.
369 :     for my $peg (@pegs) {
370 : parrello 1.9 $self->PutR(Contains => $machineRoleID, $peg);
371 : parrello 1.3 }
372 :     }
373 : parrello 1.1 }
374 :     }
375 : parrello 1.4 # Clear the subsystem cache to save space.
376 :     $fig->clear_subsystem_cache();
377 : parrello 1.1 }
378 :     }
379 :    
380 :     =head3 CreateClass
381 :    
382 :     $sl->CreateClass($className);
383 :    
384 :     Create a SubsystemClass record with the specified class name.
385 :    
386 :     =over 4
387 :    
388 :     =item className
389 :    
390 :     Name of the subsystem classification to create.
391 :    
392 :     =back
393 :    
394 :     =cut
395 :    
396 :     sub CreateClass {
397 :     # Get the parameters.
398 :     my ($self, $className) = @_;
399 :     # Create the subsystem class record.
400 :     $self->PutE(SubsystemClass => $className);
401 :     }
402 :    
403 :    
404 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3