[Bio] / Sprout / SubsystemSaplingLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SubsystemSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.16 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SubsystemSaplingLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 : parrello 1.15 use LoaderUtils;
26 : parrello 1.1 use base 'BaseSaplingLoader';
27 :    
28 :     =head1 Sapling Subsystem Load Group Class
29 :    
30 :     =head2 Introduction
31 :    
32 :     The Subsystem Load Group includes all of the major subsystem-related tables.
33 :    
34 :     =head3 new
35 :    
36 :     my $sl = SubsystemSaplingLoader->new($erdb, $options, @tables);
37 :    
38 :     Construct a new SubsystemSaplingLoader object.
39 :    
40 :     =over 4
41 :    
42 :     =item erdb
43 :    
44 : parrello 1.8 L<Sapling> object for the database being loaded.
45 : parrello 1.1
46 :     =item options
47 :    
48 :     Reference to a hash of command-line options.
49 :    
50 :     =item tables
51 :    
52 :     List of tables in this load group.
53 :    
54 :     =back
55 :    
56 :     =cut
57 :    
58 :     sub new {
59 :     # Get the parameters.
60 :     my ($class, $erdb, $options) = @_;
61 :     # Create the table list.
62 : parrello 1.3 my @tables = sort qw(Subsystem IsClassFor SubsystemClass IsSuperclassOf Includes
63 : parrello 1.13 Describes Variant IsRoleOf IsImplementedBy MachineRole
64 : parrello 1.8 IsMachineOf MolecularMachine Contains Uses VariantRole);
65 : parrello 1.1 # Create the BaseSaplingLoader object.
66 :     my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
67 :     # Return it.
68 :     return $retVal;
69 :     }
70 :    
71 :     =head2 Public Methods
72 :    
73 :     =head3 Generate
74 :    
75 :     $sl->Generate();
76 :    
77 :     Generate the data for the subsystem-related files.
78 :    
79 :     =cut
80 :    
81 :     sub Generate {
82 :     # Get the parameters.
83 :     my ($self) = @_;
84 :     # Get the database object.
85 :     my $erdb = $self->db();
86 :     # Get the source object.
87 :     my $fig = $self->source();
88 :     # Is this the global section?
89 :     if ($self->global()) {
90 :     # Yes, build the subsystem framework.
91 :     $self->GenerateSubsystems($fig, $erdb);
92 :     } else {
93 :     # Get the section ID.
94 :     my $genomeID = $self->section();
95 :     # Generate the subsystem date for this genome.
96 :     $self->GenerateSubsystemData($fig, $erdb, $genomeID);
97 :     }
98 :     }
99 :    
100 :     =head3 GenerateSubsystems
101 :    
102 :     $sl->GenerateSubsystems($fig, $erdb);
103 :    
104 :     Generate the subsystems, variants, and roles for this database. This
105 :     method concerns itself primarily with the genome-independent part of the
106 :     subsystem framework. This includes the following tables:
107 :    
108 :     Subsystem
109 :     Describes
110 :     Variant
111 :     Includes
112 :     IsClassFor
113 :     SubsystemClass
114 :     IsSuperclassOf
115 :    
116 :     =over 4
117 :    
118 :     =item fig
119 :    
120 :     Source object from which the subsystem data will be extracted.
121 :    
122 :     =item erdb
123 :    
124 :     Database object for the Sapling database.
125 :    
126 :     =back
127 :    
128 :     =cut
129 :    
130 :     sub GenerateSubsystems {
131 :     # Get the parameters.
132 :     my ($self, $fig, $erdb) = @_;
133 :     # Get the subsystem hash for this Sapling instance. Its key list will be
134 :     # the list of subsystems to put in the database.
135 :     my $subHash = $erdb->SubsystemHash();
136 :     # We'll track the various subsystem classes in here.
137 :     my %subClassHash = ();
138 :     # Loop through the subsystems.
139 :     for my $subsystem (keys %$subHash) {
140 : parrello 1.3 Trace("Processing subsystem $subsystem.") if T(ERDBLoadGroup => 3);
141 : parrello 1.1 # Get the FIG subsystem object.
142 :     my $ssData = $fig->get_subsystem($subsystem);
143 : parrello 1.12 # Only proceed if we found it.
144 :     if (! defined $ssData) {
145 :     $self->Add(missingSubsystem => 1);
146 :     Trace("Subsystem $subsystem not found.") if T(ERDBLoadGroup => 1);
147 : parrello 1.14 } elsif ($ssData->{empty_ss}) {
148 :     $self->Add(emptySubsystem => 1);
149 :     Trace("Subsystem $subsystem is empty.") if T(ERDBLoadGroup => 1);
150 : parrello 1.12 } else {
151 :     # These will be set to 1 if the subsystem has the indicated property.
152 :     my $experimental = 0;
153 :     my $clustered = 0;
154 :     # Get the subsystem's classes.
155 :     my $classes = $ssData->get_classification();
156 :     # Only proceed if classes exist.
157 :     if (scalar @$classes) {
158 :     # Check for one of the special roots. If we find it, we shift it off
159 :     # the list.
160 :     if ($classes->[0] =~ /Clustering/) {
161 :     $clustered = 1;
162 :     shift @$classes;
163 :     } elsif ($classes->[0] =~ /Experimental/) {
164 :     $experimental = 1;
165 :     shift @$classes;
166 :     }
167 :     # Loop through the remaining classes from the bottom up.
168 :     my $class = pop @$classes;
169 :     if (defined $class) {
170 :     # Create the class record.
171 :     $self->CreateClass($class);
172 :     # Connect it to the subsystem.
173 :     $self->PutR(IsClassFor => $class, $subsystem);
174 :     # Is this a new class?
175 :     if (! $subClassHash{$class}) {
176 :     # Yes. We need to put it in its hierarchy.
177 :     while (my $newClass = pop @$classes) {
178 :     # Create the new class's record.
179 :     $self->CreateClass($newClass);
180 :     # Put it above the previous class.
181 :     $self->PutR(IsSuperclassOf => $newClass, $class);
182 :     # Insure we know we're done with this class.
183 :     $subClassHash{$class} = 1;
184 :     # Prepare for the next class.
185 :     $class = $newClass;
186 :     }
187 :     }
188 :     }
189 :     }
190 :     # Get the subsystem properties.
191 :     my $curator = $ssData->get_curator();
192 :     my $description = $ssData->get_description();
193 :     my $notes = $ssData->get_notes();
194 :     my $version = $ssData->get_version();
195 :     my $usable = ($fig->is_experimental_subsystem($subsystem) ? 0 : 1);
196 :     my $private = ($fig->is_exchangable_subsystem($subsystem) ? 0 : 1);
197 :     # Fix the curator.
198 : parrello 1.16 if (! defined $curator) {
199 :     $curator = "unknown";
200 :     } else {
201 :     $curator =~ s/^master://;
202 :     }
203 : parrello 1.12 # Ensure we have a description.
204 :     if (! defined $description) {
205 :     $description = '';
206 : parrello 1.9 }
207 : parrello 1.12 # Emit the subsystem record.
208 :     $self->PutE(Subsystem => $subsystem, curator => $curator,
209 :     description => $description, notes => $notes,
210 :     version => $version, usable => $usable,
211 :     private => $private, cluster_based => $clustered,
212 :     experimental => $experimental);
213 :     # Get this subsystem's roles.
214 :     my @roles = $ssData->get_roles();
215 :     # This will track the column number for the role.
216 :     my $col = 0;
217 :     # Loop through the roles.
218 :     for my $role (@roles) {
219 :     # Check to see if this role is main or auxiliary.
220 :     my $auxFlag = ($fig->is_aux_role_in_subsystem($subsystem, $role) ? 1 : 0);
221 : parrello 1.9 # Connect it to the subsystem.
222 : parrello 1.12 $self->PutR(Includes => $subsystem, $role,
223 :     abbreviation => $ssData->get_abbr_for_role($role),
224 :     sequence => $col++, auxiliary => $auxFlag);
225 :     }
226 :     # Next come the variants. Variant data is sparse in the SEED. We
227 :     # start by getting all the known variant codes.
228 :     my %variants = map { BaseSaplingLoader::Starless($_) => '' } $ssData->get_variant_codes();
229 :     # -1 and 0 are always present.
230 :     $variants{'0'} = 'Subsystem functionality is incomplete.';
231 :     $variants{'-1'} = 'Subsystem is not functional.';
232 :     # Now get notes from any variants that have them. Note that we need
233 :     # to clean up the variant code with a call to Starless.
234 :     my $variantHash = $ssData->get_variants();
235 :     for my $variant (keys %$variantHash) {
236 :     my $realVariantID = BaseSaplingLoader::Starless($variant);
237 :     $variants{$realVariantID} = $variantHash->{$variant};
238 :     }
239 :     # Next we need to compute the role rules. For each genome in the subsystem,
240 :     # we compute its variant code and a list of its roles. These are put
241 :     # into the following two-dimensional hash. Each inner hash maps a role
242 :     # rule list to 1. The keys of the inner hash become the role rules.
243 :     my %roleRuleHash = map { $_ => {} } keys %variants;
244 :     # Loop through the list of genomes.
245 :     my @genomes = $ssData->get_genomes();
246 :     for (my $i = 0; $i < scalar(@genomes); $i++) {
247 :     # Get this genome's variant code.
248 :     my $variantCode = BaseSaplingLoader::Starless($ssData->get_variant_code($i));
249 :     # Get its roles.
250 :     my @roles = $ssData->get_roles_for_genome($genomes[$i]);
251 :     # Convert them to a role rule.
252 :     my $rule = join(" ", sort map { $ssData->get_abbr_for_role($_) } @roles);
253 :     # Put the role in the hash.
254 :     $roleRuleHash{$variantCode}{$rule} = 1;
255 :     }
256 :     # Create the variants.
257 :     for my $variant (keys %variants) {
258 :     # The variant key is the subsystem ID plus the variant code.
259 :     my $variantID = ERDB::DigestKey("$subsystem:$variant");
260 :     # Compute its type.
261 :     my $variant_type = ($variant =~ /^0/ ? 'incomplete' :
262 :     $variant =~ /^-/ ? 'vacant' : 'normal');
263 :     # The comment is easily computed from the variant data, so
264 :     # we now have enough data to output the variant record.
265 :     $self->PutE(Variant => $variantID, type => $variant_type,
266 :     code => $variant, comment => $variants{$variant});
267 :     # Now output the role rules.
268 :     for my $rule (keys %{$roleRuleHash{$variant}}) {
269 :     $self->PutE(VariantRole => $variantID, role_rule => $rule);
270 : parrello 1.9 }
271 : parrello 1.12 # Link the subsystem to the variant.
272 :     $self->PutR(Describes => $subsystem, $variantID);
273 : parrello 1.5 }
274 : parrello 1.12 # Clear the subsystem cache to keep memory under control.
275 :     $fig->clear_subsystem_cache();
276 : parrello 1.5 }
277 : parrello 1.1 }
278 :     }
279 :    
280 :     =head3 GenerateSubsystemData
281 :    
282 :     $sl->GenerateSubsystemData($fig, $erdb, $genomeID);
283 :    
284 :     Generate the molecular machines and subsystem spreadsheet cells for this
285 :     database. This method concerns itself primarily with the genome-dependent
286 :     part of the subsystem framework. This includes the following tables.
287 :    
288 :     IsImplementedBy
289 :     MolecularMachine
290 :     IsMachineOf
291 :     MachineRole
292 :     Uses
293 : parrello 1.9 Contains
294 : parrello 1.1 IsRoleOf
295 :    
296 :     =over 4
297 :    
298 :     =item fig
299 :    
300 :     Source object from which the subsystem data will be extracted.
301 :    
302 :     =item erdb
303 :    
304 :     Database object for the Sapling database.
305 :    
306 :     =item genomeID
307 :    
308 :     ID of the relevant genome.
309 :    
310 :     =back
311 :    
312 :     =cut
313 :    
314 :     sub GenerateSubsystemData {
315 :     # Get the parameters.
316 :     my ($self, $fig, $erdb, $genomeID) = @_;
317 :     # Get the subsystem hash for this Sapling instance. Its key list will be
318 :     # the list of subsystems being put in the database.
319 :     my $subHash = $erdb->SubsystemHash();
320 :     # Get the list of subsystems for this genome. The "1" indicates we want
321 : parrello 1.5 # all of them, including the ones for 0 and -1 variants. Note we have
322 :     # to normalize the subsystem names.
323 :     my @subName = map { $erdb->SubsystemID($_) } $fig->subsystems_for_genome($genomeID, 1);
324 : parrello 1.15 # Get the functional assignments for the features in this genome. We'll need
325 :     # this later when we're connecting them to subsystem cells.
326 :     my %fidHash = map { $_->[0] => $_->[1] } @{$fig->get_genome_assignment_data($genomeID)};
327 : parrello 1.1 # Loop through the named subsystems. Each one corresponds to a molecular
328 :     # machine.
329 : parrello 1.5 for my $subName (grep { exists $subHash->{$_} } @subName) {
330 : parrello 1.1 $self->Track(MolecularMachines => $subName, 100);
331 : parrello 1.7 # Compute the MD5 hash of the subsystem ID.
332 :     my $ssMD5 = ERDB::DigestKey($subName);
333 : parrello 1.1 # Get the subsystem object.
334 :     my $ssData = $fig->get_subsystem($subName);
335 : parrello 1.3 # Now we find the molecular machines for this subsystem/genome pair.
336 :     my @rows = $ssData->get_genomes();
337 :     for (my $gidx = 0; $gidx <= $#rows; $gidx++) {
338 :     my ($rowGenome, $regionString) = split /:/, $rows[$gidx], 2;
339 :     if ($rowGenome eq $genomeID) {
340 :     # Here we're positioned on a row for our genome. If it is
341 :     # a region-restricted molecular machine, then the region
342 :     # string will be defined. If it's global, we use an empty
343 :     # string for the region.
344 :     $regionString ||= "";
345 :     # Create the molecular machine. To do that, we need the variant code
346 :     # for this genome.
347 :     my $raw_variant_code = $ssData->get_variant_code($gidx);
348 :     # Check for a leading asterisk. This means the variant assignment is not
349 :     # curated.
350 :     my $curated = ($raw_variant_code =~ /^\s*\*/ ? 0 : 1);
351 :     # Clear any waste from the variant code.
352 :     my $variant_code = BaseSaplingLoader::Starless($raw_variant_code);
353 :     # Create the variant and machine IDs.
354 : parrello 1.7 my $variantID = ERDB::DigestKey("$subName:$variant_code");
355 :     my $machineID = ERDB::DigestKey("$subName:$variant_code:$genomeID:$regionString");
356 : parrello 1.3 # Create the molecular machine and connect it to the genome and
357 :     # subsystem.
358 : parrello 1.7 $self->PutE(MolecularMachine => $machineID,
359 : parrello 1.3 curated => $curated, region => $regionString);
360 :     $self->PutR(IsImplementedBy => $variantID, $machineID);
361 :     $self->PutR(Uses => $genomeID, $machineID);
362 :     # Now we loop through the subsystem's roles, creating the MachineRoles.
363 :     # Molecular machines function as spreadsheet rows; machine roles are
364 :     # spreadsheet cells.
365 :     my @roles = $ssData->get_roles();
366 :     for my $role (@roles) {
367 :     # Get this role's abbreviation.
368 :     my $ridx = $ssData->get_role_index($role);
369 :     my $abbr = $ssData->get_role_abbr($ridx);
370 :     # Create the machine-role ID.
371 : parrello 1.7 my $machineRoleID = "$ssMD5:$genomeID:$regionString:$abbr";
372 : parrello 1.3 # Create the machine-role and connect it to the role and the
373 :     # machine.
374 :     $self->PutE(MachineRole => $machineRoleID);
375 :     $self->PutR(IsMachineOf => $machineID, $machineRoleID);
376 :     $self->PutR(IsRoleOf => $role, $machineRoleID);
377 :     # Now get a list of the features in this cell.
378 :     my @pegs = $ssData->get_pegs_from_cell($genomeID, $ridx);
379 : parrello 1.15 # Connect them to the cell. We need to check the roles,
380 :     # however.
381 : parrello 1.3 for my $peg (@pegs) {
382 : parrello 1.15 # Get this PEG's functional assignment.
383 :     my $function = $fidHash{$peg};
384 :     # Extract its roles.
385 :     my ($roles, $errors) = LoaderUtils::RolesForLoading($function);
386 :     # If one of the roles matches this subsystem role, we
387 :     # will connect the peg to the cell. Otherwise, we
388 :     # count it as disconnected.
389 :     if (defined $roles && grep { $_ eq $role } @$roles) {
390 :     $self->PutR(Contains => $machineRoleID, $peg);
391 :     } else {
392 :     $self->Add(disconnectedPeg => 1);
393 :     }
394 : parrello 1.3 }
395 :     }
396 : parrello 1.1 }
397 :     }
398 : parrello 1.4 # Clear the subsystem cache to save space.
399 :     $fig->clear_subsystem_cache();
400 : parrello 1.1 }
401 :     }
402 :    
403 :     =head3 CreateClass
404 :    
405 :     $sl->CreateClass($className);
406 :    
407 :     Create a SubsystemClass record with the specified class name.
408 :    
409 :     =over 4
410 :    
411 :     =item className
412 :    
413 :     Name of the subsystem classification to create.
414 :    
415 :     =back
416 :    
417 :     =cut
418 :    
419 :     sub CreateClass {
420 :     # Get the parameters.
421 :     my ($self, $className) = @_;
422 :     # Create the subsystem class record.
423 :     $self->PutE(SubsystemClass => $className);
424 :     }
425 :    
426 :    
427 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3