[Bio] / Sprout / SubsystemSaplingLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SubsystemSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.21 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SubsystemSaplingLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 : parrello 1.15 use LoaderUtils;
26 : parrello 1.18 use SeedUtils;
27 : parrello 1.1 use base 'BaseSaplingLoader';
28 :    
29 :     =head1 Sapling Subsystem Load Group Class
30 :    
31 :     =head2 Introduction
32 :    
33 :     The Subsystem Load Group includes all of the major subsystem-related tables.
34 :    
35 :     =head3 new
36 :    
37 :     my $sl = SubsystemSaplingLoader->new($erdb, $options, @tables);
38 :    
39 :     Construct a new SubsystemSaplingLoader object.
40 :    
41 :     =over 4
42 :    
43 :     =item erdb
44 :    
45 : parrello 1.8 L<Sapling> object for the database being loaded.
46 : parrello 1.1
47 :     =item options
48 :    
49 :     Reference to a hash of command-line options.
50 :    
51 :     =item tables
52 :    
53 :     List of tables in this load group.
54 :    
55 :     =back
56 :    
57 :     =cut
58 :    
59 :     sub new {
60 :     # Get the parameters.
61 :     my ($class, $erdb, $options) = @_;
62 :     # Create the table list.
63 : parrello 1.3 my @tables = sort qw(Subsystem IsClassFor SubsystemClass IsSuperclassOf Includes
64 : parrello 1.13 Describes Variant IsRoleOf IsImplementedBy MachineRole
65 : parrello 1.8 IsMachineOf MolecularMachine Contains Uses VariantRole);
66 : parrello 1.1 # Create the BaseSaplingLoader object.
67 :     my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
68 :     # Return it.
69 :     return $retVal;
70 :     }
71 :    
72 :     =head2 Public Methods
73 :    
74 :     =head3 Generate
75 :    
76 :     $sl->Generate();
77 :    
78 :     Generate the data for the subsystem-related files.
79 :    
80 :     =cut
81 :    
82 :     sub Generate {
83 :     # Get the parameters.
84 :     my ($self) = @_;
85 :     # Get the database object.
86 :     my $erdb = $self->db();
87 :     # Get the source object.
88 :     my $fig = $self->source();
89 :     # Is this the global section?
90 :     if ($self->global()) {
91 :     # Yes, build the subsystem framework.
92 :     $self->GenerateSubsystems($fig, $erdb);
93 :     } else {
94 :     # Get the section ID.
95 :     my $genomeID = $self->section();
96 :     # Generate the subsystem date for this genome.
97 :     $self->GenerateSubsystemData($fig, $erdb, $genomeID);
98 :     }
99 :     }
100 :    
101 :     =head3 GenerateSubsystems
102 :    
103 :     $sl->GenerateSubsystems($fig, $erdb);
104 :    
105 :     Generate the subsystems, variants, and roles for this database. This
106 :     method concerns itself primarily with the genome-independent part of the
107 :     subsystem framework. This includes the following tables:
108 :    
109 :     Subsystem
110 :     Describes
111 :     Variant
112 :     Includes
113 :     IsClassFor
114 :     SubsystemClass
115 :     IsSuperclassOf
116 :    
117 :     =over 4
118 :    
119 :     =item fig
120 :    
121 :     Source object from which the subsystem data will be extracted.
122 :    
123 :     =item erdb
124 :    
125 :     Database object for the Sapling database.
126 :    
127 :     =back
128 :    
129 :     =cut
130 :    
131 :     sub GenerateSubsystems {
132 :     # Get the parameters.
133 :     my ($self, $fig, $erdb) = @_;
134 :     # Get the subsystem hash for this Sapling instance. Its key list will be
135 :     # the list of subsystems to put in the database.
136 :     my $subHash = $erdb->SubsystemHash();
137 :     # We'll track the various subsystem classes in here.
138 :     my %subClassHash = ();
139 :     # Loop through the subsystems.
140 :     for my $subsystem (keys %$subHash) {
141 : parrello 1.3 Trace("Processing subsystem $subsystem.") if T(ERDBLoadGroup => 3);
142 : parrello 1.1 # Get the FIG subsystem object.
143 :     my $ssData = $fig->get_subsystem($subsystem);
144 : parrello 1.12 # Only proceed if we found it.
145 :     if (! defined $ssData) {
146 :     $self->Add(missingSubsystem => 1);
147 :     Trace("Subsystem $subsystem not found.") if T(ERDBLoadGroup => 1);
148 : parrello 1.14 } elsif ($ssData->{empty_ss}) {
149 :     $self->Add(emptySubsystem => 1);
150 :     Trace("Subsystem $subsystem is empty.") if T(ERDBLoadGroup => 1);
151 : parrello 1.12 } else {
152 :     # These will be set to 1 if the subsystem has the indicated property.
153 :     my $experimental = 0;
154 :     my $clustered = 0;
155 :     # Get the subsystem's classes.
156 :     my $classes = $ssData->get_classification();
157 :     # Only proceed if classes exist.
158 :     if (scalar @$classes) {
159 :     # Check for one of the special roots. If we find it, we shift it off
160 :     # the list.
161 :     if ($classes->[0] =~ /Clustering/) {
162 :     $clustered = 1;
163 :     shift @$classes;
164 :     } elsif ($classes->[0] =~ /Experimental/) {
165 :     $experimental = 1;
166 :     shift @$classes;
167 :     }
168 :     # Loop through the remaining classes from the bottom up.
169 :     my $class = pop @$classes;
170 : parrello 1.17 if ($class) {
171 : parrello 1.12 # Create the class record.
172 :     $self->CreateClass($class);
173 :     # Connect it to the subsystem.
174 :     $self->PutR(IsClassFor => $class, $subsystem);
175 :     # Is this a new class?
176 :     if (! $subClassHash{$class}) {
177 :     # Yes. We need to put it in its hierarchy.
178 :     while (my $newClass = pop @$classes) {
179 :     # Create the new class's record.
180 :     $self->CreateClass($newClass);
181 :     # Put it above the previous class.
182 :     $self->PutR(IsSuperclassOf => $newClass, $class);
183 :     # Insure we know we're done with this class.
184 :     $subClassHash{$class} = 1;
185 :     # Prepare for the next class.
186 :     $class = $newClass;
187 :     }
188 :     }
189 :     }
190 :     }
191 :     # Get the subsystem properties.
192 :     my $curator = $ssData->get_curator();
193 :     my $description = $ssData->get_description();
194 :     my $notes = $ssData->get_notes();
195 :     my $version = $ssData->get_version();
196 :     my $usable = ($fig->is_experimental_subsystem($subsystem) ? 0 : 1);
197 :     my $private = ($fig->is_exchangable_subsystem($subsystem) ? 0 : 1);
198 :     # Fix the curator.
199 : parrello 1.16 if (! defined $curator) {
200 :     $curator = "unknown";
201 :     } else {
202 :     $curator =~ s/^master://;
203 :     }
204 : parrello 1.12 # Ensure we have a description.
205 :     if (! defined $description) {
206 :     $description = '';
207 : parrello 1.9 }
208 : parrello 1.12 # Emit the subsystem record.
209 :     $self->PutE(Subsystem => $subsystem, curator => $curator,
210 :     description => $description, notes => $notes,
211 :     version => $version, usable => $usable,
212 :     private => $private, cluster_based => $clustered,
213 :     experimental => $experimental);
214 :     # Get this subsystem's roles.
215 :     my @roles = $ssData->get_roles();
216 :     # This will track the column number for the role.
217 :     my $col = 0;
218 :     # Loop through the roles.
219 :     for my $role (@roles) {
220 :     # Check to see if this role is main or auxiliary.
221 :     my $auxFlag = ($fig->is_aux_role_in_subsystem($subsystem, $role) ? 1 : 0);
222 : parrello 1.9 # Connect it to the subsystem.
223 : parrello 1.12 $self->PutR(Includes => $subsystem, $role,
224 :     abbreviation => $ssData->get_abbr_for_role($role),
225 :     sequence => $col++, auxiliary => $auxFlag);
226 :     }
227 :     # Next come the variants. Variant data is sparse in the SEED. We
228 :     # start by getting all the known variant codes.
229 :     my %variants = map { BaseSaplingLoader::Starless($_) => '' } $ssData->get_variant_codes();
230 :     # -1 and 0 are always present.
231 :     $variants{'0'} = 'Subsystem functionality is incomplete.';
232 :     $variants{'-1'} = 'Subsystem is not functional.';
233 :     # Now get notes from any variants that have them. Note that we need
234 :     # to clean up the variant code with a call to Starless.
235 :     my $variantHash = $ssData->get_variants();
236 :     for my $variant (keys %$variantHash) {
237 :     my $realVariantID = BaseSaplingLoader::Starless($variant);
238 :     $variants{$realVariantID} = $variantHash->{$variant};
239 :     }
240 :     # Next we need to compute the role rules. For each genome in the subsystem,
241 :     # we compute its variant code and a list of its roles. These are put
242 :     # into the following two-dimensional hash. Each inner hash maps a role
243 :     # rule list to 1. The keys of the inner hash become the role rules.
244 :     my %roleRuleHash = map { $_ => {} } keys %variants;
245 :     # Loop through the list of genomes.
246 :     my @genomes = $ssData->get_genomes();
247 :     for (my $i = 0; $i < scalar(@genomes); $i++) {
248 :     # Get this genome's variant code.
249 :     my $variantCode = BaseSaplingLoader::Starless($ssData->get_variant_code($i));
250 :     # Get its roles.
251 :     my @roles = $ssData->get_roles_for_genome($genomes[$i]);
252 :     # Convert them to a role rule.
253 :     my $rule = join(" ", sort map { $ssData->get_abbr_for_role($_) } @roles);
254 :     # Put the role in the hash.
255 :     $roleRuleHash{$variantCode}{$rule} = 1;
256 :     }
257 :     # Create the variants.
258 :     for my $variant (keys %variants) {
259 :     # The variant key is the subsystem ID plus the variant code.
260 :     my $variantID = ERDB::DigestKey("$subsystem:$variant");
261 :     # Compute its type.
262 :     my $variant_type = ($variant =~ /^0/ ? 'incomplete' :
263 :     $variant =~ /^-/ ? 'vacant' : 'normal');
264 :     # The comment is easily computed from the variant data, so
265 :     # we now have enough data to output the variant record.
266 :     $self->PutE(Variant => $variantID, type => $variant_type,
267 :     code => $variant, comment => $variants{$variant});
268 :     # Now output the role rules.
269 :     for my $rule (keys %{$roleRuleHash{$variant}}) {
270 :     $self->PutE(VariantRole => $variantID, role_rule => $rule);
271 : parrello 1.9 }
272 : parrello 1.12 # Link the subsystem to the variant.
273 :     $self->PutR(Describes => $subsystem, $variantID);
274 : parrello 1.5 }
275 : parrello 1.12 # Clear the subsystem cache to keep memory under control.
276 :     $fig->clear_subsystem_cache();
277 : parrello 1.5 }
278 : parrello 1.1 }
279 :     }
280 :    
281 :     =head3 GenerateSubsystemData
282 :    
283 :     $sl->GenerateSubsystemData($fig, $erdb, $genomeID);
284 :    
285 :     Generate the molecular machines and subsystem spreadsheet cells for this
286 :     database. This method concerns itself primarily with the genome-dependent
287 :     part of the subsystem framework. This includes the following tables.
288 :    
289 :     IsImplementedBy
290 :     MolecularMachine
291 :     IsMachineOf
292 :     MachineRole
293 :     Uses
294 : parrello 1.9 Contains
295 : parrello 1.1 IsRoleOf
296 :    
297 :     =over 4
298 :    
299 :     =item fig
300 :    
301 :     Source object from which the subsystem data will be extracted.
302 :    
303 :     =item erdb
304 :    
305 :     Database object for the Sapling database.
306 :    
307 :     =item genomeID
308 :    
309 :     ID of the relevant genome.
310 :    
311 :     =back
312 :    
313 :     =cut
314 :    
315 :     sub GenerateSubsystemData {
316 :     # Get the parameters.
317 :     my ($self, $fig, $erdb, $genomeID) = @_;
318 :     # Get the subsystem hash for this Sapling instance. Its key list will be
319 :     # the list of subsystems being put in the database.
320 :     my $subHash = $erdb->SubsystemHash();
321 :     # Get the list of subsystems for this genome. The "1" indicates we want
322 : parrello 1.5 # all of them, including the ones for 0 and -1 variants. Note we have
323 :     # to normalize the subsystem names.
324 :     my @subName = map { $erdb->SubsystemID($_) } $fig->subsystems_for_genome($genomeID, 1);
325 : parrello 1.15 # Get the functional assignments for the features in this genome. We'll need
326 :     # this later when we're connecting them to subsystem cells.
327 :     my %fidHash = map { $_->[0] => $_->[1] } @{$fig->get_genome_assignment_data($genomeID)};
328 : parrello 1.1 # Loop through the named subsystems. Each one corresponds to a molecular
329 :     # machine.
330 : parrello 1.5 for my $subName (grep { exists $subHash->{$_} } @subName) {
331 : parrello 1.1 $self->Track(MolecularMachines => $subName, 100);
332 : parrello 1.7 # Compute the MD5 hash of the subsystem ID.
333 :     my $ssMD5 = ERDB::DigestKey($subName);
334 : parrello 1.1 # Get the subsystem object.
335 :     my $ssData = $fig->get_subsystem($subName);
336 : parrello 1.19 if (! defined $ssData) {
337 : parrello 1.20 Trace("Subsystem $subName has been deleted.") if T(ERDBLoadGroup => 2);
338 : parrello 1.19 $self->Add(missingSubsystem => 1);
339 :     } else {
340 :     # Now we find the molecular machines for this subsystem/genome pair.
341 :     my @rows = $ssData->get_genomes();
342 :     for (my $gidx = 0; $gidx <= $#rows; $gidx++) {
343 : parrello 1.21 my ($rowGenome, $regionString) = split m/:/, $rows[$gidx], 2;
344 : parrello 1.19 if ($rowGenome eq $genomeID) {
345 :     # Here we're positioned on a row for our genome. If it is
346 :     # a region-restricted molecular machine, then the region
347 :     # string will be defined. If it's global, we use an empty
348 :     # string for the region.
349 :     $regionString ||= "";
350 :     # Create the molecular machine. To do that, we need the variant code
351 :     # for this genome.
352 :     my $raw_variant_code = $ssData->get_variant_code($gidx);
353 :     # Check for a leading asterisk. This means the variant assignment is not
354 :     # curated.
355 :     my $curated = ($raw_variant_code =~ /^\s*\*/ ? 0 : 1);
356 :     # Clear any waste from the variant code.
357 :     my $variant_code = BaseSaplingLoader::Starless($raw_variant_code);
358 :     # Create the variant and machine IDs.
359 :     my $variantID = ERDB::DigestKey("$subName:$variant_code");
360 :     my $machineID = ERDB::DigestKey("$subName:$variant_code:$genomeID:$regionString");
361 :     # Create the molecular machine and connect it to the genome and
362 :     # subsystem.
363 :     $self->PutE(MolecularMachine => $machineID,
364 :     curated => $curated, region => $regionString);
365 :     $self->PutR(IsImplementedBy => $variantID, $machineID);
366 :     $self->PutR(Uses => $genomeID, $machineID);
367 :     # Now we loop through the subsystem's roles, creating the MachineRoles.
368 :     # Molecular machines function as spreadsheet rows; machine roles are
369 :     # spreadsheet cells.
370 :     my @roles = $ssData->get_roles();
371 :     for my $role (@roles) {
372 :     # Get this role's abbreviation.
373 :     my $ridx = $ssData->get_role_index($role);
374 :     my $abbr = $ssData->get_role_abbr($ridx);
375 :     # Create the machine-role ID.
376 :     my $machineRoleID = "$ssMD5:$genomeID:$regionString:$abbr";
377 :     # Create the machine-role and connect it to the role and the
378 :     # machine.
379 :     $self->PutE(MachineRole => $machineRoleID);
380 :     $self->PutR(IsMachineOf => $machineID, $machineRoleID);
381 :     $self->PutR(IsRoleOf => $role, $machineRoleID);
382 :     # Now get a list of the features in this cell.
383 :     my @pegs = $ssData->get_pegs_from_cell($genomeID, $ridx);
384 :     # Connect them to the cell. We need to check the roles,
385 :     # however.
386 :     for my $peg (@pegs) {
387 :     # Get this PEG's functional assignment.
388 :     my $function = $fidHash{$peg};
389 :     # Extract its roles.
390 :     my ($roles, $errors) = SeedUtils::roles_for_loading($function);
391 :     # If one of the roles matches this subsystem role, we
392 :     # will connect the peg to the cell. Otherwise, we
393 :     # count it as disconnected.
394 :     if (defined $roles && grep { $_ eq $role } @$roles) {
395 :     $self->PutR(Contains => $machineRoleID, $peg);
396 :     } else {
397 :     $self->Add(disconnectedPeg => 1);
398 :     }
399 : parrello 1.15 }
400 : parrello 1.3 }
401 :     }
402 : parrello 1.1 }
403 :     }
404 : parrello 1.4 # Clear the subsystem cache to save space.
405 :     $fig->clear_subsystem_cache();
406 : parrello 1.1 }
407 :     }
408 :    
409 :     =head3 CreateClass
410 :    
411 :     $sl->CreateClass($className);
412 :    
413 :     Create a SubsystemClass record with the specified class name.
414 :    
415 :     =over 4
416 :    
417 :     =item className
418 :    
419 :     Name of the subsystem classification to create.
420 :    
421 :     =back
422 :    
423 :     =cut
424 :    
425 :     sub CreateClass {
426 :     # Get the parameters.
427 :     my ($self, $className) = @_;
428 :     # Create the subsystem class record.
429 :     $self->PutE(SubsystemClass => $className);
430 :     }
431 :    
432 :    
433 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3