[Bio] / Sprout / SaplingSubsystemLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SaplingSubsystemLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SaplingSubsystemLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use Stats;
25 :     use SeedUtils;
26 :     use SAPserver;
27 :     use Sapling;
28 :     use base qw(SaplingDataLoader);
29 :    
30 :     =head1 Sapling Subsystem Loader
31 :    
32 :     This class loads Subsystem data into a Sapling database from a subsystem directory.
33 :     Unlike L<SaplingGenomeLoader>, this version is designed for updating a populated
34 :     database only. Links to features and genomes are put in, but not the features and
35 :     genomes themselves, which may lead to orphan links.
36 :    
37 :     =head2 Main Methods
38 :    
39 :     =head3 Load
40 :    
41 :     my $stats = SaplingSubsystemLoader::Load($sap, $subsystem, $directory);
42 :    
43 :     Load a subsystem from a subsystem directory into the sapling database.
44 :    
45 :     =over 4
46 :    
47 :     =item sap
48 :    
49 :     L<Sapling> object used to access the target database.
50 :    
51 :     =item subsystem
52 :    
53 :     ID of the subsystem being loaded.
54 :    
55 :     =item directory
56 :    
57 :     Name of the directory containing the subsystem information.
58 :    
59 :     =back
60 :    
61 :     =cut
62 :    
63 :     sub Load {
64 :     # Get the parameters.
65 :     my ($sap, $subsystem, $directory) = @_;
66 :     # Create the loader object.
67 :     my $loaderObject = SaplingSubsystemLoader->new($sap, $subsystem, $directory);
68 : parrello 1.2 # Create the subsystem record.
69 :     $loaderObject->CreateSubsystem();
70 :     # Read the spreadsheet file.
71 :     $loaderObject->ParseSpreadsheet();
72 : parrello 1.1 ##TODO load subsystem
73 :     # Return the statistics.
74 :     return $loaderObject->{stats};
75 :     }
76 :    
77 :     =head3 ClearSubsystem
78 :    
79 :     my $stats = SaplingSubsystemLoader::ClearSubsystem($sap, $subsystem);
80 :    
81 :     Delete the specified subsystem and all the related records from the specified sapling
82 :     database. This method can also be used to clean up after a failed or aborted load.
83 :    
84 :     =over 4
85 :    
86 :     =item sap
87 :    
88 :     L<Sapling> object used to access the target database.
89 :    
90 :     =item subsystem
91 :    
92 :     ID of the subsystem to delete.
93 :    
94 :     =item RETURN
95 :    
96 :     Returns a statistics object counting the records deleted.
97 :    
98 :     =back
99 :    
100 :     =cut
101 :    
102 :     sub ClearSubsystem {
103 :     # Get the parameters.
104 :     my ($sap, $subsystem) = @_;
105 :     # Create the statistics object.
106 :     my $stats = Stats->new();
107 : parrello 1.2 # Delete the subsystem and all its associated records.
108 :     $stats = $sap->Delete(Subsystem => $subsystem);
109 : parrello 1.1 # Return the statistics object.
110 :     return $stats;
111 :     }
112 :    
113 :     =head2 Loader Object Methods
114 :    
115 :     =head3 new
116 :    
117 :     my $loaderObject = SaplingSubsystemLoader->new($sap, $subsystem, $directory);
118 :    
119 :     Create a loader object that can be used to facilitate loading Sapling data from a
120 :     subsystem directory.
121 :    
122 :     =over 4
123 :    
124 :     =item sap
125 :    
126 :     L<Sapling> object used to access the target database.
127 :    
128 :     =item subsystem
129 :    
130 :     ID of the subsystem being loaded.
131 :    
132 :     =item directory
133 :    
134 :     Name of the directory containing the subsystem data.
135 :    
136 :     =back
137 :    
138 :     The object created contains the following fields.
139 :    
140 :     =over 4
141 :    
142 :     =item supportRecords
143 :    
144 :     A hash of hashes, used to track the support records known to exist in the database.
145 :    
146 :     =item sap
147 :    
148 :     L<Sapling> object used to access the database.
149 :    
150 :     =item stats
151 :    
152 :     L<Stats> object for tracking statistical information about the load.
153 :    
154 :     =item subsystem
155 :    
156 :     ID of the subsystem being loaded.
157 :    
158 :     =item directory
159 :    
160 :     Name of the directory containing the subsystem data.
161 :    
162 : parrello 1.2 =item roleList
163 :    
164 :     Reference to a list of roles abbreviations, in order.
165 :    
166 :     =item roleHash
167 :    
168 :     Reference to a hash mapping each role abbreviation to the association role ID.
169 :    
170 :     =item variants
171 :    
172 :     Hash mapping variant codes to descriptions.
173 :    
174 : parrello 1.1 =back
175 :    
176 :     =cut
177 :    
178 :     sub new {
179 :     # Get the parameters.
180 :     my ($class, $sap, $subsystem, $directory) = @_;
181 :     # Create the object.
182 :     my $retVal = SaplingDataLoader::new($class, $sap, qw(roles));
183 :     # Add our specialized data.
184 :     $retVal->{subsystem} = $subsystem;
185 :     $retVal->{directory} = $directory;
186 : parrello 1.2 $retVal->{variants} = {};
187 : parrello 1.1 # Return the result.
188 :     return $retVal;
189 :     }
190 :    
191 : parrello 1.2 =head3 CreateSubsystem
192 :    
193 :     $loaderObject->CreateSubsystem();
194 :    
195 :     Create the root record for this subsystem and connect it to the classifications. This
196 :     method also reads in the variant descriptions (if any);
197 :    
198 :     =cut
199 :    
200 :     sub CreateSubsystem {
201 :     # Get the parameters.
202 :     my ($self) = @_;
203 :     # Get the subsystem directory.
204 :     my $directory = $self->{directory};
205 :     # Get the Sapling database.
206 :     my $sap = $self->{sap};
207 :     # Read the classification information.
208 :     my @classes;
209 :     my $classFile = "$directory/CLASSIFICATION";
210 :     if (-f $classFile) {
211 :     my $ih = Open(undef, "<$classFile");
212 :     @classes = grep { $_ } Tracer::GetLine($ih);
213 :     }
214 :     # Loop through the classes from bottom to top, insuring we have them linked up
215 :     # in the database.
216 :     my $lastClass;
217 :     if (@classes) {
218 :     # Insure the lowest-level class is present.
219 :     my $i = $#classes;
220 :     $lastClass = $classes[$i];
221 :     my $createdFlag = $self->InsureEntity(SubsystemClass => $lastClass);
222 :     # Work up through the other classes until we find one already present or hit the top.
223 :     my $thisClass = $lastClass;
224 :     while ($createdFlag && $i > 1) {
225 :     # Connect to the next class up.
226 :     $i--;
227 :     my $nextClass = $classes[$i];
228 :     $sap->InsertObject('IsSuperClassOf', from_link => $nextClass, to_link => $thisClass);
229 :     # Insure the next class is in the database.
230 :     $createdFlag = $self->InsureEntity(SubsystemClass => $nextClass);
231 :     }
232 :     }
233 :     # Get the top class, if any. We use this to do some typing.
234 :     my $topClass = $classes[0] || ' ';
235 :     # Compute the class-related subsystem types.
236 :     my $clusterBased = ($topClass =~ /clustering-based/i ? 1 : 0);
237 :     my $experimental = ($topClass =~ /experimental/i ? 1 : 0);
238 :     my $usable = ! $experimental;
239 :     # Check for the privacy flag.
240 :     my $private = (-f "$directory/EXCHANGABLE" ? 0 : 1);
241 :     # Get the version.
242 :     my $version = "0";
243 :     my $versionFile = "$directory/VERSION";
244 :     if (-f $versionFile) {
245 :     ($version) = Tracer::GetFile($versionFile);
246 :     }
247 :     # Get the curator. This involves finding the start line in the curator log.
248 :     my $curator = "fig";
249 :     my $curatorFile = "$directory/curation.log";
250 :     if (-f $curatorFile) {
251 :     my $ih = Open(undef, "<$curatorFile");
252 :     while ($curator eq "fig" && ! eof $ih) {
253 :     my $line = <$ih>;
254 :     if ($line =~ /^\d+\t(\S+)\s+started/) {
255 :     $curator = $1;
256 :     $curator =~ s/^master://;
257 :     }
258 :     }
259 :     }
260 :     # Finally, we need to get the notes and description from the notes file.
261 :     my ($description, $notes) = ("", "");
262 :     my $notesFile = "$directory/notes";
263 :     if (-f $notesFile) {
264 :     my $ih = Open(undef, "<$notesFile");
265 :     my $notesHash = ParseNotesFile($ih);
266 :     if (exists $notesHash->{description}) {
267 :     $description = $notesHash->{description};
268 :     }
269 :     if (exists $notesHash->{notes}) {
270 :     $notes = $notesHash->{notes};
271 :     }
272 :     # Stash the variant information for later.
273 :     if (exists $notesHash->{variants}) {
274 :     # We need to create a hash of variant data.
275 :     my %varHash;
276 :     # Get the individual lines of the variant line.
277 :     my @varLines = split /\n/, $notesHash->{variants};
278 :     for my $varLine (@varLines) {
279 :     # Split this line around the tab.
280 :     my ($code, $comment) = split /\t/, $varLine;
281 :     # Only proceed if the code is nonempty.
282 :     if (defined $code && $code ne '') {
283 :     # Trim excess spaces from the code.
284 :     $code =~ s/\s+//g;
285 :     # Store the comment.
286 :     $varHash{$code} = $comment;
287 :     }
288 :     }
289 : parrello 1.4 $self->{variants} = \%varHash;
290 : parrello 1.2 }
291 :     }
292 :     # Create the subsystem record.
293 :     $sap->InsertObject('Subsystem', id => $self->{subsystem}, cluster_based => $clusterBased,
294 :     curator => $curator, description => $description, experimental => $experimental,
295 :     notes => $notes, private => $private, usable => $usable, version => $version);
296 :     # If there is a classification for it, connect it.
297 :     if ($lastClass) {
298 :     $sap->InsertObject('IsClassFor', from_link => $lastClass, to_link => $self->{subsystem});
299 :     }
300 :     }
301 :    
302 :    
303 :     =head3 ParseSpreadsheet
304 :    
305 :     $loaderObject->ParseSpreadsheet();
306 :    
307 :     Read and parse the spreadsheet file. This creates the roles, the molecular machines, and fills
308 :     in the variant table.
309 :    
310 :     =cut
311 :    
312 :     use constant VARIANT_TYPES => { '-1' => 'vacant', '0' => 'incomplete'};
313 :    
314 :     sub ParseSpreadsheet {
315 :     # Get the parameters.
316 :     my ($self) = @_;
317 :     # Get the variant hash.
318 :     my $varHash = $self->{variants};
319 :     # Get the sapling database.
320 :     my $sap = $self->{sap};
321 : parrello 1.4 # Get the statistics object.
322 :     my $stats = $self->{stats};
323 : parrello 1.2 # Get the subsystem ID.
324 :     my $subsystem = $self->{subsystem};
325 :     # Compute its MD5 for the machine role IDs.
326 :     my $ssMD5 = ERDB::DigestKey($subsystem);
327 :     # Insure the default variants are present.
328 :     if (! exists $varHash->{'0'}) {
329 :     $varHash->{'0'} = 'Subsystem functionality is incomplete.';
330 :     }
331 :     if (! exists $varHash->{'-1'}) {
332 :     $varHash->{'-1'} = 'Subsystem is not functional.';
333 :     }
334 :     # Open the spreadsheet file.
335 :     my $ih = Open(undef, "<$self->{directory}/spreadsheet");
336 :     my (@roleList, %roleHash);
337 :     # Loop through the roles.
338 :     my $done = 0;
339 :     while (! eof $ih && ! $done) {
340 :     my ($abbr, $role) = Tracer::GetLine($ih);
341 :     # Is this an end marker?
342 :     if ($abbr eq '//') {
343 :     # Yes. Stop the loop.
344 :     $done = 1;
345 :     } elsif ($abbr) {
346 :     # No, store the role.
347 :     push @roleList, $abbr;
348 :     $roleHash{$abbr} = $role;
349 :     }
350 :     }
351 :     # The next section is the subsets. All we care about here are the auxiliary roles.
352 :     my %auxHash;
353 :     $done = 0;
354 :     while (! eof $ih && ! $done) {
355 :     my ($subset, @idxes) = Tracer::GetLine($ih);
356 :     # Is this an end marker?
357 :     if ($subset eq '//') {
358 :     # Yes. Stop the loop.
359 :     $done = 1;
360 :     } elsif ($subset =~ /^aux/) {
361 :     # Here we have an auxiliary subset. Mark its roles in the auxiliary-role hash.
362 :     for my $idx (@idxes) {
363 :     $auxHash{$roleList[$idx - 1]} = 1;
364 :     }
365 :     }
366 :     }
367 :     # We now have enough information to generate the role tables.
368 :     my $col = 0;
369 :     for my $abbr (@roleList) {
370 :     # Get the role ID.
371 :     my $roleID = $roleHash{$abbr};
372 :     # Determine if it's hypothetical.
373 :     my $hypo = (hypo($roleID) ? 1 : 0);
374 :     # Insure it's in the database.
375 :     $self->InsureEntity(Role => $roleID, hypothetical => $hypo, role_index => -1);
376 :     # Connect it to the subsystem
377 :     $sap->InsertObject('Includes', from_link => $subsystem, to_link => $roleID,
378 :     abbreviation => $abbr, auxiliary => ($auxHash{$abbr} ? 1 : 0),
379 :     sequence => $col++);
380 : parrello 1.4 $stats->Add(roles => 1);
381 : parrello 1.2 }
382 :     # The final section is the role table itself. Here we get the rest of the variant data, as well.
383 :     my %varsAdded;
384 : parrello 1.3 $done = 0;
385 : parrello 1.2 while (! eof $ih && ! $done) {
386 : parrello 1.4 my ($genome, $variant, @cells) = Tracer::GetLine($ih);
387 : parrello 1.2 # Is this the end marker?
388 :     if ($genome eq '//') {
389 :     # Yes. Stop the loop.
390 :     $done = 1;
391 :     } elsif ($genome) {
392 :     # Compute the true variant code and the curation flag.
393 :     my $curated = ($variant =~ /^\s*\*/ ? 0 : 1);
394 :     my $realVariant = Starless($variant);
395 :     # Check for a region string.
396 :     my ($genomeID, $regionString) = split /:/, $genome;
397 :     $regionString ||= "";
398 :     # Compute the variant and molecular machine IDs.
399 :     my $variantID = ERDB::DigestKey("$subsystem:$realVariant");
400 :     my $machineID = ERDB::DigestKey("$subsystem:$realVariant:$genomeID:$regionString");
401 :     # Insure we have the variant in the database.
402 :     if (! exists $varsAdded{$variantID}) {
403 :     # Denote the variant is in this subsystem.
404 : parrello 1.4 $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantID);
405 : parrello 1.2 # Create the variant record. For now, the role-rule is kept empty. We'll add the
406 :     # rules later as we find them.
407 :     $sap->InsertObject('Variant', id => $variantID, code => $realVariant,
408 :     comment => ($varHash->{comment} || ''),
409 :     type => (VARIANT_TYPES->{$realVariant} || ''));
410 :     # Denote we've added this variant.
411 :     $varsAdded{$variantID} = {};
412 : parrello 1.4 $stats->Add(variants => 1);
413 : parrello 1.2 }
414 :     # Create the molecular machine.
415 :     $sap->InsertObject('IsImplementedBy', from_link => $variantID, to_link => $machineID);
416 :     $sap->InsertObject('MolecularMachine', id => $machineID, curated => $curated,
417 :     region => $regionString);
418 :     # Now loop through the cells.
419 :     my @rolesFound;
420 :     for (my $i = 0; $i <= $#cells; $i++) {
421 :     my $cell = $cells[$i];
422 :     # Is this cell occupied?
423 :     if ($cell) {
424 :     # Yes. Get this cell's role abbreviation and add it to the list of roles found
425 :     # in this row.
426 :     my $abbr = $roleList[$i];
427 :     push @rolesFound, $abbr;
428 :     # Create the machine role.
429 :     my $machineRoleID = "$ssMD5:$genomeID:$regionString:$abbr";
430 :     $sap->InsertObject('IsMachineOf', from_link => $machineID, to_link => $machineRoleID);
431 :     $sap->InsertObject('MachineRole', id => $machineRoleID);
432 :     $sap->InsertObject('IsRoleOf', from_link => $roleHash{$abbr},
433 :     to_link => $machineRoleID);
434 :     # Connect the pegs in this cell to it.
435 :     for my $pegN (split /\s*,\s*/, $cell) {
436 :     $sap->InsertObject('Contains', from_link => $machineRoleID,
437 :     to_link => "fig|$genomeID.peg.$pegN");
438 :     }
439 :     }
440 :     }
441 :     # Compute a role rule from this row's roles and associate it with this variant.
442 :     my $roleRule = join(" ", @rolesFound);
443 :     $varsAdded{$variantID}->{$roleRule} = 1;
444 :     }
445 :     }
446 :     # We've finished the spreadsheet. Now we go back and add the role rules to the variants.
447 :     for my $variantID (keys %varsAdded) {
448 :     my $ruleHash = $varsAdded{$variantID};
449 :     for my $roleRule (sort keys %$ruleHash) {
450 : parrello 1.4 $sap->InsertValue($variantID, 'Variant(role-rule)', $roleRule);
451 : parrello 1.2 }
452 :     }
453 :     }
454 :    
455 : parrello 1.1 =head2 Internal Utility Methods
456 :    
457 : parrello 1.2 =head3 ParseNotesFile
458 :    
459 :     my $notesHash = SaplingSubsystemLoader::ParseNotesFile($ih);
460 :    
461 :     Read and parse the notes file from the specified file handle. The sections of the file will be
462 :     returned in a hash, keyed by section name.
463 :    
464 :     =over 4
465 :    
466 :     =item ih
467 :    
468 :     Open handle for the notes file.
469 :    
470 :     =item RETURN
471 :    
472 :     Returns a reference to a hash keyed by section name, mapping each name to the text of that section.
473 :    
474 : parrello 1.1 =cut
475 :    
476 : parrello 1.2 sub ParseNotesFile {
477 :     # Get the parameters.
478 :     my ($ih) = @_;
479 :     # Create the return hash.
480 :     my $retVal = {};
481 :     # Anything before the first separator will be classified as "notes".
482 :     my ($section, @text) = ('notes');
483 :     # Loop through the lines of the file.
484 :     while (! eof $ih) {
485 : parrello 1.4 my $line = <$ih>;
486 :     chomp $ih;
487 :     if ($line =~ /^#####/) {
488 : parrello 1.2 # Here we have the start of a new section. If there's an old section,
489 :     #put it in the output hash.
490 :     if (@text) {
491 :     $retVal->{$section} = join("\n", @text);
492 :     }
493 :     # Is there another section?
494 :     if (! eof $ih) {
495 :     # Yes. Save the new section name and clear the text array.
496 : parrello 1.4 my $sectionLine = <$ih>;
497 :     $sectionLine =~ /^(\S+)/;
498 :     $section = lc $1;
499 : parrello 1.2 undef @text;
500 :     }
501 :     } else {
502 :     # Here we have an ordinary text line.
503 :     push @text, $line;
504 :     }
505 :     }
506 :     # Write out the last section (if any).
507 :     if (@text) {
508 :     $retVal->{$section} = join("\n", @text);
509 :     }
510 :     # Return the result hash.
511 :     return $retVal;
512 :     }
513 :    
514 :     =head3 Starless
515 :    
516 :     my $adjusted = SaplingSubsystemLoader::Starless($codeString);
517 :    
518 :     Remove any spaces and leading or trailing asterisks from the incoming string and
519 :     return the result.
520 :    
521 :     =over 4
522 :    
523 :     =item codeString
524 :    
525 :     Input string that needs to have the asterisks trimmed.
526 :    
527 :     =item RETURN
528 :    
529 :     Returns the incoming string with spaces and leading and trailing asterisks
530 :     removed.
531 :    
532 :     =back
533 :    
534 :     =cut
535 :    
536 :     sub Starless {
537 :     # Get the parameters.
538 :     my ($codeString) = @_;
539 :     # Declare the return variable.
540 :     my $retVal = $codeString;
541 :     # Remove the spaces.
542 :     $retVal =~ s/\s+//g;
543 :     # Trim the asterisks.
544 :     $retVal =~ s/^\*+//;
545 :     $retVal =~ s/\*+$//;
546 :     # Return the result.
547 :     return $retVal;
548 :     }
549 :    
550 :    
551 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3