[Bio] / Sprout / NewStuffCheck.pl Repository:
ViewVC logotype

Annotation of /Sprout/NewStuffCheck.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     =head1 New Stuff Checker
4 :    
5 :     This script compares the genomes, features, and annotations in
6 :     the old and new sprouts and lists the differences.
7 :    
8 :     The currently-supported command-line options are as follows.
9 :    
10 :     =over 4
11 :    
12 : parrello 1.2 =item summary
13 :    
14 :     Do not display details, only difference summaries.
15 :    
16 : parrello 1.8 =item nofeats
17 :    
18 :     Do not process features.
19 :    
20 : parrello 1.1 =item user
21 :    
22 :     Name suffix to be used for log files. If omitted, the PID is used.
23 :    
24 :     =item trace
25 :    
26 :     Numeric trace level. A higher trace level causes more messages to appear. The
27 :     default trace level is 2. Tracing will be directly to the standard output
28 :     as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
29 :     where I<User> is the value of the B<user> option above.
30 :    
31 :     =item sql
32 :    
33 :     If specified, turns on tracing of SQL activity.
34 :    
35 :     =item background
36 :    
37 :     Save the standard and error output to files. The files will be created
38 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
39 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
40 :     B<user> option above.
41 :    
42 :     =item h
43 :    
44 :     Display this command's parameters and options.
45 :    
46 :     =item phone
47 :    
48 :     Phone number to message when the script is complete.
49 :    
50 :     =back
51 :    
52 :     =cut
53 :    
54 :     use strict;
55 :     use Tracer;
56 :     use DocUtils;
57 :     use TestUtils;
58 :     use Cwd;
59 :     use File::Copy;
60 :     use File::Path;
61 :     use FIG;
62 :     use SFXlate;
63 :     use Sprout;
64 :    
65 :     # Get the command-line options and parameters.
66 :     my ($options, @parameters) = StandardSetup([qw(Sprout) ],
67 :     {
68 : parrello 1.8 nofeats => ["", "if specified, only genome changes will be displayed; otherwise, genome features will be compared and differences shown"],
69 : parrello 1.5 trace => ["2-", "tracing level; use a minus to prevent tracing to standard output"],
70 : parrello 1.2 summary => ["", "if specified, detailed lists of the different items will not be displayed"],
71 : parrello 1.1 phone => ["", "phone number (international format) to call when load finishes"],
72 :     },
73 :     "",
74 :     @ARGV);
75 :     # Set a variable to contain return type information.
76 :     my $rtype;
77 :     # Insure we catch errors.
78 :     eval {
79 : parrello 1.2 Trace("Processing genomes.") if T(2);
80 : parrello 1.8 # Get the current SEED.
81 :     my $fig = FIG->new();
82 : parrello 1.1 # Get the old Sprout.
83 :     my $oldSprout = SFXlate->new_sprout_only($FIG_Config::oldSproutDB);
84 :     # Get its genomes in alphabetical order.
85 : parrello 1.8 my @oldGenomes = GetGenomes($oldSprout);
86 : parrello 1.1 # Get the new Sprout.
87 :     my $newSprout = SFXlate->new_sprout_only();
88 :     # Get its genomes in alphabetical order.
89 : parrello 1.8 my @newGenomes = GetGenomes($newSprout);
90 : parrello 1.7 # Compare the two genomes lists.
91 : parrello 1.1 my ($insertedGenomes, $deletedGenomes) = Tracer::CompareLists(\@newGenomes, \@oldGenomes);
92 : parrello 1.7 # Add feature counts to the new genomes.
93 :     for my $insertedGenome (@{$insertedGenomes}) {
94 :     my $genomeID = $insertedGenome->[0];
95 :     # For a new genome, display the feature count.
96 :     my $count = $newSprout->GetCount(['HasFeature'], "HasFeature(from-link) = ?",
97 :     [$genomeID]);
98 :     my $suffix = ($count == 1 ? " one feature" : "$count features");
99 :     $insertedGenome->[1] .= "($suffix)";
100 :     }
101 : parrello 1.8 # Add information about SEED status to the deleted genomes.
102 :     for my $deletedGenome (@{$deletedGenomes}) {
103 :     my $genomeID = $deletedGenome->[0];
104 :     if ($fig->is_genome($genomeID)) {
105 :     my $complete = ($fig->is_complete($genomeID) ? "complete" : "incomplete");
106 : parrello 1.9 $deletedGenome->[1] .= "(still in SEED, $complete)";
107 : parrello 1.8 }
108 :     }
109 : parrello 1.1 # Display the lists.
110 : parrello 1.2 ShowLists(! $options->{summary},
111 :     'New Genomes' => $insertedGenomes,
112 :     'Deleted Genomes' => $deletedGenomes);
113 : parrello 1.8 # Now the groups.
114 :     Trace("Comparing groups.") if T(2);
115 :     my %oldGroups = $oldSprout->GetGroups();
116 :     my %newGroups = $newSprout->GetGroups();
117 :     # Loop through the new groups.
118 :     for my $newGroup (sort keys %newGroups) {
119 :     Trace("Processing group $newGroup.") if T(3);
120 :     # Find out if this group is new to this version.
121 :     if (! exists $oldGroups{$newGroup}) {
122 :     # Construct a list of this group's genes.
123 :     my @groupGenomes = NameGenomes($newSprout, $newGroups{$newGroup});
124 :     ShowLists(! $options->{summary}, "Genomes in new group $newGroup" => \@groupGenomes);
125 :     } else {
126 :     # Here the group is in both versions. Fix the lists and compare them.
127 :     my @newGroupList = NameGenomes($newSprout, $newGroups{$newGroup});
128 :     my @oldGroupList = NameGenomes($oldSprout, $oldGroups{$newGroup});
129 : parrello 1.10 Trace("Comparing lists for $newGroup.") if T(4);
130 : parrello 1.8 my ($insertedGroupGenomes, $deletedGroupGenomes) = Tracer::CompareLists(\@newGroupList, \@oldGroupList);
131 : parrello 1.10 Trace("Comparison complete.") if T(4);
132 : parrello 1.8 # Delete the old group data. When we're done, this means we'll have a list of the deleted
133 :     # groups.
134 :     delete $oldGroups{$newGroup};
135 :     # Show the lists. Empty lists will not be shown.
136 : parrello 1.10 Trace("Displaying group lists.") if T(4);
137 : parrello 1.8 ShowLists(! $options->{summary},
138 :     "Genomes new to $newGroup" => $insertedGroupGenomes,
139 :     "Genomes no longer in $newGroup" => $deletedGroupGenomes);
140 :     }
141 :     }
142 : parrello 1.10 Trace("Processing deleted groups.") if T(4);
143 : parrello 1.8 # Now list the deleted groups.
144 :     for my $oldGroup (sort keys %oldGroups) {
145 :     Trace("Processing deleted group $oldGroup.") if T(3);
146 :     my @groupGenomes = NameGenomes($oldSprout, $oldGroups{$oldGroup});
147 :     ShowLists(! $options->{summary}, "Genomes in deleted group $oldGroup" => \@groupGenomes);
148 :     }
149 : parrello 1.1 # Next, we get the subsystems.
150 : parrello 1.2 Trace("Processing subsystems.") if T(2);
151 : parrello 1.1 my @oldSubsystems = GetSubsystems($oldSprout);
152 :     my @newSubsystems = GetSubsystems($newSprout);
153 :     # Compare and display the subsystem lists.
154 :     my ($insertedSubs, $deletedSubs) = Tracer::CompareLists(\@newSubsystems, \@oldSubsystems);
155 : parrello 1.8 # Check the deleted subsystems to see if they're in SEED.
156 :     if (scalar @{$deletedSubs} > 0) {
157 :     my %subChecker = map { $_ => 1 } $fig->all_subsystems();
158 :     for my $deletedSub (@{$deletedSubs}) {
159 :     my $subID = $deletedSub->[0];
160 :     if ($subChecker{$subID}) {
161 :     my $trusted = ($fig->usable_subsystem($subID) ? "usable" : "not usable");
162 : parrello 1.9 $deletedSub->[1] .= " (still in SEED, $trusted)";
163 : parrello 1.8 }
164 :     }
165 :     }
166 : parrello 1.2 ShowLists(! $options->{summary},
167 :     'New Subsystems' => $insertedSubs,
168 :     'Deleted Subsystems' => $deletedSubs);
169 : parrello 1.8 # Now we process the features of the common genes.
170 :     if (! $options->{nofeats}) {
171 :     # First we need a hash of the inserted stuff so we know to skip it.
172 :     my %skipGenomes = map { $_->[0] => 1 } @{$insertedGenomes};
173 :     # Loop through the genomees.
174 :     for my $genome (@newGenomes) {
175 :     # Get the ID and name.
176 :     my ($genomeID, $genomeName) = @{$genome};
177 :     Trace("Processing $genomeID.") if T(3);
178 :     # Only process the common genes.
179 :     if (! $skipGenomes{$genomeID}) {
180 :     # Compare the genome group information.
181 :     # Get the new and old features. This will be very stressful to the machine,
182 :     # because there are lots of features.
183 :     my @oldFeatures = GetFeatures($oldSprout, $genomeID);
184 :     my @newFeatures = GetFeatures($newSprout, $genomeID);
185 :     Trace("Comparing features for $genomeID.") if T(3);
186 :     # Compare the lists.
187 :     my ($insertedFeatures, $deletedFeatures) = Tracer::CompareLists(\@newFeatures, \@oldFeatures);
188 :     # Display the lists. Only nonempty lists are displayed; however, we do a check
189 :     # first anyway so the trace tells us what's happening.
190 :     if (scalar @{$insertedFeatures} + scalar @{$deletedFeatures} > 0) {
191 :     Trace("Displaying feature differences.") if T(3);
192 :     ShowLists(! $options->{summary},
193 :     "New Features for $genomeID" => $insertedFeatures,
194 :     "Features Deleted from $genomeID" => $deletedFeatures);
195 :     }
196 : parrello 1.1 }
197 :     }
198 :     }
199 :     };
200 :     if ($@) {
201 :     Trace("Script failed with error: $@") if T(0);
202 :     $rtype = "error";
203 :     } else {
204 :     Trace("Script complete.") if T(2);
205 :     $rtype = "no error";
206 :     }
207 :     if ($options->{phone}) {
208 :     my $msgID = Tracer::SendSMS($options->{phone}, "Subsystem Checker terminated with $rtype.");
209 :     if ($msgID) {
210 :     Trace("Phone message sent with ID $msgID.") if T(2);
211 :     } else {
212 :     Trace("Phone message not sent.") if T(2);
213 :     }
214 :     }
215 :    
216 : parrello 1.8 =head3 GetGenomes
217 : parrello 1.1
218 : parrello 1.8 C<< my @geneList = GetGenomes($sprout); >>
219 : parrello 1.1
220 :     Return a list of the genomes in the specified Sprout instance. The genomes
221 :     are returned in alphabetical order by genome ID.
222 :    
223 :     =over 4
224 :    
225 :     =item sprout
226 :    
227 :     Sprout instance whose gene list is desired.
228 :    
229 :     =item RETURN
230 :    
231 :     Returns a list of two-tuples. The first element in each tuple is the genome ID,
232 :     and the second is the genome name (genus, species, strain).
233 :    
234 :     =back
235 :    
236 :     =cut
237 :    
238 : parrello 1.8 sub GetGenomes {
239 : parrello 1.1 # Get the parameters.
240 :     my ($sprout) = @_;
241 :     # Get the desired data.
242 :     my @genomes = $sprout->GetAll(['Genome'], "ORDER BY Genome(id)", [], ['Genome(id)',
243 :     'Genome(genus)',
244 :     'Genome(species)',
245 :     'Genome(unique-characterization)']);
246 :     # Create the genome names from the three pieces of the name.
247 :     my @retVal = map { [$_->[0], join(" ", @{$_}[1..3])] } @genomes;
248 :     # Return the result.
249 :     return @retVal;
250 :     }
251 :    
252 : parrello 1.8 =head3 NameGenomes
253 :    
254 :     C<< my $newList = NameGenomes($sprout, \@genomes); >>
255 :    
256 :     Convert a list of genome IDs to a list of genome IDs with names.
257 :    
258 :     =over 4
259 :    
260 :     =item sprout
261 :    
262 :     The relevant sprout instance.
263 :    
264 :     =item genomes
265 :    
266 :     Reference to a list of genome IDs
267 :    
268 :     =item RETURN
269 :    
270 :     Returns a list of 2-tuples, each tuple consisting of a genome ID followed by a
271 :     genome name.
272 :    
273 :     =back
274 :    
275 :     =cut
276 :    
277 :     sub NameGenomes {
278 :     # Get the parameters.
279 :     my ($sprout, $genomes) = @_;
280 :     # Attach the names.
281 :     my @retVal = map { [$_, $sprout->GenusSpecies($_) ] } @{$genomes};
282 :     # Return the result.
283 :     return @retVal;
284 :     }
285 :    
286 : parrello 1.1 =head3 GetSubsystems
287 :    
288 :     C<< my @subsystems = GetSubsystems($sprout); >>
289 :    
290 :     Get a list of the subsystems in the specified Sprout instance.
291 :    
292 :     =over 4
293 :    
294 :     =item sprout
295 :    
296 :     Sprout instance whose subsystems are desired.
297 :    
298 :     =item RETURN
299 :    
300 :     Returns a list of 2-tuples, each consisting of the subsystem name followed by
301 :     the name of the curator.
302 :    
303 :     =back
304 :    
305 :     =cut
306 :    
307 :     sub GetSubsystems {
308 :     # Get the parameters.
309 :     my ($sprout) = @_;
310 :     # Declare the return variable.
311 :     my @retVal = $sprout->GetAll(['Subsystem'], "ORDER BY Subsystem(id)",
312 :     [], ['Subsystem(id)', 'Subsystem(curator)']);
313 :     # Return the result.
314 :     return @retVal;
315 :     }
316 :    
317 :     =head3 GetFeatures
318 :    
319 :     C<< my @features = GetFeatures($sprout, $genomeID); >>
320 :    
321 :     Return the features of the specified genome in the specified Sprout instance.
322 :    
323 :     =over 4
324 :    
325 :     =item sprout
326 :    
327 :     Sprout instance to use to get the features.
328 :    
329 :     =item genomeID
330 :    
331 :     ID of the genome in question.
332 :    
333 :     =item RETURN
334 :    
335 :     Returns a list of 2-tuples, the first element being the feature ID and the second its
336 :     functional assignment (if any).
337 :    
338 :     =back
339 :    
340 :     =cut
341 :    
342 :     sub GetFeatures {
343 :     # Get the parameters.
344 :     my ($sprout, $genomeID) = @_;
345 :     # Get a list of the feature IDs and map them to their functional assignments.
346 :     my @retVal = map { [$_, $sprout->FunctionOf($_)] } $sprout->GetFlat(['HasFeature'],
347 :     "HasFeature(from-link) = ? ORDER BY HasFeature(to-link)",
348 :     [$genomeID], 'HasFeature(to-link)');
349 :     # Return the result.
350 :     return @retVal;
351 :     }
352 :    
353 :     =head3 ShowLists
354 :    
355 : parrello 1.2 C<< ShowLists($all, %lists); >>
356 : parrello 1.1
357 : parrello 1.2 Display a set of lists. Each list should consist of 2-tuples.
358 : parrello 1.1
359 :     =over 4
360 :    
361 : parrello 1.2 =item all
362 :    
363 :     TRUE if details should be displayed; FALSE if only summaries should be displayed.
364 :    
365 : parrello 1.1 =item lists
366 :    
367 :     A hash mapping list names to list references.
368 :    
369 :     =cut
370 :    
371 :     sub ShowLists {
372 :     # Get the parameters.
373 : parrello 1.2 my $all = shift @_;
374 : parrello 1.1 my %lists = @_;
375 :     # Loop through the lists in alphabetical order by list name.
376 : parrello 1.6 for my $listName (sort keys %lists) {
377 : parrello 1.1 # Get the list itself.
378 :     my $list = $lists{$listName};
379 :     # Get the number of list items.
380 :     my $listSize = scalar @{$list};
381 : parrello 1.6 # Only proceed if the list is nonempty.
382 :     if ($listSize > 0) {
383 : parrello 1.7 my $header = ShowHeader($listName, $listSize);
384 : parrello 1.6 print "$header\n";
385 :     Trace($header) if T(3);
386 :     # If we're at trace level 3, display the list.
387 :     if ($all) {
388 :     # Put a spacer under the title.
389 :     print "\n";
390 :     # Get the width of the name column.
391 :     my $width = 0;
392 :     for my $entryLen (map { length $_->[0] } @{$list}) {
393 :     $width = $entryLen if $entryLen > $width;
394 :     }
395 :     # Now display the list.
396 :     for my $entry (@{$list}) {
397 :     my ($name, $data) = @{$entry};
398 :     print " $name" . (" " x ($width - length $name)) . " $data\n";
399 :     }
400 :     print "\n\n";
401 : parrello 1.1 }
402 :     }
403 :     }
404 :     }
405 :    
406 : parrello 1.7 =head3 ShowHeader
407 :    
408 :     C<< my $header = ShowHeader($name, $count); >>
409 :    
410 :     Return a list header for a list of the specified length.
411 :    
412 :     =over 4
413 :    
414 :     =item name
415 :    
416 :     Name of the list.
417 :    
418 :     =item count
419 :    
420 :     Number of entries in the list.
421 :    
422 :     =item RETURN
423 :    
424 :     Returns a list header that shows the name of the list and the number of entries.
425 :    
426 :     =back
427 :    
428 :     =cut
429 :    
430 :     sub ShowHeader {
431 :     # Get the parameters.
432 :     my ($name, $count) = @_;
433 :     # Declare the return variable.
434 :     my $retVal;
435 :     if ($count == 0) {
436 :     $retVal = "*** $name: none";
437 :     } elsif ($count == 1) {
438 :     $retVal = "*** $name: one";
439 :     } else {
440 :     $retVal = "*** $name: $count";
441 :     }
442 :     # Return the result.
443 :     return $retVal;
444 :     }
445 :    
446 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3