[Bio] / Sprout / NewStuffCheck.pl Repository:
ViewVC logotype

Diff of /Sprout/NewStuffCheck.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.10, Thu Aug 24 21:17:08 2006 UTC revision 1.11, Sat Sep 2 07:01:26 2006 UTC
# Line 5  Line 5 
5  This script compares the genomes, features, and annotations in  This script compares the genomes, features, and annotations in
6  the old and new sprouts and lists the differences.  the old and new sprouts and lists the differences.
7    
8  The currently-supported command-line options are as follows.  The currently-supported command-line options for NewStuffCheck are as follows.
9    
10  =over 4  =over 4
11    
# Line 47  Line 47 
47    
48  Phone number to message when the script is complete.  Phone number to message when the script is complete.
49    
50    =item groupFile
51    
52    Name of the group file (described below). The default is C<groups.tbl>
53    in the Sprout data directory.
54    
55    =back
56    
57    =head2 The Group File
58    
59    A key data file for this process is C<groups.tbl>. This file is kept in the
60    Sprout Data directory, and contains the following columns:
61    
62    =over 4
63    
64    =item name
65    
66    Name of the group.
67    
68    =item page
69    
70    Name of the group's page on the web site (e.g. C<campy.php> for
71    Campylobacter)
72    
73    =item genus
74    
75    Genus of the group
76    
77    =item species
78    
79    Species of the group, or an empty string if the group is for an entire
80    genus.
81    
82  =back  =back
83    
84  =cut  =cut
# Line 65  Line 97 
97  # Get the command-line options and parameters.  # Get the command-line options and parameters.
98  my ($options, @parameters) = StandardSetup([qw(Sprout) ],  my ($options, @parameters) = StandardSetup([qw(Sprout) ],
99                                             {                                             {
100                                                  groupFile => ["$FIG_Config::sproutData/groups.tbl", "location of the NMPDR group description file"],
101                                                nofeats => ["", "if specified, only genome changes will be displayed; otherwise, genome features will be compared and differences shown"],                                                nofeats => ["", "if specified, only genome changes will be displayed; otherwise, genome features will be compared and differences shown"],
102                                                trace => ["2-", "tracing level; use a minus to prevent tracing to standard output"],                                                trace => ["2-", "tracing level; use a minus to prevent tracing to standard output"],
103                                                summary => ["", "if specified, detailed lists of the different items will not be displayed"],                                                summary => ["", "if specified, detailed lists of the different items will not be displayed"],
# Line 76  Line 109 
109  my $rtype;  my $rtype;
110  # Insure we catch errors.  # Insure we catch errors.
111  eval {  eval {
112        # Get the group file.
113        my $groupFileName = $options->{groupFile};
114        Trace("Reading group file $groupFileName.") if T(2);
115        my @groupLines = Tracer::GetFile($groupFileName);
116        # Well put each group's data into a hash, keyed by group
117        # name, each entry being a 3-tuple of page name, genus,
118        # and species
119        my %groups;
120        for my $groupLine (@groupLines) {
121            my ($name, $page, $genus, $species) = split(/\t/, $groupLine);
122            $groups{$name} = [$page, $genus, $species];
123        }
124      Trace("Processing genomes.") if T(2);      Trace("Processing genomes.") if T(2);
125      # Get the current SEED.      # Get the current SEED.
126      my $fig = FIG->new();      my $fig = FIG->new();
# Line 166  Line 211 
211      ShowLists(! $options->{summary},      ShowLists(! $options->{summary},
212                'New Subsystems'     => $insertedSubs,                'New Subsystems'     => $insertedSubs,
213                'Deleted Subsystems' => $deletedSubs);                'Deleted Subsystems' => $deletedSubs);
214        # Next, we show some basic counts.
215        print "\nStatistics for old Sprout\n\n";
216        ShowCounts($oldSprout);
217        print "\nStatistics for new Sprout\n\n";
218        ShowCounts($newSprout);
219        # Now we show the genomes that are not in groups but could be. First, we convert
220        # our group hash from the new Sprout into the form used on the web site.
221        Trace("Examining possible missing genomes in groups.") if T(2);
222        my %fixedGroups = Sprout::Fix(%newGroups);
223        for my $group (sort keys %groups) {
224            Trace("Checking group $group.");
225            # Get this group's genus and species.
226            my $genus = $groups{$group}->[1];
227            my $species = $groups{$group}->[2];
228            # Get a hash of the genomes already in it.
229            my %inGroup = map { $_ => 1 } @{$fixedGroups{$group}};
230            # Get a list of its possible genomes.
231            my $filter = 'Genome(genus) = ?';
232            my @parms = ($genus);
233            if ($species) {
234                $filter .= ' AND Genome(species) = ?';
235                push @parms, $species;
236            }
237            my @possibles = $newSprout->GetFlat(['Genome'], $filter, \@parms, 'Genome(id)');
238            # Get the possibles that aren't in the group and add identifying information.
239            my @leftOut = NameGenomes($newSprout, [ grep { $inGroup{$_} } @possibles ]);
240            # If anything survived, show the list.
241            if (@leftOut) {
242                ShowLists(! $options->{summary}, "Candidates for $group" => \@leftOut);
243            }
244        }
245        # Compare the property tables.
246        Trace("Comparing properties.") if T(2);
247        # Set up lists of all the properties in each sprout.
248        my @oldProps = GetProperties($oldSprout);
249        my @newProps = GetProperties($newSprout);
250        # Compare the lists.
251        my ($insertedProps, $deletedProps) = Tracer::CompareLists(\@oldProps, \@newProps);
252        # Now get all the properties in the new Sprout without any features.
253        my @emptyProps = grep { $_->[1] == 0 } @newProps;
254        # Show what we've found.
255        ShowLists(! $options->{summary}, 'New Properties' => $insertedProps,
256                                         'Deleted Properties' => $deletedProps,
257                                         'Empty Properties' => \@emptyProps);
258      # Now we process the features of the common genes.      # Now we process the features of the common genes.
259      if (! $options->{nofeats}) {      if (! $options->{nofeats}) {
260          # First we need a hash of the inserted stuff so we know to skip it.          # First we need a hash of the inserted stuff so we know to skip it.
# Line 205  Line 294 
294      $rtype = "no error";      $rtype = "no error";
295  }  }
296  if ($options->{phone}) {  if ($options->{phone}) {
297      my $msgID = Tracer::SendSMS($options->{phone}, "Subsystem Checker terminated with $rtype.");      my $msgID = Tracer::SendSMS($options->{phone}, "New Stuff Checker terminated with $rtype.");
298      if ($msgID) {      if ($msgID) {
299          Trace("Phone message sent with ID $msgID.") if T(2);          Trace("Phone message sent with ID $msgID.") if T(2);
300      } else {      } else {
# Line 251  Line 340 
340    
341  =head3 NameGenomes  =head3 NameGenomes
342    
343  C<< my $newList = NameGenomes($sprout, \@genomes); >>  C<< my @newList = NameGenomes($sprout, \@genomes); >>
344    
345  Convert a list of genome IDs to a list of genome IDs with names.  Convert a list of genome IDs to a list of genome IDs with names.
346    
# Line 314  Line 403 
403      return @retVal;      return @retVal;
404  }  }
405    
406    =head3 GetProperties
407    
408    C<< my @propertyList = GetProperties($sprout); >>
409    
410    Return a list of properties. Each element in the list will be a 2-tuple containing
411    the property name and value in the first column and its ID in the second column.
412    
413    =over 4
414    
415    =item sprout
416    
417    Sprout instance to be used to retrieve the properties.
418    
419    =item RETURN
420    
421    Returns a list of 2-tuples. The first element in each 2-tuple will be a string
422    in the form of an assignment of the property value to the property name. The second
423    element will be the number of features possessing the property. The list will be
424    sorted in ascending alphabetical order.
425    
426    =back
427    
428    =cut
429    
430    sub GetProperties {
431        # Get the parameters.
432        my ($sprout) = @_;
433        # Get the properties.
434        my @props = $sprout->GetAll(['Property'],
435                                    "ORDER BY Property(property-name), Property(property-value)", [],
436                                    ['Property(property-name)', 'Property(property-value)', 'Property(id)']);
437        # Combine the property names and values and replace each property ID by a feature count.
438        my @retVal;
439        for my $propItem (@props) {
440            my $label = $propItem->[0] . " = " . $propItem->[1];
441            my $count = $sprout->GetCount(['Feature', 'HasProperty'], "HasProperty(to-link) = ?",
442                                          [$propItem->[2]]);
443            push @retVal, [$label, $count];
444        }
445        # Return the result.
446        return @retVal;
447    }
448    
449  =head3 GetFeatures  =head3 GetFeatures
450    
451  C<< my @features = GetFeatures($sprout, $genomeID); >>  C<< my @features = GetFeatures($sprout, $genomeID); >>
# Line 443  Line 575 
575      return $retVal;      return $retVal;
576  }  }
577    
578    =head3 ShowCounts
579    
580    C<< ShowCounts($sprout); >>
581    
582    Display general counts for the specified sprout instance. These counts are
583    used in progress reports.
584    
585    =over 4
586    
587    =item sprout
588    
589    Sprout instance for which counts are to be produced.
590    
591    =back
592    
593    =cut
594    
595    sub ShowCounts {
596        # Get the parameters.
597        my ($sprout) = @_;
598        # Count genomes and subsystems.
599        my $genomes = $sprout->GetCount(['Genome']);
600        my $subsystems = $sprout->GetCount(['Subsystem']);
601        # Count roles, external functional assignments, and functional couplings.
602        my $roles = $sprout->GetCount(['OccursInSubsystem']);
603        my $funcs = $sprout->GetCount(['ExternalAliasFunc']);
604        my $couples = $sprout->GetCount(['Coupling']);
605        # Count features and BBHs.
606        my $bbhs = int ($sprout->GetCount(['IsBidirectionalBestHitOf']) / 2);
607        my $features = $sprout->GetCount(['Feature']);
608        # Display the counts.
609        print "Genomes = $genomes.\n";
610        print "Subsystems = $subsystems.\n";
611        print "Roles = $roles.\n";
612        print "External function assignments = $funcs.\n";
613        print "Features = $features.\n";
614        print "Bidirectional Best Hits = $bbhs.\n";
615        print "Functional couplings = $couples.\n";
616        print "\n";
617    }
618  1;  1;

Legend:
Removed from v.1.10  
changed lines
  Added in v.1.11

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3