[Bio] / Sprout / NewStuffCheck.pl Repository:
ViewVC logotype

View of /Sprout/NewStuffCheck.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.15 - (download) (as text) (annotate)
Tue Sep 19 00:26:21 2006 UTC (13 years, 1 month ago) by parrello
Branch: MAIN
Changes since 1.14: +3 -1 lines
Fixed a comment.

#!/usr/bin/perl -w

=head1 New Stuff Checker

This script compares the genomes, features, and annotations in
the old and new sprouts and lists the differences.

The currently-supported command-line options for NewStuffCheck are as follows.

=over 4

=item summary

Do not display details, only difference summaries.

=item nofeats

Do not process features.

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item h

Display this command's parameters and options.

=item phone

Phone number to message when the script is complete.

=item groupFile

Name of the group file (described below). The default is C<groups.tbl>
in the Sprout data directory.

=back

=head2 The Group File

A key data file for this process is C<groups.tbl>. This file is kept in the
Sprout Data directory, and contains the following columns:

=over 4

=item name

Name of the group.

=item page

Name of the group's page on the web site (e.g. C<campy.php> for
Campylobacter)

=item genus

Genus of the group

=item species

Species of the group, or an empty string if the group is for an entire
genus. If the group contains more than one species, the species names
should be separated by commas.

=back

=cut

use strict;
use Tracer;
use DocUtils;
use TestUtils;
use Cwd;
use File::Copy;
use File::Path;
use FIG;
use SFXlate;
use Sprout;

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(Sprout) ],
                                           {
                                              groupFile => ["$FIG_Config::sproutData/groups.tbl", "location of the NMPDR group description file"],
                                              nofeats => ["", "if specified, only genome changes will be displayed; otherwise, genome features will be compared and differences shown"],
                                              trace => ["2-", "tracing level; use a minus to prevent tracing to standard output"],
                                              summary => ["", "if specified, detailed lists of the different items will not be displayed"],
                                              phone => ["", "phone number (international format) to call when load finishes"],
                                           },
                                           "",
                                           @ARGV);
# Set a variable to contain return type information.
my $rtype;
# Insure we catch errors.
eval {
    # Get the group file.
    my $groupFileName = $options->{groupFile};
    Trace("Reading group file $groupFileName.") if T(2);
    # Well put each group's data into a hash, keyed by group
    # name, each entry being a 3-tuple of page name, genus,
    # and species
    my %groups = Sprout::ReadGroupFile($groupFileName);
    Trace("Processing genomes.") if T(2);
    # Get the current SEED.
    my $fig = FIG->new();
    # Get the old Sprout.
    my $oldSprout = SFXlate->new_sprout_only($FIG_Config::oldSproutDB);
    # Get its genomes in alphabetical order.
    my @oldGenomes = GetGenomes($oldSprout);
    # Get the new Sprout.
    my $newSprout = SFXlate->new_sprout_only();
    # Get its genomes in alphabetical order.
    my @newGenomes = GetGenomes($newSprout);
    # Compare the two genomes lists.
    my ($insertedGenomes, $deletedGenomes) = Tracer::CompareLists(\@newGenomes, \@oldGenomes);
    # Add feature counts to the new genomes.
    for my $insertedGenome (@{$insertedGenomes}) {
        my $genomeID = $insertedGenome->[0];
        # For a new genome, display the feature count.
        my $count = $newSprout->GetCount(['HasFeature'], "HasFeature(from-link) = ?",
                                         [$genomeID]);
        my $suffix = ($count == 1 ? " one feature" : "$count features");
        $insertedGenome->[1] .= "($suffix)";
    }
    # Add information about SEED status to the deleted genomes.
    for my $deletedGenome (@{$deletedGenomes}) {
        my $genomeID = $deletedGenome->[0];
        if ($fig->is_genome($genomeID)) {
            my $complete = ($fig->is_complete($genomeID) ? "complete" : "incomplete");
            $deletedGenome->[1] .= "(still in SEED, $complete)";
        }
    }
    # Display the lists.
    ShowLists(! $options->{summary},
              'New Genomes'     => $insertedGenomes,
              'Deleted Genomes' => $deletedGenomes);
    # Now the groups.
    Trace("Comparing groups.") if T(2);
    my %oldGroups = $oldSprout->GetGroups();
    my %newGroups = $newSprout->GetGroups();
    # Loop through the new groups.
    for my $newGroup (sort keys %newGroups) {
        Trace("Processing group $newGroup.") if T(3);
        # Find out if this group is new to this version.
        if (! exists $oldGroups{$newGroup}) {
            # Construct a list of this group's genes.
            my @groupGenomes = NameGenomes($newSprout, $newGroups{$newGroup});
            ShowLists(! $options->{summary}, "Genomes in new group $newGroup" => \@groupGenomes);
        } else {
            # Here the group is in both versions. Fix the lists and compare them.
            my @newGroupList = NameGenomes($newSprout, $newGroups{$newGroup});
            my @oldGroupList = NameGenomes($oldSprout, $oldGroups{$newGroup});
            Trace("Comparing lists for $newGroup.") if T(4);
            my ($insertedGroupGenomes, $deletedGroupGenomes) = Tracer::CompareLists(\@newGroupList, \@oldGroupList);
            Trace("Comparison complete.") if T(4);
            # Delete the old group data. When we're done, this means we'll have a list of the deleted
            # groups.
            delete $oldGroups{$newGroup};
            # Show the lists. Empty lists will not be shown.
            Trace("Displaying group lists.") if T(4);
            ShowLists(! $options->{summary},
                      "Genomes new to $newGroup" => $insertedGroupGenomes,
                      "Genomes no longer in $newGroup" => $deletedGroupGenomes);
        }
    }
    Trace("Processing deleted groups.") if T(4);
    # Now list the deleted groups.
    for my $oldGroup (sort keys %oldGroups) {
        Trace("Processing deleted group $oldGroup.") if T(3);
        my @groupGenomes = NameGenomes($oldSprout, $oldGroups{$oldGroup});
        ShowLists(! $options->{summary}, "Genomes in deleted group $oldGroup" => \@groupGenomes);
    }
    # Next, we get the subsystems.
    Trace("Processing subsystems.") if T(2);
    my @oldSubsystems = GetSubsystems($oldSprout);
    my @newSubsystems = GetSubsystems($newSprout);
    # Compare and display the subsystem lists.
    my ($insertedSubs, $deletedSubs) = Tracer::CompareLists(\@newSubsystems, \@oldSubsystems);
    # Check the deleted subsystems to see if they're in SEED.
    if (scalar @{$deletedSubs} > 0) {
        my %subChecker = map { $_ => 1 } $fig->all_subsystems();
        for my $deletedSub (@{$deletedSubs}) {
            my $subID = $deletedSub->[0];
            if ($subChecker{$subID}) {
                my $trusted = ($fig->usable_subsystem($subID) ? "usable" : "not usable");
                $deletedSub->[1] .= " (still in SEED, $trusted)";
            }
        }
    }
    ShowLists(! $options->{summary},
              'New Subsystems'     => $insertedSubs,
              'Deleted Subsystems' => $deletedSubs);
    # Next, we show some basic counts.
    print "\nStatistics for old Sprout\n\n";
    ShowCounts($oldSprout);
    print "\nStatistics for new Sprout\n\n";
    ShowCounts($newSprout);
    # Now we show the genomes that are not in groups but could be. First, we convert
    # our group hash from the new Sprout into the form used on the web site.
    Trace("Examining possible missing genomes in groups.") if T(2);
    my %fixedGroups = Sprout::Fix(%newGroups);
    for my $group (sort keys %groups) {
        Trace("Checking group $group.");
        # Get this group's genus and species.
        my $genus = $groups{$group}->[1];
        my $species = $groups{$group}->[2];
        # Get a hash of the genomes already in it.
        my %inGroup = map { $_ => 1 } @{$fixedGroups{$group}};
        # Get a list of its possible genomes.
        my $filter = 'Genome(genus) = ?';
        my @parms = ($genus);
        # The species list is tricky because a given group may involve more than
        # one target species. The species names will be comma-separated, and
        # we use some PERL trickiness to generate an OR filter for them.
        if ($species) {
            # Get the individual species.
            my @speciesList = split /\s*,\s*/, $species;
            # Create one species filter per species.
            my @filterClauses = map { 'Genome(species) = ?' } @speciesList;
            # OR the filter clauses together to get a real filter.
            $filter .= " AND (" . (join " OR ", @filterClauses) . ")";
            # Add the specieis names to the SQL parameter list.
            push @parms, @speciesList;
        }
        my @possibles = $newSprout->GetFlat(['Genome'], $filter, \@parms, 'Genome(id)');
        # Get the possibles that aren't in the group and add identifying information.
        my @leftOut = NameGenomes($newSprout, [ grep { ! exists $inGroup{$_} } @possibles ]);
        # If anything survived, show the list.
        if (@leftOut) {
            ShowLists(! $options->{summary}, "Candidates for $group" => \@leftOut);
        }
    }
    # Compare the property tables.
    Trace("Comparing properties.") if T(2);
    # Set up lists of all the properties in each sprout.
    my @oldProps = GetProperties($oldSprout);
    my @newProps = GetProperties($newSprout);
    # Compare the lists.
    my ($insertedProps, $deletedProps) = Tracer::CompareLists(\@oldProps, \@newProps);
    # Now get all the properties in the new Sprout without any features.
    my @emptyProps = grep { $_->[1] == 0 } @newProps;
    # Show what we've found.
    ShowLists(! $options->{summary}, 'New Properties' => $insertedProps,
                                     'Deleted Properties' => $deletedProps,
                                     'Empty Properties' => \@emptyProps);
    # Now we process the features of the common genes.
    if (! $options->{nofeats}) {
        # First we need a hash of the inserted stuff so we know to skip it.
        my %skipGenomes = map { $_->[0] => 1 } @{$insertedGenomes};
        # Loop through the genomees.
        for my $genome (@newGenomes) {
            # Get the ID and name.
            my ($genomeID, $genomeName) = @{$genome};
            Trace("Processing $genomeID.") if T(3);
            # Only process the common genes.
            if (! $skipGenomes{$genomeID}) {
                # Compare the genome group information.
                # Get the new and old features. This will be very stressful to the machine,
                # because there are lots of features.
                my @oldFeatures = GetFeatures($oldSprout, $genomeID);
                my @newFeatures = GetFeatures($newSprout, $genomeID);
                Trace("Comparing features for $genomeID.") if T(3);
                # Compare the lists.
                my ($insertedFeatures, $deletedFeatures) = Tracer::CompareLists(\@newFeatures, \@oldFeatures);
                # Display the lists. Only nonempty lists are displayed; however, we do a check
                # first anyway so the trace tells us what's happening.
                if (scalar @{$insertedFeatures} + scalar @{$deletedFeatures} > 0) {
                    Trace("Displaying feature differences.") if T(3);
                    ShowLists(! $options->{summary},
                              "New Features for $genomeID"      => $insertedFeatures,
                              "Features Deleted from $genomeID" => $deletedFeatures);
                }
            }
        }
    }
};
if ($@) {
    Trace("Script failed with error: $@") if T(0);
    $rtype = "error";
} else {
    Trace("Script complete.") if T(2);
    $rtype = "no error";
}
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "New Stuff Checker terminated with $rtype.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);
    }
}

=head3 GetGenomes

C<< my @geneList = GetGenomes($sprout); >>

Return a list of the genomes in the specified Sprout instance. The genomes
are returned in alphabetical order by genome ID.

=over 4

=item sprout

Sprout instance whose gene list is desired.

=item RETURN

Returns a list of two-tuples. The first element in each tuple is the genome ID,
and the second is the genome name (genus, species, strain).

=back

=cut

sub GetGenomes {
    # Get the parameters.
    my ($sprout) = @_;
    # Get the desired data.
    my @genomes = $sprout->GetAll(['Genome'], "ORDER BY Genome(id)", [], ['Genome(id)',
                                                                          'Genome(genus)',
                                                                          'Genome(species)',
                                                                          'Genome(unique-characterization)']);
    # Create the genome names from the three pieces of the name.
    my @retVal = map { [$_->[0], join(" ", @{$_}[1..3])] } @genomes;
    # Return the result.
    return @retVal;
}

=head3 NameGenomes

C<< my @newList = NameGenomes($sprout, \@genomes); >>

Convert a list of genome IDs to a list of genome IDs with names.

=over 4

=item sprout

The relevant sprout instance.

=item genomes

Reference to a list of genome IDs

=item RETURN

Returns a list of 2-tuples, each tuple consisting of a genome ID followed by a
genome name.

=back

=cut

sub NameGenomes {
    # Get the parameters.
    my ($sprout, $genomes) = @_;
    # Attach the names.
    my @retVal = map { [$_, $sprout->GenusSpecies($_) ] } @{$genomes};
    # Return the result.
    return @retVal;
}

=head3 GetSubsystems

C<< my @subsystems = GetSubsystems($sprout); >>

Get a list of the subsystems in the specified Sprout instance.

=over 4

=item sprout

Sprout instance whose subsystems are desired.

=item RETURN

Returns a list of 2-tuples, each consisting of the subsystem name followed by
the name of the curator.

=back

=cut

sub GetSubsystems {
    # Get the parameters.
    my ($sprout) = @_;
    # Declare the return variable.
    my @retVal = $sprout->GetAll(['Subsystem'], "ORDER BY Subsystem(id)",
                                 [], ['Subsystem(id)', 'Subsystem(curator)']);
    # Return the result.
    return @retVal;
}

=head3 GetProperties

C<< my @propertyList = GetProperties($sprout); >>

Return a list of properties. Each element in the list will be a 2-tuple containing
the property name and value in the first column and its ID in the second column.

=over 4

=item sprout

Sprout instance to be used to retrieve the properties.

=item RETURN

Returns a list of 2-tuples. The first element in each 2-tuple will be a string
in the form of an assignment of the property value to the property name. The second
element will be the number of features possessing the property. The list will be
sorted in ascending alphabetical order.

=back

=cut

sub GetProperties {
    # Get the parameters.
    my ($sprout) = @_;
    # Get the properties.
    my @props = $sprout->GetAll(['Property'],
                                "ORDER BY Property(property-name), Property(property-value)", [],
                                ['Property(property-name)', 'Property(property-value)', 'Property(id)']);
    # Combine the property names and values and replace each property ID by a feature count.
    my @retVal;
    for my $propItem (@props) {
        my $label = $propItem->[0] . " = " . $propItem->[1];
        my $count = $sprout->GetCount(['Feature', 'HasProperty'], "HasProperty(to-link) = ?",
                                      [$propItem->[2]]);
        push @retVal, [$label, $count];
    }
    # Return the result.
    return @retVal;
}

=head3 GetFeatures

C<< my @features = GetFeatures($sprout, $genomeID); >>

Return the features of the specified genome in the specified Sprout instance.

=over 4

=item sprout

Sprout instance to use to get the features.

=item genomeID

ID of the genome in question.

=item RETURN

Returns a list of 2-tuples, the first element being the feature ID and the second its
functional assignment (if any).

=back

=cut

sub GetFeatures {
    # Get the parameters.
    my ($sprout, $genomeID) = @_;
    # Get a list of the feature IDs and map them to their functional assignments.
    my @retVal = map { [$_, $sprout->FunctionOf($_)] } $sprout->GetFlat(['HasFeature'],
                                                                        "HasFeature(from-link) = ? ORDER BY HasFeature(to-link)",
                                                                        [$genomeID], 'HasFeature(to-link)');
    # Return the result.
    return @retVal;
}

=head3 ShowLists

C<< ShowLists($all, %lists); >>

Display a set of lists. Each list should consist of 2-tuples.

=over 4

=item all

TRUE if details should be displayed; FALSE if only summaries should be displayed.

=item lists

A hash mapping list names to list references.

=back

=cut

sub ShowLists {
    # Get the parameters.
    my $all = shift @_;
    my %lists = @_;
    # Loop through the lists in alphabetical order by list name.
    for my $listName (sort keys %lists) {
        # Get the list itself.
        my $list = $lists{$listName};
        # Get the number of list items.
        my $listSize = scalar @{$list};
        # Only proceed if the list is nonempty.
        if ($listSize > 0) {
            my $header = ShowHeader($listName, $listSize);
            print "$header\n";
            Trace($header) if T(3);
            # If we're at trace level 3, display the list.
            if ($all) {
                # Put a spacer under the title.
                print "\n";
                # Get the width of the name column.
                my $width = 0;
                for my $entryLen (map { length $_->[0] } @{$list}) {
                    $width = $entryLen if $entryLen > $width;
                }
                # Now display the list.
                for my $entry (@{$list}) {
                    my ($name, $data) = @{$entry};
                    print "    $name" . (" " x ($width - length $name)) . " $data\n";
                }
                print "\n\n";
            }
        }
    }
}

=head3 ShowHeader

C<< my $header = ShowHeader($name, $count); >>

Return a list header for a list of the specified length.

=over 4

=item name

Name of the list.

=item count

Number of entries in the list.

=item RETURN

Returns a list header that shows the name of the list and the number of entries.

=back

=cut

sub ShowHeader {
    # Get the parameters.
    my ($name, $count) = @_;
    # Declare the return variable.
    my $retVal;
    if ($count == 0) {
        $retVal = "*** $name: none";
    } elsif ($count == 1) {
        $retVal = "*** $name: one";
    } else {
        $retVal = "*** $name: $count";
    }
    # Return the result.
    return $retVal;
}

=head3 ShowCounts

C<< ShowCounts($sprout); >>

Display general counts for the specified sprout instance. These counts are
used in progress reports.

=over 4

=item sprout

Sprout instance for which counts are to be produced.

=back

=cut

sub ShowCounts {
    # Get the parameters.
    my ($sprout) = @_;
    # Count genomes and subsystems.
    my $genomes = $sprout->GetCount(['Genome']);
    my $subsystems = $sprout->GetCount(['Subsystem']);
    # Count roles, external functional assignments, and functional couplings.
    my $roles = $sprout->GetCount(['OccursInSubsystem']);
    my $funcs = $sprout->GetCount(['ExternalAliasFunc']);
    my $couples = $sprout->GetCount(['Coupling']);
    # Count features and BBHs.
    my $features = $sprout->GetCount(['Feature']);
    # Display the counts.
    print "Genomes = $genomes.\n";
    print "Subsystems = $subsystems.\n";
    print "Roles = $roles.\n";
    print "External function assignments = $funcs.\n";
    print "Features = $features.\n";
    print "Functional couplings = $couples.\n";
    print "\n";
}
1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3