[Bio] / Sprout / NewStuffCheck.pl Repository:
ViewVC logotype

View of /Sprout/NewStuffCheck.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (download) (as text) (annotate)
Thu Aug 24 21:17:08 2006 UTC (13 years, 2 months ago) by parrello
Branch: MAIN
Changes since 1.9: +4 -0 lines
More tracing.

#!/usr/bin/perl -w

=head1 New Stuff Checker

This script compares the genomes, features, and annotations in
the old and new sprouts and lists the differences.

The currently-supported command-line options are as follows.

=over 4

=item summary

Do not display details, only difference summaries.

=item nofeats

Do not process features.

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item h

Display this command's parameters and options.

=item phone

Phone number to message when the script is complete.

=back

=cut

use strict;
use Tracer;
use DocUtils;
use TestUtils;
use Cwd;
use File::Copy;
use File::Path;
use FIG;
use SFXlate;
use Sprout;

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(Sprout) ],
                                           {
                                              nofeats => ["", "if specified, only genome changes will be displayed; otherwise, genome features will be compared and differences shown"],
                                              trace => ["2-", "tracing level; use a minus to prevent tracing to standard output"],
                                              summary => ["", "if specified, detailed lists of the different items will not be displayed"],
                                              phone => ["", "phone number (international format) to call when load finishes"],
                                           },
                                           "",
                                           @ARGV);
# Set a variable to contain return type information.
my $rtype;
# Insure we catch errors.
eval {
    Trace("Processing genomes.") if T(2);
    # Get the current SEED.
    my $fig = FIG->new();
    # Get the old Sprout.
    my $oldSprout = SFXlate->new_sprout_only($FIG_Config::oldSproutDB);
    # Get its genomes in alphabetical order.
    my @oldGenomes = GetGenomes($oldSprout);
    # Get the new Sprout.
    my $newSprout = SFXlate->new_sprout_only();
    # Get its genomes in alphabetical order.
    my @newGenomes = GetGenomes($newSprout);
    # Compare the two genomes lists.
    my ($insertedGenomes, $deletedGenomes) = Tracer::CompareLists(\@newGenomes, \@oldGenomes);
    # Add feature counts to the new genomes.
    for my $insertedGenome (@{$insertedGenomes}) {
        my $genomeID = $insertedGenome->[0];
        # For a new genome, display the feature count.
        my $count = $newSprout->GetCount(['HasFeature'], "HasFeature(from-link) = ?",
                                         [$genomeID]);
        my $suffix = ($count == 1 ? " one feature" : "$count features");
        $insertedGenome->[1] .= "($suffix)";
    }
    # Add information about SEED status to the deleted genomes.
    for my $deletedGenome (@{$deletedGenomes}) {
        my $genomeID = $deletedGenome->[0];
        if ($fig->is_genome($genomeID)) {
            my $complete = ($fig->is_complete($genomeID) ? "complete" : "incomplete");
            $deletedGenome->[1] .= "(still in SEED, $complete)";
        }
    }
    # Display the lists.
    ShowLists(! $options->{summary},
              'New Genomes'     => $insertedGenomes,
              'Deleted Genomes' => $deletedGenomes);
    # Now the groups.
    Trace("Comparing groups.") if T(2);
    my %oldGroups = $oldSprout->GetGroups();
    my %newGroups = $newSprout->GetGroups();
    # Loop through the new groups.
    for my $newGroup (sort keys %newGroups) {
        Trace("Processing group $newGroup.") if T(3);
        # Find out if this group is new to this version.
        if (! exists $oldGroups{$newGroup}) {
            # Construct a list of this group's genes.
            my @groupGenomes = NameGenomes($newSprout, $newGroups{$newGroup});
            ShowLists(! $options->{summary}, "Genomes in new group $newGroup" => \@groupGenomes);
        } else {
            # Here the group is in both versions. Fix the lists and compare them.
            my @newGroupList = NameGenomes($newSprout, $newGroups{$newGroup});
            my @oldGroupList = NameGenomes($oldSprout, $oldGroups{$newGroup});
            Trace("Comparing lists for $newGroup.") if T(4);
            my ($insertedGroupGenomes, $deletedGroupGenomes) = Tracer::CompareLists(\@newGroupList, \@oldGroupList);
            Trace("Comparison complete.") if T(4);
            # Delete the old group data. When we're done, this means we'll have a list of the deleted
            # groups.
            delete $oldGroups{$newGroup};
            # Show the lists. Empty lists will not be shown.
            Trace("Displaying group lists.") if T(4);
            ShowLists(! $options->{summary},
                      "Genomes new to $newGroup" => $insertedGroupGenomes,
                      "Genomes no longer in $newGroup" => $deletedGroupGenomes);
        }
    }
    Trace("Processing deleted groups.") if T(4);
    # Now list the deleted groups.
    for my $oldGroup (sort keys %oldGroups) {
        Trace("Processing deleted group $oldGroup.") if T(3);
        my @groupGenomes = NameGenomes($oldSprout, $oldGroups{$oldGroup});
        ShowLists(! $options->{summary}, "Genomes in deleted group $oldGroup" => \@groupGenomes);
    }
    # Next, we get the subsystems.
    Trace("Processing subsystems.") if T(2);
    my @oldSubsystems = GetSubsystems($oldSprout);
    my @newSubsystems = GetSubsystems($newSprout);
    # Compare and display the subsystem lists.
    my ($insertedSubs, $deletedSubs) = Tracer::CompareLists(\@newSubsystems, \@oldSubsystems);
    # Check the deleted subsystems to see if they're in SEED.
    if (scalar @{$deletedSubs} > 0) {
        my %subChecker = map { $_ => 1 } $fig->all_subsystems();
        for my $deletedSub (@{$deletedSubs}) {
            my $subID = $deletedSub->[0];
            if ($subChecker{$subID}) {
                my $trusted = ($fig->usable_subsystem($subID) ? "usable" : "not usable");
                $deletedSub->[1] .= " (still in SEED, $trusted)";
            }
        }
    }
    ShowLists(! $options->{summary},
              'New Subsystems'     => $insertedSubs,
              'Deleted Subsystems' => $deletedSubs);
    # Now we process the features of the common genes.
    if (! $options->{nofeats}) {
        # First we need a hash of the inserted stuff so we know to skip it.
        my %skipGenomes = map { $_->[0] => 1 } @{$insertedGenomes};
        # Loop through the genomees.
        for my $genome (@newGenomes) {
            # Get the ID and name.
            my ($genomeID, $genomeName) = @{$genome};
            Trace("Processing $genomeID.") if T(3);
            # Only process the common genes.
            if (! $skipGenomes{$genomeID}) {
                # Compare the genome group information.
                # Get the new and old features. This will be very stressful to the machine,
                # because there are lots of features.
                my @oldFeatures = GetFeatures($oldSprout, $genomeID);
                my @newFeatures = GetFeatures($newSprout, $genomeID);
                Trace("Comparing features for $genomeID.") if T(3);
                # Compare the lists.
                my ($insertedFeatures, $deletedFeatures) = Tracer::CompareLists(\@newFeatures, \@oldFeatures);
                # Display the lists. Only nonempty lists are displayed; however, we do a check
                # first anyway so the trace tells us what's happening.
                if (scalar @{$insertedFeatures} + scalar @{$deletedFeatures} > 0) {
                    Trace("Displaying feature differences.") if T(3);
                    ShowLists(! $options->{summary},
                              "New Features for $genomeID"      => $insertedFeatures,
                              "Features Deleted from $genomeID" => $deletedFeatures);
                }
            }
        }
    }
};
if ($@) {
    Trace("Script failed with error: $@") if T(0);
    $rtype = "error";
} else {
    Trace("Script complete.") if T(2);
    $rtype = "no error";
}
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "Subsystem Checker terminated with $rtype.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);
    }
}

=head3 GetGenomes

C<< my @geneList = GetGenomes($sprout); >>

Return a list of the genomes in the specified Sprout instance. The genomes
are returned in alphabetical order by genome ID.

=over 4

=item sprout

Sprout instance whose gene list is desired.

=item RETURN

Returns a list of two-tuples. The first element in each tuple is the genome ID,
and the second is the genome name (genus, species, strain).

=back

=cut

sub GetGenomes {
    # Get the parameters.
    my ($sprout) = @_;
    # Get the desired data.
    my @genomes = $sprout->GetAll(['Genome'], "ORDER BY Genome(id)", [], ['Genome(id)',
                                                                          'Genome(genus)',
                                                                          'Genome(species)',
                                                                          'Genome(unique-characterization)']);
    # Create the genome names from the three pieces of the name.
    my @retVal = map { [$_->[0], join(" ", @{$_}[1..3])] } @genomes;
    # Return the result.
    return @retVal;
}

=head3 NameGenomes

C<< my $newList = NameGenomes($sprout, \@genomes); >>

Convert a list of genome IDs to a list of genome IDs with names.

=over 4

=item sprout

The relevant sprout instance.

=item genomes

Reference to a list of genome IDs

=item RETURN

Returns a list of 2-tuples, each tuple consisting of a genome ID followed by a
genome name.

=back

=cut

sub NameGenomes {
    # Get the parameters.
    my ($sprout, $genomes) = @_;
    # Attach the names.
    my @retVal = map { [$_, $sprout->GenusSpecies($_) ] } @{$genomes};
    # Return the result.
    return @retVal;
}

=head3 GetSubsystems

C<< my @subsystems = GetSubsystems($sprout); >>

Get a list of the subsystems in the specified Sprout instance.

=over 4

=item sprout

Sprout instance whose subsystems are desired.

=item RETURN

Returns a list of 2-tuples, each consisting of the subsystem name followed by
the name of the curator.

=back

=cut

sub GetSubsystems {
    # Get the parameters.
    my ($sprout) = @_;
    # Declare the return variable.
    my @retVal = $sprout->GetAll(['Subsystem'], "ORDER BY Subsystem(id)",
                                 [], ['Subsystem(id)', 'Subsystem(curator)']);
    # Return the result.
    return @retVal;
}

=head3 GetFeatures

C<< my @features = GetFeatures($sprout, $genomeID); >>

Return the features of the specified genome in the specified Sprout instance.

=over 4

=item sprout

Sprout instance to use to get the features.

=item genomeID

ID of the genome in question.

=item RETURN

Returns a list of 2-tuples, the first element being the feature ID and the second its
functional assignment (if any).

=back

=cut

sub GetFeatures {
    # Get the parameters.
    my ($sprout, $genomeID) = @_;
    # Get a list of the feature IDs and map them to their functional assignments.
    my @retVal = map { [$_, $sprout->FunctionOf($_)] } $sprout->GetFlat(['HasFeature'],
                                                                        "HasFeature(from-link) = ? ORDER BY HasFeature(to-link)",
                                                                        [$genomeID], 'HasFeature(to-link)');
    # Return the result.
    return @retVal;
}

=head3 ShowLists

C<< ShowLists($all, %lists); >>

Display a set of lists. Each list should consist of 2-tuples.

=over 4

=item all

TRUE if details should be displayed; FALSE if only summaries should be displayed.

=item lists

A hash mapping list names to list references.

=cut

sub ShowLists {
    # Get the parameters.
    my $all = shift @_;
    my %lists = @_;
    # Loop through the lists in alphabetical order by list name.
    for my $listName (sort keys %lists) {
        # Get the list itself.
        my $list = $lists{$listName};
        # Get the number of list items.
        my $listSize = scalar @{$list};
        # Only proceed if the list is nonempty.
        if ($listSize > 0) {
            my $header = ShowHeader($listName, $listSize);
            print "$header\n";
            Trace($header) if T(3);
            # If we're at trace level 3, display the list.
            if ($all) {
                # Put a spacer under the title.
                print "\n";
                # Get the width of the name column.
                my $width = 0;
                for my $entryLen (map { length $_->[0] } @{$list}) {
                    $width = $entryLen if $entryLen > $width;
                }
                # Now display the list.
                for my $entry (@{$list}) {
                    my ($name, $data) = @{$entry};
                    print "    $name" . (" " x ($width - length $name)) . " $data\n";
                }
                print "\n\n";
            }
        }
    }
}

=head3 ShowHeader

C<< my $header = ShowHeader($name, $count); >>

Return a list header for a list of the specified length.

=over 4

=item name

Name of the list.

=item count

Number of entries in the list.

=item RETURN

Returns a list header that shows the name of the list and the number of entries.

=back

=cut

sub ShowHeader {
    # Get the parameters.
    my ($name, $count) = @_;
    # Declare the return variable.
    my $retVal;   
    if ($count == 0) {
        $retVal = "*** $name: none";
    } elsif ($count == 1) {
        $retVal = "*** $name: one";
    } else {
        $retVal = "*** $name: $count";
    }
    # Return the result.
    return $retVal;
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3