[Bio] / Sprout / GenomeStats.pl Repository:
ViewVC logotype

View of /Sprout/GenomeStats.pl

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.28 - (download) (as text) (annotate)
Tue Apr 10 06:01:56 2007 UTC (12 years, 11 months ago) by parrello
Branch: MAIN
Changes since 1.27: +37 -21 lines
Upgraded to include serotype support. Enhanced for better handling of old versions with different DBDs.

#!/usr/bin/perl -w

=head1 Genome Data Generator

This script creates a set of HTML include files that list the statistics for
the genomes in each of the genome groups. Genomes that are new to this version
of the Sprout will be specially marked. In order for this to work, both the
current and previous Sprout databases must be available on this machine.

The currently-supported command-line options are as follows.

=over 4

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item h

Display this command's parameters and options.

=item strict

If specified, strict groups will be used; otherwise, groups with a common primary name
will be combined into a single group. (The primary name of the group is the first
capitalized word.)

=item oddStyle

Style to use for odd rows of the table.

=item evenStyle

Style to use for even rows of the table.

=item tableStyle

Style to use for the table itself.

=item markerStyle

Style to use for small-text markers (e.g. NEW!)

=item numStyle

Style to use for numeric cells.

=item counterStyle

Style to use for counter cells.

=item linkCGI

Path to the CGI script for displaying detailed statistics.

=item noNewCheck

If specified, the check for new genomes in the group is suppressed. This
may need to be done if there's been a change in the database definition. Note
that all this really does is keep the B<NEW!> symbol from showing. It does
not affect which genomes show up in the table.



use strict;
use Tracer;
use DocUtils;
use TestUtils;
use Cwd;
use File::Copy;
use File::Path;
use Sprout;
use SFXlate;
use CGI qw(:standard);
use FIG;
no warnings 'once'; # only when coding

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
                                            strict => [0, 'keep related groups separate'],
                                            oddStyle => ['odd', 'style for odd rows'],
                                            trace => [2, 'tracing level'],
                                            evenStyle => ['even', 'style for even rows'],
                                            tableStyle => ['genomestats', 'style for whole table'],
                                            markerStyle => ['tinytext', 'style for markers'],
                                            numStyle => ['numcell', 'style for cells with numeric values'],
                                            counterStyle => ['countercell', 'style for cells with counter values'],
                                            linkCGI => ['../FIG/genome_statistics.cgi',
                                                        'path to CGI script for detailed statistics'],
                                            groupFile => ["$FIG_Config::sproutData/groups.tbl",
                                                          'location of the NMPDR group description file'],
                                            noNewCheck => [0, 'if specified, skips the check for new genomes'],
                                            targetDir => ["$FIG_Config::nmpdr_base/next/html/includes",
                                                          'target directory'],
# Verify the directory name.
my $targetDir = $options->{targetDir};
if (! $targetDir) {
    Confess("No target directory specified.");
} elsif (! -d $targetDir) {
    Confess("Target directory $targetDir not found.");
} else {
    # Get the new Sprout.
    my $sprout = SFXlate->new_sprout_only();
    my %newGroupHash = $sprout->GetGroups();
    # Extract the genome group data from the new Sprout.
    if (! $options->{strict}) {
        %newGroupHash = Sprout::Fix(%newGroupHash);
    # This hash will be used to determine which genomes are new.
    my %oldGroupHash = ();
    if ($options->{noNewCheck}) {
        # Here we can't look at the old Sprout. Set up the hash
        # so it looks like the old Sprout's data is the same as ours.
        %oldGroupHash = map { $_ => $newGroupHash{$_} } keys %newGroupHash;
    } else {
        # Get the old Sprout.
        my $oldSprout = SFXlate->old_sprout_only();
        # Extract the genome group data from the old Sprout.
        %oldGroupHash = $oldSprout->GetGroups();
        if (! $options->{strict}) {
            %oldGroupHash = Sprout::Fix(%oldGroupHash);
    # Get a FIG object for computing attributes.
    my $fig = FIG->new();
    # Read the group file.
    my %groupData = Sprout::ReadGroupFile($options->{groupFile});
    # Set up some useful stuff for the four count columns.
    my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",
                      s1 => "hypo_sub", n1 => "hypo_nosub" );
    my @columnTypes = ('s0', 'n0', 's1', 'n1');
    # Get the styles.
    my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},
                                                 $options->{evenStyle}, $options->{oddStyle});
    my ($numStyle, $counterStyle) = ($options->{numStyle}, $options->{counterStyle});
    # Prepare a hash for the summary counters. These will be used on the organism summary page.
    my %summaries = ();
    # Loop through the groups.
    for my $groupID (keys %newGroupHash) {
        Trace("Processing group $groupID.") if T(2);
        # Create a hash for summarizing the counters.
        my %groupTotals = ( genomes => 0, pegs => 0, RNAs => 0,
                           map { $_ => } @columnTypes, features => 0 );
        # Get the genomes from the new hash.
        my @newGenomes = @{$newGroupHash{$groupID}};
        # Create a hash for finding if a genome is in the old group. If the entire group is
        # new, we just use an empty hash.
        my %oldGenomes = ();
        if (exists $oldGroupHash{$groupID}) {
            %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};
        # Create the output file.
        my $outFileName = "stats-" . lc($groupID) . ".inc";
        Open(\*GROUPFILE, ">$targetDir/$outFileName");
        # Get the serotypes.
        my %serotypes = map { $_->[0] => $_->[2] } $fig->get_attributes(\@newGenomes, "Serotype_code");
        my $hasSeroData = (scalar(keys %serotypes) > 0);
        # If we have serotypes, we add an extra column.
        my @columnNames = "Strain annotated in NMPDR";
        if ($hasSeroData) {
            push @columnNames, "Serotype";
        push @columnNames,  "Genome size, bp",
                            "Protein Encoding Genes (PEGs)",
                            "Named genes in subsystems",            # s0
                            "Named genes not in subsystems",        # n0
                            "Hypothetical genes in subsystems",     # s1
                            "Hypothetical genes not in subsystems", # n1
        # Start the table.
        print GROUPFILE "<table class=\"$tableStyle\">\n";
        # Create the header row.
        print GROUPFILE Tr( { class => 'odd' }, th(\@columnNames)) . "\n";
        # The data rows will be built next. We'll be putting them into a hash keyed by
        # organism name. The hash enables us to spit them out sorted by name.
        my %rows = ();
        # This variable is used to hold the counts.
        my $num;
        # Loop through the genomes in the new group.
        for my $genomeID (@newGenomes) {
            # Count this genome.
            # Check to see if this genome is new.
            my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");
            Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);
            # Get the strain name.
            my $genomeName = $sprout->GenusSpecies($genomeID);
            # Apply a link.
            my $genomeText = CGI::a({ href => "../FIG/genome_statistics.cgi?genome=$genomeID;SPROUT=1" }, $genomeName);
            # If this is a new strain, build the HTML for the NEW! mark.
            if ($new) {
                $new = " <span class=\"$markerStyle\">NEW!</span>";
            # Get the genome length.
            $num = $sprout->GenomeLength($genomeID);
            my $genomeLen = Tracer::CommaFormat($num);
            # Get the number of PEGs.
            $num = $sprout->FeatureCount($genomeID, 'peg');
            my $pegCount = Tracer::CommaFormat($num);
            $groupTotals{pegs} += $num;
            # Get the number of RNAs.
            $num = $sprout->FeatureCount($genomeID, 'rna');
            my $rnaCount = Tracer::CommaFormat($num);
            $groupTotals{RNAs} += $num;
            # If there are no RNAs, we say we don't know the number, since we know there
            # must be RNAs somewhere.
            if (! $rnaCount) {
                $rnaCount = "n/d";
            # Now we have four categories of features to work with, for each
            # combination of named or hypothetical vs. in-subsystem or
            # not-in-subsystem. First, we get all of the feature assignments for
            # the genome.
            my $assignHash = $sprout->GenomeAssignments($genomeID);
            # Next, we get all of the features in the genome that belong to a
            # subsystem.
            my %ssHash = $sprout->GenomeSubsystemData($genomeID);
            # Create a hash to track the four categories. "s" or "n" indicates
            # in or out of a subsystem. "1" or "0" indicates hypothetical or
            # real.
            my %counters = ( s0 => 0, n0 => 0, s1 => 0, n1 => 0 );
            # Also keep a count of all the features.
            my $totalFeatures = 0;
            # Loop through the assignments.
            for my $fid (keys %{$assignHash}) {
                # Form the counter key.
                my $ss = ( exists $ssHash{$fid} ? "s" : "n" ) . FIG::hypo($assignHash->{$fid} );
                # Record this feature.
                $counters{$ss} += 1;
            Trace("$totalFeatures total features found for $genomeID.") if T(3);
            for my $counterKey (@columnTypes) {
                $groupTotals{$counterKey} += $counters{$counterKey};
            $groupTotals{features} += $totalFeatures;
            # We have all our data. Next we need to compute the percentages and the links.
            # First, the link stuff.
            my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";
            # Now format the counters and percentages.
            for my $type (keys %linkParms) {
                $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },
                                     sprintf("%d(%.1f%%)", $counters{$type},
                                             Tracer::Percent($counters{$type}, $totalFeatures)));
            my @counterValues = map { $counters{$_} } @columnTypes;
            # The last link is a button to look at the subsystem summaries.
            my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',
            my $ssLink = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&show_subsystems=1";
            my $ssCol = "<a href=\"$ssLink\">$ssCount</a>";
            # Start creating the table cells.
            my $rowHtml = td("$genomeText$new");
            # Check for a serotype.
            if ($hasSeroData) {
                my $seroType = $serotypes{$genomeID} || "&nbsp;";
                $rowHtml .= td($seroType);
            # Now add the data columns.
            $rowHtml .= join("",
                            td({ class => $numStyle }, $genomeLen),
                            td({ class => $numStyle }, $pegCount),
                            td({ class => $counterStyle }, \@counterValues),
                            td({ class => $numStyle }, $ssCol),
                            td({ class => $numStyle }, $rnaCount),
            # Put it in the row hash.
            $rows{$genomeName} = $rowHtml;
        # Set up a counter.
        my $rowCount = 0;
        # Now we have all the rows set up. We want to sort them and then
        # write them to the output file. First, we create a variable
        # we can use to select the even or odd style.
        my $rowType = 0;
        # Loop through the rows.
        for my $rowID (sort keys %rows) {
            # Format the row.
            print GROUPFILE Tr( { class => $rowStyle[$rowType] }, $rows{$rowID} ) . "\n";
            # Flip the row type.
            $rowType = 1 - $rowType;
            # Count the row.
        # All done, terminate the table and close the file.
        print GROUPFILE "</table>\n";
        close GROUPFILE;
        Trace("$rowCount genomes processed.") if T(2);
        # Now save the group totals.
        $summaries{$groupID} = \%groupTotals;
    # Now produce the summary table.
    my $sumFileName = "stats-groups.inc";
    Open(\*SUMFILE, ">$targetDir/$sumFileName");
    # Start the table.
    print SUMFILE "<table class=\"$tableStyle\">\n";
    # Create the header row.
    print SUMFILE Tr( { class => 'odd' }, th(["Group name",
                                             "Protein Encoding Genes (PEGs)",
                                             "Named genes in subsystems",            # s0
                                             "Named genes not in subsystems",        # n0
                                             "Hypothetical genes in subsystems",     # s1
                                             "Hypothetical genes not in subsystems", # n1
                                               ])) . "\n";
    # Set up a flag for the odd-even styling.
    my $rowFlag = 0;
    # Put in the data rows.
    for my $groupName (sort keys %summaries) {
        my $group = $summaries{$groupName};
        # Compute the link for the current group.
        my $groupLink = a({ href => $groupData{$groupName}->[0] }, $groupName);
        # Create the table row.
        my $rowHtml = join("",
                           td({ class => $numStyle }, Tracer::CommaFormat($group->{genomes})),
                           td({ class => $numStyle }, Tracer::CommaFormat($group->{pegs})),
                           td({ class => $counterStyle }, [ map { Tracer::CommaFormat($group->{$_}) } @columnTypes ]),
                           td({ class => $numStyle }, Tracer::CommaFormat($group->{RNAs})),
        print SUMFILE Tr( { class => $rowStyle[$rowFlag] }, $rowHtml ) . "\n";
        # Flip the row style.
        $rowFlag = 1 - $rowFlag;
    # Terminate the table and close the file.
    print SUMFILE "</table>\n";
    close SUMFILE;
    # We're all done.
    Trace("Processing complete.") if T(2);


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3