[Bio] / Sprout / GenomeStats.pl Repository:
ViewVC logotype

View of /Sprout/GenomeStats.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.15 - (download) (as text) (annotate)
Fri Jun 23 23:12:43 2006 UTC (13 years, 3 months ago) by parrello
Branch: MAIN
Changes since 1.14: +1 -1 lines
*** empty log message ***

#!/usr/bin/perl -w

=head1 Genome Data Generator

This script creates a set of HTML include files that list the statistics for
the genomes in each of the genome groups. Genomes that are new to this version
of the Sprout will be specially marked. In order for this to work, both the
current and previous Sprout databases must be available on this machine.
This is one positional parameter: the name of a directory in which to place
the include files.

The currently-supported command-line options are as follows.

=over 4

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item h

Display this command's parameters and options.

=item strict

If specified, strict groups will be used; otherwise, groups with a common primary name
will be combined into a single group. (The primary name of the group is the first
capitalized word.)

=item oddStyle

Style to use for odd rows of the table.

=item evenStyle

Style to use for even rows of the table.

=item tableStyle

Style to use for the table itself.

=item markerStyle

Style to use for small-text markers (e.g. NEW!)

=item numStyle

Style to use for numeric cells.

=item counterStyle

Style to use for counter cells.

=item linkCGI

Path to the CGI script for displaying detailed statistics.

=back

=cut

use strict;
use Tracer;
use DocUtils;
use TestUtils;
use Cwd;
use File::Copy;
use File::Path;
use Sprout;
use SFXlate;
use CGI qw(:standard);
use FIG;
no warnings 'once'; # only when coding

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
                                           {
                                            strict => [0, 'keep related groups separate'],
                                            oddStyle => ['odd', 'style for odd rows'],
                                            evenStyle => ['even', 'style for even rows'],
                                            tableStyle => ['genomestats', 'style for whole table'],
                                            markerStyle => ['tinytext', 'style for markers'],
                                            numStyle => ['numcell', 'style for cells with numeric values'],
                                            counterStyle => ['countercell', 'style for cells with counter values'],
                                            linkCGI => ['../FIG/genome_statistics.cgi',
                                                        'path to CGI script for detailed statistics'],
                                           },
                                           "<targetDir>",
                                           @ARGV);
# Verify the directory name.
my $targetDir = $parameters[0];
if (! $targetDir) {
    Confess("No target directory specified.");
} elsif (! -d $targetDir) {
    Confess("Target directory $targetDir not found.");
} else {
    # *Get the old Sprout.
    my $oldSprout = SFXlate->new_sprout_only($FIG_Config::oldSproutDB);
    # Extract the genome group data from the old Sprout.
    my %oldGroupHash = $oldSprout->GetGroups();
    if (! $options->{strict}) {
        %oldGroupHash = Fix(%oldGroupHash);
    }
    # Get the new Sprout.
    my $sprout = SFXlate->new_sprout_only();
    my %newGroupHash = $sprout->GetGroups();
    if (! $options->{strict}) {
        %newGroupHash = Fix(%newGroupHash);
    }
    # Loop through the groups.
    for my $groupID (keys %newGroupHash) {
        Trace("Processing group $groupID.") if T(2);
        # Get the genomes from the new hash.
        my @newGenomes = @{$newGroupHash{$groupID}};
        # Create a hash for finding if a genome is in the old group. If the entire group is
        # new, we just use an empty hash.
        my %oldGenomes = ();
        if (exists $oldGroupHash{$groupID}) {
            %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};
        }
        # Create the output file.
        my $outFileName = "stats-" . lc($groupID) . ".inc";
        Open(\*GROUPFILE, ">$targetDir/$outFileName");
        # Get the styles.
        my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},
                                                     $options->{evenStyle}, $options->{oddStyle});
        my ($numStyle, $counterStyle) = ($options->{numStyle}, $options->{counterStyle});
        # Start the table.
        print GROUPFILE "<table class=\"$tableStyle\">\n";
        # Create the header row.
        print GROUPFILE Tr( { class => 'odd' }, th(["Strain annotated in NMPDR",
                                                 "Genome size, bp",
                                                 "Protein Encoding Genes (PEGs)",
                                                 "Named genes in subsystems",            # s0
                                                 "Named genes not in subsystems",        # n0
                                                 "Hypothetical genes in subsystems",     # s1
                                                 "Hypothetical genes not in subsystems", # n1
                                                 "RNAs",
                                                   ])) . "\n";
        # Set up some useful stuff for the four count columns.
        my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",
                          s1 => "hypo_sub", n1 => "hypo_nosub" );
        my @columnTypes = ('s0', 'n0', 's1', 'n1');
        # The data rows will be built next. We'll be putting them into a hash keyed by
        # organism name. The hash enables us to spit them out sorted by name.
        my %rows = ();
        # Loop through the genomes in the new group.
        for my $genomeID (@newGenomes) {
            # Check to see if this genome is new.
            my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");
            Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);
            # Get the strain name.
            my $genomeName = $sprout->GenusSpecies($genomeID);
            # If this is a new strain, build the HTML for the NEW! mark.
            if ($new) {
                $new = " <span class=\"$markerStyle\">NEW!</span>";
            }
            # Get the genome length.
            my $genomeLen = Tracer::CommaFormat($sprout->GenomeLength($genomeID));
            # Get the number of PEGs.
            my $pegCount = Tracer::CommaFormat($sprout->FeatureCount($genomeID, 'peg'));
            # Get the number of RNAs.
            my $rnaCount = Tracer::CommaFormat($sprout->FeatureCount($genomeID, 'rna'));
            # Now we have four categories of features to work with, for each
            # combination of named or hypothetical vs. in-subsystem or
            # not-in-subsystem. First, we get all of the feature assignments for
            # the genome.
            my $assignHash = $sprout->GenomeAssignments($genomeID);
            # Next, we get all of the features in the genome that belong to a
            # subsystem. This involves a query via the subsystem spreadsheet.
            my %ssHash = map { $_ => 1 } $sprout->GetFlat(['IsGenomeOf', 'ContainsFeature'],
                                                    "IsGenomeOf(from-link) = ?",
                                                    [$genomeID], 'ContainsFeature(to-link)');
            # Create a hash to track the four categories. "s" or "n" indicates
            # in or out of a subsystem. "1" or "0" indicates hypothetical or
            # real.
            my %counters = ( s0 => 0, n0 => 0, s1 => 0, n1 => 0 );
            # Also keep a count of all the features.
            my $totalFeatures = 0;
            # Loop through the assignments.
            for my $fid (keys %{$assignHash}) {
                # Form the counter key.
                my $ss = ( exists $ssHash{$fid} ? "s" : "n" ) . FIG::hypo($assignHash->{$fid} );
                # Record this feature.
                $counters{$ss} += 1;
                $totalFeatures++;
            }
            Trace("$totalFeatures total features found for $genomeID.") if T(3);
            # We have all our data. Next we need to compute the percentages and the links.
            # First, the link stuff.
            my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";
            # Now format the counters and percentages.
            for my $type (keys %linkParms) {
                $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },
                                     sprintf("%d(%.1f%%)", $counters{$type},
                                             Tracer::Percent($counters{$type}, $totalFeatures)));
            }
            my @counterValues = map { $counters{$_} } @columnTypes;
            # Create the row text. Note that we use the distributive capability of the TD
            # function to apply the same style to each one.
            my $rowHtml = join("",
                               td("$genomeName$new"),
                               td({ class => $numStyle }, $genomeLen),
                               td({ class => $numStyle }, $pegCount),
                               td({ class => $counterStyle }, \@counterValues),
                               td({ class => $numStyle }, $rnaCount),
                              );
            # Put it in the row hash.
            $rows{$genomeName} = $rowHtml;
        }
        # Set up a counter.
        my $rowCount = 0;
        # Now we have all the rows set up. We want to sort them and then
        # write them to the output file. First, we create a variable
        # we can use to select the even or odd style.
        my $rowType = 0;
        # Loop through the rows.
        for my $rowID (sort keys %rows) {
            # Format the row.
            print GROUPFILE Tr( { class => $rowStyle[$rowType] }, $rows{$rowID} ) . "\n";
            # Flip the row type.
            $rowType = 1 - $rowType;
            # Count the row.
            $rowCount++;
        }
        # All done, terminate the table and close the file.
        print GROUPFILE "</table>\n";
        close GROUPFILE;
        Trace("$rowCount genomes processed.") if T(2);
    }
    # We're all done.
    Trace("Processing complete.") if T(2);
}

=head3 Fix

C<< my %fixedHash = Fix(%groupHash); >>

Prepare a genome group hash for processing. Groups with the same primary name will be combined.
The primary name is the first capitalized word in the group name.

=over 4

=item groupHash

Hash to be fixed up.

=item RETURN

Returns a fixed-up version of the hash.

=back

=cut

sub Fix {
    # Get the parameters.
    my (%groupHash) = @_;
    # Create the result hash.
    my %retVal = ();
    # Copy over the genomes.
    for my $groupID (keys %groupHash) {
        # Make a safety copy of the group ID.
        my $realGroupID = $groupID;
        # Yank the primary name.
        if ($groupID =~ /([A-Z]\w+)/) {
            $realGroupID = $1;
        }
        # Append this group's genomes into the result hash.
        Tracer::AddToListMap(\%retVal, $realGroupID, @{$groupHash{$groupID}});
    }
    # Return the result hash.
    return %retVal;
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3