[Bio] / Sprout / GenomeStats.pl Repository:
ViewVC logotype

View of /Sprout/GenomeStats.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.36 - (download) (as text) (annotate)
Mon Mar 2 22:25:08 2009 UTC (10 years, 9 months ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, rast_rel_2009_05_18, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2009_07_09, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, rast_rel_2009_03_26, mgrast_dev_10262011, HEAD
Changes since 1.35: +2 -0 lines
Made organism stats tables sortable.

#!/usr/bin/perl -w

=head1 Genome Data Generator

This script creates a set of Wiki pages that list the statistics for
the genomes in each of the genome groups. Genomes that are new to this version
of the Sprout will be specially marked. In order for this to work, both the
current and previous Sprout databases must be available on this machine.

The currently-supported command-line options are as follows.

=over 4

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item h

Display this command's parameters and options.

=item test

If specified, the output pages will be put into the sandbox instead of the main web.

=item noNewCheck

If specified, the check for new genomes in the group is suppressed. This
may need to be done if there's been a change in the database definition. Note
that all this really does is keep the B<NEW!> symbol from showing. It does
not affect which genomes show up in the table.

=back

=cut

use strict;
use Tracer;
use Cwd;
use File::Copy;
use File::Path;
use Sprout;
use SFXlate;
use CGI qw(-nosticky);
use FIG;
use WikiTools;

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
                                           {
                                            test => [0, 'if specified, publishes to the wiki sandbox instead of the main web'],
                                            noNewCheck => [0, 'if specified, skips the check for new genomes'],
                                           },
                                           "",
                                           @ARGV);
# The return type (error/no error) goes in here.
my $rtype;
eval {
    # This table controls the special attribute columns. For each we need to know the attribute name and the
    # column title. If any genomes in a group have a value for one of the special columns, that column is
    # displayed along with the attribute values.
    my %specialCols = (Serotype => 'Serotype_code',
                       Phenotype => 'Phenotype');
    my $outputWeb = ($options->{test} ? 'Sandbox' : 'Main');
    # Get the new Sprout.
    my $sprout = SFXlate->new_sprout_only();
    my %newGroupHash = $sprout->GetGroups();
    # Get a wiki helper.
    my $wiki = WikiTools->new();
    # Extract the genome group data from the new Sprout.
    %newGroupHash = $sprout->Fix(%newGroupHash);
    # This hash will be used to determine which genomes are new.
    my %oldGroupHash = ();
    if ($options->{noNewCheck}) {
        # Here we can't look at the old Sprout. Set up the hash
        # so it looks like the old Sprout's data is the same as ours.
        %oldGroupHash = map { $_ => $newGroupHash{$_} } keys %newGroupHash;
    } else {
        # Get the old Sprout.
        my $oldSprout = SFXlate->old_sprout_only();
        # Extract the genome group data from the old Sprout.
        %oldGroupHash = $oldSprout->GetGroups();
        %oldGroupHash = $oldSprout->Fix(%oldGroupHash);
    }
    # Get a FIG object for computing attributes.
    my $fig = FIG->new();
    # Get the super-group list.
    my @superGroups = sort keys %newGroupHash;
    # Set up some useful stuff for the four count columns.
    my $url = "%SCRIPTURL{rest}%/NmpdrPlugin/search?Class=OrgSumSearch;Search=Go";
    my %linkParms = ( s0 => "$url;hypothetical=named;insubsystem=in;genome=",
                      n0 => "$url;hypothetical=named;insubsystem=out;genome=",
                      s1 => "$url;hypothetical=hypo;insubsystem=in;genome=",
                      n1 => "$url;hypothetical=hypo;insubsystem=out;genome=" );
    my @columnTypes = ('s0', 'n0', 's1', 'n1');
    # Prepare a hash for the summary counters. These will be used on the organism summary page.
    my %summaries = ();
    # Loop through the groups.
    for my $groupID (@superGroups) {
        Trace("Processing group $groupID.") if T(2);
        # Create a hash for summarizing the counters.
        my %groupTotals = ( genomes => 0, pegs => 0, RNAs => 0,
                           map { $_ => } @columnTypes, features => 0 );
        # Get the genomes from the new hash.
        my @newGenomes = @{$newGroupHash{$groupID}};
        # Create a hash for finding if a genome is in the old group. If the entire group is
        # new, we just use an empty hash.
        my %oldGenomes = ();
        if (exists $oldGroupHash{$groupID}) {
            %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};
        }
        # Compute the name of the wiki page we're building.
        my $outPageName = "${groupID}Stats";
        # We'll put the data for the page in here.
        my @outputLines = ();
        # Get the special columns. We'll stuff them in a hash keyed by column name. Each column name will contain
        # a sub-hash that translates each genome ID to its applicable attribute value (if any).
        my %specialData = ();
        for my $specialColumn (keys %specialCols) {
            # Get the attribute mapping.
            my %specialDataList = map { $_->[0] => $_->[2] } $fig->get_attributes(\@newGenomes, $specialCols{$specialColumn});
            # We only proceed if some attributes were found. As a result, the keys in %specialData will only be keys
            # for columns that exist in the output.
            if (scalar(keys %specialDataList)) {
                $specialData{$specialColumn} = \%specialDataList;
            }
        }
        # Set up the column names. Note that an extra space in front of a name will be interpreted by
        # the Wiki markup as an order to right-justify the text.
        my @columnNames = "*Strain annotated in NMPDR*";
        push @columnNames, map { "*$_*" } sort keys %specialData;
        push @columnNames,  "*Genome size, bp*",
                            " *%FIG{Protein Encoding Genes}% (PEGs)*",
                            " *Named genes in subsystems*",            # s0
                            " *Named genes not in subsystems*",        # n0
                            " *Hypothetical genes in subsystems*",     # s1
                            " *Hypothetical genes not in subsystems*", # n1
                            " *Subsystems*",
                            " *RNAs*";
        # Make the table sortable.
        push @outputLines, '%TABLE{sort="on"}%';
        # Create the header row.
        push @outputLines, "| " . join(" | ", @columnNames) . " |";
        # The data rows will be built next. We'll be putting them into a hash keyed by
        # organism name. The hash enables us to spit them out sorted by name.
        my %rows = ();
        # This variable is used to hold the counts.
        my $num;
        # Loop through the genomes in the new group.
        for my $genomeID (@newGenomes) {
            # Count this genome.
            $groupTotals{genomes}++;
            # Check to see if this genome is new.
            my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");
            Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);
            # Get the strain name.
            my $genomeName = $sprout->GenusSpecies($genomeID);
            # Apply a link.
            my $genomeText = "%SV{\"$genomeName\" id=\"$genomeID\"}%";
            # If this is a new strain, add the NEW! mark.
            if ($new) {
                $genomeText .= " %N%";
            }
            # Get the genome length.
            $num = $sprout->GenomeLength($genomeID);
            my $genomeLen = Tracer::CommaFormat($num);
            # Get the number of PEGs.
            $num = $sprout->FeatureCount($genomeID, 'peg');
            my $pegCount = Tracer::CommaFormat($num);
            $groupTotals{pegs} += $num;
            # Get the number of RNAs.
            $num = $sprout->FeatureCount($genomeID, 'rna');
            my $rnaCount = Tracer::CommaFormat($num);
            $groupTotals{RNAs} += $num;
            # If there are no RNAs, we say we don't know the number, since we know there
            # must be RNAs somewhere.
            if (! $rnaCount) {
                $rnaCount = "n/d";
            }
            # Now we have four categories of features to work with, for each
            # combination of named or hypothetical vs. in-subsystem or
            # not-in-subsystem. First, we get all of the feature assignments for
            # the genome.
            my $assignHash = $sprout->GenomeAssignments($genomeID);
            # Next, we get all of the features in the genome that belong to a
            # subsystem.
            my %ssHash = $sprout->GenomeSubsystemData($genomeID);
            # Create a hash to track the four categories. "s" or "n" indicates
            # in or out of a subsystem. "1" or "0" indicates hypothetical or
            # real.
            my %counters = ( s0 => 0, n0 => 0, s1 => 0, n1 => 0 );
            # Also keep a count of all the features.
            my $totalFeatures = 0;
            # Loop through the assignments.
            for my $fid (keys %{$assignHash}) {
                # Form the counter key.
                my $ss = ( exists $ssHash{$fid} ? "s" : "n" ) . FIG::hypo($assignHash->{$fid} );
                # Record this feature.
                $counters{$ss} += 1;
                $totalFeatures++;
            }
            Trace("$totalFeatures total features found for $genomeID.") if T(3);
            for my $counterKey (@columnTypes) {
                $groupTotals{$counterKey} += $counters{$counterKey};
            }
            $groupTotals{features} += $totalFeatures;
            # We have all our data. Next we need to compute the percentages.
            for my $type (keys %linkParms) {
                my $counterData = sprintf("%d(%.1f%%)", $counters{$type},
                                          Tracer::Percent($counters{$type}, $totalFeatures));
                $counters{$type} = CGI::a({ href => $linkParms{$type} . $genomeID }, $counterData);
            }
            my @counterValues = map { $counters{$_} } @columnTypes;
            # The last column is a subsystem count.
            my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',
                                               [$genomeID]);
            my $ssCol = $ssCount;
            # Start creating the table cells.
            my $rowHtml = "| $genomeText |";
            # Add any special columns.
            for my $specialCol (keys %specialData) {
                # Here we get the attribute value. If there is none, we leave the column blank.
                my $attribute = $specialData{$specialCol}->{$genomeID} || "&nbsp;";
                $rowHtml .= " $attribute |";
            }
            # Now add the data columns.
            $rowHtml .= join("", map { "  $_ |" } ($genomeLen, $pegCount, @counterValues, $ssCol, $rnaCount));
            # Put it in the row hash.
            $rows{$genomeName} = $rowHtml;
        }
        # Set up a counter.
        my $rowCount = 0;
        # Now we have all the rows set up. We want to sort them and then
        # write them to the output file. First, we create a variable
        # we can use to select the even or odd style.
        my $rowType = 0;
        # Loop through the rows.
        for my $rowID (sort keys %rows) {
            # Format the row.
            push @outputLines, $rows{$rowID};
            # Count the row.
            $rowCount++;
        }
        # All done, write the Wiki Page.
        my $rc = $wiki->Save($outPageName, $outputWeb, 'OrganismDataSummariesStats', join("\n", @outputLines));
        Trace("$rowCount genomes processed.") if T(2);
        if (! $rc) {
            Confess("Error saving $outPageName: $wiki->{error}");
        }
        # Now save the group totals.
        $summaries{$groupID} = \%groupTotals;
    }
    # Now produce the summary table.
    my $sumPageName = "OrganismDataSummariesStats";
    my @sumLines = ();
    # Start the table.  Asterisks make a cell a column header. An extra space at the front right-justifies it.
    push @sumLines, "| " . join("", map { " *$_* |" } ("Group name", "Genomes")) .
                           join("", map { "  *$_* |"} ("[[FIG.ProteinEncodingGenes][Protein Encoding Genes]] (PEGs)",
                                                       "Named genes in subsystems",            # s0
                                                       "Named genes not in subsystems",        # n0
                                                       "Hypothetical genes in subsystems",     # s1
                                                       "Hypothetical genes not in subsystems", # n1
                                                       "RNAs"));
    # Put in the data rows.
    for my $groupName (sort keys %summaries) {
        my $group = $summaries{$groupName};
        # Create the table row.
        my $rowHtml = "| [[$groupName]] |" . join("", map { "  " . Tracer::CommaFormat($group->{$_}) . " |" }
                                                  ('genomes', 'pegs', @columnTypes, 'RNAs'));
        push @sumLines, $rowHtml;
    }
    # Write the page.
    my $rc = $wiki->Save($sumPageName, $outputWeb, 'WebHome', join("\n", @sumLines));
    # We're all done.
    Trace("Processing complete.") if T(2);
};
if ($@) {
    Trace("Stats failed with error: $@") if T(0);
    $rtype = "error";
} else {
    Trace("Stats complete.") if T(2);
    $rtype = "no error";
}
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "GenomeStats terminated with $rtype.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);
    }
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3