[Bio] / Sprout / AttributeReport.pl Repository:
ViewVC logotype

View of /Sprout/AttributeReport.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Mon Jan 19 21:46:21 2009 UTC (10 years, 3 months ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, rast_rel_2009_05_18, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2009_07_09, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, rast_rel_2009_03_26, mgrast_dev_10262011, HEAD
Changes since 1.4: +19 -19 lines
ERDB 2.0 support

#!/usr/bin/perl -w

=head1 AttributeReport

Display the attribute report for the current version of NMPDR. For
each attribute in the named group, the description of the attribute will
be displayed along with a sampling of the attribute values. If the
attribute is in one of the entity type groups (Feature, Genome,
Role, etc.), a report will be generated indicating how many distinct
NMPDR entity instances have values for the attribute. So, if the attribute
is in the Genome group, a report will be generated indicating how many distinct
NMPDR genomes have values for the attribute. The output will be in HTML format.

The positional parameters should be the names of the groups for which
reports are desired.

The currently-supported command-line options are as follows.

=over 4

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is C<2>. Tracing will be to the C<trace>I<User>C<.log>
file in the FIG temporary directory, where I<User> is the value of the B<user>
option above as well as to the standard output.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item h

Display this command's parameters and options.

=item phone

Phone number to message when the script is complete.

=item noCover

If specified, suppresses the coverage report output, which greatly
improves the speed.

=item sample

Number of sample attribute values to display. The default is C<50>.

=item output

Output file in which the web page will be built.

=back

=cut

use strict;
use Tracer;
use Cwd;
use File::Copy;
use File::Path;
use FIG;
use Sprout;
use SFXlate;
use CustomAttributes;

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
                                           {
                                              output => ["/disks/nmpdr/next/html/Blog/attributes.inc", "output file to contain the HTML"],
                                              trace => ["2", "tracing level"],
                                              noCover => ["", "if specified, no coverage information will be displayed, making the report much faster"],
                                              phone => ["", "phone number (international format) to call when load finishes"],
                                              sample => [50, "number of sample attribute values to display"]
                                           },
                                           "<group1> <group2> ... <groupN>",
                                           @ARGV);
# Set a variable to contain return type information.
my $rtype;
# Insure we catch errors.
eval {
    # Get the FIG and Sprout objects. The FIG object communicates with the attribute
    # server, the Sprout object with the current NMPDR version.
    my $fig = FIG->new();
    my $sprout = SFXlate->new_sprout_only();
    # Get the CGI query object so we can generate HTML.
    my $cgi = CGI->new();
    # The output is divided into sections by group name. This hash will tell us which section
    # contains the first occurrence of each attribute. If an attribute is found in multiple
    # groups, we simply hyperlink to its original report.
    my %attrSection = ();
    # Get the entity hash. This is used to collect the IDs of the entity instances used
    # in the coverage reports. For each entity type, it maps the entity type name to a
    # hash of NMPDR instance keys.
    my %entityHash = ();
    # This hash maps each entity type to the number of its IDs and the number of IDs related to core genomes.
    my %typeCounts = ();
    # Get a list of NMPDR core genome IDs, too.
    my %coreGenomes = map { $_ => 1 } $sprout->GetFlat(['Genome'], 'Genome(primary-group) <> ?',
                                                       [$FIG_Config::otherGroup], 'Genome(id)');
    # We also need a list of entity types, so we know which ones to care about.
    my %typeHash = map { $_ => 1 } $sprout->GetEntityTypes();
    # Get the list of groups, sorted by name. This will drive the main loop.
    my @groups = sort @parameters;
    # The HTML will be output to this handle.
    my $ofh = Open(undef, ">$options->{output}");
    # Get the tuning parameters. $cover is TRUE iff we're generating a coverage
    # report, and $sampleSize is the number of sample attribute values to display.
    my $cover = ! $options->{noCover};
    my $sampleSize = $options->{sample};
    # The coverage tables will be built in here. It will be a hash where the key
    # is the sort key for the table and the data is the row itself.
    my %coverageRows = ();
    if ($cover) {
        $coverageRows{" "} = CGI::Tr(CGI::th(['Key', 'Type']),
                                      CGI::th({align => 'right'}, ['All IDs', 'Core Covered', 'Core Total', 'Core Percent',
                                                                    'NMPDR Covered', 'NMPDR Total', 'Percent']));
    }
    # If there's more than one group or there's a coverage report, start with a table of contents.
    if (@groups > 1 || $cover) {
        Output($ofh, CGI::h1("Contents"));
        Output($ofh, CGI::start_ol());
        for my $group (@groups) {
            Output($ofh, CGI::li(CGI::a({ href => "#$group" }, "$group group")));
        }
        Output($ofh, CGI::li(CGI::a({ href => "#CoverageReport" }, "Coverage report")));
        Output($ofh, CGI::end_ol());
    }
    Output($ofh, CGI::h1("Attribute Report"));
    # Loop through the groups. Each group is treated as a section.
    for my $group (@groups) {
        # Create the section header.
        Output($ofh, CGI::a({ name => $group }, CGI::h2("$group Group")));
        # Get the attribute data for this group.
        my %keys = $fig->get_group_key_info($group);
        my @keyList = sort keys %keys;
        Output($ofh, CGI::p(scalar(@keyList) . " keys found in group $group."));
        # Loop through the attributes.
        for my $key (@keyList) {
            # Create the header HTML for the key.
            my $keyHeader = CGI::h3($key);
            # Find out if this is the key's first appearance.
            my $newKey = ! exists $attrSection{$key};
            # If this is the key's first appearance, put a name anchor around
            # the header.
            if ($newKey) { $keyHeader = CGI::a({ name => $key }, $keyHeader); }
            # Now push out the header and the description. The description is the second value
            # in the key's n-tuple from the %keys hash. The first value is the data type, which
            # is not used.
            Output($ofh, $keyHeader);
            Output($ofh, CGI::p(Tracer::Clean($keys{$key}->[1])));
            # If this is not the first appearance of the key, we need to hyperlink back to
            # its original location.
            if (! $newKey) {
                Output($ofh, CGI::p(CGI::a({ href => "#$key" }, "Key details already reported in the $attrSection{$key} group.")));
            } else {
                # Save the name of the section containing this key's data.
                $attrSection{$key} = $group;
                # Display the data sample. We do this only if the sample size is 1 or more.
                if ($sampleSize > 0) {
                    my @sample = $fig->query_attributes("\$key = ? LIMIT $sampleSize", [$key]);
                    if (! @sample) {
                        Output($ofh, CGI::p("$key attributes has no values."));
                    } else {
                        Output($ofh, CustomAttributes::AttributeTable($cgi, @sample));
                    }
                }
                # Unless there is a coverage report, we are done.
                if ($cover) {
                    # Our procedure will be to loop through all the values for the specified
                    # key, tracking the objects to which the keys are attached. Once we
                    # have that information, we can find out how many objects in the database
                    # have matching values. This is, unfortunately, a slow, brutal process.
                    # We get a little benefit from the fact that we only care about object
                    # IDs. The technique used will be to grab 1000 attribute values at
                    # a time, ordered by object ID within key. Since we ONLY care about the
                    # object IDs, we can request each time we go back only values for object IDs
                    # greater than the greatest we've yet seen. This may cause us to miss rows
                    # of the table, but they are rows that yield us no useful information.
                    # First, we need some variables. The following variable contains the
                    # last object ID processed.
                    my $lastObjectId = "";
                    # The next variable contains a hash of all the object IDs found. It is
                    # a two-level hash, the first being a map of object types to sub-hashes,
                    # each sub-hash recording all the object IDs for that type.
                    my %objects = ();
                    # We loop until we get no results back.
                    my $done = 0;
                    my $totalRows = 0;
                    Trace("Starting retrieval for key $key.") if T(3);
                    while (! $done) {
                        # Ask for a bunch of attribute rows.
                        my @rows = $fig->query_attributes('$key = ? AND $object > ? ORDER BY $object LIMIT 10000',
                                                          [$key, $lastObjectId]);
                        if (! @rows) {
                            # None found, so end the loop.
                            $done = 1;
                        } else {
                            # Save the last object ID found.
                            $lastObjectId = $rows[$#rows]->[0];
                            # Store all the object IDs found in the hash.
                            for my $row (@rows) {
                                my ($type, $id) = CustomAttributes::ParseID($row->[0]);
                                # Fix the ZINC type.
                                if ($type =~ /^zinc/i) { $type = "Ligand"; }
                                # Only proceed if this type exists in the NMPDR.
                                if ($typeHash{$type}) {
                                    # Put the ID into the sub-hash for this object type.
                                    if (! exists $objects{$type}) {
                                        $objects{$type} = { $id => 1 };
                                    } else {
                                        $objects{$type}->{$id} = 1;
                                    }
                                }
                            }
                            $totalRows += scalar(@rows);
                            Trace("$totalRows rows processed for key $key.") if T(3);
                        }
                    }
                    # Now we can compute the coverage. For each object type we get a list of all
                    # the object IDs in NMPDR, and record how many are present or absent in the
                    # relevant object type ID hash.
                    Trace("Processing object types for key $key.") if T(3);
                    for my $type (sort keys %objects) {
                        my $idHash = $objects{$type};
                        # Check to see if we already know the keys for this object type.
                        if (! exists $entityHash{$type}) {
                            # We don't know, so we need to ask. Note that above we've
                            # deliberately excluded types that don't exist in NMPDR,
                            # so this is a safe request.
                            Trace("Retrieving IDs for entity type $type.") if T(3);
                            my @idList = $sprout->GetFlat([$type], "", [], "$type(id)");
                            Trace(scalar(@idList) . " IDs found for entity $type in NMPDR database.") if T(3);
                            $entityHash{$type} = { map { $_ => 1 } @idList };
                            if ($type eq 'Feature' || $type eq 'Genome') {
                                # If we are a feature or a genome, we need to also check for core genome
                                # coverage.
                                my ($coreCount, $totalCount) = 0;
                                for my $id (@idList) {
                                    # Pull out the genome ID and test it.
                                    if ($id =~ /(\d+\.\d+)/ && $coreGenomes{$1}) {
                                        $entityHash{$type}->{$id} = 2;
                                        $coreCount++;
                                    }
                                    $totalCount++;
                                }
                                $typeCounts{$type} = [$coreCount, $totalCount];
                            } else {
                                # We are not a feature or a genome, so there are no core IDs
                                # to store in the counter hash.
                                $typeCounts{$type} = [0, scalar(@idList)];
                            }
                        }
                        # Loop through the list of IDs, checking to see which are covered. Because IDs
                        # are unique, we don't need to worry about duplicates when we make our counts.
                        my ($covered, $coreCovered, $total) = (0, 0, 0);
                        my $typeIdHash = $entityHash{$type};
                        for my $id (keys %{$idHash}) {
                            $total++;
                            if (exists $typeIdHash->{$id}) {
                                $covered++;
                                if ($typeIdHash->{$id} == 2) {
                                    $coreCovered++;
                                }
                            }
                        }
                        Trace("Retrieving type counts for $type.") if T(3);
                        my ($coreTotal, $nmpdrTotal) = @{$typeCounts{$type}};
                        Trace("$coreTotal core objects, $nmpdrTotal NMPDR objects.") if T(3);
                        # Compute the percent.
                        my $percent = sprintf('%3.2f', $covered * 100 / $nmpdrTotal);
                        # Coverage of core genomes is only displayed for certain object types.
                        my ($coreCoverDisplay, $coreTotalDisplay, $corePercentDisplay) = ("","","");
                        if ($coreTotal) {
                            $coreCoverDisplay = $coreCovered;
                            $coreTotalDisplay = $coreTotal;
                            $corePercentDisplay = sprintf('%3.2f', $coreCovered * 100 / $coreTotal);
                        }
                        # Output the coverage data.
                        $coverageRows{"$type/$key"} = CGI::Tr(CGI::td($key), CGI::td($type),
                                                               CGI::td({ align => 'right' }, [$total, $coreCoverDisplay, $coreTotalDisplay,
                                                                                               $corePercentDisplay, $covered, $nmpdrTotal, $percent]));
                    }
                    Trace("Coverage data computed for $key.") if T(3);
                }
            }
        }
    }
    # If there's a coverage table, write it out.
    if ($cover) {
        Output($ofh, CGI::a({ name => "CoverageReport" }, CGI::h2("Coverage Report")));
        Output($ofh, CGI::table({border => 2}, map { $coverageRows{$_} } sort keys %coverageRows));
    }
    # Close the output file.
    close $ofh;
};
if ($@) {
    Trace("Script failed with error: $@") if T(0);
    $rtype = "error";
} else {
    Trace("Script complete.") if T(2);
    $rtype = "no error";
}
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "AttributeCoverage terminated with $rtype.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);
    }
}

=head3 Output

    Output($fh, $data);

Output the specified HTML data line using the specified output handle. All this
method really does is a C<print> statement with a new-line code on the end.

=over 4

=item fh

Open output file handle.

=item data

Data line to write out.

=back

=cut

sub Output {
    my ($fh, $data) = @_;
    print $fh "$data\n";
}



1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3