--- GenomeStats.pl 2006/06/18 07:33:37 1.6 +++ GenomeStats.pl 2006/10/03 02:48:59 1.24 @@ -61,6 +61,14 @@ Style to use for small-text markers (e.g. NEW!) +=item numStyle + +Style to use for numeric cells. + +=item counterStyle + +Style to use for counter cells. + =item linkCGI Path to the CGI script for displaying detailed statistics. @@ -87,12 +95,17 @@ { strict => [0, 'keep related groups separate'], oddStyle => ['odd', 'style for odd rows'], + trace => [2, 'tracing level'], evenStyle => ['even', 'style for even rows'], tableStyle => ['genomestats', 'style for whole table'], markerStyle => ['tinytext', 'style for markers'], + numStyle => ['numcell', 'style for cells with numeric values'], + counterStyle => ['countercell', 'style for cells with counter values'], linkCGI => ['../FIG/genome_statistics.cgi', 'path to CGI script for detailed statistics'], - }, + groupFile => ["$FIG_Config::sproutData/groups.tbl", + "location of the NMPDR group description file"], + }, "", @ARGV); # Verify the directory name. @@ -102,22 +115,37 @@ } elsif (! -d $targetDir) { Confess("Target directory $targetDir not found."); } else { - # *Get the old Sprout. + # Get the old Sprout. my $oldSprout = SFXlate->new_sprout_only($FIG_Config::oldSproutDB); # Extract the genome group data from the old Sprout. my %oldGroupHash = $oldSprout->GetGroups(); if (! $options->{strict}) { - %oldGroupHash = Fix(%oldGroupHash); + %oldGroupHash = Sprout::Fix(%oldGroupHash); } # Get the new Sprout. my $sprout = SFXlate->new_sprout_only(); my %newGroupHash = $sprout->GetGroups(); if (! $options->{strict}) { - %newGroupHash = Fix(%newGroupHash); + %newGroupHash = Sprout::Fix(%newGroupHash); } + # Read the group file. + my %groupData = Sprout::ReadGroupFile($options->{groupFile}); + # Set up some useful stuff for the four count columns. + my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub", + s1 => "hypo_sub", n1 => "hypo_nosub" ); + my @columnTypes = ('s0', 'n0', 's1', 'n1'); + # Get the styles. + my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle}, + $options->{evenStyle}, $options->{oddStyle}); + my ($numStyle, $counterStyle) = ($options->{numStyle}, $options->{counterStyle}); + # Prepare a hash for the summary counters. These will be used on the organism summary page. + my %summaries = (); # Loop through the groups. for my $groupID (keys %newGroupHash) { Trace("Processing group $groupID.") if T(2); + # Create a hash for summarizing the counters. + my %groupTotals = ( genomes => 0, pegs => 0, RNAs => 0, + map { $_ => } @columnTypes, features => 0 ); # Get the genomes from the new hash. my @newGenomes = @{$newGroupHash{$groupID}}; # Create a hash for finding if a genome is in the old group. If the entire group is @@ -127,30 +155,30 @@ %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}}; } # Create the output file. - Open(\*GROUPFILE, ">$targetDir/$groupID.inc"); - # Get the styles. - my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle}, - $options->{evenStyle}, $options->{oddStyle}); + my $outFileName = "stats-" . lc($groupID) . ".inc"; + Open(\*GROUPFILE, ">$targetDir/$outFileName"); # Start the table. print GROUPFILE "\n"; # Create the header row. - print GROUPFILE Tr( { class => 'odd' }, th("Strain annotated in NMPDR", + print GROUPFILE Tr( { class => 'odd' }, th(["Strain annotated in NMPDR", "Genome size, bp", "Protein Encoding Genes (PEGs)", "Named genes in subsystems", # s0 "Named genes not in subsystems", # n0 "Hypothetical genes in subsystems", # s1 "Hypothetical genes not in subsystems", # n1 - "RNAs")) . "\n"; - # Set up some useful stuff for the four count columns. - my %linkParms = ( s0 => "nohypo_sub", n0 => "nohypo_nosub", - s1 => "hypo_sub", n1 => "hypo_nosub" ); - my @columnTypes = ('s0', 'n0', 's1', 'n1'); + "Subsystems", + "RNAs", + ])) . "\n"; # The data rows will be built next. We'll be putting them into a hash keyed by # organism name. The hash enables us to spit them out sorted by name. my %rows = (); + # This variable is used to hold the counts. + my $num; # Loop through the genomes in the new group. for my $genomeID (@newGenomes) { + # Count this genome. + $groupTotals{genomes}++; # Check to see if this genome is new. my $new = (! exists $oldGenomes{$genomeID} ? "new " : ""); Trace("Processing ${new}genome $genomeID for $groupID.") if T(3); @@ -161,21 +189,29 @@ $new = " NEW!"; } # Get the genome length. - my $genomeLen = $sprout->GenomeLength($genomeID); + $num = $sprout->GenomeLength($genomeID); + my $genomeLen = Tracer::CommaFormat($num); # Get the number of PEGs. - my $pegCount = $sprout->FeatureCount($genomeID, 'peg'); + $num = $sprout->FeatureCount($genomeID, 'peg'); + my $pegCount = Tracer::CommaFormat($num); + $groupTotals{pegs} += $num; # Get the number of RNAs. - my $rnaCount = $sprout->FeatureCount($genomeID, 'rna'); + $num = $sprout->FeatureCount($genomeID, 'rna'); + my $rnaCount = Tracer::CommaFormat($num); + $groupTotals{RNAs} += $num; + # If there are no RNAs, we say we don't know the number, since we know there + # must be RNAs somewhere. + if (! $rnaCount) { + $rnaCount = "n/d"; + } # Now we have four categories of features to work with, for each # combination of named or hypothetical vs. in-subsystem or # not-in-subsystem. First, we get all of the feature assignments for # the genome. my $assignHash = $sprout->GenomeAssignments($genomeID); # Next, we get all of the features in the genome that belong to a - # subsystem. This involves a query via the subsystem spreadsheet. - my %ssHash = map { $_ => 1 } $sprout->GetFlat(['IsGenomeOf', 'ContainsFeature'], - "IsGenomeOf(from-link) = ?", - [$genomeID], 'ContainsFeature(to-link)'); + # subsystem. + my %ssHash = $sprout->GenomeSubsystemData($genomeID); # Create a hash to track the four categories. "s" or "n" indicates # in or out of a subsystem. "1" or "0" indicates hypothetical or # real. @@ -191,6 +227,10 @@ $totalFeatures++; } Trace("$totalFeatures total features found for $genomeID.") if T(3); + for my $counterKey (@columnTypes) { + $groupTotals{$counterKey} += $counters{$counterKey}; + } + $groupTotals{features} += $totalFeatures; # We have all our data. Next we need to compute the percentages and the links. # First, the link stuff. my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request="; @@ -198,12 +238,24 @@ for my $type (keys %linkParms) { $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" }, sprintf("%d(%.1f%%)", $counters{$type}, - $counters{$type} * 100 / $totalFeatures)); + Tracer::Percent($counters{$type}, $totalFeatures))); } - # Create the row text. - my $rowHtml = td( "$genomeName$new", $genomeLen, $pegCount, - map { $counters{$_} } @columnTypes, - $rnaCount ); + my @counterValues = map { $counters{$_} } @columnTypes; + # The last link is a button to look at the subsystem summaries. + my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?', + [$genomeID]); + my $ssLink = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&show_subsystems=1"; + my $ssCol = "$ssCount"; + # Create the row text. Note that we use the distributive capability of the TD + # function to apply the same style to each one. + my $rowHtml = join("", + td("$genomeName$new"), + td({ class => $numStyle }, $genomeLen), + td({ class => $numStyle }, $pegCount), + td({ class => $counterStyle }, \@counterValues), + td({ class => $numStyle }, $ssCol), + td({ class => $numStyle }, $rnaCount), + ); # Put it in the row hash. $rows{$genomeName} = $rowHtml; } @@ -222,54 +274,52 @@ # Count the row. $rowCount++; } - # All done, close the file. + # All done, terminate the table and close the file. + print GROUPFILE "
\n"; close GROUPFILE; Trace("$rowCount genomes processed.") if T(2); + # Now save the group totals. + $summaries{$groupID} = \%groupTotals; } + # Now produce the summary table. + my $sumFileName = "stats-groups.inc"; + Open(\*SUMFILE, ">$targetDir/$sumFileName"); + # Start the table. + print SUMFILE "\n"; + # Create the header row. + print SUMFILE Tr( { class => 'odd' }, th(["Group name", + "Genomes", + "Protein Encoding Genes (PEGs)", + "Named genes in subsystems", # s0 + "Named genes not in subsystems", # n0 + "Hypothetical genes in subsystems", # s1 + "Hypothetical genes not in subsystems", # n1 + "RNAs", + ])) . "\n"; + # Set up a flag for the odd-even styling. + my $rowFlag = 0; + # Put in the data rows. + for my $groupName (sort keys %summaries) { + my $group = $summaries{$groupName}; + # Compute the link for the current group. + my $groupLink = a({ href => $groupData{$groupName}->[0] }, $groupName); + # Create the table row. + my $rowHtml = join("", + td($groupLink), + td({ class => $numStyle }, Tracer::CommaFormat($group->{genomes})), + td({ class => $numStyle }, Tracer::CommaFormat($group->{pegs})), + td({ class => $counterStyle }, [ map { Tracer::CommaFormat($group->{$_}) } @columnTypes ]), + td({ class => $numStyle }, Tracer::CommaFormat($group->{RNAs})), + ); + print SUMFILE Tr( { class => $rowStyle[$rowFlag] }, $rowHtml ) . "\n"; + # Flip the row style. + $rowFlag = 1 - $rowFlag; + } + # Terminate the table and close the file. + print SUMFILE "
\n"; + close SUMFILE; # We're all done. Trace("Processing complete.") if T(2); } -=head3 Fix - -C<< my %fixedHash = Fix(%groupHash); >> - -Prepare a genome group hash for processing. Groups with the same primary name will be combined. -The primary name is the first capitalized word in the group name. - -=over 4 - -=item groupHash - -Hash to be fixed up. - -=item RETURN - -Returns a fixed-up version of the hash. - -=back - -=cut - -sub Fix { - # Get the parameters. - my (%groupHash) = @_; - # Create the result hash. - my %retVal = (); - # Copy over the genomes. - for my $groupID (keys %groupHash) { - # Make a safety copy of the group ID. - my $realGroupID = $groupID; - # Yank the primary name. - if ($groupID =~ /([A-Z]\w+)/) { - $realGroupID = $1; - } - # Append this group's genomes into the result hash. - Tracer::AddToListMap(\%retVal, $realGroupID, @{$groupHash{$groupID}}); - } - # Return the result hash. - return %retVal; -} - - 1; \ No newline at end of file