[Bio] / Sprout / GenomeStats.pl Repository:
ViewVC logotype

Diff of /Sprout/GenomeStats.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.31, Tue Feb 5 05:47:32 2008 UTC revision 1.36, Mon Mar 2 22:25:08 2009 UTC
# Line 2  Line 2 
2    
3  =head1 Genome Data Generator  =head1 Genome Data Generator
4    
5  This script creates a set of HTML include files that list the statistics for  This script creates a set of Wiki pages that list the statistics for
6  the genomes in each of the genome groups. Genomes that are new to this version  the genomes in each of the genome groups. Genomes that are new to this version
7  of the Sprout will be specially marked. In order for this to work, both the  of the Sprout will be specially marked. In order for this to work, both the
8  current and previous Sprout databases must be available on this machine.  current and previous Sprout databases must be available on this machine.
# Line 37  Line 37 
37    
38  Display this command's parameters and options.  Display this command's parameters and options.
39    
40  =item strict  =item test
41    
42  If specified, strict groups will be used; otherwise, groups with a common primary name  If specified, the output pages will be put into the sandbox instead of the main web.
 will be combined into a single group. (The primary name of the group is the first  
 capitalized word.)  
   
 =item oddStyle  
   
 Style to use for odd rows of the table.  
   
 =item evenStyle  
   
 Style to use for even rows of the table.  
   
 =item tableStyle  
   
 Style to use for the table itself.  
   
 =item markerStyle  
   
 Style to use for small-text markers (e.g. NEW!)  
   
 =item numStyle  
   
 Style to use for numeric cells.  
   
 =item counterStyle  
   
 Style to use for counter cells.  
   
 =item linkCGI  
   
 Path to the CGI script for displaying detailed statistics.  
43    
44  =item noNewCheck  =item noNewCheck
45    
# Line 89  Line 59 
59  use File::Path;  use File::Path;
60  use Sprout;  use Sprout;
61  use SFXlate;  use SFXlate;
62  use CGI qw(:standard);  use CGI qw(-nosticky);
63  use FIG;  use FIG;
64    use WikiTools;
65    
66  # Get the command-line options and parameters.  # Get the command-line options and parameters.
67  my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],  my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
68                                             {                                             {
69                                              strict => [0, 'keep related groups separate'],                                              test => [0, 'if specified, publishes to the wiki sandbox instead of the main web'],
                                             oddStyle => ['odd', 'style for odd rows'],  
                                             trace => [2, 'tracing level'],  
                                             evenStyle => ['even', 'style for even rows'],  
                                             tableStyle => ['genomestats', 'style for whole table'],  
                                             markerStyle => ['tinytext', 'style for markers'],  
                                             numStyle => ['numcell', 'style for cells with numeric values'],  
                                             counterStyle => ['countercell', 'style for cells with counter values'],  
                                             linkCGI => ['../FIG/genome_statistics.cgi',  
                                                         'path to CGI script for detailed statistics'],  
70                                              noNewCheck => [0, 'if specified, skips the check for new genomes'],                                              noNewCheck => [0, 'if specified, skips the check for new genomes'],
                                             targetDir => ["$FIG_Config::nmpdr_base/next/html/includes",  
                                                           'target directory'],  
71                                              },                                              },
72                                             "",                                             "",
73                                             @ARGV);                                             @ARGV);
# Line 119  Line 79 
79      # displayed along with the attribute values.      # displayed along with the attribute values.
80      my %specialCols = (Serotype => 'Serotype_code',      my %specialCols = (Serotype => 'Serotype_code',
81                         Phenotype => 'Phenotype');                         Phenotype => 'Phenotype');
82      # Verify the directory name.      my $outputWeb = ($options->{test} ? 'Sandbox' : 'Main');
     my $targetDir = $options->{targetDir};  
     if (! $targetDir) {  
         Confess("No target directory specified.");  
     } elsif (! -d $targetDir) {  
         Confess("Target directory $targetDir not found.");  
     } else {  
83          # Get the new Sprout.          # Get the new Sprout.
84          my $sprout = SFXlate->new_sprout_only();          my $sprout = SFXlate->new_sprout_only();
85          my %newGroupHash = $sprout->GetGroups();          my %newGroupHash = $sprout->GetGroups();
86        # Get a wiki helper.
87        my $wiki = WikiTools->new();
88          # Extract the genome group data from the new Sprout.          # Extract the genome group data from the new Sprout.
         if (! $options->{strict}) {  
89              %newGroupHash = $sprout->Fix(%newGroupHash);              %newGroupHash = $sprout->Fix(%newGroupHash);
         }  
90          # This hash will be used to determine which genomes are new.          # This hash will be used to determine which genomes are new.
91          my %oldGroupHash = ();          my %oldGroupHash = ();
92          if ($options->{noNewCheck}) {          if ($options->{noNewCheck}) {
# Line 144  Line 98 
98              my $oldSprout = SFXlate->old_sprout_only();              my $oldSprout = SFXlate->old_sprout_only();
99              # Extract the genome group data from the old Sprout.              # Extract the genome group data from the old Sprout.
100              %oldGroupHash = $oldSprout->GetGroups();              %oldGroupHash = $oldSprout->GetGroups();
             if (! $options->{strict}) {  
101                  %oldGroupHash = $oldSprout->Fix(%oldGroupHash);                  %oldGroupHash = $oldSprout->Fix(%oldGroupHash);
102              }              }
         }  
103          # Get a FIG object for computing attributes.          # Get a FIG object for computing attributes.
104          my $fig = FIG->new();          my $fig = FIG->new();
105          # Get the super-group list.          # Get the super-group list.
106          my @superGroups = sort keys %newGroupHash;          my @superGroups = sort keys %newGroupHash;
107          # Set up some useful stuff for the four count columns.          # Set up some useful stuff for the four count columns.
108          my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",      my $url = "%SCRIPTURL{rest}%/NmpdrPlugin/search?Class=OrgSumSearch;Search=Go";
109                            s1 => "hypo_sub", n1 => "hypo_nosub" );      my %linkParms = ( s0 => "$url;hypothetical=named;insubsystem=in;genome=",
110                          n0 => "$url;hypothetical=named;insubsystem=out;genome=",
111                          s1 => "$url;hypothetical=hypo;insubsystem=in;genome=",
112                          n1 => "$url;hypothetical=hypo;insubsystem=out;genome=" );
113          my @columnTypes = ('s0', 'n0', 's1', 'n1');          my @columnTypes = ('s0', 'n0', 's1', 'n1');
         # Get the styles.  
         my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},  
                                                      $options->{evenStyle}, $options->{oddStyle});  
         my ($numStyle, $counterStyle) = ($options->{numStyle}, $options->{counterStyle});  
114          # Prepare a hash for the summary counters. These will be used on the organism summary page.          # Prepare a hash for the summary counters. These will be used on the organism summary page.
115          my %summaries = ();          my %summaries = ();
116          # Loop through the groups.          # Loop through the groups.
# Line 176  Line 127 
127              if (exists $oldGroupHash{$groupID}) {              if (exists $oldGroupHash{$groupID}) {
128                  %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};                  %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};
129              }              }
130              # Create the output file.          # Compute the name of the wiki page we're building.
131              my $outFileName = "stats-" . lc($groupID) . ".inc";          my $outPageName = "${groupID}Stats";
132              Open(\*GROUPFILE, ">$targetDir/$outFileName");          # We'll put the data for the page in here.
133            my @outputLines = ();
134              # Get the special columns. We'll stuff them in a hash keyed by column name. Each column name will contain              # Get the special columns. We'll stuff them in a hash keyed by column name. Each column name will contain
135              # a sub-hash that translates each genome ID to its applicable attribute value (if any).              # a sub-hash that translates each genome ID to its applicable attribute value (if any).
136              my %specialData = ();              my %specialData = ();
# Line 191  Line 143 
143                      $specialData{$specialColumn} = \%specialDataList;                      $specialData{$specialColumn} = \%specialDataList;
144                  }                  }
145              }              }
146              # Set up the column names.          # Set up the column names. Note that an extra space in front of a name will be interpreted by
147              my @columnNames = "Strain annotated in NMPDR";          # the Wiki markup as an order to right-justify the text.
148              push @columnNames, sort keys %specialData;          my @columnNames = "*Strain annotated in NMPDR*";
149              push @columnNames,  "Genome size, bp",          push @columnNames, map { "*$_*" } sort keys %specialData;
150                                  "Protein Encoding Genes (PEGs)",          push @columnNames,  "*Genome size, bp*",
151                                  "Named genes in subsystems",            # s0                              " *%FIG{Protein Encoding Genes}% (PEGs)*",
152                                  "Named genes not in subsystems",        # n0                              " *Named genes in subsystems*",            # s0
153                                  "Hypothetical genes in subsystems",     # s1                              " *Named genes not in subsystems*",        # n0
154                                  "Hypothetical genes not in subsystems", # n1                              " *Hypothetical genes in subsystems*",     # s1
155                                  "Subsystems",                              " *Hypothetical genes not in subsystems*", # n1
156                                  "RNAs";                              " *Subsystems*",
157              # Start the table.                              " *RNAs*";
158              print GROUPFILE "<table class=\"$tableStyle\">\n";          # Make the table sortable.
159            push @outputLines, '%TABLE{sort="on"}%';
160              # Create the header row.              # Create the header row.
161              print GROUPFILE Tr( { class => 'odd' }, th(\@columnNames)) . "\n";          push @outputLines, "| " . join(" | ", @columnNames) . " |";
162              # The data rows will be built next. We'll be putting them into a hash keyed by              # The data rows will be built next. We'll be putting them into a hash keyed by
163              # organism name. The hash enables us to spit them out sorted by name.              # organism name. The hash enables us to spit them out sorted by name.
164              my %rows = ();              my %rows = ();
# Line 221  Line 174 
174                  # Get the strain name.                  # Get the strain name.
175                  my $genomeName = $sprout->GenusSpecies($genomeID);                  my $genomeName = $sprout->GenusSpecies($genomeID);
176                  # Apply a link.                  # Apply a link.
177                  my $genomeText = CGI::a({ href => "../FIG/genome_statistics.cgi?genome=$genomeID;SPROUT=1" }, $genomeName);              my $genomeText = "%SV{\"$genomeName\" id=\"$genomeID\"}%";
178                  # If this is a new strain, build the HTML for the NEW! mark.              # If this is a new strain, add the NEW! mark.
179                  if ($new) {                  if ($new) {
180                      $new = " <span class=\"$markerStyle\">NEW!</span>";                  $genomeText .= " %N%";
181                  }                  }
182                  # Get the genome length.                  # Get the genome length.
183                  $num = $sprout->GenomeLength($genomeID);                  $num = $sprout->GenomeLength($genomeID);
# Line 269  Line 222 
222                      $groupTotals{$counterKey} += $counters{$counterKey};                      $groupTotals{$counterKey} += $counters{$counterKey};
223                  }                  }
224                  $groupTotals{features} += $totalFeatures;                  $groupTotals{features} += $totalFeatures;
225                  # We have all our data. Next we need to compute the percentages and the links.              # We have all our data. Next we need to compute the percentages.
                 # First, the link stuff.  
                 my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";  
                 # Now format the counters and percentages.  
226                  for my $type (keys %linkParms) {                  for my $type (keys %linkParms) {
227                      $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },                  my $counterData = sprintf("%d(%.1f%%)", $counters{$type},
228                                           sprintf("%d(%.1f%%)", $counters{$type},                                            Tracer::Percent($counters{$type}, $totalFeatures));
229                                                   Tracer::Percent($counters{$type}, $totalFeatures)));                  $counters{$type} = CGI::a({ href => $linkParms{$type} . $genomeID }, $counterData);
230                  }                  }
231                  my @counterValues = map { $counters{$_} } @columnTypes;                  my @counterValues = map { $counters{$_} } @columnTypes;
232                  # The last link is a button to look at the subsystem summaries.              # The last column is a subsystem count.
233                  my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',                  my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',
234                                                     [$genomeID]);                                                     [$genomeID]);
235                  my $ssLink = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&show_subsystems=1";              my $ssCol = $ssCount;
                 my $ssCol = "<a href=\"$ssLink\">$ssCount</a>";  
236                  # Start creating the table cells.                  # Start creating the table cells.
237                  my $rowHtml = td("$genomeText$new");              my $rowHtml = "| $genomeText |";
238                  # Add any special columns.                  # Add any special columns.
239                  for my $specialCol (keys %specialData) {                  for my $specialCol (keys %specialData) {
240                      # Here we get the attribute value. If there is none, we leave the column blank.                      # Here we get the attribute value. If there is none, we leave the column blank.
241                      my $attribute = $specialData{$specialCol}->{$genomeID} || "&nbsp;";                      my $attribute = $specialData{$specialCol}->{$genomeID} || "&nbsp;";
242                      $rowHtml .= td($attribute);                  $rowHtml .= " $attribute |";
243                  }                  }
244                  # Now add the data columns.                  # Now add the data columns.
245                  $rowHtml .= join("",              $rowHtml .= join("", map { "  $_ |" } ($genomeLen, $pegCount, @counterValues, $ssCol, $rnaCount));
                                 td({ class => $numStyle }, $genomeLen),  
                                 td({ class => $numStyle }, $pegCount),  
                                 td({ class => $counterStyle }, \@counterValues),  
                                 td({ class => $numStyle }, $ssCol),  
                                 td({ class => $numStyle }, $rnaCount),  
                                 );  
246                  # Put it in the row hash.                  # Put it in the row hash.
247                  $rows{$genomeName} = $rowHtml;                  $rows{$genomeName} = $rowHtml;
248              }              }
# Line 312  Line 255 
255              # Loop through the rows.              # Loop through the rows.
256              for my $rowID (sort keys %rows) {              for my $rowID (sort keys %rows) {
257                  # Format the row.                  # Format the row.
258                  print GROUPFILE Tr( { class => $rowStyle[$rowType] }, $rows{$rowID} ) . "\n";              push @outputLines, $rows{$rowID};
                 # Flip the row type.  
                 $rowType = 1 - $rowType;  
259                  # Count the row.                  # Count the row.
260                  $rowCount++;                  $rowCount++;
261              }              }
262              # All done, terminate the table and close the file.          # All done, write the Wiki Page.
263              print GROUPFILE "</table>\n";          my $rc = $wiki->Save($outPageName, $outputWeb, 'OrganismDataSummariesStats', join("\n", @outputLines));
             close GROUPFILE;  
264              Trace("$rowCount genomes processed.") if T(2);              Trace("$rowCount genomes processed.") if T(2);
265            if (! $rc) {
266                Confess("Error saving $outPageName: $wiki->{error}");
267            }
268              # Now save the group totals.              # Now save the group totals.
269              $summaries{$groupID} = \%groupTotals;              $summaries{$groupID} = \%groupTotals;
270          }          }
271          # Now produce the summary table.          # Now produce the summary table.
272          my $sumFileName = "stats-groups.inc";      my $sumPageName = "OrganismDataSummariesStats";
273          Open(\*SUMFILE, ">$targetDir/$sumFileName");      my @sumLines = ();
274          # Start the table.      # Start the table.  Asterisks make a cell a column header. An extra space at the front right-justifies it.
275          print SUMFILE "<table class=\"$tableStyle\">\n";      push @sumLines, "| " . join("", map { " *$_* |" } ("Group name", "Genomes")) .
276          # Create the header row.                             join("", map { "  *$_* |"} ("[[FIG.ProteinEncodingGenes][Protein Encoding Genes]] (PEGs)",
         print SUMFILE Tr( { class => 'odd' }, th(["Group name",  
                                                  "Genomes",  
                                                  "Protein Encoding Genes (PEGs)",  
277                                                   "Named genes in subsystems",            # s0                                                   "Named genes in subsystems",            # s0
278                                                   "Named genes not in subsystems",        # n0                                                   "Named genes not in subsystems",        # n0
279                                                   "Hypothetical genes in subsystems",     # s1                                                   "Hypothetical genes in subsystems",     # s1
280                                                   "Hypothetical genes not in subsystems", # n1                                                   "Hypothetical genes not in subsystems", # n1
281                                                   "RNAs",                                                         "RNAs"));
                                                    ])) . "\n";  
         # Set up a flag for the odd-even styling.  
         my $rowFlag = 0;  
282          # Put in the data rows.          # Put in the data rows.
283          for my $groupName (sort keys %summaries) {          for my $groupName (sort keys %summaries) {
284              my $group = $summaries{$groupName};              my $group = $summaries{$groupName};
             # Compute the link for the current group.  
             my $groupLink = a({ href => $sprout->GroupPageName($groupName) }, $groupName);  
285              # Create the table row.              # Create the table row.
286              my $rowHtml = join("",          my $rowHtml = "| [[$groupName]] |" . join("", map { "  " . Tracer::CommaFormat($group->{$_}) . " |" }
287                                 td($groupLink),                                                    ('genomes', 'pegs', @columnTypes, 'RNAs'));
288                                 td({ class => $numStyle }, Tracer::CommaFormat($group->{genomes})),          push @sumLines, $rowHtml;
289                                 td({ class => $numStyle }, Tracer::CommaFormat($group->{pegs})),      }
290                                 td({ class => $counterStyle }, [ map { Tracer::CommaFormat($group->{$_}) } @columnTypes ]),      # Write the page.
291                                 td({ class => $numStyle }, Tracer::CommaFormat($group->{RNAs})),      my $rc = $wiki->Save($sumPageName, $outputWeb, 'WebHome', join("\n", @sumLines));
                               );  
             print SUMFILE Tr( { class => $rowStyle[$rowFlag] }, $rowHtml ) . "\n";  
             # Flip the row style.  
             $rowFlag = 1 - $rowFlag;  
         }  
         # Terminate the table and close the file.  
         print SUMFILE "</table>\n";  
         close SUMFILE;  
292          # We're all done.          # We're all done.
293          Trace("Processing complete.") if T(2);          Trace("Processing complete.") if T(2);
     }  
294  };  };
295  if ($@) {  if ($@) {
296      Trace("Stats failed with error: $@") if T(0);      Trace("Stats failed with error: $@") if T(0);

Legend:
Removed from v.1.31  
changed lines
  Added in v.1.36

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3