[Bio] / Sprout / GenomeStats.pl Repository:
ViewVC logotype

Diff of /Sprout/GenomeStats.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.27, Sun Oct 8 20:55:06 2006 UTC revision 1.34, Sat Sep 20 14:31:22 2008 UTC
# Line 2  Line 2 
2    
3  =head1 Genome Data Generator  =head1 Genome Data Generator
4    
5  This script creates a set of HTML include files that list the statistics for  This script creates a set of Wiki pages that list the statistics for
6  the genomes in each of the genome groups. Genomes that are new to this version  the genomes in each of the genome groups. Genomes that are new to this version
7  of the Sprout will be specially marked. In order for this to work, both the  of the Sprout will be specially marked. In order for this to work, both the
8  current and previous Sprout databases must be available on this machine.  current and previous Sprout databases must be available on this machine.
# Line 37  Line 37 
37    
38  Display this command's parameters and options.  Display this command's parameters and options.
39    
40  =item strict  =item test
41    
42  If specified, strict groups will be used; otherwise, groups with a common primary name  If specified, the output pages will be put into the sandbox instead of the main web.
 will be combined into a single group. (The primary name of the group is the first  
 capitalized word.)  
   
 =item oddStyle  
   
 Style to use for odd rows of the table.  
   
 =item evenStyle  
   
 Style to use for even rows of the table.  
   
 =item tableStyle  
   
 Style to use for the table itself.  
   
 =item markerStyle  
   
 Style to use for small-text markers (e.g. NEW!)  
   
 =item numStyle  
   
 Style to use for numeric cells.  
   
 =item counterStyle  
   
 Style to use for counter cells.  
   
 =item linkCGI  
   
 Path to the CGI script for displaying detailed statistics.  
43    
44  =item noNewCheck  =item noNewCheck
45    
# Line 84  Line 54 
54    
55  use strict;  use strict;
56  use Tracer;  use Tracer;
 use DocUtils;  
 use TestUtils;  
57  use Cwd;  use Cwd;
58  use File::Copy;  use File::Copy;
59  use File::Path;  use File::Path;
# Line 93  Line 61 
61  use SFXlate;  use SFXlate;
62  use CGI qw(:standard);  use CGI qw(:standard);
63  use FIG;  use FIG;
64  no warnings 'once'; # only when coding  use WikiTools;
65    
66  # Get the command-line options and parameters.  # Get the command-line options and parameters.
67  my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],  my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
68                                             {                                             {
69                                              strict => [0, 'keep related groups separate'],                                              test => [0, 'if specified, publishes to the wiki sandbox instead of the main web'],
                                             oddStyle => ['odd', 'style for odd rows'],  
                                             trace => [2, 'tracing level'],  
                                             evenStyle => ['even', 'style for even rows'],  
                                             tableStyle => ['genomestats', 'style for whole table'],  
                                             markerStyle => ['tinytext', 'style for markers'],  
                                             numStyle => ['numcell', 'style for cells with numeric values'],  
                                             counterStyle => ['countercell', 'style for cells with counter values'],  
                                             linkCGI => ['../FIG/genome_statistics.cgi',  
                                                         'path to CGI script for detailed statistics'],  
                                             groupFile => ["$FIG_Config::sproutData/groups.tbl",  
                                                           'location of the NMPDR group description file'],  
70                                              noNewCheck => [0, 'if specified, skips the check for new genomes'],                                              noNewCheck => [0, 'if specified, skips the check for new genomes'],
                                             targetDir => ["$FIG_Config::nmpdr_base/next/html/includes",  
                                                           'target directory'],  
71                                              },                                              },
72                                             "",                                             "",
73                                             @ARGV);                                             @ARGV);
74  # Verify the directory name.  # The return type (error/no error) goes in here.
75  my $targetDir = $options->{targetDir};  my $rtype;
76  if (! $targetDir) {  eval {
77      Confess("No target directory specified.");      # This table controls the special attribute columns. For each we need to know the attribute name and the
78  } elsif (! -d $targetDir) {      # column title. If any genomes in a group have a value for one of the special columns, that column is
79      Confess("Target directory $targetDir not found.");      # displayed along with the attribute values.
80  } else {      my %specialCols = (Serotype => 'Serotype_code',
81                           Phenotype => 'Phenotype');
82        my $outputWeb = ($options->{test} ? 'Sandbox' : 'Main');
83      # Get the new Sprout.      # Get the new Sprout.
84      my $sprout = SFXlate->new_sprout_only();      my $sprout = SFXlate->new_sprout_only();
85      my %newGroupHash = $sprout->GetGroups();      my %newGroupHash = $sprout->GetGroups();
86        # Get a wiki helper.
87        my $wiki = WikiTools->new();
88      # Extract the genome group data from the new Sprout.      # Extract the genome group data from the new Sprout.
89      if (! $options->{strict}) {      %newGroupHash = $sprout->Fix(%newGroupHash);
         %newGroupHash = Sprout::Fix(%newGroupHash);  
     }  
90      # This hash will be used to determine which genomes are new.      # This hash will be used to determine which genomes are new.
91      my %oldGroupHash = ();      my %oldGroupHash = ();
92      if ($options->{noNewCheck}) {      if ($options->{noNewCheck}) {
# Line 138  Line 95 
95          %oldGroupHash = map { $_ => $newGroupHash{$_} } keys %newGroupHash;          %oldGroupHash = map { $_ => $newGroupHash{$_} } keys %newGroupHash;
96      } else {      } else {
97          # Get the old Sprout.          # Get the old Sprout.
98          my $oldSprout = SFXlate->new_sprout_only($FIG_Config::oldSproutDB);          my $oldSprout = SFXlate->old_sprout_only();
99          # Extract the genome group data from the old Sprout.          # Extract the genome group data from the old Sprout.
100          %oldGroupHash = $oldSprout->GetGroups();          %oldGroupHash = $oldSprout->GetGroups();
101          if (! $options->{strict}) {          %oldGroupHash = $oldSprout->Fix(%oldGroupHash);
             %oldGroupHash = Sprout::Fix(%oldGroupHash);  
         }  
102      }      }
103      # Read the group file.      # Get a FIG object for computing attributes.
104      my %groupData = Sprout::ReadGroupFile($options->{groupFile});      my $fig = FIG->new();
105        # Get the super-group list.
106        my @superGroups = sort keys %newGroupHash;
107      # Set up some useful stuff for the four count columns.      # Set up some useful stuff for the four count columns.
108      my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",      my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",
109                        s1 => "hypo_sub", n1 => "hypo_nosub" );                        s1 => "hypo_sub", n1 => "hypo_nosub" );
110      my @columnTypes = ('s0', 'n0', 's1', 'n1');      my @columnTypes = ('s0', 'n0', 's1', 'n1');
     # Get the styles.  
     my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},  
                                                  $options->{evenStyle}, $options->{oddStyle});  
     my ($numStyle, $counterStyle) = ($options->{numStyle}, $options->{counterStyle});  
111      # Prepare a hash for the summary counters. These will be used on the organism summary page.      # Prepare a hash for the summary counters. These will be used on the organism summary page.
112      my %summaries = ();      my %summaries = ();
113      # Loop through the groups.      # Loop through the groups.
114      for my $groupID (keys %newGroupHash) {      for my $groupID (@superGroups) {
115          Trace("Processing group $groupID.") if T(2);          Trace("Processing group $groupID.") if T(2);
116          # Create a hash for summarizing the counters.          # Create a hash for summarizing the counters.
117          my %groupTotals = ( genomes => 0, pegs => 0, RNAs => 0,          my %groupTotals = ( genomes => 0, pegs => 0, RNAs => 0,
# Line 171  Line 124 
124          if (exists $oldGroupHash{$groupID}) {          if (exists $oldGroupHash{$groupID}) {
125              %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};              %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};
126          }          }
127          # Create the output file.          # Compute the name of the wiki page we're building.
128          my $outFileName = "stats-" . lc($groupID) . ".inc";          my $outPageName = "${groupID}Stats";
129          Open(\*GROUPFILE, ">$targetDir/$outFileName");          # We'll put the data for the page in here.
130          # Start the table.          my @outputLines = ();
131          print GROUPFILE "<table class=\"$tableStyle\">\n";          # Get the special columns. We'll stuff them in a hash keyed by column name. Each column name will contain
132            # a sub-hash that translates each genome ID to its applicable attribute value (if any).
133            my %specialData = ();
134            for my $specialColumn (keys %specialCols) {
135                # Get the attribute mapping.
136                my %specialDataList = map { $_->[0] => $_->[2] } $fig->get_attributes(\@newGenomes, $specialCols{$specialColumn});
137                # We only proceed if some attributes were found. As a result, the keys in %specialData will only be keys
138                # for columns that exist in the output.
139                if (scalar(keys %specialDataList)) {
140                    $specialData{$specialColumn} = \%specialDataList;
141                }
142            }
143            # Set up the column names. Note that an extra space in front of a name will be interpreted by
144            # the Wiki markup as an order to right-justify the text.
145            my @columnNames = "*Strain annotated in NMPDR*";
146            push @columnNames, map { "*$_*" } sort keys %specialData;
147            push @columnNames,  "*Genome size, bp*",
148                                " *%FIG{Protein Encoding Genes}% (PEGs)*",
149                                " *Named genes in subsystems*",            # s0
150                                " *Named genes not in subsystems*",        # n0
151                                " *Hypothetical genes in subsystems*",     # s1
152                                " *Hypothetical genes not in subsystems*", # n1
153                                " *Subsystems*",
154                                " *RNAs*";
155          # Create the header row.          # Create the header row.
156          print GROUPFILE Tr( { class => 'odd' }, th(["Strain annotated in NMPDR",          push @outputLines, "| " . join(" | ", @columnNames) . " |";
                                                  "Genome size, bp",  
                                                  "Protein Encoding Genes (PEGs)",  
                                                  "Named genes in subsystems",            # s0  
                                                  "Named genes not in subsystems",        # n0  
                                                  "Hypothetical genes in subsystems",     # s1  
                                                  "Hypothetical genes not in subsystems", # n1  
                                                  "Subsystems",  
                                                  "RNAs",  
                                                    ])) . "\n";  
157          # The data rows will be built next. We'll be putting them into a hash keyed by          # The data rows will be built next. We'll be putting them into a hash keyed by
158          # organism name. The hash enables us to spit them out sorted by name.          # organism name. The hash enables us to spit them out sorted by name.
159          my %rows = ();          my %rows = ();
# Line 201  Line 168 
168              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);
169              # Get the strain name.              # Get the strain name.
170              my $genomeName = $sprout->GenusSpecies($genomeID);              my $genomeName = $sprout->GenusSpecies($genomeID);
171              # If this is a new strain, build the HTML for the NEW! mark.              # Apply a link.
172                my $genomeText = "%SV{\"$genomeName\" id=\"$genomeID\"}%";
173                # If this is a new strain, add the NEW! mark.
174              if ($new) {              if ($new) {
175                  $new = " <span class=\"$markerStyle\">NEW!</span>";                  $genomeText .= " %N%";
176              }              }
177              # Get the genome length.              # Get the genome length.
178              $num = $sprout->GenomeLength($genomeID);              $num = $sprout->GenomeLength($genomeID);
# Line 248  Line 217 
217                  $groupTotals{$counterKey} += $counters{$counterKey};                  $groupTotals{$counterKey} += $counters{$counterKey};
218              }              }
219              $groupTotals{features} += $totalFeatures;              $groupTotals{features} += $totalFeatures;
220              # We have all our data. Next we need to compute the percentages and the links.              # We have all our data. Next we need to compute the percentages.
             # First, the link stuff.  
             my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";  
             # Now format the counters and percentages.  
221              for my $type (keys %linkParms) {              for my $type (keys %linkParms) {
222                  $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },                  $counters{$type} = sprintf("%d(%.1f%%)", $counters{$type},
223                                       sprintf("%d(%.1f%%)", $counters{$type},                                             Tracer::Percent($counters{$type}, $totalFeatures));
                                              Tracer::Percent($counters{$type}, $totalFeatures)));  
224              }              }
225              my @counterValues = map { $counters{$_} } @columnTypes;              my @counterValues = map { $counters{$_} } @columnTypes;
226              # The last link is a button to look at the subsystem summaries.              # The last column is a subsystem count.
227              my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',              my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',
228                                                 [$genomeID]);                                                 [$genomeID]);
229              my $ssLink = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&show_subsystems=1";              my $ssCol = $ssCount;
230              my $ssCol = "<a href=\"$ssLink\">$ssCount</a>";              # Start creating the table cells.
231              # Create the row text. Note that we use the distributive capability of the TD              my $rowHtml = "| $genomeText |";
232              # function to apply the same style to each one.              # Add any special columns.
233              my $rowHtml = join("",              for my $specialCol (keys %specialData) {
234                                 td("$genomeName$new"),                  # Here we get the attribute value. If there is none, we leave the column blank.
235                                 td({ class => $numStyle }, $genomeLen),                  my $attribute = $specialData{$specialCol}->{$genomeID} || "&nbsp;";
236                                 td({ class => $numStyle }, $pegCount),                  $rowHtml .= " $attribute |";
237                                 td({ class => $counterStyle }, \@counterValues),              }
238                                 td({ class => $numStyle }, $ssCol),              # Now add the data columns.
239                                 td({ class => $numStyle }, $rnaCount),              $rowHtml .= join("", map { "  $_ |" } ($genomeLen, $pegCount, @counterValues, $ssCol, $rnaCount));
                               );  
240              # Put it in the row hash.              # Put it in the row hash.
241              $rows{$genomeName} = $rowHtml;              $rows{$genomeName} = $rowHtml;
242          }          }
# Line 285  Line 249 
249          # Loop through the rows.          # Loop through the rows.
250          for my $rowID (sort keys %rows) {          for my $rowID (sort keys %rows) {
251              # Format the row.              # Format the row.
252              print GROUPFILE Tr( { class => $rowStyle[$rowType] }, $rows{$rowID} ) . "\n";              push @outputLines, $rows{$rowID};
             # Flip the row type.  
             $rowType = 1 - $rowType;  
253              # Count the row.              # Count the row.
254              $rowCount++;              $rowCount++;
255          }          }
256          # All done, terminate the table and close the file.          # All done, write the Wiki Page.
257          print GROUPFILE "</table>\n";          my $rc = $wiki->Save($outPageName, $outputWeb, 'OrganismDataSummariesStats', join("\n", @outputLines));
         close GROUPFILE;  
258          Trace("$rowCount genomes processed.") if T(2);          Trace("$rowCount genomes processed.") if T(2);
259            if (! $rc) {
260                Confess("Error saving $outPageName: $wiki->{error}");
261            }
262          # Now save the group totals.          # Now save the group totals.
263          $summaries{$groupID} = \%groupTotals;          $summaries{$groupID} = \%groupTotals;
264      }      }
265      # Now produce the summary table.      # Now produce the summary table.
266      my $sumFileName = "stats-groups.inc";      my $sumPageName = "OrganismDataSummariesStats";
267      Open(\*SUMFILE, ">$targetDir/$sumFileName");      my @sumLines = ();
268      # Start the table.      # Start the table.  Asterisks make a cell a column header. An extra space at the front right-justifies it.
269      print SUMFILE "<table class=\"$tableStyle\">\n";      push @sumLines, "| " . join("", map { " *$_* |" } ("Group name", "Genomes")) .
270      # Create the header row.                             join("", map { "  *$_* |"} ("[[FIG.ProteinEncodingGenes][Protein Encoding Genes]] (PEGs)",
     print SUMFILE Tr( { class => 'odd' }, th(["Group name",  
                                              "Genomes",  
                                              "Protein Encoding Genes (PEGs)",  
271                                               "Named genes in subsystems",            # s0                                               "Named genes in subsystems",            # s0
272                                               "Named genes not in subsystems",        # n0                                               "Named genes not in subsystems",        # n0
273                                               "Hypothetical genes in subsystems",     # s1                                               "Hypothetical genes in subsystems",     # s1
274                                               "Hypothetical genes not in subsystems", # n1                                               "Hypothetical genes not in subsystems", # n1
275                                               "RNAs",                                                         "RNAs"));
                                                ])) . "\n";  
     # Set up a flag for the odd-even styling.  
     my $rowFlag = 0;  
276      # Put in the data rows.      # Put in the data rows.
277      for my $groupName (sort keys %summaries) {      for my $groupName (sort keys %summaries) {
278          my $group = $summaries{$groupName};          my $group = $summaries{$groupName};
         # Compute the link for the current group.  
         my $groupLink = a({ href => $groupData{$groupName}->[0] }, $groupName);  
279          # Create the table row.          # Create the table row.
280          my $rowHtml = join("",          my $rowHtml = "| [[$groupName]] |" . join("", map { "  " . Tracer::CommaFormat($group->{$_}) . " |" }
281                             td($groupLink),                                                    ('genomes', 'pegs', @columnTypes, 'RNAs'));
282                             td({ class => $numStyle }, Tracer::CommaFormat($group->{genomes})),          push @sumLines, $rowHtml;
283                             td({ class => $numStyle }, Tracer::CommaFormat($group->{pegs})),      }
284                             td({ class => $counterStyle }, [ map { Tracer::CommaFormat($group->{$_}) } @columnTypes ]),      # Write the page.
285                             td({ class => $numStyle }, Tracer::CommaFormat($group->{RNAs})),      my $rc = $wiki->Save($sumPageName, $outputWeb, 'WebHome', join("\n", @sumLines));
                           );  
         print SUMFILE Tr( { class => $rowStyle[$rowFlag] }, $rowHtml ) . "\n";  
         # Flip the row style.  
         $rowFlag = 1 - $rowFlag;  
     }  
     # Terminate the table and close the file.  
     print SUMFILE "</table>\n";  
     close SUMFILE;  
286      # We're all done.      # We're all done.
287      Trace("Processing complete.") if T(2);      Trace("Processing complete.") if T(2);
288    };
289    if ($@) {
290        Trace("Stats failed with error: $@") if T(0);
291        $rtype = "error";
292    } else {
293        Trace("Stats complete.") if T(2);
294        $rtype = "no error";
295    }
296    if ($options->{phone}) {
297        my $msgID = Tracer::SendSMS($options->{phone}, "GenomeStats terminated with $rtype.");
298        if ($msgID) {
299            Trace("Phone message sent with ID $msgID.") if T(2);
300        } else {
301            Trace("Phone message not sent.") if T(2);
302        }
303  }  }
304    
305  1;  1;

Legend:
Removed from v.1.27  
changed lines
  Added in v.1.34

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3