[Bio] / Sprout / GenomeStats.pl Repository:
ViewVC logotype

Diff of /Sprout/GenomeStats.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.23, Wed Sep 13 04:26:44 2006 UTC revision 1.34, Sat Sep 20 14:31:22 2008 UTC
# Line 2  Line 2 
2    
3  =head1 Genome Data Generator  =head1 Genome Data Generator
4    
5  This script creates a set of HTML include files that list the statistics for  This script creates a set of Wiki pages that list the statistics for
6  the genomes in each of the genome groups. Genomes that are new to this version  the genomes in each of the genome groups. Genomes that are new to this version
7  of the Sprout will be specially marked. In order for this to work, both the  of the Sprout will be specially marked. In order for this to work, both the
8  current and previous Sprout databases must be available on this machine.  current and previous Sprout databases must be available on this machine.
 This is one positional parameter: the name of a directory in which to place  
 the include files.  
9    
10  The currently-supported command-line options are as follows.  The currently-supported command-line options are as follows.
11    
# Line 39  Line 37 
37    
38  Display this command's parameters and options.  Display this command's parameters and options.
39    
40  =item strict  =item test
41    
42  If specified, strict groups will be used; otherwise, groups with a common primary name  If specified, the output pages will be put into the sandbox instead of the main web.
 will be combined into a single group. (The primary name of the group is the first  
 capitalized word.)  
43    
44  =item oddStyle  =item noNewCheck
45    
46  Style to use for odd rows of the table.  If specified, the check for new genomes in the group is suppressed. This
47    may need to be done if there's been a change in the database definition. Note
48  =item evenStyle  that all this really does is keep the B<NEW!> symbol from showing. It does
49    not affect which genomes show up in the table.
 Style to use for even rows of the table.  
   
 =item tableStyle  
   
 Style to use for the table itself.  
   
 =item markerStyle  
   
 Style to use for small-text markers (e.g. NEW!)  
   
 =item numStyle  
   
 Style to use for numeric cells.  
   
 =item counterStyle  
   
 Style to use for counter cells.  
   
 =item linkCGI  
   
 Path to the CGI script for displaying detailed statistics.  
50    
51  =back  =back
52    
# Line 79  Line 54 
54    
55  use strict;  use strict;
56  use Tracer;  use Tracer;
 use DocUtils;  
 use TestUtils;  
57  use Cwd;  use Cwd;
58  use File::Copy;  use File::Copy;
59  use File::Path;  use File::Path;
# Line 88  Line 61 
61  use SFXlate;  use SFXlate;
62  use CGI qw(:standard);  use CGI qw(:standard);
63  use FIG;  use FIG;
64  no warnings 'once'; # only when coding  use WikiTools;
65    
66  # Get the command-line options and parameters.  # Get the command-line options and parameters.
67  my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],  my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
68                                             {                                             {
69                                              strict => [0, 'keep related groups separate'],                                              test => [0, 'if specified, publishes to the wiki sandbox instead of the main web'],
70                                              oddStyle => ['odd', 'style for odd rows'],                                              noNewCheck => [0, 'if specified, skips the check for new genomes'],
                                             evenStyle => ['even', 'style for even rows'],  
                                             tableStyle => ['genomestats', 'style for whole table'],  
                                             markerStyle => ['tinytext', 'style for markers'],  
                                             numStyle => ['numcell', 'style for cells with numeric values'],  
                                             counterStyle => ['countercell', 'style for cells with counter values'],  
                                             linkCGI => ['../FIG/genome_statistics.cgi',  
                                                         'path to CGI script for detailed statistics'],  
                                             groupFile => ["$FIG_Config::sproutData/groups.tbl",  
                                                           "location of the NMPDR group description file"],  
71                                              },                                              },
72                                             "<targetDir>",                                             "",
73                                             @ARGV);                                             @ARGV);
74  # Verify the directory name.  # The return type (error/no error) goes in here.
75  my $targetDir = $parameters[0];  my $rtype;
76  if (! $targetDir) {  eval {
77      Confess("No target directory specified.");      # This table controls the special attribute columns. For each we need to know the attribute name and the
78  } elsif (! -d $targetDir) {      # column title. If any genomes in a group have a value for one of the special columns, that column is
79      Confess("Target directory $targetDir not found.");      # displayed along with the attribute values.
80  } else {      my %specialCols = (Serotype => 'Serotype_code',
81      # Get the old Sprout.                         Phenotype => 'Phenotype');
82      my $oldSprout = SFXlate->new_sprout_only($FIG_Config::oldSproutDB);      my $outputWeb = ($options->{test} ? 'Sandbox' : 'Main');
     # Extract the genome group data from the old Sprout.  
     my %oldGroupHash = $oldSprout->GetGroups();  
     if (! $options->{strict}) {  
         %oldGroupHash = Sprout::Fix(%oldGroupHash);  
     }  
83      # Get the new Sprout.      # Get the new Sprout.
84      my $sprout = SFXlate->new_sprout_only();      my $sprout = SFXlate->new_sprout_only();
85      my %newGroupHash = $sprout->GetGroups();      my %newGroupHash = $sprout->GetGroups();
86      if (! $options->{strict}) {      # Get a wiki helper.
87          %newGroupHash = Sprout::Fix(%newGroupHash);      my $wiki = WikiTools->new();
88        # Extract the genome group data from the new Sprout.
89        %newGroupHash = $sprout->Fix(%newGroupHash);
90        # This hash will be used to determine which genomes are new.
91        my %oldGroupHash = ();
92        if ($options->{noNewCheck}) {
93            # Here we can't look at the old Sprout. Set up the hash
94            # so it looks like the old Sprout's data is the same as ours.
95            %oldGroupHash = map { $_ => $newGroupHash{$_} } keys %newGroupHash;
96        } else {
97            # Get the old Sprout.
98            my $oldSprout = SFXlate->old_sprout_only();
99            # Extract the genome group data from the old Sprout.
100            %oldGroupHash = $oldSprout->GetGroups();
101            %oldGroupHash = $oldSprout->Fix(%oldGroupHash);
102      }      }
103      # Read the group file.      # Get a FIG object for computing attributes.
104      my %groupData = Sprout::ReadGroupFile($options->{groupFile});      my $fig = FIG->new();
105        # Get the super-group list.
106        my @superGroups = sort keys %newGroupHash;
107      # Set up some useful stuff for the four count columns.      # Set up some useful stuff for the four count columns.
108      my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",      my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",
109                        s1 => "hypo_sub", n1 => "hypo_nosub" );                        s1 => "hypo_sub", n1 => "hypo_nosub" );
110      my @columnTypes = ('s0', 'n0', 's1', 'n1');      my @columnTypes = ('s0', 'n0', 's1', 'n1');
     # Get the styles.  
     my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},  
                                                  $options->{evenStyle}, $options->{oddStyle});  
     my ($numStyle, $counterStyle) = ($options->{numStyle}, $options->{counterStyle});  
111      # Prepare a hash for the summary counters. These will be used on the organism summary page.      # Prepare a hash for the summary counters. These will be used on the organism summary page.
112      my %summaries = ();      my %summaries = ();
113      # Loop through the groups.      # Loop through the groups.
114      for my $groupID (keys %newGroupHash) {      for my $groupID (@superGroups) {
115          Trace("Processing group $groupID.") if T(2);          Trace("Processing group $groupID.") if T(2);
116          # Create a hash for summarizing the counters.          # Create a hash for summarizing the counters.
117          my %groupTotals = ( genomes => 0, pegs => 0, RNAs => 0,          my %groupTotals = ( genomes => 0, pegs => 0, RNAs => 0,
# Line 153  Line 124 
124          if (exists $oldGroupHash{$groupID}) {          if (exists $oldGroupHash{$groupID}) {
125              %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};              %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};
126          }          }
127          # Create the output file.          # Compute the name of the wiki page we're building.
128          my $outFileName = "stats-" . lc($groupID) . ".inc";          my $outPageName = "${groupID}Stats";
129          Open(\*GROUPFILE, ">$targetDir/$outFileName");          # We'll put the data for the page in here.
130          # Start the table.          my @outputLines = ();
131          print GROUPFILE "<table class=\"$tableStyle\">\n";          # Get the special columns. We'll stuff them in a hash keyed by column name. Each column name will contain
132            # a sub-hash that translates each genome ID to its applicable attribute value (if any).
133            my %specialData = ();
134            for my $specialColumn (keys %specialCols) {
135                # Get the attribute mapping.
136                my %specialDataList = map { $_->[0] => $_->[2] } $fig->get_attributes(\@newGenomes, $specialCols{$specialColumn});
137                # We only proceed if some attributes were found. As a result, the keys in %specialData will only be keys
138                # for columns that exist in the output.
139                if (scalar(keys %specialDataList)) {
140                    $specialData{$specialColumn} = \%specialDataList;
141                }
142            }
143            # Set up the column names. Note that an extra space in front of a name will be interpreted by
144            # the Wiki markup as an order to right-justify the text.
145            my @columnNames = "*Strain annotated in NMPDR*";
146            push @columnNames, map { "*$_*" } sort keys %specialData;
147            push @columnNames,  "*Genome size, bp*",
148                                " *%FIG{Protein Encoding Genes}% (PEGs)*",
149                                " *Named genes in subsystems*",            # s0
150                                " *Named genes not in subsystems*",        # n0
151                                " *Hypothetical genes in subsystems*",     # s1
152                                " *Hypothetical genes not in subsystems*", # n1
153                                " *Subsystems*",
154                                " *RNAs*";
155          # Create the header row.          # Create the header row.
156          print GROUPFILE Tr( { class => 'odd' }, th(["Strain annotated in NMPDR",          push @outputLines, "| " . join(" | ", @columnNames) . " |";
                                                  "Genome size, bp",  
                                                  "Protein Encoding Genes (PEGs)",  
                                                  "Named genes in subsystems",            # s0  
                                                  "Named genes not in subsystems",        # n0  
                                                  "Hypothetical genes in subsystems",     # s1  
                                                  "Hypothetical genes not in subsystems", # n1  
                                                  "Subsystems",  
                                                  "RNAs",  
                                                    ])) . "\n";  
157          # The data rows will be built next. We'll be putting them into a hash keyed by          # The data rows will be built next. We'll be putting them into a hash keyed by
158          # organism name. The hash enables us to spit them out sorted by name.          # organism name. The hash enables us to spit them out sorted by name.
159          my %rows = ();          my %rows = ();
# Line 183  Line 168 
168              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);
169              # Get the strain name.              # Get the strain name.
170              my $genomeName = $sprout->GenusSpecies($genomeID);              my $genomeName = $sprout->GenusSpecies($genomeID);
171              # If this is a new strain, build the HTML for the NEW! mark.              # Apply a link.
172                my $genomeText = "%SV{\"$genomeName\" id=\"$genomeID\"}%";
173                # If this is a new strain, add the NEW! mark.
174              if ($new) {              if ($new) {
175                  $new = " <span class=\"$markerStyle\">NEW!</span>";                  $genomeText .= " %N%";
176              }              }
177              # Get the genome length.              # Get the genome length.
178              $num = $sprout->GenomeLength($genomeID);              $num = $sprout->GenomeLength($genomeID);
# Line 230  Line 217 
217                  $groupTotals{$counterKey} += $counters{$counterKey};                  $groupTotals{$counterKey} += $counters{$counterKey};
218              }              }
219              $groupTotals{features} += $totalFeatures;              $groupTotals{features} += $totalFeatures;
220              # We have all our data. Next we need to compute the percentages and the links.              # We have all our data. Next we need to compute the percentages.
             # First, the link stuff.  
             my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";  
             # Now format the counters and percentages.  
221              for my $type (keys %linkParms) {              for my $type (keys %linkParms) {
222                  $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },                  $counters{$type} = sprintf("%d(%.1f%%)", $counters{$type},
223                                       sprintf("%d(%.1f%%)", $counters{$type},                                             Tracer::Percent($counters{$type}, $totalFeatures));
                                              Tracer::Percent($counters{$type}, $totalFeatures)));  
224              }              }
225              my @counterValues = map { $counters{$_} } @columnTypes;              my @counterValues = map { $counters{$_} } @columnTypes;
226              # The last link is a button to look at the subsystem summaries.              # The last column is a subsystem count.
227              my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',              my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',
228                                                 [$genomeID]);                                                 [$genomeID]);
229              my $ssLink = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&show_subsystems=1";              my $ssCol = $ssCount;
230              my $ssCol = "<a href=\"$ssLink\">$ssCount</a>";              # Start creating the table cells.
231              # Create the row text. Note that we use the distributive capability of the TD              my $rowHtml = "| $genomeText |";
232              # function to apply the same style to each one.              # Add any special columns.
233              my $rowHtml = join("",              for my $specialCol (keys %specialData) {
234                                 td("$genomeName$new"),                  # Here we get the attribute value. If there is none, we leave the column blank.
235                                 td({ class => $numStyle }, $genomeLen),                  my $attribute = $specialData{$specialCol}->{$genomeID} || "&nbsp;";
236                                 td({ class => $numStyle }, $pegCount),                  $rowHtml .= " $attribute |";
237                                 td({ class => $counterStyle }, \@counterValues),              }
238                                 td({ class => $numStyle }, $ssCol),              # Now add the data columns.
239                                 td({ class => $numStyle }, $rnaCount),              $rowHtml .= join("", map { "  $_ |" } ($genomeLen, $pegCount, @counterValues, $ssCol, $rnaCount));
                               );  
240              # Put it in the row hash.              # Put it in the row hash.
241              $rows{$genomeName} = $rowHtml;              $rows{$genomeName} = $rowHtml;
242          }          }
# Line 267  Line 249 
249          # Loop through the rows.          # Loop through the rows.
250          for my $rowID (sort keys %rows) {          for my $rowID (sort keys %rows) {
251              # Format the row.              # Format the row.
252              print GROUPFILE Tr( { class => $rowStyle[$rowType] }, $rows{$rowID} ) . "\n";              push @outputLines, $rows{$rowID};
             # Flip the row type.  
             $rowType = 1 - $rowType;  
253              # Count the row.              # Count the row.
254              $rowCount++;              $rowCount++;
255          }          }
256          # All done, terminate the table and close the file.          # All done, write the Wiki Page.
257          print GROUPFILE "</table>\n";          my $rc = $wiki->Save($outPageName, $outputWeb, 'OrganismDataSummariesStats', join("\n", @outputLines));
         close GROUPFILE;  
258          Trace("$rowCount genomes processed.") if T(2);          Trace("$rowCount genomes processed.") if T(2);
259            if (! $rc) {
260                Confess("Error saving $outPageName: $wiki->{error}");
261            }
262          # Now save the group totals.          # Now save the group totals.
263          $summaries{$groupID} = \%groupTotals;          $summaries{$groupID} = \%groupTotals;
264      }      }
265      # Now produce the summary table.      # Now produce the summary table.
266      my $sumFileName = "stats-groups.inc";      my $sumPageName = "OrganismDataSummariesStats";
267      Open(\*SUMFILE, ">$targetDir/$sumFileName");      my @sumLines = ();
268      # Start the table.      # Start the table.  Asterisks make a cell a column header. An extra space at the front right-justifies it.
269      print SUMFILE "<table class=\"$tableStyle\">\n";      push @sumLines, "| " . join("", map { " *$_* |" } ("Group name", "Genomes")) .
270      # Create the header row.                             join("", map { "  *$_* |"} ("[[FIG.ProteinEncodingGenes][Protein Encoding Genes]] (PEGs)",
     print SUMFILE Tr( { class => 'odd' }, th(["Group name",  
                                              "Genomes",  
                                              "Protein Encoding Genes (PEGs)",  
271                                               "Named genes in subsystems",            # s0                                               "Named genes in subsystems",            # s0
272                                               "Named genes not in subsystems",        # n0                                               "Named genes not in subsystems",        # n0
273                                               "Hypothetical genes in subsystems",     # s1                                               "Hypothetical genes in subsystems",     # s1
274                                               "Hypothetical genes not in subsystems", # n1                                               "Hypothetical genes not in subsystems", # n1
275                                               "RNAs",                                                         "RNAs"));
                                                ])) . "\n";  
     # Set up a flag for the odd-even styling.  
     my $rowFlag = 0;  
276      # Put in the data rows.      # Put in the data rows.
277      for my $groupName (sort keys %summaries) {      for my $groupName (sort keys %summaries) {
278          my $group = $summaries{$groupName};          my $group = $summaries{$groupName};
         # Compute the link for the current group.  
         my $groupLink = a({ href => $groupData{$groupName}->[0] }, $groupName);  
279          # Create the table row.          # Create the table row.
280          my $rowHtml = join("",          my $rowHtml = "| [[$groupName]] |" . join("", map { "  " . Tracer::CommaFormat($group->{$_}) . " |" }
281                             td($groupLink),                                                    ('genomes', 'pegs', @columnTypes, 'RNAs'));
282                             td({ class => $numStyle }, Tracer::CommaFormat($group->{genomes})),          push @sumLines, $rowHtml;
283                             td({ class => $numStyle }, Tracer::CommaFormat($group->{pegs})),      }
284                             td({ class => $counterStyle }, [ map { Tracer::CommaFormat($group->{$_}) } @columnTypes ]),      # Write the page.
285                             td({ class => $numStyle }, Tracer::CommaFormat($group->{RNAs})),      my $rc = $wiki->Save($sumPageName, $outputWeb, 'WebHome', join("\n", @sumLines));
                           );  
         print SUMFILE Tr( { class => $rowStyle[$rowFlag] }, $rowHtml ) . "\n";  
         # Flip the row style.  
         $rowFlag = 1 - $rowFlag;  
     }  
     # Terminate the table and close the file.  
     print SUMFILE "</table>\n";  
     close SUMFILE;  
286      # We're all done.      # We're all done.
287      Trace("Processing complete.") if T(2);      Trace("Processing complete.") if T(2);
288    };
289    if ($@) {
290        Trace("Stats failed with error: $@") if T(0);
291        $rtype = "error";
292    } else {
293        Trace("Stats complete.") if T(2);
294        $rtype = "no error";
295    }
296    if ($options->{phone}) {
297        my $msgID = Tracer::SendSMS($options->{phone}, "GenomeStats terminated with $rtype.");
298        if ($msgID) {
299            Trace("Phone message sent with ID $msgID.") if T(2);
300        } else {
301            Trace("Phone message not sent.") if T(2);
302        }
303  }  }
304    
305  1;  1;

Legend:
Removed from v.1.23  
changed lines
  Added in v.1.34

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3