[Bio] / Sprout / GenomeStats.pl Repository:
ViewVC logotype

Diff of /Sprout/GenomeStats.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.18, Thu Aug 10 07:34:13 2006 UTC revision 1.32, Sun Mar 23 16:10:47 2008 UTC
# Line 2  Line 2 
2    
3  =head1 Genome Data Generator  =head1 Genome Data Generator
4    
5  This script creates a set of HTML include files that list the statistics for  This script creates a set of Wiki pages that list the statistics for
6  the genomes in each of the genome groups. Genomes that are new to this version  the genomes in each of the genome groups. Genomes that are new to this version
7  of the Sprout will be specially marked. In order for this to work, both the  of the Sprout will be specially marked. In order for this to work, both the
8  current and previous Sprout databases must be available on this machine.  current and previous Sprout databases must be available on this machine.
 This is one positional parameter: the name of a directory in which to place  
 the include files.  
9    
10  The currently-supported command-line options are as follows.  The currently-supported command-line options are as follows.
11    
# Line 39  Line 37 
37    
38  Display this command's parameters and options.  Display this command's parameters and options.
39    
40  =item strict  =item test
41    
42  If specified, strict groups will be used; otherwise, groups with a common primary name  If specified, the output pages will be put into the sandbox instead of the main web.
 will be combined into a single group. (The primary name of the group is the first  
 capitalized word.)  
43    
44  =item oddStyle  =item noNewCheck
45    
46  Style to use for odd rows of the table.  If specified, the check for new genomes in the group is suppressed. This
47    may need to be done if there's been a change in the database definition. Note
48  =item evenStyle  that all this really does is keep the B<NEW!> symbol from showing. It does
49    not affect which genomes show up in the table.
 Style to use for even rows of the table.  
   
 =item tableStyle  
   
 Style to use for the table itself.  
   
 =item markerStyle  
   
 Style to use for small-text markers (e.g. NEW!)  
   
 =item numStyle  
   
 Style to use for numeric cells.  
   
 =item counterStyle  
   
 Style to use for counter cells.  
   
 =item linkCGI  
   
 Path to the CGI script for displaying detailed statistics.  
50    
51  =back  =back
52    
# Line 79  Line 54 
54    
55  use strict;  use strict;
56  use Tracer;  use Tracer;
 use DocUtils;  
 use TestUtils;  
57  use Cwd;  use Cwd;
58  use File::Copy;  use File::Copy;
59  use File::Path;  use File::Path;
# Line 88  Line 61 
61  use SFXlate;  use SFXlate;
62  use CGI qw(:standard);  use CGI qw(:standard);
63  use FIG;  use FIG;
64  no warnings 'once'; # only when coding  use WikiTools;
65    
66  # Get the command-line options and parameters.  # Get the command-line options and parameters.
67  my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],  my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
68                                             {                                             {
69                                              strict => [0, 'keep related groups separate'],                                              test => [0, 'if specified, publishes to the wiki sandbox instead of the main web'],
70                                              oddStyle => ['odd', 'style for odd rows'],                                              noNewCheck => [0, 'if specified, skips the check for new genomes'],
                                             evenStyle => ['even', 'style for even rows'],  
                                             tableStyle => ['genomestats', 'style for whole table'],  
                                             markerStyle => ['tinytext', 'style for markers'],  
                                             numStyle => ['numcell', 'style for cells with numeric values'],  
                                             counterStyle => ['countercell', 'style for cells with counter values'],  
                                             linkCGI => ['../FIG/genome_statistics.cgi',  
                                                         'path to CGI script for detailed statistics'],  
71                                             },                                             },
72                                             "<targetDir>",                                             "",
73                                             @ARGV);                                             @ARGV);
74  # Verify the directory name.  # The return type (error/no error) goes in here.
75  my $targetDir = $parameters[0];  my $rtype;
76  if (! $targetDir) {  eval {
77      Confess("No target directory specified.");      # This table controls the special attribute columns. For each we need to know the attribute name and the
78  } elsif (! -d $targetDir) {      # column title. If any genomes in a group have a value for one of the special columns, that column is
79      Confess("Target directory $targetDir not found.");      # displayed along with the attribute values.
80  } else {      my %specialCols = (Serotype => 'Serotype_code',
81      # Get the old Sprout.                         Phenotype => 'Phenotype');
82      my $oldSprout = SFXlate->new_sprout_only($FIG_Config::oldSproutDB);      my $outputWeb = ($options->{test} ? 'Sandbox' : 'Main');
     # Extract the genome group data from the old Sprout.  
     my %oldGroupHash = $oldSprout->GetGroups();  
     if (! $options->{strict}) {  
         %oldGroupHash = Fix(%oldGroupHash);  
     }  
83      # Get the new Sprout.      # Get the new Sprout.
84      my $sprout = SFXlate->new_sprout_only();      my $sprout = SFXlate->new_sprout_only();
85      my %newGroupHash = $sprout->GetGroups();      my %newGroupHash = $sprout->GetGroups();
86      if (! $options->{strict}) {      # Get a wiki helper.
87          %newGroupHash = Fix(%newGroupHash);      my $wiki = WikiTools->new();
88        # Extract the genome group data from the new Sprout.
89        %newGroupHash = $sprout->Fix(%newGroupHash);
90        # This hash will be used to determine which genomes are new.
91        my %oldGroupHash = ();
92        if ($options->{noNewCheck}) {
93            # Here we can't look at the old Sprout. Set up the hash
94            # so it looks like the old Sprout's data is the same as ours.
95            %oldGroupHash = map { $_ => $newGroupHash{$_} } keys %newGroupHash;
96        } else {
97            # Get the old Sprout.
98            my $oldSprout = SFXlate->old_sprout_only();
99            # Extract the genome group data from the old Sprout.
100            %oldGroupHash = $oldSprout->GetGroups();
101            %oldGroupHash = $oldSprout->Fix(%oldGroupHash);
102      }      }
103        # Get a FIG object for computing attributes.
104        my $fig = FIG->new();
105        # Get the super-group list.
106        my @superGroups = sort keys %newGroupHash;
107        # Set up some useful stuff for the four count columns.
108        my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",
109                          s1 => "hypo_sub", n1 => "hypo_nosub" );
110        my @columnTypes = ('s0', 'n0', 's1', 'n1');
111        # Prepare a hash for the summary counters. These will be used on the organism summary page.
112        my %summaries = ();
113      # Loop through the groups.      # Loop through the groups.
114      for my $groupID (keys %newGroupHash) {      for my $groupID (@superGroups) {
115          Trace("Processing group $groupID.") if T(2);          Trace("Processing group $groupID.") if T(2);
116            # Create a hash for summarizing the counters.
117            my %groupTotals = ( genomes => 0, pegs => 0, RNAs => 0,
118                               map { $_ => } @columnTypes, features => 0 );
119          # Get the genomes from the new hash.          # Get the genomes from the new hash.
120          my @newGenomes = @{$newGroupHash{$groupID}};          my @newGenomes = @{$newGroupHash{$groupID}};
121          # Create a hash for finding if a genome is in the old group. If the entire group is          # Create a hash for finding if a genome is in the old group. If the entire group is
# Line 136  Line 124 
124          if (exists $oldGroupHash{$groupID}) {          if (exists $oldGroupHash{$groupID}) {
125              %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};              %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};
126          }          }
127          # Create the output file.          # Compute the name of the wiki page we're building.
128          my $outFileName = "stats-" . lc($groupID) . ".inc";          my $outPageName = "${groupID}Stats";
129          Open(\*GROUPFILE, ">$targetDir/$outFileName");          # We'll put the data for the page in here.
130          # Get the styles.          my @outputLines = ();
131          my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},          # Get the special columns. We'll stuff them in a hash keyed by column name. Each column name will contain
132                                                       $options->{evenStyle}, $options->{oddStyle});          # a sub-hash that translates each genome ID to its applicable attribute value (if any).
133          my ($numStyle, $counterStyle) = ($options->{numStyle}, $options->{counterStyle});          my %specialData = ();
134          # Start the table.          for my $specialColumn (keys %specialCols) {
135          print GROUPFILE "<table class=\"$tableStyle\">\n";              # Get the attribute mapping.
136                my %specialDataList = map { $_->[0] => $_->[2] } $fig->get_attributes(\@newGenomes, $specialCols{$specialColumn});
137                # We only proceed if some attributes were found. As a result, the keys in %specialData will only be keys
138                # for columns that exist in the output.
139                if (scalar(keys %specialDataList)) {
140                    $specialData{$specialColumn} = \%specialDataList;
141                }
142            }
143            # Set up the column names. Note that an extra space in front of a name will be interpreted by
144            # the Wiki markup as an order to right-justify the text.
145            my @columnNames = "*Strain annotated in NMPDR*";
146            push @columnNames, map { "*$_*" } sort keys %specialData;
147            push @columnNames,  "*Genome size, bp*",
148                                " *[[FIG.ProteinEncodingGenes][Protein Encoding Genes]] (PEGs)*",
149                                " *Named genes in subsystems*",            # s0
150                                " *Named genes not in subsystems*",        # n0
151                                " *Hypothetical genes in subsystems*",     # s1
152                                " *Hypothetical genes not in subsystems*", # n1
153                                " *Subsystems*",
154                                " *RNAs*";
155          # Create the header row.          # Create the header row.
156          print GROUPFILE Tr( { class => 'odd' }, th(["Strain annotated in NMPDR",          push @outputLines, "| " . join(" | ", @columnNames) . " |";
                                                  "Genome size, bp",  
                                                  "Protein Encoding Genes (PEGs)",  
                                                  "Named genes in subsystems",            # s0  
                                                  "Named genes not in subsystems",        # n0  
                                                  "Hypothetical genes in subsystems",     # s1  
                                                  "Hypothetical genes not in subsystems", # n1  
                                                  "RNAs",  
                                                    ])) . "\n";  
         # Set up some useful stuff for the four count columns.  
         my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",  
                           s1 => "hypo_sub", n1 => "hypo_nosub" );  
         my @columnTypes = ('s0', 'n0', 's1', 'n1');  
157          # The data rows will be built next. We'll be putting them into a hash keyed by          # The data rows will be built next. We'll be putting them into a hash keyed by
158          # organism name. The hash enables us to spit them out sorted by name.          # organism name. The hash enables us to spit them out sorted by name.
159          my %rows = ();          my %rows = ();
160            # This variable is used to hold the counts.
161            my $num;
162          # Loop through the genomes in the new group.          # Loop through the genomes in the new group.
163          for my $genomeID (@newGenomes) {          for my $genomeID (@newGenomes) {
164                # Count this genome.
165                $groupTotals{genomes}++;
166              # Check to see if this genome is new.              # Check to see if this genome is new.
167              my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");              my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");
168              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);
169              # Get the strain name.              # Get the strain name.
170              my $genomeName = $sprout->GenusSpecies($genomeID);              my $genomeName = $sprout->GenusSpecies($genomeID);
171              # If this is a new strain, build the HTML for the NEW! mark.              # Apply a link.
172                my $genomeText = "%SV{\"$genomeName\" id=\"$genomeID\"}%";
173                # If this is a new strain, add the NEW! mark.
174              if ($new) {              if ($new) {
175                  $new = " <span class=\"$markerStyle\">NEW!</span>";                  $genomeText .= " %N%";
176              }              }
177              # Get the genome length.              # Get the genome length.
178              my $genomeLen = Tracer::CommaFormat($sprout->GenomeLength($genomeID));              $num = $sprout->GenomeLength($genomeID);
179                my $genomeLen = Tracer::CommaFormat($num);
180              # Get the number of PEGs.              # Get the number of PEGs.
181              my $pegCount = Tracer::CommaFormat($sprout->FeatureCount($genomeID, 'peg'));              $num = $sprout->FeatureCount($genomeID, 'peg');
182                my $pegCount = Tracer::CommaFormat($num);
183                $groupTotals{pegs} += $num;
184              # Get the number of RNAs.              # Get the number of RNAs.
185              my $rnaCount = Tracer::CommaFormat($sprout->FeatureCount($genomeID, 'rna'));              $num = $sprout->FeatureCount($genomeID, 'rna');
186                my $rnaCount = Tracer::CommaFormat($num);
187                $groupTotals{RNAs} += $num;
188              # If there are no RNAs, we say we don't know the number, since we know there              # If there are no RNAs, we say we don't know the number, since we know there
189              # must be RNAs somewhere.              # must be RNAs somewhere.
190              if (! $rnaCount) {              if (! $rnaCount) {
# Line 207  Line 213 
213                  $totalFeatures++;                  $totalFeatures++;
214              }              }
215              Trace("$totalFeatures total features found for $genomeID.") if T(3);              Trace("$totalFeatures total features found for $genomeID.") if T(3);
216                for my $counterKey (@columnTypes) {
217                    $groupTotals{$counterKey} += $counters{$counterKey};
218                }
219                $groupTotals{features} += $totalFeatures;
220              # We have all our data. Next we need to compute the percentages and the links.              # We have all our data. Next we need to compute the percentages and the links.
221              # First, the link stuff.              # First, the link stuff.
222              my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";              my $linkPrefix = "%NMPDR{FIG/genome_statistics.cgi}%?user=;genome=$genomeID;SPROUT=1;request=";
223              # Now format the counters and percentages.              # Now format the counters and percentages.
224              for my $type (keys %linkParms) {              for my $type (keys %linkParms) {
225                  $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },                  $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },
# Line 217  Line 227 
227                                               Tracer::Percent($counters{$type}, $totalFeatures)));                                               Tracer::Percent($counters{$type}, $totalFeatures)));
228              }              }
229              my @counterValues = map { $counters{$_} } @columnTypes;              my @counterValues = map { $counters{$_} } @columnTypes;
230              # Create the row text. Note that we use the distributive capability of the TD              # The last link is a button to look at the subsystem summaries.
231              # function to apply the same style to each one.              my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',
232              my $rowHtml = join("",                                                 [$genomeID]);
233                                 td("$genomeName$new"),              my $ssLink = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&show_subsystems=1";
234                                 td({ class => $numStyle }, $genomeLen),              my $ssCol = "<a href=\"$ssLink\">$ssCount</a>";
235                                 td({ class => $numStyle }, $pegCount),              # Start creating the table cells.
236                                 td({ class => $counterStyle }, \@counterValues),              my $rowHtml = "| $genomeText |";
237                                 td({ class => $numStyle }, $rnaCount),              # Add any special columns.
238                                );              for my $specialCol (keys %specialData) {
239                    # Here we get the attribute value. If there is none, we leave the column blank.
240                    my $attribute = $specialData{$specialCol}->{$genomeID} || "&nbsp;";
241                    $rowHtml .= " $attribute |";
242                }
243                # Now add the data columns.
244                $rowHtml .= join("", map { "  $_ |" } ($genomeLen, $pegCount, @counterValues, $ssCol, $rnaCount));
245              # Put it in the row hash.              # Put it in the row hash.
246              $rows{$genomeName} = $rowHtml;              $rows{$genomeName} = $rowHtml;
247          }          }
# Line 238  Line 254 
254          # Loop through the rows.          # Loop through the rows.
255          for my $rowID (sort keys %rows) {          for my $rowID (sort keys %rows) {
256              # Format the row.              # Format the row.
257              print GROUPFILE Tr( { class => $rowStyle[$rowType] }, $rows{$rowID} ) . "\n";              push @outputLines, $rows{$rowID};
             # Flip the row type.  
             $rowType = 1 - $rowType;  
258              # Count the row.              # Count the row.
259              $rowCount++;              $rowCount++;
260          }          }
261          # All done, terminate the table and close the file.          # All done, write the Wiki Page.
262          print GROUPFILE "</table>\n";          my $rc = $wiki->Save($outPageName, $outputWeb, 'OrganismDataSummariesStats', join("\n", @outputLines));
         close GROUPFILE;  
263          Trace("$rowCount genomes processed.") if T(2);          Trace("$rowCount genomes processed.") if T(2);
264            if (! $rc) {
265                Confess("Error saving $outPageName: $wiki->{error}");
266            }
267            # Now save the group totals.
268            $summaries{$groupID} = \%groupTotals;
269        }
270        # Now produce the summary table.
271        my $sumPageName = "OrganismDataSummariesStats";
272        my @sumLines = ();
273        # Start the table.  Asterisks make a cell a column header. An extra space at the front right-justifies it.
274        push @sumLines, "| " . join("", map { " *$_* |" } ("Group name", "Genomes")) .
275                               join("", map { "  *$_* |"} ("[[FIG.ProteinEncodingGenes][Protein Encoding Genes]] (PEGs)",
276                                                           "Named genes in subsystems",            # s0
277                                                           "Named genes not in subsystems",        # n0
278                                                           "Hypothetical genes in subsystems",     # s1
279                                                           "Hypothetical genes not in subsystems", # n1
280                                                           "RNAs"));
281        # Put in the data rows.
282        for my $groupName (sort keys %summaries) {
283            my $group = $summaries{$groupName};
284            # Create the table row.
285            my $rowHtml = "| [[$groupName]] |" . join("", map { "  " . Tracer::CommaFormat($group->{$_}) . " |" }
286                                                      ('genomes', 'pegs', @columnTypes, 'RNAs'));
287            push @sumLines, $rowHtml;
288      }      }
289        # Write the page.
290        my $rc = $wiki->Save($sumPageName, $outputWeb, 'WebHome', join("\n", @sumLines));
291      # We're all done.      # We're all done.
292      Trace("Processing complete.") if T(2);      Trace("Processing complete.") if T(2);
293    };
294    if ($@) {
295        Trace("Stats failed with error: $@") if T(0);
296        $rtype = "error";
297    } else {
298        Trace("Stats complete.") if T(2);
299        $rtype = "no error";
300  }  }
301    if ($options->{phone}) {
302  =head3 Fix      my $msgID = Tracer::SendSMS($options->{phone}, "GenomeStats terminated with $rtype.");
303        if ($msgID) {
304  C<< my %fixedHash = Fix(%groupHash); >>          Trace("Phone message sent with ID $msgID.") if T(2);
305        } else {
306  Prepare a genome group hash for processing. Groups with the same primary name will be combined.          Trace("Phone message not sent.") if T(2);
 The primary name is the first capitalized word in the group name.  
   
 =over 4  
   
 =item groupHash  
   
 Hash to be fixed up.  
   
 =item RETURN  
   
 Returns a fixed-up version of the hash.  
   
 =back  
   
 =cut  
   
 sub Fix {  
     # Get the parameters.  
     my (%groupHash) = @_;  
     # Create the result hash.  
     my %retVal = ();  
     # Copy over the genomes.  
     for my $groupID (keys %groupHash) {  
         # Make a safety copy of the group ID.  
         my $realGroupID = $groupID;  
         # Yank the primary name.  
         if ($groupID =~ /([A-Z]\w+)/) {  
             $realGroupID = $1;  
         }  
         # Append this group's genomes into the result hash.  
         Tracer::AddToListMap(\%retVal, $realGroupID, @{$groupHash{$groupID}});  
307      }      }
     # Return the result hash.  
     return %retVal;  
308  }  }
309    
   
310  1;  1;

Legend:
Removed from v.1.18  
changed lines
  Added in v.1.32

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3