[Bio] / Sprout / GenomeStats.pl Repository:
ViewVC logotype

Diff of /Sprout/GenomeStats.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.10, Sun Jun 18 08:52:31 2006 UTC revision 1.34, Sat Sep 20 14:31:22 2008 UTC
# Line 2  Line 2 
2    
3  =head1 Genome Data Generator  =head1 Genome Data Generator
4    
5  This script creates a set of HTML include files that list the statistics for  This script creates a set of Wiki pages that list the statistics for
6  the genomes in each of the genome groups. Genomes that are new to this version  the genomes in each of the genome groups. Genomes that are new to this version
7  of the Sprout will be specially marked. In order for this to work, both the  of the Sprout will be specially marked. In order for this to work, both the
8  current and previous Sprout databases must be available on this machine.  current and previous Sprout databases must be available on this machine.
 This is one positional parameter: the name of a directory in which to place  
 the include files.  
9    
10  The currently-supported command-line options are as follows.  The currently-supported command-line options are as follows.
11    
# Line 39  Line 37 
37    
38  Display this command's parameters and options.  Display this command's parameters and options.
39    
40  =item strict  =item test
41    
42  If specified, strict groups will be used; otherwise, groups with a common primary name  If specified, the output pages will be put into the sandbox instead of the main web.
 will be combined into a single group. (The primary name of the group is the first  
 capitalized word.)  
43    
44  =item oddStyle  =item noNewCheck
45    
46  Style to use for odd rows of the table.  If specified, the check for new genomes in the group is suppressed. This
47    may need to be done if there's been a change in the database definition. Note
48  =item evenStyle  that all this really does is keep the B<NEW!> symbol from showing. It does
49    not affect which genomes show up in the table.
 Style to use for even rows of the table.  
   
 =item tableStyle  
   
 Style to use for the table itself.  
   
 =item markerStyle  
   
 Style to use for small-text markers (e.g. NEW!)  
   
 =item numStyle  
   
 Style to use for numeric cells.  
   
 =item counterStyle  
   
 Style to use for counter cells.  
   
 =item linkCGI  
   
 Path to the CGI script for displaying detailed statistics.  
50    
51  =back  =back
52    
# Line 79  Line 54 
54    
55  use strict;  use strict;
56  use Tracer;  use Tracer;
 use DocUtils;  
 use TestUtils;  
57  use Cwd;  use Cwd;
58  use File::Copy;  use File::Copy;
59  use File::Path;  use File::Path;
# Line 88  Line 61 
61  use SFXlate;  use SFXlate;
62  use CGI qw(:standard);  use CGI qw(:standard);
63  use FIG;  use FIG;
64  no warnings 'once'; # only when coding  use WikiTools;
65    
66  # Get the command-line options and parameters.  # Get the command-line options and parameters.
67  my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],  my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
68                                             {                                             {
69                                              strict => [0, 'keep related groups separate'],                                              test => [0, 'if specified, publishes to the wiki sandbox instead of the main web'],
70                                              oddStyle => ['odd', 'style for odd rows'],                                              noNewCheck => [0, 'if specified, skips the check for new genomes'],
                                             evenStyle => ['even', 'style for even rows'],  
                                             tableStyle => ['genomestats', 'style for whole table'],  
                                             markerStyle => ['tinytext', 'style for markers'],  
                                             numStyle => ['numcell', 'style for cells with numeric values'],  
                                             counterStyle => ['countercell', 'style for cells with counter values'],  
                                             linkCGI => ['../FIG/genome_statistics.cgi',  
                                                         'path to CGI script for detailed statistics'],  
71                                             },                                             },
72                                             "<targetDir>",                                             "",
73                                             @ARGV);                                             @ARGV);
74  # Verify the directory name.  # The return type (error/no error) goes in here.
75  my $targetDir = $parameters[0];  my $rtype;
76  if (! $targetDir) {  eval {
77      Confess("No target directory specified.");      # This table controls the special attribute columns. For each we need to know the attribute name and the
78  } elsif (! -d $targetDir) {      # column title. If any genomes in a group have a value for one of the special columns, that column is
79      Confess("Target directory $targetDir not found.");      # displayed along with the attribute values.
80  } else {      my %specialCols = (Serotype => 'Serotype_code',
81      # *Get the old Sprout.                         Phenotype => 'Phenotype');
82      my $oldSprout = SFXlate->new_sprout_only($FIG_Config::oldSproutDB);      my $outputWeb = ($options->{test} ? 'Sandbox' : 'Main');
     # Extract the genome group data from the old Sprout.  
     my %oldGroupHash = $oldSprout->GetGroups();  
     if (! $options->{strict}) {  
         %oldGroupHash = Fix(%oldGroupHash);  
     }  
83      # Get the new Sprout.      # Get the new Sprout.
84      my $sprout = SFXlate->new_sprout_only();      my $sprout = SFXlate->new_sprout_only();
85      my %newGroupHash = $sprout->GetGroups();      my %newGroupHash = $sprout->GetGroups();
86      if (! $options->{strict}) {      # Get a wiki helper.
87          %newGroupHash = Fix(%newGroupHash);      my $wiki = WikiTools->new();
88        # Extract the genome group data from the new Sprout.
89        %newGroupHash = $sprout->Fix(%newGroupHash);
90        # This hash will be used to determine which genomes are new.
91        my %oldGroupHash = ();
92        if ($options->{noNewCheck}) {
93            # Here we can't look at the old Sprout. Set up the hash
94            # so it looks like the old Sprout's data is the same as ours.
95            %oldGroupHash = map { $_ => $newGroupHash{$_} } keys %newGroupHash;
96        } else {
97            # Get the old Sprout.
98            my $oldSprout = SFXlate->old_sprout_only();
99            # Extract the genome group data from the old Sprout.
100            %oldGroupHash = $oldSprout->GetGroups();
101            %oldGroupHash = $oldSprout->Fix(%oldGroupHash);
102      }      }
103        # Get a FIG object for computing attributes.
104        my $fig = FIG->new();
105        # Get the super-group list.
106        my @superGroups = sort keys %newGroupHash;
107        # Set up some useful stuff for the four count columns.
108        my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",
109                          s1 => "hypo_sub", n1 => "hypo_nosub" );
110        my @columnTypes = ('s0', 'n0', 's1', 'n1');
111        # Prepare a hash for the summary counters. These will be used on the organism summary page.
112        my %summaries = ();
113      # Loop through the groups.      # Loop through the groups.
114      for my $groupID (keys %newGroupHash) {      for my $groupID (@superGroups) {
115          Trace("Processing group $groupID.") if T(2);          Trace("Processing group $groupID.") if T(2);
116            # Create a hash for summarizing the counters.
117            my %groupTotals = ( genomes => 0, pegs => 0, RNAs => 0,
118                               map { $_ => } @columnTypes, features => 0 );
119          # Get the genomes from the new hash.          # Get the genomes from the new hash.
120          my @newGenomes = @{$newGroupHash{$groupID}};          my @newGenomes = @{$newGroupHash{$groupID}};
121          # Create a hash for finding if a genome is in the old group. If the entire group is          # Create a hash for finding if a genome is in the old group. If the entire group is
# Line 136  Line 124 
124          if (exists $oldGroupHash{$groupID}) {          if (exists $oldGroupHash{$groupID}) {
125              %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};              %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};
126          }          }
127          # Create the output file.          # Compute the name of the wiki page we're building.
128          Open(\*GROUPFILE, ">$targetDir/$groupID.inc");          my $outPageName = "${groupID}Stats";
129          # Get the styles.          # We'll put the data for the page in here.
130          my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},          my @outputLines = ();
131                                                       $options->{evenStyle}, $options->{oddStyle});          # Get the special columns. We'll stuff them in a hash keyed by column name. Each column name will contain
132          my ($numStyle, $counterStyle) = ($options->{numStyle}, $options->{counterStyle});          # a sub-hash that translates each genome ID to its applicable attribute value (if any).
133          # Start the table.          my %specialData = ();
134          print GROUPFILE "<table class=\"$tableStyle\">\n";          for my $specialColumn (keys %specialCols) {
135                # Get the attribute mapping.
136                my %specialDataList = map { $_->[0] => $_->[2] } $fig->get_attributes(\@newGenomes, $specialCols{$specialColumn});
137                # We only proceed if some attributes were found. As a result, the keys in %specialData will only be keys
138                # for columns that exist in the output.
139                if (scalar(keys %specialDataList)) {
140                    $specialData{$specialColumn} = \%specialDataList;
141                }
142            }
143            # Set up the column names. Note that an extra space in front of a name will be interpreted by
144            # the Wiki markup as an order to right-justify the text.
145            my @columnNames = "*Strain annotated in NMPDR*";
146            push @columnNames, map { "*$_*" } sort keys %specialData;
147            push @columnNames,  "*Genome size, bp*",
148                                " *%FIG{Protein Encoding Genes}% (PEGs)*",
149                                " *Named genes in subsystems*",            # s0
150                                " *Named genes not in subsystems*",        # n0
151                                " *Hypothetical genes in subsystems*",     # s1
152                                " *Hypothetical genes not in subsystems*", # n1
153                                " *Subsystems*",
154                                " *RNAs*";
155          # Create the header row.          # Create the header row.
156          print GROUPFILE Tr( { class => 'odd' }, th(["Strain annotated in NMPDR",          push @outputLines, "| " . join(" | ", @columnNames) . " |";
                                                  "Genome size, bp",  
                                                  "Protein Encoding Genes (PEGs)",  
                                                  "Named genes in subsystems",            # s0  
                                                  "Named genes not in subsystems",        # n0  
                                                  "Hypothetical genes in subsystems",     # s1  
                                                  "Hypothetical genes not in subsystems", # n1  
                                                  "RNAs",  
                                                    ])) . "\n";  
         # Set up some useful stuff for the four count columns.  
         my %linkParms = ( s0 => "nohypo_sub", n0 => "nohypo_nosub",  
                           s1 => "hypo_sub", n1 => "hypo_nosub" );  
         my @columnTypes = ('s0', 'n0', 's1', 'n1');  
157          # The data rows will be built next. We'll be putting them into a hash keyed by          # The data rows will be built next. We'll be putting them into a hash keyed by
158          # organism name. The hash enables us to spit them out sorted by name.          # organism name. The hash enables us to spit them out sorted by name.
159          my %rows = ();          my %rows = ();
160            # This variable is used to hold the counts.
161            my $num;
162          # Loop through the genomes in the new group.          # Loop through the genomes in the new group.
163          for my $genomeID (@newGenomes) {          for my $genomeID (@newGenomes) {
164                # Count this genome.
165                $groupTotals{genomes}++;
166              # Check to see if this genome is new.              # Check to see if this genome is new.
167              my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");              my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");
168              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);
169              # Get the strain name.              # Get the strain name.
170              my $genomeName = $sprout->GenusSpecies($genomeID);              my $genomeName = $sprout->GenusSpecies($genomeID);
171              # If this is a new strain, build the HTML for the NEW! mark.              # Apply a link.
172                my $genomeText = "%SV{\"$genomeName\" id=\"$genomeID\"}%";
173                # If this is a new strain, add the NEW! mark.
174              if ($new) {              if ($new) {
175                  $new = " <span class=\"$markerStyle\">NEW!</span>";                  $genomeText .= " %N%";
176              }              }
177              # Get the genome length.              # Get the genome length.
178              my $genomeLen = $sprout->GenomeLength($genomeID);              $num = $sprout->GenomeLength($genomeID);
179                my $genomeLen = Tracer::CommaFormat($num);
180              # Get the number of PEGs.              # Get the number of PEGs.
181              my $pegCount = $sprout->FeatureCount($genomeID, 'peg');              $num = $sprout->FeatureCount($genomeID, 'peg');
182                my $pegCount = Tracer::CommaFormat($num);
183                $groupTotals{pegs} += $num;
184              # Get the number of RNAs.              # Get the number of RNAs.
185              my $rnaCount = $sprout->FeatureCount($genomeID, 'rna');              $num = $sprout->FeatureCount($genomeID, 'rna');
186                my $rnaCount = Tracer::CommaFormat($num);
187                $groupTotals{RNAs} += $num;
188                # If there are no RNAs, we say we don't know the number, since we know there
189                # must be RNAs somewhere.
190                if (! $rnaCount) {
191                    $rnaCount = "n/d";
192                }
193              # Now we have four categories of features to work with, for each              # Now we have four categories of features to work with, for each
194              # combination of named or hypothetical vs. in-subsystem or              # combination of named or hypothetical vs. in-subsystem or
195              # not-in-subsystem. First, we get all of the feature assignments for              # not-in-subsystem. First, we get all of the feature assignments for
196              # the genome.              # the genome.
197              my $assignHash = $sprout->GenomeAssignments($genomeID);              my $assignHash = $sprout->GenomeAssignments($genomeID);
198              # Next, we get all of the features in the genome that belong to a              # Next, we get all of the features in the genome that belong to a
199              # subsystem. This involves a query via the subsystem spreadsheet.              # subsystem.
200              my %ssHash = map { $_ => 1 } $sprout->GetFlat(['IsGenomeOf', 'ContainsFeature'],              my %ssHash = $sprout->GenomeSubsystemData($genomeID);
                                                     "IsGenomeOf(from-link) = ?",  
                                                     [$genomeID], 'ContainsFeature(to-link)');  
201              # Create a hash to track the four categories. "s" or "n" indicates              # Create a hash to track the four categories. "s" or "n" indicates
202              # in or out of a subsystem. "1" or "0" indicates hypothetical or              # in or out of a subsystem. "1" or "0" indicates hypothetical or
203              # real.              # real.
# Line 203  Line 213 
213                  $totalFeatures++;                  $totalFeatures++;
214              }              }
215              Trace("$totalFeatures total features found for $genomeID.") if T(3);              Trace("$totalFeatures total features found for $genomeID.") if T(3);
216              # We have all our data. Next we need to compute the percentages and the links.              for my $counterKey (@columnTypes) {
217              # First, the link stuff.                  $groupTotals{$counterKey} += $counters{$counterKey};
218              my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";              }
219              # Now format the counters and percentages.              $groupTotals{features} += $totalFeatures;
220                # We have all our data. Next we need to compute the percentages.
221              for my $type (keys %linkParms) {              for my $type (keys %linkParms) {
222                  $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },                  $counters{$type} = sprintf("%d(%.1f%%)", $counters{$type},
223                                       sprintf("%d(%.1f%%)", $counters{$type},                                             Tracer::Percent($counters{$type}, $totalFeatures));
                                              Tracer::Percent($counters{$type}, $totalFeatures)));  
224              }              }
225              my @counterValues = map { $counters{$_} } @columnTypes;              my @counterValues = map { $counters{$_} } @columnTypes;
226              # Create the row text. Note that we use the distributive capability of the TD              # The last column is a subsystem count.
227              # function to apply the same style to each one.              my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',
228              my $rowHtml = join("",                                                 [$genomeID]);
229                                 td("$genomeName$new"),              my $ssCol = $ssCount;
230                                 td({ class => $numStyle }, $genomeLen),              # Start creating the table cells.
231                                 td({ class => $numStyle }, $pegCount),              my $rowHtml = "| $genomeText |";
232                                 td({ class => $counterStyle }, \@counterValues),              # Add any special columns.
233                                 td({ class => $numStyle }, $rnaCount),              for my $specialCol (keys %specialData) {
234                                );                  # Here we get the attribute value. If there is none, we leave the column blank.
235                    my $attribute = $specialData{$specialCol}->{$genomeID} || "&nbsp;";
236                    $rowHtml .= " $attribute |";
237                }
238                # Now add the data columns.
239                $rowHtml .= join("", map { "  $_ |" } ($genomeLen, $pegCount, @counterValues, $ssCol, $rnaCount));
240              # Put it in the row hash.              # Put it in the row hash.
241              $rows{$genomeName} = $rowHtml;              $rows{$genomeName} = $rowHtml;
242          }          }
# Line 234  Line 249 
249          # Loop through the rows.          # Loop through the rows.
250          for my $rowID (sort keys %rows) {          for my $rowID (sort keys %rows) {
251              # Format the row.              # Format the row.
252              print GROUPFILE Tr( { class => $rowStyle[$rowType] }, $rows{$rowID} ) . "\n";              push @outputLines, $rows{$rowID};
             # Flip the row type.  
             $rowType = 1 - $rowType;  
253              # Count the row.              # Count the row.
254              $rowCount++;              $rowCount++;
255          }          }
256          # All done, terminate the table and close the file.          # All done, write the Wiki Page.
257          print GROUPFILE "</table>\n";          my $rc = $wiki->Save($outPageName, $outputWeb, 'OrganismDataSummariesStats', join("\n", @outputLines));
         close GROUPFILE;  
258          Trace("$rowCount genomes processed.") if T(2);          Trace("$rowCount genomes processed.") if T(2);
259            if (! $rc) {
260                Confess("Error saving $outPageName: $wiki->{error}");
261            }
262            # Now save the group totals.
263            $summaries{$groupID} = \%groupTotals;
264        }
265        # Now produce the summary table.
266        my $sumPageName = "OrganismDataSummariesStats";
267        my @sumLines = ();
268        # Start the table.  Asterisks make a cell a column header. An extra space at the front right-justifies it.
269        push @sumLines, "| " . join("", map { " *$_* |" } ("Group name", "Genomes")) .
270                               join("", map { "  *$_* |"} ("[[FIG.ProteinEncodingGenes][Protein Encoding Genes]] (PEGs)",
271                                                           "Named genes in subsystems",            # s0
272                                                           "Named genes not in subsystems",        # n0
273                                                           "Hypothetical genes in subsystems",     # s1
274                                                           "Hypothetical genes not in subsystems", # n1
275                                                           "RNAs"));
276        # Put in the data rows.
277        for my $groupName (sort keys %summaries) {
278            my $group = $summaries{$groupName};
279            # Create the table row.
280            my $rowHtml = "| [[$groupName]] |" . join("", map { "  " . Tracer::CommaFormat($group->{$_}) . " |" }
281                                                      ('genomes', 'pegs', @columnTypes, 'RNAs'));
282            push @sumLines, $rowHtml;
283      }      }
284        # Write the page.
285        my $rc = $wiki->Save($sumPageName, $outputWeb, 'WebHome', join("\n", @sumLines));
286      # We're all done.      # We're all done.
287      Trace("Processing complete.") if T(2);      Trace("Processing complete.") if T(2);
288    };
289    if ($@) {
290        Trace("Stats failed with error: $@") if T(0);
291        $rtype = "error";
292    } else {
293        Trace("Stats complete.") if T(2);
294        $rtype = "no error";
295  }  }
296    if ($options->{phone}) {
297  =head3 Fix      my $msgID = Tracer::SendSMS($options->{phone}, "GenomeStats terminated with $rtype.");
298        if ($msgID) {
299  C<< my %fixedHash = Fix(%groupHash); >>          Trace("Phone message sent with ID $msgID.") if T(2);
300        } else {
301  Prepare a genome group hash for processing. Groups with the same primary name will be combined.          Trace("Phone message not sent.") if T(2);
 The primary name is the first capitalized word in the group name.  
   
 =over 4  
   
 =item groupHash  
   
 Hash to be fixed up.  
   
 =item RETURN  
   
 Returns a fixed-up version of the hash.  
   
 =back  
   
 =cut  
   
 sub Fix {  
     # Get the parameters.  
     my (%groupHash) = @_;  
     # Create the result hash.  
     my %retVal = ();  
     # Copy over the genomes.  
     for my $groupID (keys %groupHash) {  
         # Make a safety copy of the group ID.  
         my $realGroupID = $groupID;  
         # Yank the primary name.  
         if ($groupID =~ /([A-Z]\w+)/) {  
             $realGroupID = $1;  
         }  
         # Append this group's genomes into the result hash.  
         Tracer::AddToListMap(\%retVal, $realGroupID, @{$groupHash{$groupID}});  
302      }      }
     # Return the result hash.  
     return %retVal;  
303  }  }
304    
   
305  1;  1;

Legend:
Removed from v.1.10  
changed lines
  Added in v.1.34

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3