[Bio] / Sprout / GenomeStats.pl Repository:
ViewVC logotype

Diff of /Sprout/GenomeStats.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.14, Fri Jun 23 23:09:04 2006 UTC revision 1.36, Mon Mar 2 22:25:08 2009 UTC
# Line 2  Line 2 
2    
3  =head1 Genome Data Generator  =head1 Genome Data Generator
4    
5  This script creates a set of HTML include files that list the statistics for  This script creates a set of Wiki pages that list the statistics for
6  the genomes in each of the genome groups. Genomes that are new to this version  the genomes in each of the genome groups. Genomes that are new to this version
7  of the Sprout will be specially marked. In order for this to work, both the  of the Sprout will be specially marked. In order for this to work, both the
8  current and previous Sprout databases must be available on this machine.  current and previous Sprout databases must be available on this machine.
 This is one positional parameter: the name of a directory in which to place  
 the include files.  
9    
10  The currently-supported command-line options are as follows.  The currently-supported command-line options are as follows.
11    
# Line 39  Line 37 
37    
38  Display this command's parameters and options.  Display this command's parameters and options.
39    
40  =item strict  =item test
41    
42  If specified, strict groups will be used; otherwise, groups with a common primary name  If specified, the output pages will be put into the sandbox instead of the main web.
 will be combined into a single group. (The primary name of the group is the first  
 capitalized word.)  
43    
44  =item oddStyle  =item noNewCheck
45    
46  Style to use for odd rows of the table.  If specified, the check for new genomes in the group is suppressed. This
47    may need to be done if there's been a change in the database definition. Note
48  =item evenStyle  that all this really does is keep the B<NEW!> symbol from showing. It does
49    not affect which genomes show up in the table.
 Style to use for even rows of the table.  
   
 =item tableStyle  
   
 Style to use for the table itself.  
   
 =item markerStyle  
   
 Style to use for small-text markers (e.g. NEW!)  
   
 =item numStyle  
   
 Style to use for numeric cells.  
   
 =item counterStyle  
   
 Style to use for counter cells.  
   
 =item linkCGI  
   
 Path to the CGI script for displaying detailed statistics.  
50    
51  =back  =back
52    
# Line 79  Line 54 
54    
55  use strict;  use strict;
56  use Tracer;  use Tracer;
 use DocUtils;  
 use TestUtils;  
57  use Cwd;  use Cwd;
58  use File::Copy;  use File::Copy;
59  use File::Path;  use File::Path;
60  use Sprout;  use Sprout;
61  use SFXlate;  use SFXlate;
62  use CGI qw(:standard);  use CGI qw(-nosticky);
63  use FIG;  use FIG;
64  no warnings 'once'; # only when coding  use WikiTools;
65    
66  # Get the command-line options and parameters.  # Get the command-line options and parameters.
67  my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],  my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
68                                             {                                             {
69                                              strict => [0, 'keep related groups separate'],                                              test => [0, 'if specified, publishes to the wiki sandbox instead of the main web'],
70                                              oddStyle => ['odd', 'style for odd rows'],                                              noNewCheck => [0, 'if specified, skips the check for new genomes'],
                                             evenStyle => ['even', 'style for even rows'],  
                                             tableStyle => ['genomestats', 'style for whole table'],  
                                             markerStyle => ['tinytext', 'style for markers'],  
                                             numStyle => ['numcell', 'style for cells with numeric values'],  
                                             counterStyle => ['countercell', 'style for cells with counter values'],  
                                             linkCGI => ['../FIG/genome_statistics.cgi',  
                                                         'path to CGI script for detailed statistics'],  
71                                             },                                             },
72                                             "<targetDir>",                                             "",
73                                             @ARGV);                                             @ARGV);
74  # Verify the directory name.  # The return type (error/no error) goes in here.
75  my $targetDir = $parameters[0];  my $rtype;
76  if (! $targetDir) {  eval {
77      Confess("No target directory specified.");      # This table controls the special attribute columns. For each we need to know the attribute name and the
78  } elsif (! -d $targetDir) {      # column title. If any genomes in a group have a value for one of the special columns, that column is
79      Confess("Target directory $targetDir not found.");      # displayed along with the attribute values.
80  } else {      my %specialCols = (Serotype => 'Serotype_code',
81      # *Get the old Sprout.                         Phenotype => 'Phenotype');
82      my $oldSprout = SFXlate->new_sprout_only($FIG_Config::oldSproutDB);      my $outputWeb = ($options->{test} ? 'Sandbox' : 'Main');
     # Extract the genome group data from the old Sprout.  
     my %oldGroupHash = $oldSprout->GetGroups();  
     if (! $options->{strict}) {  
         %oldGroupHash = Fix(%oldGroupHash);  
     }  
83      # Get the new Sprout.      # Get the new Sprout.
84      my $sprout = SFXlate->new_sprout_only();      my $sprout = SFXlate->new_sprout_only();
85      my %newGroupHash = $sprout->GetGroups();      my %newGroupHash = $sprout->GetGroups();
86      if (! $options->{strict}) {      # Get a wiki helper.
87          %newGroupHash = Fix(%newGroupHash);      my $wiki = WikiTools->new();
88        # Extract the genome group data from the new Sprout.
89        %newGroupHash = $sprout->Fix(%newGroupHash);
90        # This hash will be used to determine which genomes are new.
91        my %oldGroupHash = ();
92        if ($options->{noNewCheck}) {
93            # Here we can't look at the old Sprout. Set up the hash
94            # so it looks like the old Sprout's data is the same as ours.
95            %oldGroupHash = map { $_ => $newGroupHash{$_} } keys %newGroupHash;
96        } else {
97            # Get the old Sprout.
98            my $oldSprout = SFXlate->old_sprout_only();
99            # Extract the genome group data from the old Sprout.
100            %oldGroupHash = $oldSprout->GetGroups();
101            %oldGroupHash = $oldSprout->Fix(%oldGroupHash);
102      }      }
103        # Get a FIG object for computing attributes.
104        my $fig = FIG->new();
105        # Get the super-group list.
106        my @superGroups = sort keys %newGroupHash;
107        # Set up some useful stuff for the four count columns.
108        my $url = "%SCRIPTURL{rest}%/NmpdrPlugin/search?Class=OrgSumSearch;Search=Go";
109        my %linkParms = ( s0 => "$url;hypothetical=named;insubsystem=in;genome=",
110                          n0 => "$url;hypothetical=named;insubsystem=out;genome=",
111                          s1 => "$url;hypothetical=hypo;insubsystem=in;genome=",
112                          n1 => "$url;hypothetical=hypo;insubsystem=out;genome=" );
113        my @columnTypes = ('s0', 'n0', 's1', 'n1');
114        # Prepare a hash for the summary counters. These will be used on the organism summary page.
115        my %summaries = ();
116      # Loop through the groups.      # Loop through the groups.
117      for my $groupID (keys %newGroupHash) {      for my $groupID (@superGroups) {
118          Trace("Processing group $groupID.") if T(2);          Trace("Processing group $groupID.") if T(2);
119            # Create a hash for summarizing the counters.
120            my %groupTotals = ( genomes => 0, pegs => 0, RNAs => 0,
121                               map { $_ => } @columnTypes, features => 0 );
122          # Get the genomes from the new hash.          # Get the genomes from the new hash.
123          my @newGenomes = @{$newGroupHash{$groupID}};          my @newGenomes = @{$newGroupHash{$groupID}};
124          # Create a hash for finding if a genome is in the old group. If the entire group is          # Create a hash for finding if a genome is in the old group. If the entire group is
# Line 136  Line 127 
127          if (exists $oldGroupHash{$groupID}) {          if (exists $oldGroupHash{$groupID}) {
128              %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};              %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};
129          }          }
130          # Create the output file.          # Compute the name of the wiki page we're building.
131          my $outFileName = "stats-" . lc($groupID) . ".inc";          my $outPageName = "${groupID}Stats";
132          Open(\*GROUPFILE, ">$targetDir/outFileName");          # We'll put the data for the page in here.
133          # Get the styles.          my @outputLines = ();
134          my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},          # Get the special columns. We'll stuff them in a hash keyed by column name. Each column name will contain
135                                                       $options->{evenStyle}, $options->{oddStyle});          # a sub-hash that translates each genome ID to its applicable attribute value (if any).
136          my ($numStyle, $counterStyle) = ($options->{numStyle}, $options->{counterStyle});          my %specialData = ();
137          # Start the table.          for my $specialColumn (keys %specialCols) {
138          print GROUPFILE "<table class=\"$tableStyle\">\n";              # Get the attribute mapping.
139                my %specialDataList = map { $_->[0] => $_->[2] } $fig->get_attributes(\@newGenomes, $specialCols{$specialColumn});
140                # We only proceed if some attributes were found. As a result, the keys in %specialData will only be keys
141                # for columns that exist in the output.
142                if (scalar(keys %specialDataList)) {
143                    $specialData{$specialColumn} = \%specialDataList;
144                }
145            }
146            # Set up the column names. Note that an extra space in front of a name will be interpreted by
147            # the Wiki markup as an order to right-justify the text.
148            my @columnNames = "*Strain annotated in NMPDR*";
149            push @columnNames, map { "*$_*" } sort keys %specialData;
150            push @columnNames,  "*Genome size, bp*",
151                                " *%FIG{Protein Encoding Genes}% (PEGs)*",
152                                " *Named genes in subsystems*",            # s0
153                                " *Named genes not in subsystems*",        # n0
154                                " *Hypothetical genes in subsystems*",     # s1
155                                " *Hypothetical genes not in subsystems*", # n1
156                                " *Subsystems*",
157                                " *RNAs*";
158            # Make the table sortable.
159            push @outputLines, '%TABLE{sort="on"}%';
160          # Create the header row.          # Create the header row.
161          print GROUPFILE Tr( { class => 'odd' }, th(["Strain annotated in NMPDR",          push @outputLines, "| " . join(" | ", @columnNames) . " |";
                                                  "Genome size, bp",  
                                                  "Protein Encoding Genes (PEGs)",  
                                                  "Named genes in subsystems",            # s0  
                                                  "Named genes not in subsystems",        # n0  
                                                  "Hypothetical genes in subsystems",     # s1  
                                                  "Hypothetical genes not in subsystems", # n1  
                                                  "RNAs",  
                                                    ])) . "\n";  
         # Set up some useful stuff for the four count columns.  
         my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",  
                           s1 => "hypo_sub", n1 => "hypo_nosub" );  
         my @columnTypes = ('s0', 'n0', 's1', 'n1');  
162          # The data rows will be built next. We'll be putting them into a hash keyed by          # The data rows will be built next. We'll be putting them into a hash keyed by
163          # organism name. The hash enables us to spit them out sorted by name.          # organism name. The hash enables us to spit them out sorted by name.
164          my %rows = ();          my %rows = ();
165            # This variable is used to hold the counts.
166            my $num;
167          # Loop through the genomes in the new group.          # Loop through the genomes in the new group.
168          for my $genomeID (@newGenomes) {          for my $genomeID (@newGenomes) {
169                # Count this genome.
170                $groupTotals{genomes}++;
171              # Check to see if this genome is new.              # Check to see if this genome is new.
172              my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");              my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");
173              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);
174              # Get the strain name.              # Get the strain name.
175              my $genomeName = $sprout->GenusSpecies($genomeID);              my $genomeName = $sprout->GenusSpecies($genomeID);
176              # If this is a new strain, build the HTML for the NEW! mark.              # Apply a link.
177                my $genomeText = "%SV{\"$genomeName\" id=\"$genomeID\"}%";
178                # If this is a new strain, add the NEW! mark.
179              if ($new) {              if ($new) {
180                  $new = " <span class=\"$markerStyle\">NEW!</span>";                  $genomeText .= " %N%";
181              }              }
182              # Get the genome length.              # Get the genome length.
183              my $genomeLen = Tracer::CommaFormat($sprout->GenomeLength($genomeID));              $num = $sprout->GenomeLength($genomeID);
184                my $genomeLen = Tracer::CommaFormat($num);
185              # Get the number of PEGs.              # Get the number of PEGs.
186              my $pegCount = Tracer::CommaFormat($sprout->FeatureCount($genomeID, 'peg'));              $num = $sprout->FeatureCount($genomeID, 'peg');
187                my $pegCount = Tracer::CommaFormat($num);
188                $groupTotals{pegs} += $num;
189              # Get the number of RNAs.              # Get the number of RNAs.
190              my $rnaCount = Tracer::CommaFormat($sprout->FeatureCount($genomeID, 'rna'));              $num = $sprout->FeatureCount($genomeID, 'rna');
191                my $rnaCount = Tracer::CommaFormat($num);
192                $groupTotals{RNAs} += $num;
193                # If there are no RNAs, we say we don't know the number, since we know there
194                # must be RNAs somewhere.
195                if (! $rnaCount) {
196                    $rnaCount = "n/d";
197                }
198              # Now we have four categories of features to work with, for each              # Now we have four categories of features to work with, for each
199              # combination of named or hypothetical vs. in-subsystem or              # combination of named or hypothetical vs. in-subsystem or
200              # not-in-subsystem. First, we get all of the feature assignments for              # not-in-subsystem. First, we get all of the feature assignments for
201              # the genome.              # the genome.
202              my $assignHash = $sprout->GenomeAssignments($genomeID);              my $assignHash = $sprout->GenomeAssignments($genomeID);
203              # Next, we get all of the features in the genome that belong to a              # Next, we get all of the features in the genome that belong to a
204              # subsystem. This involves a query via the subsystem spreadsheet.              # subsystem.
205              my %ssHash = map { $_ => 1 } $sprout->GetFlat(['IsGenomeOf', 'ContainsFeature'],              my %ssHash = $sprout->GenomeSubsystemData($genomeID);
                                                     "IsGenomeOf(from-link) = ?",  
                                                     [$genomeID], 'ContainsFeature(to-link)');  
206              # Create a hash to track the four categories. "s" or "n" indicates              # Create a hash to track the four categories. "s" or "n" indicates
207              # in or out of a subsystem. "1" or "0" indicates hypothetical or              # in or out of a subsystem. "1" or "0" indicates hypothetical or
208              # real.              # real.
# Line 204  Line 218 
218                  $totalFeatures++;                  $totalFeatures++;
219              }              }
220              Trace("$totalFeatures total features found for $genomeID.") if T(3);              Trace("$totalFeatures total features found for $genomeID.") if T(3);
221              # We have all our data. Next we need to compute the percentages and the links.              for my $counterKey (@columnTypes) {
222              # First, the link stuff.                  $groupTotals{$counterKey} += $counters{$counterKey};
223              my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";              }
224              # Now format the counters and percentages.              $groupTotals{features} += $totalFeatures;
225                # We have all our data. Next we need to compute the percentages.
226              for my $type (keys %linkParms) {              for my $type (keys %linkParms) {
227                  $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },                  my $counterData = sprintf("%d(%.1f%%)", $counters{$type},
228                                       sprintf("%d(%.1f%%)", $counters{$type},                                            Tracer::Percent($counters{$type}, $totalFeatures));
229                                               Tracer::Percent($counters{$type}, $totalFeatures)));                  $counters{$type} = CGI::a({ href => $linkParms{$type} . $genomeID }, $counterData);
230              }              }
231              my @counterValues = map { $counters{$_} } @columnTypes;              my @counterValues = map { $counters{$_} } @columnTypes;
232              # Create the row text. Note that we use the distributive capability of the TD              # The last column is a subsystem count.
233              # function to apply the same style to each one.              my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',
234              my $rowHtml = join("",                                                 [$genomeID]);
235                                 td("$genomeName$new"),              my $ssCol = $ssCount;
236                                 td({ class => $numStyle }, $genomeLen),              # Start creating the table cells.
237                                 td({ class => $numStyle }, $pegCount),              my $rowHtml = "| $genomeText |";
238                                 td({ class => $counterStyle }, \@counterValues),              # Add any special columns.
239                                 td({ class => $numStyle }, $rnaCount),              for my $specialCol (keys %specialData) {
240                                );                  # Here we get the attribute value. If there is none, we leave the column blank.
241                    my $attribute = $specialData{$specialCol}->{$genomeID} || "&nbsp;";
242                    $rowHtml .= " $attribute |";
243                }
244                # Now add the data columns.
245                $rowHtml .= join("", map { "  $_ |" } ($genomeLen, $pegCount, @counterValues, $ssCol, $rnaCount));
246              # Put it in the row hash.              # Put it in the row hash.
247              $rows{$genomeName} = $rowHtml;              $rows{$genomeName} = $rowHtml;
248          }          }
# Line 235  Line 255 
255          # Loop through the rows.          # Loop through the rows.
256          for my $rowID (sort keys %rows) {          for my $rowID (sort keys %rows) {
257              # Format the row.              # Format the row.
258              print GROUPFILE Tr( { class => $rowStyle[$rowType] }, $rows{$rowID} ) . "\n";              push @outputLines, $rows{$rowID};
             # Flip the row type.  
             $rowType = 1 - $rowType;  
259              # Count the row.              # Count the row.
260              $rowCount++;              $rowCount++;
261          }          }
262          # All done, terminate the table and close the file.          # All done, write the Wiki Page.
263          print GROUPFILE "</table>\n";          my $rc = $wiki->Save($outPageName, $outputWeb, 'OrganismDataSummariesStats', join("\n", @outputLines));
         close GROUPFILE;  
264          Trace("$rowCount genomes processed.") if T(2);          Trace("$rowCount genomes processed.") if T(2);
265            if (! $rc) {
266                Confess("Error saving $outPageName: $wiki->{error}");
267      }      }
268            # Now save the group totals.
269            $summaries{$groupID} = \%groupTotals;
270        }
271        # Now produce the summary table.
272        my $sumPageName = "OrganismDataSummariesStats";
273        my @sumLines = ();
274        # Start the table.  Asterisks make a cell a column header. An extra space at the front right-justifies it.
275        push @sumLines, "| " . join("", map { " *$_* |" } ("Group name", "Genomes")) .
276                               join("", map { "  *$_* |"} ("[[FIG.ProteinEncodingGenes][Protein Encoding Genes]] (PEGs)",
277                                                           "Named genes in subsystems",            # s0
278                                                           "Named genes not in subsystems",        # n0
279                                                           "Hypothetical genes in subsystems",     # s1
280                                                           "Hypothetical genes not in subsystems", # n1
281                                                           "RNAs"));
282        # Put in the data rows.
283        for my $groupName (sort keys %summaries) {
284            my $group = $summaries{$groupName};
285            # Create the table row.
286            my $rowHtml = "| [[$groupName]] |" . join("", map { "  " . Tracer::CommaFormat($group->{$_}) . " |" }
287                                                      ('genomes', 'pegs', @columnTypes, 'RNAs'));
288            push @sumLines, $rowHtml;
289        }
290        # Write the page.
291        my $rc = $wiki->Save($sumPageName, $outputWeb, 'WebHome', join("\n", @sumLines));
292      # We're all done.      # We're all done.
293      Trace("Processing complete.") if T(2);      Trace("Processing complete.") if T(2);
294    };
295    if ($@) {
296        Trace("Stats failed with error: $@") if T(0);
297        $rtype = "error";
298    } else {
299        Trace("Stats complete.") if T(2);
300        $rtype = "no error";
301  }  }
302    if ($options->{phone}) {
303  =head3 Fix      my $msgID = Tracer::SendSMS($options->{phone}, "GenomeStats terminated with $rtype.");
304        if ($msgID) {
305  C<< my %fixedHash = Fix(%groupHash); >>          Trace("Phone message sent with ID $msgID.") if T(2);
306        } else {
307  Prepare a genome group hash for processing. Groups with the same primary name will be combined.          Trace("Phone message not sent.") if T(2);
 The primary name is the first capitalized word in the group name.  
   
 =over 4  
   
 =item groupHash  
   
 Hash to be fixed up.  
   
 =item RETURN  
   
 Returns a fixed-up version of the hash.  
   
 =back  
   
 =cut  
   
 sub Fix {  
     # Get the parameters.  
     my (%groupHash) = @_;  
     # Create the result hash.  
     my %retVal = ();  
     # Copy over the genomes.  
     for my $groupID (keys %groupHash) {  
         # Make a safety copy of the group ID.  
         my $realGroupID = $groupID;  
         # Yank the primary name.  
         if ($groupID =~ /([A-Z]\w+)/) {  
             $realGroupID = $1;  
         }  
         # Append this group's genomes into the result hash.  
         Tracer::AddToListMap(\%retVal, $realGroupID, @{$groupHash{$groupID}});  
308      }      }
     # Return the result hash.  
     return %retVal;  
309  }  }
310    
   
311  1;  1;

Legend:
Removed from v.1.14  
changed lines
  Added in v.1.36

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3