[Bio] / Sprout / GenomeStats.pl Repository:
ViewVC logotype

Diff of /Sprout/GenomeStats.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Sun Jun 18 05:37:14 2006 UTC revision 1.26, Sun Oct 8 05:44:55 2006 UTC
# Line 61  Line 61 
61    
62  Style to use for small-text markers (e.g. NEW!)  Style to use for small-text markers (e.g. NEW!)
63    
64    =item numStyle
65    
66    Style to use for numeric cells.
67    
68    =item counterStyle
69    
70    Style to use for counter cells.
71    
72  =item linkCGI  =item linkCGI
73    
74  Path to the CGI script for displaying detailed statistics.  Path to the CGI script for displaying detailed statistics.
75    
76    =item noNewCheck
77    
78    If specified, the check for new genomes in the group is suppressed. This
79    may need to be done if there's been a change in the database definition. Note
80    that all this really does is keep the B<NEW!> symbol from showing. It does
81    not affect which genomes show up in the table.
82    
83  =back  =back
84    
85  =cut  =cut
# Line 87  Line 102 
102                                             {                                             {
103                                              strict => [0, 'keep related groups separate'],                                              strict => [0, 'keep related groups separate'],
104                                              oddStyle => ['odd', 'style for odd rows'],                                              oddStyle => ['odd', 'style for odd rows'],
105                                                trace => [2, 'tracing level'],
106                                              evenStyle => ['even', 'style for even rows'],                                              evenStyle => ['even', 'style for even rows'],
107                                              tableStyle => ['genomestats', 'style for whole table'],                                              tableStyle => ['genomestats', 'style for whole table'],
108                                              markerStyle => ['tinytext', 'style for markers'],                                              markerStyle => ['tinytext', 'style for markers'],
109                                                numStyle => ['numcell', 'style for cells with numeric values'],
110                                                counterStyle => ['countercell', 'style for cells with counter values'],
111                                              linkCGI => ['../FIG/genome_statistics.cgi',                                              linkCGI => ['../FIG/genome_statistics.cgi',
112                                                          'path to CGI script for detailed statistics'],                                                          'path to CGI script for detailed statistics'],
113                                                groupFile => ["$FIG_Config::sproutData/groups.tbl",
114                                                              'location of the NMPDR group description file'],
115                                                noNewCheck => [0, 'if specified, skips the check for new genomes'],
116                                                targetDir => ["$FIG_Config::nmpdr_base/next/html/includes",
117                                                              'target directory'],
118                                             },                                             },
119                                             "<targetDir>",                                             "",
120                                             @ARGV);                                             @ARGV);
121  # Verify the directory name.  # Verify the directory name.
122  my $targetDir = $parameters[0];  my $targetDir = $options->{targetDir};
123  if (! $targetDir) {  if (! $targetDir) {
124      Confess("No target directory specified.");      Confess("No target directory specified.");
125  } elsif (! -d $targetDir) {  } elsif (! -d $targetDir) {
126      Confess("Target directory $targetDir not found.");      Confess("Target directory $targetDir not found.");
127  } else {  } else {
     # *Get the old Sprout.  
     my $oldSprout = SFXlate->new_sprout_only($FIG_Config::oldSproutDB);  
     # Extract the genome group data from the old Sprout.  
     my %oldGroupHash = $oldSprout->GetGroups();  
     if (! $options->{strict}) {  
         %oldGroupHash = FixGroups(%oldGroupHash);  
     }  
128      # Get the new Sprout.      # Get the new Sprout.
129      my $sprout = SFXlate->new_sprout_only();      my $sprout = SFXlate->new_sprout_only();
130      my %newGroupHash = $sprout->GetGroups();      my %newGroupHash = $sprout->GetGroups();
131        # Extract the genome group data from the new Sprout.
132      if (! $options->{strict}) {      if (! $options->{strict}) {
133          %newGroupHash = FixGroups(%newGroupHash);          %newGroupHash = Sprout::Fix(%newGroupHash);
134      }      }
135        # This hash will be used to determine which genomes are new.
136        my %oldGroupHash = ();
137        if ($options->{noNewCheck}) {
138            # Here we can't look at the old Sprout. Set up the hash
139            # so it looks like the old Sprout's data is the same as ours.
140            %oldGroupHash = map { $_ => $newGroupHash{$_} } keys %newGroupHash;
141        } else {
142            # Get the old Sprout.
143            my $oldSprout = SFXlate->new_sprout_only($FIG_Config::oldSproutDB);
144            # Extract the genome group data from the old Sprout.
145            %oldGroupHash = $oldSprout->GetGroups();
146            if (! $options->{strict}) {
147                %oldGroupHash = Sprout::Fix(%oldGroupHash);
148            }
149        }
150        # Read the group file.
151        my %groupData = Sprout::ReadGroupFile($options->{groupFile});
152        # Set up some useful stuff for the four count columns.
153        my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",
154                          s1 => "hypo_sub", n1 => "hypo_nosub" );
155        my @columnTypes = ('s0', 'n0', 's1', 'n1');
156        # Get the styles.
157        my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},
158                                                     $options->{evenStyle}, $options->{oddStyle});
159        my ($numStyle, $counterStyle) = ($options->{numStyle}, $options->{counterStyle});
160        # Prepare a hash for the summary counters. These will be used on the organism summary page.
161        my %summaries = ();
162      # Loop through the groups.      # Loop through the groups.
163      for my $groupID (keys %newGroupHash) {      for my $groupID (keys %newGroupHash) {
164          Trace("Processing group $groupID.") if T(2);          Trace("Processing group $groupID.") if T(2);
165            # Create a hash for summarizing the counters.
166            my %groupTotals = ( genomes => 0, pegs => 0, RNAs => 0,
167                               map { $_ => } @columnTypes, features => 0 );
168          # Get the genomes from the new hash.          # Get the genomes from the new hash.
169          my @newGenomes = @{$newGroupHash{$groupID}};          my @newGenomes = @{$newGroupHash{$groupID}};
170          # Create a hash for finding if a genome is in the old group. If the entire group is          # Create a hash for finding if a genome is in the old group. If the entire group is
# Line 127  Line 174 
174              %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};              %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};
175          }          }
176          # Create the output file.          # Create the output file.
177          Open(\*GROUPFILE, ">$targetDir/$groupID.inc");          my $outFileName = "stats-" . lc($groupID) . ".inc";
178          # Get the styles.          Open(\*GROUPFILE, ">$targetDir/$outFileName");
         my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},  
                                                      $options->{evenStyle}, $options->{oddStyle});  
179          # Start the table.          # Start the table.
180          print GROUPFILE "<table class=\"$tableStyle\">\n";          print GROUPFILE "<table class=\"$tableStyle\">\n";
181          # Create the header row.          # Create the header row.
182          print GROUPFILE Tr( { class => 'odd' }, th("Strain annotated in NMPDR",          print GROUPFILE Tr( { class => 'odd' }, th(["Strain annotated in NMPDR",
183                                                   "Genome size, bp",                                                   "Genome size, bp",
184                                                   "Protein Encoding Genes (PEGs)",                                                   "Protein Encoding Genes (PEGs)",
185                                                   "Named genes in subsystems",            # s0                                                   "Named genes in subsystems",            # s0
186                                                   "Named genes not in subsystems",        # n0                                                   "Named genes not in subsystems",        # n0
187                                                   "Hypothetical genes in subsystems",     # s1                                                   "Hypothetical genes in subsystems",     # s1
188                                                   "Hypothetical genes not in subsystems", # n1                                                   "Hypothetical genes not in subsystems", # n1
189                                                   "RNAs")) . "\n";                                                   "Subsystems",
190          # Set up some useful stuff for the four count columns.                                                   "RNAs",
191          my %linkParms = ( s0 => "nohypo_sub", n0 => "nohypo_nosub",                                                     ])) . "\n";
                           s1 => "hypo_sub", n1 => "hypo_nosub" );  
         my @columnTypes = ('s0', 'n0', 's1', 'n1');  
192          # The data rows will be built next. We'll be putting them into a hash keyed by          # The data rows will be built next. We'll be putting them into a hash keyed by
193          # organism name. The hash enables us to spit them out sorted by name.          # organism name. The hash enables us to spit them out sorted by name.
194          my %rows = ();          my %rows = ();
195            # This variable is used to hold the counts.
196            my $num;
197          # Loop through the genomes in the new group.          # Loop through the genomes in the new group.
198          for my $genomeID (@newGenomes) {          for my $genomeID (@newGenomes) {
199                # Count this genome.
200                $groupTotals{genomes}++;
201              # Check to see if this genome is new.              # Check to see if this genome is new.
202              my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");              my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");
203              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);
# Line 161  Line 208 
208                  $new = " <span class=\"$markerStyle\">NEW!</span>";                  $new = " <span class=\"$markerStyle\">NEW!</span>";
209              }              }
210              # Get the genome length.              # Get the genome length.
211              my $genomeLen = $sprout->GenomeLength($genomeID);              $num = $sprout->GenomeLength($genomeID);
212                my $genomeLen = Tracer::CommaFormat($num);
213              # Get the number of PEGs.              # Get the number of PEGs.
214              my $pegCount = $sprout->FeatureCount($genomeID, 'peg');              $num = $sprout->FeatureCount($genomeID, 'peg');
215                my $pegCount = Tracer::CommaFormat($num);
216                $groupTotals{pegs} += $num;
217              # Get the number of RNAs.              # Get the number of RNAs.
218              my $rnaCount = $sprout->FeatureCount($genomeID, 'rna');              $num = $sprout->FeatureCount($genomeID, 'rna');
219                my $rnaCount = Tracer::CommaFormat($num);
220                $groupTotals{RNAs} += $num;
221                # If there are no RNAs, we say we don't know the number, since we know there
222                # must be RNAs somewhere.
223                if (! $rnaCount) {
224                    $rnaCount = "n/d";
225                }
226              # Now we have four categories of features to work with, for each              # Now we have four categories of features to work with, for each
227              # combination of named or hypothetical vs. in-subsystem or              # combination of named or hypothetical vs. in-subsystem or
228              # not-in-subsystem. First, we get all of the feature assignments for              # not-in-subsystem. First, we get all of the feature assignments for
229              # the genome.              # the genome.
230              my $assignHash = $sprout->GenomeAssignments($genomeID);              my $assignHash = $sprout->GenomeAssignments($genomeID);
231              # Next, we get all of the features in the genome that belong to a              # Next, we get all of the features in the genome that belong to a
232              # subsystem. This involves a query via the subsystem spreadsheet.              # subsystem.
233              my %ssHash = map { $_ => 1 } $sprout->GetFlat(['IsGenomeOf', 'ContainsFeature'],              my %ssHash = $sprout->GenomeSubsystemData($genomeID);
                                                     "IsGenomeOf(from-link) = ?",  
                                                     [$genomeID], 'ContainsFeature(to-link)');  
234              # Create a hash to track the four categories. "s" or "n" indicates              # Create a hash to track the four categories. "s" or "n" indicates
235              # in or out of a subsystem. "1" or "0" indicates hypothetical or              # in or out of a subsystem. "1" or "0" indicates hypothetical or
236              # real.              # real.
# Line 190  Line 245 
245                  $counters{$ss} += 1;                  $counters{$ss} += 1;
246                  $totalFeatures++;                  $totalFeatures++;
247              }              }
248                Trace("$totalFeatures total features found for $genomeID.") if T(3);
249                for my $counterKey (@columnTypes) {
250                    $groupTotals{$counterKey} += $counters{$counterKey};
251                }
252                $groupTotals{features} += $totalFeatures;
253              # We have all our data. Next we need to compute the percentages and the links.              # We have all our data. Next we need to compute the percentages and the links.
254              # First, the link stuff.              # First, the link stuff.
255              my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";              my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";
# Line 197  Line 257 
257              for my $type (keys %linkParms) {              for my $type (keys %linkParms) {
258                  $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },                  $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },
259                                       sprintf("%d(%.1f%%)", $counters{$type},                                       sprintf("%d(%.1f%%)", $counters{$type},
260                                               $counters{$type} * 100 / $totalFeatures));                                               Tracer::Percent($counters{$type}, $totalFeatures)));
261              }              }
262              # Create the row text.              my @counterValues = map { $counters{$_} } @columnTypes;
263              my $rowHtml = td( "$genomeName$new", $genomeLen, $pegCount,              # The last link is a button to look at the subsystem summaries.
264                                map { $counters{$_} } @columnTypes,              my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',
265                                $rnaCount );                                                 [$genomeID]);
266                my $ssLink = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&show_subsystems=1";
267                my $ssCol = "<a href=\"$ssLink\">$ssCount</a>";
268                # Create the row text. Note that we use the distributive capability of the TD
269                # function to apply the same style to each one.
270                my $rowHtml = join("",
271                                   td("$genomeName$new"),
272                                   td({ class => $numStyle }, $genomeLen),
273                                   td({ class => $numStyle }, $pegCount),
274                                   td({ class => $counterStyle }, \@counterValues),
275                                   td({ class => $numStyle }, $ssCol),
276                                   td({ class => $numStyle }, $rnaCount),
277                                  );
278              # Put it in the row hash.              # Put it in the row hash.
279              $rows{$genomeName} = $rowHtml;              $rows{$genomeName} = $rowHtml;
280          }          }
# Line 221  Line 293 
293              # Count the row.              # Count the row.
294              $rowCount++;              $rowCount++;
295          }          }
296          # All done, close the file.          # All done, terminate the table and close the file.
297            print GROUPFILE "</table>\n";
298          close GROUPFILE;          close GROUPFILE;
299          Trace("$rowCount genomes processed.") if T(2);          Trace("$rowCount genomes processed.") if T(2);
300            # Now save the group totals.
301            $summaries{$groupID} = \%groupTotals;
302      }      }
303        # Now produce the summary table.
304        my $sumFileName = "stats-groups.inc";
305        Open(\*SUMFILE, ">$targetDir/$sumFileName");
306        # Start the table.
307        print SUMFILE "<table class=\"$tableStyle\">\n";
308        # Create the header row.
309        print SUMFILE Tr( { class => 'odd' }, th(["Group name",
310                                                 "Genomes",
311                                                 "Protein Encoding Genes (PEGs)",
312                                                 "Named genes in subsystems",            # s0
313                                                 "Named genes not in subsystems",        # n0
314                                                 "Hypothetical genes in subsystems",     # s1
315                                                 "Hypothetical genes not in subsystems", # n1
316                                                 "RNAs",
317                                                   ])) . "\n";
318        # Set up a flag for the odd-even styling.
319        my $rowFlag = 0;
320        # Put in the data rows.
321        for my $groupName (sort keys %summaries) {
322            my $group = $summaries{$groupName};
323            # Compute the link for the current group.
324            my $groupLink = a({ href => $groupData{$groupName}->[0] }, $groupName);
325            # Create the table row.
326            my $rowHtml = join("",
327                               td($groupLink),
328                               td({ class => $numStyle }, Tracer::CommaFormat($group->{genomes})),
329                               td({ class => $numStyle }, Tracer::CommaFormat($group->{pegs})),
330                               td({ class => $counterStyle }, [ map { Tracer::CommaFormat($group->{$_}) } @columnTypes ]),
331                               td({ class => $numStyle }, Tracer::CommaFormat($group->{RNAs})),
332                              );
333            print SUMFILE Tr( { class => $rowStyle[$rowFlag] }, $rowHtml ) . "\n";
334            # Flip the row style.
335            $rowFlag = 1 - $rowFlag;
336        }
337        # Terminate the table and close the file.
338        print SUMFILE "</table>\n";
339        close SUMFILE;
340      # We're all done.      # We're all done.
341      Trace("Processing complete.") if T(2);      Trace("Processing complete.") if T(2);
342  }  }
343    
 =head3 Fix  
   
 C<< my %fixedHash = Fix(%groupHash); >>  
   
 Prepare a genome group hash for processing. Groups with the same primary name will be combined.  
 The primary name is the first capitalized word in the group name.  
   
 =over 4  
   
 =item groupHash  
   
 Hash to be fixed up.  
   
 =item RETURN  
   
 Returns a fixed-up version of the hash.  
   
 =back  
   
 =cut  
   
 sub Fix {  
     # Get the parameters.  
     my (%groupHash) = @_;  
     # Create the result hash.  
     my %retVal = ();  
     # Copy over the genomes.  
     for my $groupID (keys %groupHash) {  
         # Make a safety copy of the group ID.  
         my $realGroupID = $groupID;  
         # Yank the primary name.  
         if ($groupID =~ /([A-Z]\w+)/) {  
             $realGroupID = $1;  
         }  
         # Append this group's genomes into the result hash.  
         Tracer::AddToListMap(\%retVal, $realGroupID, $groupHash{$groupID});  
     }  
     # Return the result hash.  
     return %retVal;  
 }  
   
   
344  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.26

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3