[Bio] / Sprout / GenomeStats.pl Repository:
ViewVC logotype

Diff of /Sprout/GenomeStats.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Sun Jun 18 05:37:14 2006 UTC revision 1.31, Tue Feb 5 05:47:32 2008 UTC
# Line 6  Line 6 
6  the genomes in each of the genome groups. Genomes that are new to this version  the genomes in each of the genome groups. Genomes that are new to this version
7  of the Sprout will be specially marked. In order for this to work, both the  of the Sprout will be specially marked. In order for this to work, both the
8  current and previous Sprout databases must be available on this machine.  current and previous Sprout databases must be available on this machine.
 This is one positional parameter: the name of a directory in which to place  
 the include files.  
9    
10  The currently-supported command-line options are as follows.  The currently-supported command-line options are as follows.
11    
# Line 61  Line 59 
59    
60  Style to use for small-text markers (e.g. NEW!)  Style to use for small-text markers (e.g. NEW!)
61    
62    =item numStyle
63    
64    Style to use for numeric cells.
65    
66    =item counterStyle
67    
68    Style to use for counter cells.
69    
70  =item linkCGI  =item linkCGI
71    
72  Path to the CGI script for displaying detailed statistics.  Path to the CGI script for displaying detailed statistics.
73    
74    =item noNewCheck
75    
76    If specified, the check for new genomes in the group is suppressed. This
77    may need to be done if there's been a change in the database definition. Note
78    that all this really does is keep the B<NEW!> symbol from showing. It does
79    not affect which genomes show up in the table.
80    
81  =back  =back
82    
83  =cut  =cut
84    
85  use strict;  use strict;
86  use Tracer;  use Tracer;
 use DocUtils;  
 use TestUtils;  
87  use Cwd;  use Cwd;
88  use File::Copy;  use File::Copy;
89  use File::Path;  use File::Path;
# Line 80  Line 91 
91  use SFXlate;  use SFXlate;
92  use CGI qw(:standard);  use CGI qw(:standard);
93  use FIG;  use FIG;
 no warnings 'once'; # only when coding  
94    
95  # Get the command-line options and parameters.  # Get the command-line options and parameters.
96  my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],  my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
97                                             {                                             {
98                                              strict => [0, 'keep related groups separate'],                                              strict => [0, 'keep related groups separate'],
99                                              oddStyle => ['odd', 'style for odd rows'],                                              oddStyle => ['odd', 'style for odd rows'],
100                                                trace => [2, 'tracing level'],
101                                              evenStyle => ['even', 'style for even rows'],                                              evenStyle => ['even', 'style for even rows'],
102                                              tableStyle => ['genomestats', 'style for whole table'],                                              tableStyle => ['genomestats', 'style for whole table'],
103                                              markerStyle => ['tinytext', 'style for markers'],                                              markerStyle => ['tinytext', 'style for markers'],
104                                                numStyle => ['numcell', 'style for cells with numeric values'],
105                                                counterStyle => ['countercell', 'style for cells with counter values'],
106                                              linkCGI => ['../FIG/genome_statistics.cgi',                                              linkCGI => ['../FIG/genome_statistics.cgi',
107                                                          'path to CGI script for detailed statistics'],                                                          'path to CGI script for detailed statistics'],
108                                                noNewCheck => [0, 'if specified, skips the check for new genomes'],
109                                                targetDir => ["$FIG_Config::nmpdr_base/next/html/includes",
110                                                              'target directory'],
111                                             },                                             },
112                                             "<targetDir>",                                             "",
113                                             @ARGV);                                             @ARGV);
114    # The return type (error/no error) goes in here.
115    my $rtype;
116    eval {
117        # This table controls the special attribute columns. For each we need to know the attribute name and the
118        # column title. If any genomes in a group have a value for one of the special columns, that column is
119        # displayed along with the attribute values.
120        my %specialCols = (Serotype => 'Serotype_code',
121                           Phenotype => 'Phenotype');
122  # Verify the directory name.  # Verify the directory name.
123  my $targetDir = $parameters[0];      my $targetDir = $options->{targetDir};
124  if (! $targetDir) {  if (! $targetDir) {
125      Confess("No target directory specified.");      Confess("No target directory specified.");
126  } elsif (! -d $targetDir) {  } elsif (! -d $targetDir) {
127      Confess("Target directory $targetDir not found.");      Confess("Target directory $targetDir not found.");
128  } else {  } else {
     # *Get the old Sprout.  
     my $oldSprout = SFXlate->new_sprout_only($FIG_Config::oldSproutDB);  
     # Extract the genome group data from the old Sprout.  
     my %oldGroupHash = $oldSprout->GetGroups();  
     if (! $options->{strict}) {  
         %oldGroupHash = FixGroups(%oldGroupHash);  
     }  
129      # Get the new Sprout.      # Get the new Sprout.
130      my $sprout = SFXlate->new_sprout_only();      my $sprout = SFXlate->new_sprout_only();
131      my %newGroupHash = $sprout->GetGroups();      my %newGroupHash = $sprout->GetGroups();
132            # Extract the genome group data from the new Sprout.
133      if (! $options->{strict}) {      if (! $options->{strict}) {
134          %newGroupHash = FixGroups(%newGroupHash);              %newGroupHash = $sprout->Fix(%newGroupHash);
135      }      }
136            # This hash will be used to determine which genomes are new.
137            my %oldGroupHash = ();
138            if ($options->{noNewCheck}) {
139                # Here we can't look at the old Sprout. Set up the hash
140                # so it looks like the old Sprout's data is the same as ours.
141                %oldGroupHash = map { $_ => $newGroupHash{$_} } keys %newGroupHash;
142            } else {
143                # Get the old Sprout.
144                my $oldSprout = SFXlate->old_sprout_only();
145                # Extract the genome group data from the old Sprout.
146                %oldGroupHash = $oldSprout->GetGroups();
147                if (! $options->{strict}) {
148                    %oldGroupHash = $oldSprout->Fix(%oldGroupHash);
149                }
150            }
151            # Get a FIG object for computing attributes.
152            my $fig = FIG->new();
153            # Get the super-group list.
154            my @superGroups = sort keys %newGroupHash;
155            # Set up some useful stuff for the four count columns.
156            my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",
157                              s1 => "hypo_sub", n1 => "hypo_nosub" );
158            my @columnTypes = ('s0', 'n0', 's1', 'n1');
159            # Get the styles.
160            my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},
161                                                         $options->{evenStyle}, $options->{oddStyle});
162            my ($numStyle, $counterStyle) = ($options->{numStyle}, $options->{counterStyle});
163            # Prepare a hash for the summary counters. These will be used on the organism summary page.
164            my %summaries = ();
165      # Loop through the groups.      # Loop through the groups.
166      for my $groupID (keys %newGroupHash) {          for my $groupID (@superGroups) {
167          Trace("Processing group $groupID.") if T(2);          Trace("Processing group $groupID.") if T(2);
168                # Create a hash for summarizing the counters.
169                my %groupTotals = ( genomes => 0, pegs => 0, RNAs => 0,
170                                   map { $_ => } @columnTypes, features => 0 );
171          # Get the genomes from the new hash.          # Get the genomes from the new hash.
172          my @newGenomes = @{$newGroupHash{$groupID}};          my @newGenomes = @{$newGroupHash{$groupID}};
173          # Create a hash for finding if a genome is in the old group. If the entire group is          # Create a hash for finding if a genome is in the old group. If the entire group is
# Line 127  Line 177 
177              %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};              %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};
178          }          }
179          # Create the output file.          # Create the output file.
180          Open(\*GROUPFILE, ">$targetDir/$groupID.inc");              my $outFileName = "stats-" . lc($groupID) . ".inc";
181          # Get the styles.              Open(\*GROUPFILE, ">$targetDir/$outFileName");
182          my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},              # Get the special columns. We'll stuff them in a hash keyed by column name. Each column name will contain
183                                                       $options->{evenStyle}, $options->{oddStyle});              # a sub-hash that translates each genome ID to its applicable attribute value (if any).
184          # Start the table.              my %specialData = ();
185          print GROUPFILE "<table class=\"$tableStyle\">\n";              for my $specialColumn (keys %specialCols) {
186          # Create the header row.                  # Get the attribute mapping.
187          print GROUPFILE Tr( { class => 'odd' }, th("Strain annotated in NMPDR",                  my %specialDataList = map { $_->[0] => $_->[2] } $fig->get_attributes(\@newGenomes, $specialCols{$specialColumn});
188                                                   "Genome size, bp",                  # We only proceed if some attributes were found. As a result, the keys in %specialData will only be keys
189                    # for columns that exist in the output.
190                    if (scalar(keys %specialDataList)) {
191                        $specialData{$specialColumn} = \%specialDataList;
192                    }
193                }
194                # Set up the column names.
195                my @columnNames = "Strain annotated in NMPDR";
196                push @columnNames, sort keys %specialData;
197                push @columnNames,  "Genome size, bp",
198                                                   "Protein Encoding Genes (PEGs)",                                                   "Protein Encoding Genes (PEGs)",
199                                                   "Named genes in subsystems",            # s0                                                   "Named genes in subsystems",            # s0
200                                                   "Named genes not in subsystems",        # n0                                                   "Named genes not in subsystems",        # n0
201                                                   "Hypothetical genes in subsystems",     # s1                                                   "Hypothetical genes in subsystems",     # s1
202                                                   "Hypothetical genes not in subsystems", # n1                                                   "Hypothetical genes not in subsystems", # n1
203                                                   "RNAs")) . "\n";                                  "Subsystems",
204          # Set up some useful stuff for the four count columns.                                  "RNAs";
205          my %linkParms = ( s0 => "nohypo_sub", n0 => "nohypo_nosub",              # Start the table.
206                            s1 => "hypo_sub", n1 => "hypo_nosub" );              print GROUPFILE "<table class=\"$tableStyle\">\n";
207          my @columnTypes = ('s0', 'n0', 's1', 'n1');              # Create the header row.
208                print GROUPFILE Tr( { class => 'odd' }, th(\@columnNames)) . "\n";
209          # The data rows will be built next. We'll be putting them into a hash keyed by          # The data rows will be built next. We'll be putting them into a hash keyed by
210          # organism name. The hash enables us to spit them out sorted by name.          # organism name. The hash enables us to spit them out sorted by name.
211          my %rows = ();          my %rows = ();
212                # This variable is used to hold the counts.
213                my $num;
214          # Loop through the genomes in the new group.          # Loop through the genomes in the new group.
215          for my $genomeID (@newGenomes) {          for my $genomeID (@newGenomes) {
216                    # Count this genome.
217                    $groupTotals{genomes}++;
218              # Check to see if this genome is new.              # Check to see if this genome is new.
219              my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");              my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");
220              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);
221              # Get the strain name.              # Get the strain name.
222              my $genomeName = $sprout->GenusSpecies($genomeID);              my $genomeName = $sprout->GenusSpecies($genomeID);
223                    # Apply a link.
224                    my $genomeText = CGI::a({ href => "../FIG/genome_statistics.cgi?genome=$genomeID;SPROUT=1" }, $genomeName);
225              # If this is a new strain, build the HTML for the NEW! mark.              # If this is a new strain, build the HTML for the NEW! mark.
226              if ($new) {              if ($new) {
227                  $new = " <span class=\"$markerStyle\">NEW!</span>";                  $new = " <span class=\"$markerStyle\">NEW!</span>";
228              }              }
229              # Get the genome length.              # Get the genome length.
230              my $genomeLen = $sprout->GenomeLength($genomeID);                  $num = $sprout->GenomeLength($genomeID);
231                    my $genomeLen = Tracer::CommaFormat($num);
232              # Get the number of PEGs.              # Get the number of PEGs.
233              my $pegCount = $sprout->FeatureCount($genomeID, 'peg');                  $num = $sprout->FeatureCount($genomeID, 'peg');
234                    my $pegCount = Tracer::CommaFormat($num);
235                    $groupTotals{pegs} += $num;
236              # Get the number of RNAs.              # Get the number of RNAs.
237              my $rnaCount = $sprout->FeatureCount($genomeID, 'rna');                  $num = $sprout->FeatureCount($genomeID, 'rna');
238                    my $rnaCount = Tracer::CommaFormat($num);
239                    $groupTotals{RNAs} += $num;
240                    # If there are no RNAs, we say we don't know the number, since we know there
241                    # must be RNAs somewhere.
242                    if (! $rnaCount) {
243                        $rnaCount = "n/d";
244                    }
245              # Now we have four categories of features to work with, for each              # Now we have four categories of features to work with, for each
246              # combination of named or hypothetical vs. in-subsystem or              # combination of named or hypothetical vs. in-subsystem or
247              # not-in-subsystem. First, we get all of the feature assignments for              # not-in-subsystem. First, we get all of the feature assignments for
248              # the genome.              # the genome.
249              my $assignHash = $sprout->GenomeAssignments($genomeID);              my $assignHash = $sprout->GenomeAssignments($genomeID);
250              # Next, we get all of the features in the genome that belong to a              # Next, we get all of the features in the genome that belong to a
251              # subsystem. This involves a query via the subsystem spreadsheet.                  # subsystem.
252              my %ssHash = map { $_ => 1 } $sprout->GetFlat(['IsGenomeOf', 'ContainsFeature'],                  my %ssHash = $sprout->GenomeSubsystemData($genomeID);
                                                     "IsGenomeOf(from-link) = ?",  
                                                     [$genomeID], 'ContainsFeature(to-link)');  
253              # Create a hash to track the four categories. "s" or "n" indicates              # Create a hash to track the four categories. "s" or "n" indicates
254              # in or out of a subsystem. "1" or "0" indicates hypothetical or              # in or out of a subsystem. "1" or "0" indicates hypothetical or
255              # real.              # real.
# Line 190  Line 264 
264                  $counters{$ss} += 1;                  $counters{$ss} += 1;
265                  $totalFeatures++;                  $totalFeatures++;
266              }              }
267                    Trace("$totalFeatures total features found for $genomeID.") if T(3);
268                    for my $counterKey (@columnTypes) {
269                        $groupTotals{$counterKey} += $counters{$counterKey};
270                    }
271                    $groupTotals{features} += $totalFeatures;
272              # We have all our data. Next we need to compute the percentages and the links.              # We have all our data. Next we need to compute the percentages and the links.
273              # First, the link stuff.              # First, the link stuff.
274              my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";              my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";
# Line 197  Line 276 
276              for my $type (keys %linkParms) {              for my $type (keys %linkParms) {
277                  $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },                  $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },
278                                       sprintf("%d(%.1f%%)", $counters{$type},                                       sprintf("%d(%.1f%%)", $counters{$type},
279                                               $counters{$type} * 100 / $totalFeatures));                                                   Tracer::Percent($counters{$type}, $totalFeatures)));
280              }              }
281              # Create the row text.                  my @counterValues = map { $counters{$_} } @columnTypes;
282              my $rowHtml = td( "$genomeName$new", $genomeLen, $pegCount,                  # The last link is a button to look at the subsystem summaries.
283                                map { $counters{$_} } @columnTypes,                  my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',
284                                $rnaCount );                                                     [$genomeID]);
285                    my $ssLink = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&show_subsystems=1";
286                    my $ssCol = "<a href=\"$ssLink\">$ssCount</a>";
287                    # Start creating the table cells.
288                    my $rowHtml = td("$genomeText$new");
289                    # Add any special columns.
290                    for my $specialCol (keys %specialData) {
291                        # Here we get the attribute value. If there is none, we leave the column blank.
292                        my $attribute = $specialData{$specialCol}->{$genomeID} || "&nbsp;";
293                        $rowHtml .= td($attribute);
294                    }
295                    # Now add the data columns.
296                    $rowHtml .= join("",
297                                    td({ class => $numStyle }, $genomeLen),
298                                    td({ class => $numStyle }, $pegCount),
299                                    td({ class => $counterStyle }, \@counterValues),
300                                    td({ class => $numStyle }, $ssCol),
301                                    td({ class => $numStyle }, $rnaCount),
302                                    );
303              # Put it in the row hash.              # Put it in the row hash.
304              $rows{$genomeName} = $rowHtml;              $rows{$genomeName} = $rowHtml;
305          }          }
# Line 221  Line 318 
318              # Count the row.              # Count the row.
319              $rowCount++;              $rowCount++;
320          }          }
321          # All done, close the file.              # All done, terminate the table and close the file.
322                print GROUPFILE "</table>\n";
323          close GROUPFILE;          close GROUPFILE;
324          Trace("$rowCount genomes processed.") if T(2);          Trace("$rowCount genomes processed.") if T(2);
325                # Now save the group totals.
326                $summaries{$groupID} = \%groupTotals;
327      }      }
328            # Now produce the summary table.
329            my $sumFileName = "stats-groups.inc";
330            Open(\*SUMFILE, ">$targetDir/$sumFileName");
331            # Start the table.
332            print SUMFILE "<table class=\"$tableStyle\">\n";
333            # Create the header row.
334            print SUMFILE Tr( { class => 'odd' }, th(["Group name",
335                                                     "Genomes",
336                                                     "Protein Encoding Genes (PEGs)",
337                                                     "Named genes in subsystems",            # s0
338                                                     "Named genes not in subsystems",        # n0
339                                                     "Hypothetical genes in subsystems",     # s1
340                                                     "Hypothetical genes not in subsystems", # n1
341                                                     "RNAs",
342                                                       ])) . "\n";
343            # Set up a flag for the odd-even styling.
344            my $rowFlag = 0;
345            # Put in the data rows.
346            for my $groupName (sort keys %summaries) {
347                my $group = $summaries{$groupName};
348                # Compute the link for the current group.
349                my $groupLink = a({ href => $sprout->GroupPageName($groupName) }, $groupName);
350                # Create the table row.
351                my $rowHtml = join("",
352                                   td($groupLink),
353                                   td({ class => $numStyle }, Tracer::CommaFormat($group->{genomes})),
354                                   td({ class => $numStyle }, Tracer::CommaFormat($group->{pegs})),
355                                   td({ class => $counterStyle }, [ map { Tracer::CommaFormat($group->{$_}) } @columnTypes ]),
356                                   td({ class => $numStyle }, Tracer::CommaFormat($group->{RNAs})),
357                                  );
358                print SUMFILE Tr( { class => $rowStyle[$rowFlag] }, $rowHtml ) . "\n";
359                # Flip the row style.
360                $rowFlag = 1 - $rowFlag;
361            }
362            # Terminate the table and close the file.
363            print SUMFILE "</table>\n";
364            close SUMFILE;
365      # We're all done.      # We're all done.
366      Trace("Processing complete.") if T(2);      Trace("Processing complete.") if T(2);
367  }  }
368    };
369  =head3 Fix  if ($@) {
370        Trace("Stats failed with error: $@") if T(0);
371  C<< my %fixedHash = Fix(%groupHash); >>      $rtype = "error";
372    } else {
373  Prepare a genome group hash for processing. Groups with the same primary name will be combined.      Trace("Stats complete.") if T(2);
374  The primary name is the first capitalized word in the group name.      $rtype = "no error";
   
 =over 4  
   
 =item groupHash  
   
 Hash to be fixed up.  
   
 =item RETURN  
   
 Returns a fixed-up version of the hash.  
   
 =back  
   
 =cut  
   
 sub Fix {  
     # Get the parameters.  
     my (%groupHash) = @_;  
     # Create the result hash.  
     my %retVal = ();  
     # Copy over the genomes.  
     for my $groupID (keys %groupHash) {  
         # Make a safety copy of the group ID.  
         my $realGroupID = $groupID;  
         # Yank the primary name.  
         if ($groupID =~ /([A-Z]\w+)/) {  
             $realGroupID = $1;  
375          }          }
376          # Append this group's genomes into the result hash.  if ($options->{phone}) {
377          Tracer::AddToListMap(\%retVal, $realGroupID, $groupHash{$groupID});      my $msgID = Tracer::SendSMS($options->{phone}, "GenomeStats terminated with $rtype.");
378        if ($msgID) {
379            Trace("Phone message sent with ID $msgID.") if T(2);
380        } else {
381            Trace("Phone message not sent.") if T(2);
382      }      }
     # Return the result hash.  
     return %retVal;  
383  }  }
384    
   
385  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.31

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3