[Bio] / Sprout / GenomeStats.pl Repository:
ViewVC logotype

Diff of /Sprout/GenomeStats.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.5, Sun Jun 18 07:16:57 2006 UTC revision 1.35, Mon Jan 19 21:46:21 2009 UTC
# Line 2  Line 2 
2    
3  =head1 Genome Data Generator  =head1 Genome Data Generator
4    
5  This script creates a set of HTML include files that list the statistics for  This script creates a set of Wiki pages that list the statistics for
6  the genomes in each of the genome groups. Genomes that are new to this version  the genomes in each of the genome groups. Genomes that are new to this version
7  of the Sprout will be specially marked. In order for this to work, both the  of the Sprout will be specially marked. In order for this to work, both the
8  current and previous Sprout databases must be available on this machine.  current and previous Sprout databases must be available on this machine.
 This is one positional parameter: the name of a directory in which to place  
 the include files.  
9    
10  The currently-supported command-line options are as follows.  The currently-supported command-line options are as follows.
11    
# Line 39  Line 37 
37    
38  Display this command's parameters and options.  Display this command's parameters and options.
39    
40  =item strict  =item test
41    
42  If specified, strict groups will be used; otherwise, groups with a common primary name  If specified, the output pages will be put into the sandbox instead of the main web.
 will be combined into a single group. (The primary name of the group is the first  
 capitalized word.)  
43    
44  =item oddStyle  =item noNewCheck
45    
46  Style to use for odd rows of the table.  If specified, the check for new genomes in the group is suppressed. This
47    may need to be done if there's been a change in the database definition. Note
48  =item evenStyle  that all this really does is keep the B<NEW!> symbol from showing. It does
49    not affect which genomes show up in the table.
 Style to use for even rows of the table.  
   
 =item tableStyle  
   
 Style to use for the table itself.  
   
 =item markerStyle  
   
 Style to use for small-text markers (e.g. NEW!)  
   
 =item linkCGI  
   
 Path to the CGI script for displaying detailed statistics.  
50    
51  =back  =back
52    
# Line 71  Line 54 
54    
55  use strict;  use strict;
56  use Tracer;  use Tracer;
 use DocUtils;  
 use TestUtils;  
57  use Cwd;  use Cwd;
58  use File::Copy;  use File::Copy;
59  use File::Path;  use File::Path;
60  use Sprout;  use Sprout;
61  use SFXlate;  use SFXlate;
62  use CGI qw(:standard);  use CGI qw(-nosticky);
63  use FIG;  use FIG;
64  no warnings 'once'; # only when coding  use WikiTools;
65    
66  # Get the command-line options and parameters.  # Get the command-line options and parameters.
67  my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],  my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
68                                             {                                             {
69                                              strict => [0, 'keep related groups separate'],                                              test => [0, 'if specified, publishes to the wiki sandbox instead of the main web'],
70                                              oddStyle => ['odd', 'style for odd rows'],                                              noNewCheck => [0, 'if specified, skips the check for new genomes'],
                                             evenStyle => ['even', 'style for even rows'],  
                                             tableStyle => ['genomestats', 'style for whole table'],  
                                             markerStyle => ['tinytext', 'style for markers'],  
                                             linkCGI => ['../FIG/genome_statistics.cgi',  
                                                         'path to CGI script for detailed statistics'],  
71                                             },                                             },
72                                             "<targetDir>",                                             "",
73                                             @ARGV);                                             @ARGV);
74  # Verify the directory name.  # The return type (error/no error) goes in here.
75  my $targetDir = $parameters[0];  my $rtype;
76  if (! $targetDir) {  eval {
77      Confess("No target directory specified.");      # This table controls the special attribute columns. For each we need to know the attribute name and the
78  } elsif (! -d $targetDir) {      # column title. If any genomes in a group have a value for one of the special columns, that column is
79      Confess("Target directory $targetDir not found.");      # displayed along with the attribute values.
80  } else {      my %specialCols = (Serotype => 'Serotype_code',
81      # *Get the old Sprout.                         Phenotype => 'Phenotype');
82      my $oldSprout = SFXlate->new_sprout_only($FIG_Config::oldSproutDB);      my $outputWeb = ($options->{test} ? 'Sandbox' : 'Main');
     # Extract the genome group data from the old Sprout.  
     my %oldGroupHash = $oldSprout->GetGroups();  
     if (! $options->{strict}) {  
         %oldGroupHash = Fix(%oldGroupHash);  
     }  
83      # Get the new Sprout.      # Get the new Sprout.
84      my $sprout = SFXlate->new_sprout_only();      my $sprout = SFXlate->new_sprout_only();
85      my %newGroupHash = $sprout->GetGroups();      my %newGroupHash = $sprout->GetGroups();
86      if (! $options->{strict}) {      # Get a wiki helper.
87          %newGroupHash = Fix(%newGroupHash);      my $wiki = WikiTools->new();
88        # Extract the genome group data from the new Sprout.
89        %newGroupHash = $sprout->Fix(%newGroupHash);
90        # This hash will be used to determine which genomes are new.
91        my %oldGroupHash = ();
92        if ($options->{noNewCheck}) {
93            # Here we can't look at the old Sprout. Set up the hash
94            # so it looks like the old Sprout's data is the same as ours.
95            %oldGroupHash = map { $_ => $newGroupHash{$_} } keys %newGroupHash;
96        } else {
97            # Get the old Sprout.
98            my $oldSprout = SFXlate->old_sprout_only();
99            # Extract the genome group data from the old Sprout.
100            %oldGroupHash = $oldSprout->GetGroups();
101            %oldGroupHash = $oldSprout->Fix(%oldGroupHash);
102      }      }
103        # Get a FIG object for computing attributes.
104        my $fig = FIG->new();
105        # Get the super-group list.
106        my @superGroups = sort keys %newGroupHash;
107        # Set up some useful stuff for the four count columns.
108        my $url = "%SCRIPTURL{rest}%/NmpdrPlugin/search?Class=OrgSumSearch;Search=Go";
109        my %linkParms = ( s0 => "$url;hypothetical=named;insubsystem=in;genome=",
110                          n0 => "$url;hypothetical=named;insubsystem=out;genome=",
111                          s1 => "$url;hypothetical=hypo;insubsystem=in;genome=",
112                          n1 => "$url;hypothetical=hypo;insubsystem=out;genome=" );
113        my @columnTypes = ('s0', 'n0', 's1', 'n1');
114        # Prepare a hash for the summary counters. These will be used on the organism summary page.
115        my %summaries = ();
116      # Loop through the groups.      # Loop through the groups.
117      for my $groupID (keys %newGroupHash) {      for my $groupID (@superGroups) {
118          Trace("Processing group $groupID.") if T(2);          Trace("Processing group $groupID.") if T(2);
119            # Create a hash for summarizing the counters.
120            my %groupTotals = ( genomes => 0, pegs => 0, RNAs => 0,
121                               map { $_ => } @columnTypes, features => 0 );
122          # Get the genomes from the new hash.          # Get the genomes from the new hash.
123          my @newGenomes = @{$newGroupHash{$groupID}};          my @newGenomes = @{$newGroupHash{$groupID}};
124          # Create a hash for finding if a genome is in the old group. If the entire group is          # Create a hash for finding if a genome is in the old group. If the entire group is
# Line 126  Line 127 
127          if (exists $oldGroupHash{$groupID}) {          if (exists $oldGroupHash{$groupID}) {
128              %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};              %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};
129          }          }
130          # Create the output file.          # Compute the name of the wiki page we're building.
131          Open(\*GROUPFILE, ">$targetDir/$groupID.inc");          my $outPageName = "${groupID}Stats";
132          # Get the styles.          # We'll put the data for the page in here.
133          my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},          my @outputLines = ();
134                                                       $options->{evenStyle}, $options->{oddStyle});          # Get the special columns. We'll stuff them in a hash keyed by column name. Each column name will contain
135          # Start the table.          # a sub-hash that translates each genome ID to its applicable attribute value (if any).
136          print GROUPFILE "<table class=\"$tableStyle\">\n";          my %specialData = ();
137            for my $specialColumn (keys %specialCols) {
138                # Get the attribute mapping.
139                my %specialDataList = map { $_->[0] => $_->[2] } $fig->get_attributes(\@newGenomes, $specialCols{$specialColumn});
140                # We only proceed if some attributes were found. As a result, the keys in %specialData will only be keys
141                # for columns that exist in the output.
142                if (scalar(keys %specialDataList)) {
143                    $specialData{$specialColumn} = \%specialDataList;
144                }
145            }
146            # Set up the column names. Note that an extra space in front of a name will be interpreted by
147            # the Wiki markup as an order to right-justify the text.
148            my @columnNames = "*Strain annotated in NMPDR*";
149            push @columnNames, map { "*$_*" } sort keys %specialData;
150            push @columnNames,  "*Genome size, bp*",
151                                " *%FIG{Protein Encoding Genes}% (PEGs)*",
152                                " *Named genes in subsystems*",            # s0
153                                " *Named genes not in subsystems*",        # n0
154                                " *Hypothetical genes in subsystems*",     # s1
155                                " *Hypothetical genes not in subsystems*", # n1
156                                " *Subsystems*",
157                                " *RNAs*";
158          # Create the header row.          # Create the header row.
159          print GROUPFILE Tr( { class => 'odd' }, th("Strain annotated in NMPDR",          push @outputLines, "| " . join(" | ", @columnNames) . " |";
                                                  "Genome size, bp",  
                                                  "Protein Encoding Genes (PEGs)",  
                                                  "Named genes in subsystems",            # s0  
                                                  "Named genes not in subsystems",        # n0  
                                                  "Hypothetical genes in subsystems",     # s1  
                                                  "Hypothetical genes not in subsystems", # n1  
                                                  "RNAs")) . "\n";  
         # Set up some useful stuff for the four count columns.  
         my %linkParms = ( s0 => "nohypo_sub", n0 => "nohypo_nosub",  
                           s1 => "hypo_sub", n1 => "hypo_nosub" );  
         my @columnTypes = ('s0', 'n0', 's1', 'n1');  
160          # The data rows will be built next. We'll be putting them into a hash keyed by          # The data rows will be built next. We'll be putting them into a hash keyed by
161          # organism name. The hash enables us to spit them out sorted by name.          # organism name. The hash enables us to spit them out sorted by name.
162          my %rows = ();          my %rows = ();
163            # This variable is used to hold the counts.
164            my $num;
165          # Loop through the genomes in the new group.          # Loop through the genomes in the new group.
166          for my $genomeID (@newGenomes) {          for my $genomeID (@newGenomes) {
167                # Count this genome.
168                $groupTotals{genomes}++;
169              # Check to see if this genome is new.              # Check to see if this genome is new.
170              my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");              my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");
171              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);
172              # Get the strain name.              # Get the strain name.
173              my $genomeName = $sprout->GenusSpecies($genomeID);              my $genomeName = $sprout->GenusSpecies($genomeID);
174              # If this is a new strain, build the HTML for the NEW! mark.              # Apply a link.
175                my $genomeText = "%SV{\"$genomeName\" id=\"$genomeID\"}%";
176                # If this is a new strain, add the NEW! mark.
177              if ($new) {              if ($new) {
178                  $new = " <span class=\"$markerStyle\">NEW!</span>";                  $genomeText .= " %N%";
179              }              }
180              # Get the genome length.              # Get the genome length.
181              my $genomeLen = $sprout->GenomeLength($genomeID);              $num = $sprout->GenomeLength($genomeID);
182                my $genomeLen = Tracer::CommaFormat($num);
183              # Get the number of PEGs.              # Get the number of PEGs.
184              my $pegCount = $sprout->FeatureCount($genomeID, 'peg');              $num = $sprout->FeatureCount($genomeID, 'peg');
185                my $pegCount = Tracer::CommaFormat($num);
186                $groupTotals{pegs} += $num;
187              # Get the number of RNAs.              # Get the number of RNAs.
188              my $rnaCount = $sprout->FeatureCount($genomeID, 'rna');              $num = $sprout->FeatureCount($genomeID, 'rna');
189                my $rnaCount = Tracer::CommaFormat($num);
190                $groupTotals{RNAs} += $num;
191                # If there are no RNAs, we say we don't know the number, since we know there
192                # must be RNAs somewhere.
193                if (! $rnaCount) {
194                    $rnaCount = "n/d";
195                }
196              # Now we have four categories of features to work with, for each              # Now we have four categories of features to work with, for each
197              # combination of named or hypothetical vs. in-subsystem or              # combination of named or hypothetical vs. in-subsystem or
198              # not-in-subsystem. First, we get all of the feature assignments for              # not-in-subsystem. First, we get all of the feature assignments for
199              # the genome.              # the genome.
200              my $assignHash = $sprout->GenomeAssignments($genomeID);              my $assignHash = $sprout->GenomeAssignments($genomeID);
201              # Next, we get all of the features in the genome that belong to a              # Next, we get all of the features in the genome that belong to a
202              # subsystem. This involves a query via the subsystem spreadsheet.              # subsystem.
203              my %ssHash = map { $_ => 1 } $sprout->GetFlat(['IsGenomeOf', 'ContainsFeature'],              my %ssHash = $sprout->GenomeSubsystemData($genomeID);
                                                     "IsGenomeOf(from-link) = ?",  
                                                     [$genomeID], 'ContainsFeature(to-link)');  
204              # Create a hash to track the four categories. "s" or "n" indicates              # Create a hash to track the four categories. "s" or "n" indicates
205              # in or out of a subsystem. "1" or "0" indicates hypothetical or              # in or out of a subsystem. "1" or "0" indicates hypothetical or
206              # real.              # real.
# Line 190  Line 215 
215                  $counters{$ss} += 1;                  $counters{$ss} += 1;
216                  $totalFeatures++;                  $totalFeatures++;
217              }              }
218              Trace("$totalFeatures total feature found for $genomeID.") if T(3);              Trace("$totalFeatures total features found for $genomeID.") if T(3);
219              # We have all our data. Next we need to compute the percentages and the links.              for my $counterKey (@columnTypes) {
220              # First, the link stuff.                  $groupTotals{$counterKey} += $counters{$counterKey};
221              my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";              }
222              # Now format the counters and percentages.              $groupTotals{features} += $totalFeatures;
223                # We have all our data. Next we need to compute the percentages.
224              for my $type (keys %linkParms) {              for my $type (keys %linkParms) {
225                  $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },                  my $counterData = sprintf("%d(%.1f%%)", $counters{$type},
226                                       sprintf("%d(%.1f%%)", $counters{$type},                                            Tracer::Percent($counters{$type}, $totalFeatures));
227                                               $counters{$type} * 100 / $totalFeatures));                  $counters{$type} = CGI::a({ href => $linkParms{$type} . $genomeID }, $counterData);
228              }              }
229              # Create the row text.              my @counterValues = map { $counters{$_} } @columnTypes;
230              my $rowHtml = td( "$genomeName$new", $genomeLen, $pegCount,              # The last column is a subsystem count.
231                                map { $counters{$_} } @columnTypes,              my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',
232                                $rnaCount );                                                 [$genomeID]);
233                my $ssCol = $ssCount;
234                # Start creating the table cells.
235                my $rowHtml = "| $genomeText |";
236                # Add any special columns.
237                for my $specialCol (keys %specialData) {
238                    # Here we get the attribute value. If there is none, we leave the column blank.
239                    my $attribute = $specialData{$specialCol}->{$genomeID} || "&nbsp;";
240                    $rowHtml .= " $attribute |";
241                }
242                # Now add the data columns.
243                $rowHtml .= join("", map { "  $_ |" } ($genomeLen, $pegCount, @counterValues, $ssCol, $rnaCount));
244              # Put it in the row hash.              # Put it in the row hash.
245              $rows{$genomeName} = $rowHtml;              $rows{$genomeName} = $rowHtml;
246          }          }
# Line 216  Line 253 
253          # Loop through the rows.          # Loop through the rows.
254          for my $rowID (sort keys %rows) {          for my $rowID (sort keys %rows) {
255              # Format the row.              # Format the row.
256              print GROUPFILE Tr( { class => $rowStyle[$rowType] }, $rows{$rowID} ) . "\n";              push @outputLines, $rows{$rowID};
             # Flip the row type.  
             $rowType = 1 - $rowType;  
257              # Count the row.              # Count the row.
258              $rowCount++;              $rowCount++;
259          }          }
260          # All done, close the file.          # All done, write the Wiki Page.
261          close GROUPFILE;          my $rc = $wiki->Save($outPageName, $outputWeb, 'OrganismDataSummariesStats', join("\n", @outputLines));
262          Trace("$rowCount genomes processed.") if T(2);          Trace("$rowCount genomes processed.") if T(2);
263            if (! $rc) {
264                Confess("Error saving $outPageName: $wiki->{error}");
265      }      }
266            # Now save the group totals.
267            $summaries{$groupID} = \%groupTotals;
268        }
269        # Now produce the summary table.
270        my $sumPageName = "OrganismDataSummariesStats";
271        my @sumLines = ();
272        # Start the table.  Asterisks make a cell a column header. An extra space at the front right-justifies it.
273        push @sumLines, "| " . join("", map { " *$_* |" } ("Group name", "Genomes")) .
274                               join("", map { "  *$_* |"} ("[[FIG.ProteinEncodingGenes][Protein Encoding Genes]] (PEGs)",
275                                                           "Named genes in subsystems",            # s0
276                                                           "Named genes not in subsystems",        # n0
277                                                           "Hypothetical genes in subsystems",     # s1
278                                                           "Hypothetical genes not in subsystems", # n1
279                                                           "RNAs"));
280        # Put in the data rows.
281        for my $groupName (sort keys %summaries) {
282            my $group = $summaries{$groupName};
283            # Create the table row.
284            my $rowHtml = "| [[$groupName]] |" . join("", map { "  " . Tracer::CommaFormat($group->{$_}) . " |" }
285                                                      ('genomes', 'pegs', @columnTypes, 'RNAs'));
286            push @sumLines, $rowHtml;
287        }
288        # Write the page.
289        my $rc = $wiki->Save($sumPageName, $outputWeb, 'WebHome', join("\n", @sumLines));
290      # We're all done.      # We're all done.
291      Trace("Processing complete.") if T(2);      Trace("Processing complete.") if T(2);
292    };
293    if ($@) {
294        Trace("Stats failed with error: $@") if T(0);
295        $rtype = "error";
296    } else {
297        Trace("Stats complete.") if T(2);
298        $rtype = "no error";
299  }  }
300    if ($options->{phone}) {
301  =head3 Fix      my $msgID = Tracer::SendSMS($options->{phone}, "GenomeStats terminated with $rtype.");
302        if ($msgID) {
303  C<< my %fixedHash = Fix(%groupHash); >>          Trace("Phone message sent with ID $msgID.") if T(2);
304        } else {
305  Prepare a genome group hash for processing. Groups with the same primary name will be combined.          Trace("Phone message not sent.") if T(2);
 The primary name is the first capitalized word in the group name.  
   
 =over 4  
   
 =item groupHash  
   
 Hash to be fixed up.  
   
 =item RETURN  
   
 Returns a fixed-up version of the hash.  
   
 =back  
   
 =cut  
   
 sub Fix {  
     # Get the parameters.  
     my (%groupHash) = @_;  
     # Create the result hash.  
     my %retVal = ();  
     # Copy over the genomes.  
     for my $groupID (keys %groupHash) {  
         # Make a safety copy of the group ID.  
         my $realGroupID = $groupID;  
         # Yank the primary name.  
         if ($groupID =~ /([A-Z]\w+)/) {  
             $realGroupID = $1;  
         }  
         # Append this group's genomes into the result hash.  
         Tracer::AddToListMap(\%retVal, $realGroupID, @{$groupHash{$groupID}});  
306      }      }
     # Return the result hash.  
     return %retVal;  
307  }  }
308    
   
309  1;  1;

Legend:
Removed from v.1.5  
changed lines
  Added in v.1.35

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3