[Bio] / Sprout / GenomeStats.pl Repository:
ViewVC logotype

Annotation of /Sprout/GenomeStats.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.29 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     =head1 Genome Data Generator
4 :    
5 :     This script creates a set of HTML include files that list the statistics for
6 :     the genomes in each of the genome groups. Genomes that are new to this version
7 :     of the Sprout will be specially marked. In order for this to work, both the
8 :     current and previous Sprout databases must be available on this machine.
9 :    
10 :     The currently-supported command-line options are as follows.
11 :    
12 :     =over 4
13 :    
14 :     =item user
15 :    
16 :     Name suffix to be used for log files. If omitted, the PID is used.
17 :    
18 :     =item trace
19 :    
20 :     Numeric trace level. A higher trace level causes more messages to appear. The
21 :     default trace level is 2. Tracing will be directly to the standard output
22 :     as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
23 :     where I<User> is the value of the B<user> option above.
24 :    
25 :     =item sql
26 :    
27 :     If specified, turns on tracing of SQL activity.
28 :    
29 :     =item background
30 :    
31 :     Save the standard and error output to files. The files will be created
32 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
33 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
34 :     B<user> option above.
35 :    
36 :     =item h
37 :    
38 :     Display this command's parameters and options.
39 :    
40 :     =item strict
41 :    
42 :     If specified, strict groups will be used; otherwise, groups with a common primary name
43 :     will be combined into a single group. (The primary name of the group is the first
44 :     capitalized word.)
45 :    
46 :     =item oddStyle
47 :    
48 :     Style to use for odd rows of the table.
49 :    
50 :     =item evenStyle
51 :    
52 :     Style to use for even rows of the table.
53 :    
54 :     =item tableStyle
55 :    
56 :     Style to use for the table itself.
57 :    
58 :     =item markerStyle
59 :    
60 :     Style to use for small-text markers (e.g. NEW!)
61 :    
62 : parrello 1.10 =item numStyle
63 :    
64 :     Style to use for numeric cells.
65 :    
66 :     =item counterStyle
67 :    
68 :     Style to use for counter cells.
69 :    
70 : parrello 1.1 =item linkCGI
71 :    
72 :     Path to the CGI script for displaying detailed statistics.
73 :    
74 : parrello 1.25 =item noNewCheck
75 :    
76 :     If specified, the check for new genomes in the group is suppressed. This
77 :     may need to be done if there's been a change in the database definition. Note
78 :     that all this really does is keep the B<NEW!> symbol from showing. It does
79 :     not affect which genomes show up in the table.
80 :    
81 : parrello 1.1 =back
82 :    
83 :     =cut
84 :    
85 :     use strict;
86 :     use Tracer;
87 :     use DocUtils;
88 :     use TestUtils;
89 :     use Cwd;
90 :     use File::Copy;
91 :     use File::Path;
92 :     use Sprout;
93 :     use SFXlate;
94 :     use CGI qw(:standard);
95 :     use FIG;
96 :    
97 :     # Get the command-line options and parameters.
98 :     my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
99 :     {
100 :     strict => [0, 'keep related groups separate'],
101 :     oddStyle => ['odd', 'style for odd rows'],
102 : parrello 1.24 trace => [2, 'tracing level'],
103 : parrello 1.1 evenStyle => ['even', 'style for even rows'],
104 :     tableStyle => ['genomestats', 'style for whole table'],
105 :     markerStyle => ['tinytext', 'style for markers'],
106 : parrello 1.10 numStyle => ['numcell', 'style for cells with numeric values'],
107 :     counterStyle => ['countercell', 'style for cells with counter values'],
108 : parrello 1.1 linkCGI => ['../FIG/genome_statistics.cgi',
109 :     'path to CGI script for detailed statistics'],
110 : parrello 1.22 groupFile => ["$FIG_Config::sproutData/groups.tbl",
111 : parrello 1.26 'location of the NMPDR group description file'],
112 : parrello 1.25 noNewCheck => [0, 'if specified, skips the check for new genomes'],
113 : parrello 1.26 targetDir => ["$FIG_Config::nmpdr_base/next/html/includes",
114 :     'target directory'],
115 : parrello 1.22 },
116 : parrello 1.26 "",
117 : parrello 1.1 @ARGV);
118 : parrello 1.29 # This table controls the special attribute columns. For each we need to know the attribute name and the
119 :     # column title. If any genomes in a group have a value for one of the special columns, that column is
120 :     # displayed along with the attribute values.
121 :     my %specialCols = (Serotype => 'Serotype_code',
122 :     Phenotype => 'Phenotype');
123 : parrello 1.1 # Verify the directory name.
124 : parrello 1.26 my $targetDir = $options->{targetDir};
125 : parrello 1.1 if (! $targetDir) {
126 :     Confess("No target directory specified.");
127 :     } elsif (! -d $targetDir) {
128 :     Confess("Target directory $targetDir not found.");
129 :     } else {
130 :     # Get the new Sprout.
131 :     my $sprout = SFXlate->new_sprout_only();
132 :     my %newGroupHash = $sprout->GetGroups();
133 : parrello 1.25 # Extract the genome group data from the new Sprout.
134 : parrello 1.1 if (! $options->{strict}) {
135 : parrello 1.22 %newGroupHash = Sprout::Fix(%newGroupHash);
136 : parrello 1.1 }
137 : parrello 1.25 # This hash will be used to determine which genomes are new.
138 :     my %oldGroupHash = ();
139 :     if ($options->{noNewCheck}) {
140 :     # Here we can't look at the old Sprout. Set up the hash
141 :     # so it looks like the old Sprout's data is the same as ours.
142 :     %oldGroupHash = map { $_ => $newGroupHash{$_} } keys %newGroupHash;
143 :     } else {
144 :     # Get the old Sprout.
145 : parrello 1.28 my $oldSprout = SFXlate->old_sprout_only();
146 : parrello 1.25 # Extract the genome group data from the old Sprout.
147 : parrello 1.26 %oldGroupHash = $oldSprout->GetGroups();
148 : parrello 1.25 if (! $options->{strict}) {
149 :     %oldGroupHash = Sprout::Fix(%oldGroupHash);
150 :     }
151 :     }
152 : parrello 1.28 # Get a FIG object for computing attributes.
153 :     my $fig = FIG->new();
154 : parrello 1.22 # Read the group file.
155 :     my %groupData = Sprout::ReadGroupFile($options->{groupFile});
156 :     # Set up some useful stuff for the four count columns.
157 :     my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",
158 :     s1 => "hypo_sub", n1 => "hypo_nosub" );
159 :     my @columnTypes = ('s0', 'n0', 's1', 'n1');
160 :     # Get the styles.
161 :     my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},
162 :     $options->{evenStyle}, $options->{oddStyle});
163 :     my ($numStyle, $counterStyle) = ($options->{numStyle}, $options->{counterStyle});
164 :     # Prepare a hash for the summary counters. These will be used on the organism summary page.
165 :     my %summaries = ();
166 : parrello 1.1 # Loop through the groups.
167 :     for my $groupID (keys %newGroupHash) {
168 :     Trace("Processing group $groupID.") if T(2);
169 : parrello 1.22 # Create a hash for summarizing the counters.
170 :     my %groupTotals = ( genomes => 0, pegs => 0, RNAs => 0,
171 :     map { $_ => } @columnTypes, features => 0 );
172 : parrello 1.1 # Get the genomes from the new hash.
173 :     my @newGenomes = @{$newGroupHash{$groupID}};
174 :     # Create a hash for finding if a genome is in the old group. If the entire group is
175 :     # new, we just use an empty hash.
176 :     my %oldGenomes = ();
177 :     if (exists $oldGroupHash{$groupID}) {
178 :     %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};
179 :     }
180 :     # Create the output file.
181 : parrello 1.14 my $outFileName = "stats-" . lc($groupID) . ".inc";
182 : parrello 1.15 Open(\*GROUPFILE, ">$targetDir/$outFileName");
183 : parrello 1.29 # Get the special columns. We'll stuff them in a hash keyed by column name. Each column name will contain
184 :     # a sub-hash that translates each genome ID to its applicable attribute value (if any).
185 :     my %specialData = ();
186 :     for my $specialColumn (keys %specialCols) {
187 :     # Get the attribute mapping.
188 :     my %specialDataList = map { $_->[0] => $_->[2] } $fig->get_attributes(\@newGenomes, $specialCols{$specialColumn});
189 :     # We only proceed if some attributes were found. As a result, the keys in %specialData will only be keys
190 :     # for columns that exist in the output.
191 :     if (scalar(keys %specialDataList)) {
192 :     $specialData{$specialColumn} = \%specialDataList;
193 :     }
194 :     }
195 :     # Set up the column names.
196 : parrello 1.28 my @columnNames = "Strain annotated in NMPDR";
197 : parrello 1.29 push @columnNames, sort keys %specialData;
198 : parrello 1.28 push @columnNames, "Genome size, bp",
199 :     "Protein Encoding Genes (PEGs)",
200 :     "Named genes in subsystems", # s0
201 :     "Named genes not in subsystems", # n0
202 :     "Hypothetical genes in subsystems", # s1
203 :     "Hypothetical genes not in subsystems", # n1
204 :     "Subsystems",
205 :     "RNAs";
206 : parrello 1.1 # Start the table.
207 :     print GROUPFILE "<table class=\"$tableStyle\">\n";
208 :     # Create the header row.
209 : parrello 1.28 print GROUPFILE Tr( { class => 'odd' }, th(\@columnNames)) . "\n";
210 : parrello 1.1 # The data rows will be built next. We'll be putting them into a hash keyed by
211 :     # organism name. The hash enables us to spit them out sorted by name.
212 :     my %rows = ();
213 : parrello 1.22 # This variable is used to hold the counts.
214 :     my $num;
215 : parrello 1.1 # Loop through the genomes in the new group.
216 :     for my $genomeID (@newGenomes) {
217 : parrello 1.22 # Count this genome.
218 :     $groupTotals{genomes}++;
219 : parrello 1.1 # Check to see if this genome is new.
220 :     my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");
221 :     Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);
222 :     # Get the strain name.
223 :     my $genomeName = $sprout->GenusSpecies($genomeID);
224 : parrello 1.28 # Apply a link.
225 :     my $genomeText = CGI::a({ href => "../FIG/genome_statistics.cgi?genome=$genomeID;SPROUT=1" }, $genomeName);
226 : parrello 1.1 # If this is a new strain, build the HTML for the NEW! mark.
227 :     if ($new) {
228 :     $new = " <span class=\"$markerStyle\">NEW!</span>";
229 :     }
230 :     # Get the genome length.
231 : parrello 1.22 $num = $sprout->GenomeLength($genomeID);
232 :     my $genomeLen = Tracer::CommaFormat($num);
233 : parrello 1.1 # Get the number of PEGs.
234 : parrello 1.22 $num = $sprout->FeatureCount($genomeID, 'peg');
235 :     my $pegCount = Tracer::CommaFormat($num);
236 :     $groupTotals{pegs} += $num;
237 : parrello 1.1 # Get the number of RNAs.
238 : parrello 1.22 $num = $sprout->FeatureCount($genomeID, 'rna');
239 :     my $rnaCount = Tracer::CommaFormat($num);
240 :     $groupTotals{RNAs} += $num;
241 : parrello 1.17 # If there are no RNAs, we say we don't know the number, since we know there
242 :     # must be RNAs somewhere.
243 :     if (! $rnaCount) {
244 :     $rnaCount = "n/d";
245 :     }
246 : parrello 1.1 # Now we have four categories of features to work with, for each
247 :     # combination of named or hypothetical vs. in-subsystem or
248 :     # not-in-subsystem. First, we get all of the feature assignments for
249 :     # the genome.
250 :     my $assignHash = $sprout->GenomeAssignments($genomeID);
251 :     # Next, we get all of the features in the genome that belong to a
252 : parrello 1.18 # subsystem.
253 :     my %ssHash = $sprout->GenomeSubsystemData($genomeID);
254 : parrello 1.1 # Create a hash to track the four categories. "s" or "n" indicates
255 :     # in or out of a subsystem. "1" or "0" indicates hypothetical or
256 :     # real.
257 :     my %counters = ( s0 => 0, n0 => 0, s1 => 0, n1 => 0 );
258 :     # Also keep a count of all the features.
259 :     my $totalFeatures = 0;
260 :     # Loop through the assignments.
261 :     for my $fid (keys %{$assignHash}) {
262 :     # Form the counter key.
263 :     my $ss = ( exists $ssHash{$fid} ? "s" : "n" ) . FIG::hypo($assignHash->{$fid} );
264 :     # Record this feature.
265 :     $counters{$ss} += 1;
266 :     $totalFeatures++;
267 :     }
268 : parrello 1.6 Trace("$totalFeatures total features found for $genomeID.") if T(3);
269 : parrello 1.22 for my $counterKey (@columnTypes) {
270 :     $groupTotals{$counterKey} += $counters{$counterKey};
271 :     }
272 :     $groupTotals{features} += $totalFeatures;
273 : parrello 1.1 # We have all our data. Next we need to compute the percentages and the links.
274 :     # First, the link stuff.
275 :     my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";
276 :     # Now format the counters and percentages.
277 :     for my $type (keys %linkParms) {
278 :     $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },
279 :     sprintf("%d(%.1f%%)", $counters{$type},
280 : parrello 1.7 Tracer::Percent($counters{$type}, $totalFeatures)));
281 : parrello 1.1 }
282 : parrello 1.9 my @counterValues = map { $counters{$_} } @columnTypes;
283 : parrello 1.19 # The last link is a button to look at the subsystem summaries.
284 : parrello 1.21 my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',
285 :     [$genomeID]);
286 : parrello 1.19 my $ssLink = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&show_subsystems=1";
287 : parrello 1.21 my $ssCol = "<a href=\"$ssLink\">$ssCount</a>";
288 : parrello 1.28 # Start creating the table cells.
289 :     my $rowHtml = td("$genomeText$new");
290 : parrello 1.29 # Add any special columns.
291 :     for my $specialCol (keys %specialData) {
292 :     # Here we get the attribute value. If there is none, we leave the column blank.
293 :     my $attribute = $specialData{$specialCol}->{$genomeID} || "&nbsp;";
294 :     $rowHtml .= td($attribute);
295 : parrello 1.28 }
296 :     # Now add the data columns.
297 :     $rowHtml .= join("",
298 :     td({ class => $numStyle }, $genomeLen),
299 :     td({ class => $numStyle }, $pegCount),
300 :     td({ class => $counterStyle }, \@counterValues),
301 :     td({ class => $numStyle }, $ssCol),
302 :     td({ class => $numStyle }, $rnaCount),
303 :     );
304 : parrello 1.1 # Put it in the row hash.
305 :     $rows{$genomeName} = $rowHtml;
306 :     }
307 :     # Set up a counter.
308 :     my $rowCount = 0;
309 :     # Now we have all the rows set up. We want to sort them and then
310 :     # write them to the output file. First, we create a variable
311 :     # we can use to select the even or odd style.
312 :     my $rowType = 0;
313 :     # Loop through the rows.
314 :     for my $rowID (sort keys %rows) {
315 :     # Format the row.
316 :     print GROUPFILE Tr( { class => $rowStyle[$rowType] }, $rows{$rowID} ) . "\n";
317 :     # Flip the row type.
318 :     $rowType = 1 - $rowType;
319 :     # Count the row.
320 :     $rowCount++;
321 :     }
322 : parrello 1.8 # All done, terminate the table and close the file.
323 :     print GROUPFILE "</table>\n";
324 : parrello 1.1 close GROUPFILE;
325 :     Trace("$rowCount genomes processed.") if T(2);
326 : parrello 1.22 # Now save the group totals.
327 :     $summaries{$groupID} = \%groupTotals;
328 :     }
329 :     # Now produce the summary table.
330 : parrello 1.23 my $sumFileName = "stats-groups.inc";
331 : parrello 1.22 Open(\*SUMFILE, ">$targetDir/$sumFileName");
332 :     # Start the table.
333 :     print SUMFILE "<table class=\"$tableStyle\">\n";
334 :     # Create the header row.
335 :     print SUMFILE Tr( { class => 'odd' }, th(["Group name",
336 :     "Genomes",
337 :     "Protein Encoding Genes (PEGs)",
338 :     "Named genes in subsystems", # s0
339 :     "Named genes not in subsystems", # n0
340 :     "Hypothetical genes in subsystems", # s1
341 :     "Hypothetical genes not in subsystems", # n1
342 :     "RNAs",
343 :     ])) . "\n";
344 :     # Set up a flag for the odd-even styling.
345 :     my $rowFlag = 0;
346 :     # Put in the data rows.
347 :     for my $groupName (sort keys %summaries) {
348 :     my $group = $summaries{$groupName};
349 :     # Compute the link for the current group.
350 :     my $groupLink = a({ href => $groupData{$groupName}->[0] }, $groupName);
351 :     # Create the table row.
352 :     my $rowHtml = join("",
353 :     td($groupLink),
354 :     td({ class => $numStyle }, Tracer::CommaFormat($group->{genomes})),
355 :     td({ class => $numStyle }, Tracer::CommaFormat($group->{pegs})),
356 :     td({ class => $counterStyle }, [ map { Tracer::CommaFormat($group->{$_}) } @columnTypes ]),
357 :     td({ class => $numStyle }, Tracer::CommaFormat($group->{RNAs})),
358 :     );
359 :     print SUMFILE Tr( { class => $rowStyle[$rowFlag] }, $rowHtml ) . "\n";
360 :     # Flip the row style.
361 :     $rowFlag = 1 - $rowFlag;
362 : parrello 1.1 }
363 : parrello 1.22 # Terminate the table and close the file.
364 :     print SUMFILE "</table>\n";
365 :     close SUMFILE;
366 : parrello 1.1 # We're all done.
367 :     Trace("Processing complete.") if T(2);
368 :     }
369 :    
370 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3