[Bio] / Sprout / GenomeStats.pl Repository:
ViewVC logotype

Annotation of /Sprout/GenomeStats.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.32 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     =head1 Genome Data Generator
4 :    
5 : parrello 1.32 This script creates a set of Wiki pages that list the statistics for
6 : parrello 1.1 the genomes in each of the genome groups. Genomes that are new to this version
7 :     of the Sprout will be specially marked. In order for this to work, both the
8 :     current and previous Sprout databases must be available on this machine.
9 :    
10 :     The currently-supported command-line options are as follows.
11 :    
12 :     =over 4
13 :    
14 :     =item user
15 :    
16 :     Name suffix to be used for log files. If omitted, the PID is used.
17 :    
18 :     =item trace
19 :    
20 :     Numeric trace level. A higher trace level causes more messages to appear. The
21 :     default trace level is 2. Tracing will be directly to the standard output
22 :     as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
23 :     where I<User> is the value of the B<user> option above.
24 :    
25 :     =item sql
26 :    
27 :     If specified, turns on tracing of SQL activity.
28 :    
29 :     =item background
30 :    
31 :     Save the standard and error output to files. The files will be created
32 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
33 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
34 :     B<user> option above.
35 :    
36 :     =item h
37 :    
38 :     Display this command's parameters and options.
39 :    
40 : parrello 1.32 =item test
41 : parrello 1.1
42 : parrello 1.32 If specified, the output pages will be put into the sandbox instead of the main web.
43 : parrello 1.1
44 : parrello 1.25 =item noNewCheck
45 :    
46 :     If specified, the check for new genomes in the group is suppressed. This
47 :     may need to be done if there's been a change in the database definition. Note
48 :     that all this really does is keep the B<NEW!> symbol from showing. It does
49 :     not affect which genomes show up in the table.
50 :    
51 : parrello 1.1 =back
52 :    
53 :     =cut
54 :    
55 :     use strict;
56 :     use Tracer;
57 :     use Cwd;
58 :     use File::Copy;
59 :     use File::Path;
60 :     use Sprout;
61 :     use SFXlate;
62 :     use CGI qw(:standard);
63 :     use FIG;
64 : parrello 1.32 use WikiTools;
65 : parrello 1.1
66 :     # Get the command-line options and parameters.
67 :     my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
68 :     {
69 : parrello 1.32 test => [0, 'if specified, publishes to the wiki sandbox instead of the main web'],
70 : parrello 1.25 noNewCheck => [0, 'if specified, skips the check for new genomes'],
71 : parrello 1.32 },
72 : parrello 1.26 "",
73 : parrello 1.1 @ARGV);
74 : parrello 1.30 # The return type (error/no error) goes in here.
75 :     my $rtype;
76 :     eval {
77 :     # This table controls the special attribute columns. For each we need to know the attribute name and the
78 :     # column title. If any genomes in a group have a value for one of the special columns, that column is
79 :     # displayed along with the attribute values.
80 :     my %specialCols = (Serotype => 'Serotype_code',
81 :     Phenotype => 'Phenotype');
82 : parrello 1.32 my $outputWeb = ($options->{test} ? 'Sandbox' : 'Main');
83 :     # Get the new Sprout.
84 :     my $sprout = SFXlate->new_sprout_only();
85 :     my %newGroupHash = $sprout->GetGroups();
86 :     # Get a wiki helper.
87 :     my $wiki = WikiTools->new();
88 :     # Extract the genome group data from the new Sprout.
89 :     %newGroupHash = $sprout->Fix(%newGroupHash);
90 :     # This hash will be used to determine which genomes are new.
91 :     my %oldGroupHash = ();
92 :     if ($options->{noNewCheck}) {
93 :     # Here we can't look at the old Sprout. Set up the hash
94 :     # so it looks like the old Sprout's data is the same as ours.
95 :     %oldGroupHash = map { $_ => $newGroupHash{$_} } keys %newGroupHash;
96 : parrello 1.25 } else {
97 : parrello 1.32 # Get the old Sprout.
98 :     my $oldSprout = SFXlate->old_sprout_only();
99 :     # Extract the genome group data from the old Sprout.
100 :     %oldGroupHash = $oldSprout->GetGroups();
101 :     %oldGroupHash = $oldSprout->Fix(%oldGroupHash);
102 :     }
103 :     # Get a FIG object for computing attributes.
104 :     my $fig = FIG->new();
105 :     # Get the super-group list.
106 :     my @superGroups = sort keys %newGroupHash;
107 :     # Set up some useful stuff for the four count columns.
108 :     my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",
109 :     s1 => "hypo_sub", n1 => "hypo_nosub" );
110 :     my @columnTypes = ('s0', 'n0', 's1', 'n1');
111 :     # Prepare a hash for the summary counters. These will be used on the organism summary page.
112 :     my %summaries = ();
113 :     # Loop through the groups.
114 :     for my $groupID (@superGroups) {
115 :     Trace("Processing group $groupID.") if T(2);
116 :     # Create a hash for summarizing the counters.
117 :     my %groupTotals = ( genomes => 0, pegs => 0, RNAs => 0,
118 :     map { $_ => } @columnTypes, features => 0 );
119 :     # Get the genomes from the new hash.
120 :     my @newGenomes = @{$newGroupHash{$groupID}};
121 :     # Create a hash for finding if a genome is in the old group. If the entire group is
122 :     # new, we just use an empty hash.
123 :     my %oldGenomes = ();
124 :     if (exists $oldGroupHash{$groupID}) {
125 :     %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};
126 : parrello 1.1 }
127 : parrello 1.32 # Compute the name of the wiki page we're building.
128 :     my $outPageName = "${groupID}Stats";
129 :     # We'll put the data for the page in here.
130 :     my @outputLines = ();
131 :     # Get the special columns. We'll stuff them in a hash keyed by column name. Each column name will contain
132 :     # a sub-hash that translates each genome ID to its applicable attribute value (if any).
133 :     my %specialData = ();
134 :     for my $specialColumn (keys %specialCols) {
135 :     # Get the attribute mapping.
136 :     my %specialDataList = map { $_->[0] => $_->[2] } $fig->get_attributes(\@newGenomes, $specialCols{$specialColumn});
137 :     # We only proceed if some attributes were found. As a result, the keys in %specialData will only be keys
138 :     # for columns that exist in the output.
139 :     if (scalar(keys %specialDataList)) {
140 :     $specialData{$specialColumn} = \%specialDataList;
141 : parrello 1.29 }
142 :     }
143 : parrello 1.32 # Set up the column names. Note that an extra space in front of a name will be interpreted by
144 :     # the Wiki markup as an order to right-justify the text.
145 :     my @columnNames = "*Strain annotated in NMPDR*";
146 :     push @columnNames, map { "*$_*" } sort keys %specialData;
147 :     push @columnNames, "*Genome size, bp*",
148 :     " *[[FIG.ProteinEncodingGenes][Protein Encoding Genes]] (PEGs)*",
149 :     " *Named genes in subsystems*", # s0
150 :     " *Named genes not in subsystems*", # n0
151 :     " *Hypothetical genes in subsystems*", # s1
152 :     " *Hypothetical genes not in subsystems*", # n1
153 :     " *Subsystems*",
154 :     " *RNAs*";
155 :     # Create the header row.
156 :     push @outputLines, "| " . join(" | ", @columnNames) . " |";
157 :     # The data rows will be built next. We'll be putting them into a hash keyed by
158 :     # organism name. The hash enables us to spit them out sorted by name.
159 :     my %rows = ();
160 :     # This variable is used to hold the counts.
161 :     my $num;
162 :     # Loop through the genomes in the new group.
163 :     for my $genomeID (@newGenomes) {
164 :     # Count this genome.
165 :     $groupTotals{genomes}++;
166 :     # Check to see if this genome is new.
167 :     my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");
168 :     Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);
169 :     # Get the strain name.
170 :     my $genomeName = $sprout->GenusSpecies($genomeID);
171 :     # Apply a link.
172 :     my $genomeText = "%SV{\"$genomeName\" id=\"$genomeID\"}%";
173 :     # If this is a new strain, add the NEW! mark.
174 :     if ($new) {
175 :     $genomeText .= " %N%";
176 : parrello 1.17 }
177 : parrello 1.32 # Get the genome length.
178 :     $num = $sprout->GenomeLength($genomeID);
179 :     my $genomeLen = Tracer::CommaFormat($num);
180 :     # Get the number of PEGs.
181 :     $num = $sprout->FeatureCount($genomeID, 'peg');
182 :     my $pegCount = Tracer::CommaFormat($num);
183 :     $groupTotals{pegs} += $num;
184 :     # Get the number of RNAs.
185 :     $num = $sprout->FeatureCount($genomeID, 'rna');
186 :     my $rnaCount = Tracer::CommaFormat($num);
187 :     $groupTotals{RNAs} += $num;
188 :     # If there are no RNAs, we say we don't know the number, since we know there
189 :     # must be RNAs somewhere.
190 :     if (! $rnaCount) {
191 :     $rnaCount = "n/d";
192 : parrello 1.1 }
193 : parrello 1.32 # Now we have four categories of features to work with, for each
194 :     # combination of named or hypothetical vs. in-subsystem or
195 :     # not-in-subsystem. First, we get all of the feature assignments for
196 :     # the genome.
197 :     my $assignHash = $sprout->GenomeAssignments($genomeID);
198 :     # Next, we get all of the features in the genome that belong to a
199 :     # subsystem.
200 :     my %ssHash = $sprout->GenomeSubsystemData($genomeID);
201 :     # Create a hash to track the four categories. "s" or "n" indicates
202 :     # in or out of a subsystem. "1" or "0" indicates hypothetical or
203 :     # real.
204 :     my %counters = ( s0 => 0, n0 => 0, s1 => 0, n1 => 0 );
205 :     # Also keep a count of all the features.
206 :     my $totalFeatures = 0;
207 :     # Loop through the assignments.
208 :     for my $fid (keys %{$assignHash}) {
209 :     # Form the counter key.
210 :     my $ss = ( exists $ssHash{$fid} ? "s" : "n" ) . FIG::hypo($assignHash->{$fid} );
211 :     # Record this feature.
212 :     $counters{$ss} += 1;
213 :     $totalFeatures++;
214 : parrello 1.22 }
215 : parrello 1.32 Trace("$totalFeatures total features found for $genomeID.") if T(3);
216 :     for my $counterKey (@columnTypes) {
217 :     $groupTotals{$counterKey} += $counters{$counterKey};
218 : parrello 1.1 }
219 : parrello 1.32 $groupTotals{features} += $totalFeatures;
220 :     # We have all our data. Next we need to compute the percentages and the links.
221 :     # First, the link stuff.
222 :     my $linkPrefix = "%NMPDR{FIG/genome_statistics.cgi}%?user=;genome=$genomeID;SPROUT=1;request=";
223 :     # Now format the counters and percentages.
224 :     for my $type (keys %linkParms) {
225 :     $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },
226 :     sprintf("%d(%.1f%%)", $counters{$type},
227 :     Tracer::Percent($counters{$type}, $totalFeatures)));
228 :     }
229 :     my @counterValues = map { $counters{$_} } @columnTypes;
230 :     # The last link is a button to look at the subsystem summaries.
231 :     my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',
232 :     [$genomeID]);
233 :     my $ssLink = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&show_subsystems=1";
234 :     my $ssCol = "<a href=\"$ssLink\">$ssCount</a>";
235 :     # Start creating the table cells.
236 :     my $rowHtml = "| $genomeText |";
237 :     # Add any special columns.
238 :     for my $specialCol (keys %specialData) {
239 :     # Here we get the attribute value. If there is none, we leave the column blank.
240 :     my $attribute = $specialData{$specialCol}->{$genomeID} || "&nbsp;";
241 :     $rowHtml .= " $attribute |";
242 :     }
243 :     # Now add the data columns.
244 :     $rowHtml .= join("", map { " $_ |" } ($genomeLen, $pegCount, @counterValues, $ssCol, $rnaCount));
245 :     # Put it in the row hash.
246 :     $rows{$genomeName} = $rowHtml;
247 :     }
248 :     # Set up a counter.
249 :     my $rowCount = 0;
250 :     # Now we have all the rows set up. We want to sort them and then
251 :     # write them to the output file. First, we create a variable
252 :     # we can use to select the even or odd style.
253 :     my $rowType = 0;
254 :     # Loop through the rows.
255 :     for my $rowID (sort keys %rows) {
256 :     # Format the row.
257 :     push @outputLines, $rows{$rowID};
258 :     # Count the row.
259 :     $rowCount++;
260 : parrello 1.1 }
261 : parrello 1.32 # All done, write the Wiki Page.
262 :     my $rc = $wiki->Save($outPageName, $outputWeb, 'OrganismDataSummariesStats', join("\n", @outputLines));
263 :     Trace("$rowCount genomes processed.") if T(2);
264 :     if (! $rc) {
265 :     Confess("Error saving $outPageName: $wiki->{error}");
266 : parrello 1.1 }
267 : parrello 1.32 # Now save the group totals.
268 :     $summaries{$groupID} = \%groupTotals;
269 :     }
270 :     # Now produce the summary table.
271 :     my $sumPageName = "OrganismDataSummariesStats";
272 :     my @sumLines = ();
273 :     # Start the table. Asterisks make a cell a column header. An extra space at the front right-justifies it.
274 :     push @sumLines, "| " . join("", map { " *$_* |" } ("Group name", "Genomes")) .
275 :     join("", map { " *$_* |"} ("[[FIG.ProteinEncodingGenes][Protein Encoding Genes]] (PEGs)",
276 :     "Named genes in subsystems", # s0
277 :     "Named genes not in subsystems", # n0
278 :     "Hypothetical genes in subsystems", # s1
279 :     "Hypothetical genes not in subsystems", # n1
280 :     "RNAs"));
281 :     # Put in the data rows.
282 :     for my $groupName (sort keys %summaries) {
283 :     my $group = $summaries{$groupName};
284 :     # Create the table row.
285 :     my $rowHtml = "| [[$groupName]] |" . join("", map { " " . Tracer::CommaFormat($group->{$_}) . " |" }
286 :     ('genomes', 'pegs', @columnTypes, 'RNAs'));
287 :     push @sumLines, $rowHtml;
288 : parrello 1.22 }
289 : parrello 1.32 # Write the page.
290 :     my $rc = $wiki->Save($sumPageName, $outputWeb, 'WebHome', join("\n", @sumLines));
291 :     # We're all done.
292 :     Trace("Processing complete.") if T(2);
293 : parrello 1.30 };
294 :     if ($@) {
295 :     Trace("Stats failed with error: $@") if T(0);
296 :     $rtype = "error";
297 :     } else {
298 :     Trace("Stats complete.") if T(2);
299 :     $rtype = "no error";
300 :     }
301 :     if ($options->{phone}) {
302 :     my $msgID = Tracer::SendSMS($options->{phone}, "GenomeStats terminated with $rtype.");
303 :     if ($msgID) {
304 :     Trace("Phone message sent with ID $msgID.") if T(2);
305 :     } else {
306 :     Trace("Phone message not sent.") if T(2);
307 : parrello 1.1 }
308 :     }
309 :    
310 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3