[Bio] / Sprout / GenomeStats.pl Repository:
ViewVC logotype

Annotation of /Sprout/GenomeStats.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     =head1 Genome Data Generator
4 :    
5 :     This script creates a set of HTML include files that list the statistics for
6 :     the genomes in each of the genome groups. Genomes that are new to this version
7 :     of the Sprout will be specially marked. In order for this to work, both the
8 :     current and previous Sprout databases must be available on this machine.
9 :     This is one positional parameter: the name of a directory in which to place
10 :     the include files.
11 :    
12 :     The currently-supported command-line options are as follows.
13 :    
14 :     =over 4
15 :    
16 :     =item user
17 :    
18 :     Name suffix to be used for log files. If omitted, the PID is used.
19 :    
20 :     =item trace
21 :    
22 :     Numeric trace level. A higher trace level causes more messages to appear. The
23 :     default trace level is 2. Tracing will be directly to the standard output
24 :     as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
25 :     where I<User> is the value of the B<user> option above.
26 :    
27 :     =item sql
28 :    
29 :     If specified, turns on tracing of SQL activity.
30 :    
31 :     =item background
32 :    
33 :     Save the standard and error output to files. The files will be created
34 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
35 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
36 :     B<user> option above.
37 :    
38 :     =item h
39 :    
40 :     Display this command's parameters and options.
41 :    
42 :     =item strict
43 :    
44 :     If specified, strict groups will be used; otherwise, groups with a common primary name
45 :     will be combined into a single group. (The primary name of the group is the first
46 :     capitalized word.)
47 :    
48 :     =item oddStyle
49 :    
50 :     Style to use for odd rows of the table.
51 :    
52 :     =item evenStyle
53 :    
54 :     Style to use for even rows of the table.
55 :    
56 :     =item tableStyle
57 :    
58 :     Style to use for the table itself.
59 :    
60 :     =item markerStyle
61 :    
62 :     Style to use for small-text markers (e.g. NEW!)
63 :    
64 :     =item linkCGI
65 :    
66 :     Path to the CGI script for displaying detailed statistics.
67 :    
68 :     =back
69 :    
70 :     =cut
71 :    
72 :     use strict;
73 :     use Tracer;
74 :     use DocUtils;
75 :     use TestUtils;
76 :     use Cwd;
77 :     use File::Copy;
78 :     use File::Path;
79 :     use Sprout;
80 :     use SFXlate;
81 :     use CGI qw(:standard);
82 :     use FIG;
83 :     no warnings 'once'; # only when coding
84 :    
85 :     # Get the command-line options and parameters.
86 :     my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
87 :     {
88 :     strict => [0, 'keep related groups separate'],
89 :     oddStyle => ['odd', 'style for odd rows'],
90 :     evenStyle => ['even', 'style for even rows'],
91 :     tableStyle => ['genomestats', 'style for whole table'],
92 :     markerStyle => ['tinytext', 'style for markers'],
93 :     linkCGI => ['../FIG/genome_statistics.cgi',
94 :     'path to CGI script for detailed statistics'],
95 :     },
96 :     "<targetDir>",
97 :     @ARGV);
98 :     # Verify the directory name.
99 :     my $targetDir = $parameters[0];
100 :     if (! $targetDir) {
101 :     Confess("No target directory specified.");
102 :     } elsif (! -d $targetDir) {
103 :     Confess("Target directory $targetDir not found.");
104 :     } else {
105 :     # *Get the old Sprout.
106 :     my $oldSprout = SFXlate->new_sprout_only($FIG_Config::oldSproutDB);
107 :     # Extract the genome group data from the old Sprout.
108 :     my %oldGroupHash = $oldSprout->GetGroups();
109 :     if (! $options->{strict}) {
110 : parrello 1.2 %oldGroupHash = Fix(%oldGroupHash);
111 : parrello 1.1 }
112 :     # Get the new Sprout.
113 :     my $sprout = SFXlate->new_sprout_only();
114 :     my %newGroupHash = $sprout->GetGroups();
115 :     if (! $options->{strict}) {
116 : parrello 1.3 %newGroupHash = Fix(%newGroupHash);
117 : parrello 1.1 }
118 :     # Loop through the groups.
119 :     for my $groupID (keys %newGroupHash) {
120 :     Trace("Processing group $groupID.") if T(2);
121 :     # Get the genomes from the new hash.
122 :     my @newGenomes = @{$newGroupHash{$groupID}};
123 :     # Create a hash for finding if a genome is in the old group. If the entire group is
124 :     # new, we just use an empty hash.
125 :     my %oldGenomes = ();
126 :     if (exists $oldGroupHash{$groupID}) {
127 :     %oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};
128 :     }
129 :     # Create the output file.
130 :     Open(\*GROUPFILE, ">$targetDir/$groupID.inc");
131 :     # Get the styles.
132 :     my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},
133 :     $options->{evenStyle}, $options->{oddStyle});
134 :     # Start the table.
135 :     print GROUPFILE "<table class=\"$tableStyle\">\n";
136 :     # Create the header row.
137 : parrello 1.9 print GROUPFILE Tr( { class => 'odd' }, th(["Strain annotated in NMPDR",
138 : parrello 1.1 "Genome size, bp",
139 :     "Protein Encoding Genes (PEGs)",
140 :     "Named genes in subsystems", # s0
141 :     "Named genes not in subsystems", # n0
142 :     "Hypothetical genes in subsystems", # s1
143 :     "Hypothetical genes not in subsystems", # n1
144 : parrello 1.9 "RNAs",
145 :     ])) . "\n";
146 : parrello 1.1 # Set up some useful stuff for the four count columns.
147 :     my %linkParms = ( s0 => "nohypo_sub", n0 => "nohypo_nosub",
148 :     s1 => "hypo_sub", n1 => "hypo_nosub" );
149 :     my @columnTypes = ('s0', 'n0', 's1', 'n1');
150 :     # The data rows will be built next. We'll be putting them into a hash keyed by
151 :     # organism name. The hash enables us to spit them out sorted by name.
152 :     my %rows = ();
153 :     # Loop through the genomes in the new group.
154 :     for my $genomeID (@newGenomes) {
155 :     # Check to see if this genome is new.
156 :     my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");
157 :     Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);
158 :     # Get the strain name.
159 :     my $genomeName = $sprout->GenusSpecies($genomeID);
160 :     # If this is a new strain, build the HTML for the NEW! mark.
161 :     if ($new) {
162 :     $new = " <span class=\"$markerStyle\">NEW!</span>";
163 :     }
164 :     # Get the genome length.
165 :     my $genomeLen = $sprout->GenomeLength($genomeID);
166 :     # Get the number of PEGs.
167 :     my $pegCount = $sprout->FeatureCount($genomeID, 'peg');
168 :     # Get the number of RNAs.
169 :     my $rnaCount = $sprout->FeatureCount($genomeID, 'rna');
170 :     # Now we have four categories of features to work with, for each
171 :     # combination of named or hypothetical vs. in-subsystem or
172 :     # not-in-subsystem. First, we get all of the feature assignments for
173 :     # the genome.
174 :     my $assignHash = $sprout->GenomeAssignments($genomeID);
175 :     # Next, we get all of the features in the genome that belong to a
176 :     # subsystem. This involves a query via the subsystem spreadsheet.
177 :     my %ssHash = map { $_ => 1 } $sprout->GetFlat(['IsGenomeOf', 'ContainsFeature'],
178 :     "IsGenomeOf(from-link) = ?",
179 :     [$genomeID], 'ContainsFeature(to-link)');
180 :     # Create a hash to track the four categories. "s" or "n" indicates
181 :     # in or out of a subsystem. "1" or "0" indicates hypothetical or
182 :     # real.
183 :     my %counters = ( s0 => 0, n0 => 0, s1 => 0, n1 => 0 );
184 :     # Also keep a count of all the features.
185 :     my $totalFeatures = 0;
186 :     # Loop through the assignments.
187 :     for my $fid (keys %{$assignHash}) {
188 :     # Form the counter key.
189 :     my $ss = ( exists $ssHash{$fid} ? "s" : "n" ) . FIG::hypo($assignHash->{$fid} );
190 :     # Record this feature.
191 :     $counters{$ss} += 1;
192 :     $totalFeatures++;
193 :     }
194 : parrello 1.6 Trace("$totalFeatures total features found for $genomeID.") if T(3);
195 : parrello 1.1 # We have all our data. Next we need to compute the percentages and the links.
196 :     # First, the link stuff.
197 :     my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";
198 :     # Now format the counters and percentages.
199 :     for my $type (keys %linkParms) {
200 :     $counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },
201 :     sprintf("%d(%.1f%%)", $counters{$type},
202 : parrello 1.7 Tracer::Percent($counters{$type}, $totalFeatures)));
203 : parrello 1.1 }
204 : parrello 1.9 my @counterValues = map { $counters{$_} } @columnTypes;
205 : parrello 1.8 # Create the row text. We use a list reference to distribute the TD tag
206 :     # across all the cells.
207 :     my $rowHtml = td(["$genomeName$new", $genomeLen, $pegCount,
208 : parrello 1.9 @counterValues, $rnaCount,
209 : parrello 1.8 ]);
210 : parrello 1.1 # Put it in the row hash.
211 :     $rows{$genomeName} = $rowHtml;
212 :     }
213 :     # Set up a counter.
214 :     my $rowCount = 0;
215 :     # Now we have all the rows set up. We want to sort them and then
216 :     # write them to the output file. First, we create a variable
217 :     # we can use to select the even or odd style.
218 :     my $rowType = 0;
219 :     # Loop through the rows.
220 :     for my $rowID (sort keys %rows) {
221 :     # Format the row.
222 :     print GROUPFILE Tr( { class => $rowStyle[$rowType] }, $rows{$rowID} ) . "\n";
223 :     # Flip the row type.
224 :     $rowType = 1 - $rowType;
225 :     # Count the row.
226 :     $rowCount++;
227 :     }
228 : parrello 1.8 # All done, terminate the table and close the file.
229 :     print GROUPFILE "</table>\n";
230 : parrello 1.1 close GROUPFILE;
231 :     Trace("$rowCount genomes processed.") if T(2);
232 :     }
233 :     # We're all done.
234 :     Trace("Processing complete.") if T(2);
235 :     }
236 :    
237 :     =head3 Fix
238 :    
239 :     C<< my %fixedHash = Fix(%groupHash); >>
240 :    
241 :     Prepare a genome group hash for processing. Groups with the same primary name will be combined.
242 :     The primary name is the first capitalized word in the group name.
243 :    
244 :     =over 4
245 :    
246 :     =item groupHash
247 :    
248 :     Hash to be fixed up.
249 :    
250 :     =item RETURN
251 :    
252 :     Returns a fixed-up version of the hash.
253 :    
254 :     =back
255 :    
256 :     =cut
257 :    
258 :     sub Fix {
259 :     # Get the parameters.
260 :     my (%groupHash) = @_;
261 :     # Create the result hash.
262 :     my %retVal = ();
263 :     # Copy over the genomes.
264 :     for my $groupID (keys %groupHash) {
265 :     # Make a safety copy of the group ID.
266 :     my $realGroupID = $groupID;
267 :     # Yank the primary name.
268 :     if ($groupID =~ /([A-Z]\w+)/) {
269 :     $realGroupID = $1;
270 :     }
271 :     # Append this group's genomes into the result hash.
272 : parrello 1.5 Tracer::AddToListMap(\%retVal, $realGroupID, @{$groupHash{$groupID}});
273 : parrello 1.1 }
274 :     # Return the result hash.
275 :     return %retVal;
276 :     }
277 :    
278 :    
279 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3