[Bio] / Sprout / GenomeStats.pl Repository:
ViewVC logotype

Diff of /Sprout/GenomeStats.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.16, Tue Jun 27 16:44:18 2006 UTC revision 1.24, Tue Oct 3 02:48:59 2006 UTC
# Line 95  Line 95 
95                                             {                                             {
96                                              strict => [0, 'keep related groups separate'],                                              strict => [0, 'keep related groups separate'],
97                                              oddStyle => ['odd', 'style for odd rows'],                                              oddStyle => ['odd', 'style for odd rows'],
98                                                trace => [2, 'tracing level'],
99                                              evenStyle => ['even', 'style for even rows'],                                              evenStyle => ['even', 'style for even rows'],
100                                              tableStyle => ['genomestats', 'style for whole table'],                                              tableStyle => ['genomestats', 'style for whole table'],
101                                              markerStyle => ['tinytext', 'style for markers'],                                              markerStyle => ['tinytext', 'style for markers'],
# Line 102  Line 103 
103                                              counterStyle => ['countercell', 'style for cells with counter values'],                                              counterStyle => ['countercell', 'style for cells with counter values'],
104                                              linkCGI => ['../FIG/genome_statistics.cgi',                                              linkCGI => ['../FIG/genome_statistics.cgi',
105                                                          'path to CGI script for detailed statistics'],                                                          'path to CGI script for detailed statistics'],
106                                                groupFile => ["$FIG_Config::sproutData/groups.tbl",
107                                                              "location of the NMPDR group description file"],
108                                             },                                             },
109                                             "<targetDir>",                                             "<targetDir>",
110                                             @ARGV);                                             @ARGV);
# Line 117  Line 120 
120      # Extract the genome group data from the old Sprout.      # Extract the genome group data from the old Sprout.
121      my %oldGroupHash = $oldSprout->GetGroups();      my %oldGroupHash = $oldSprout->GetGroups();
122      if (! $options->{strict}) {      if (! $options->{strict}) {
123          %oldGroupHash = Fix(%oldGroupHash);          %oldGroupHash = Sprout::Fix(%oldGroupHash);
124      }      }
125      # Get the new Sprout.      # Get the new Sprout.
126      my $sprout = SFXlate->new_sprout_only();      my $sprout = SFXlate->new_sprout_only();
127      my %newGroupHash = $sprout->GetGroups();      my %newGroupHash = $sprout->GetGroups();
128      if (! $options->{strict}) {      if (! $options->{strict}) {
129          %newGroupHash = Fix(%newGroupHash);          %newGroupHash = Sprout::Fix(%newGroupHash);
130      }      }
131        # Read the group file.
132        my %groupData = Sprout::ReadGroupFile($options->{groupFile});
133        # Set up some useful stuff for the four count columns.
134        my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",
135                          s1 => "hypo_sub", n1 => "hypo_nosub" );
136        my @columnTypes = ('s0', 'n0', 's1', 'n1');
137        # Get the styles.
138        my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},
139                                                     $options->{evenStyle}, $options->{oddStyle});
140        my ($numStyle, $counterStyle) = ($options->{numStyle}, $options->{counterStyle});
141        # Prepare a hash for the summary counters. These will be used on the organism summary page.
142        my %summaries = ();
143      # Loop through the groups.      # Loop through the groups.
144      for my $groupID (keys %newGroupHash) {      for my $groupID (keys %newGroupHash) {
145          Trace("Processing group $groupID.") if T(2);          Trace("Processing group $groupID.") if T(2);
146            # Create a hash for summarizing the counters.
147            my %groupTotals = ( genomes => 0, pegs => 0, RNAs => 0,
148                               map { $_ => } @columnTypes, features => 0 );
149          # Get the genomes from the new hash.          # Get the genomes from the new hash.
150          my @newGenomes = @{$newGroupHash{$groupID}};          my @newGenomes = @{$newGroupHash{$groupID}};
151          # Create a hash for finding if a genome is in the old group. If the entire group is          # Create a hash for finding if a genome is in the old group. If the entire group is
# Line 139  Line 157 
157          # Create the output file.          # Create the output file.
158          my $outFileName = "stats-" . lc($groupID) . ".inc";          my $outFileName = "stats-" . lc($groupID) . ".inc";
159          Open(\*GROUPFILE, ">$targetDir/$outFileName");          Open(\*GROUPFILE, ">$targetDir/$outFileName");
         # Get the styles.  
         my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},  
                                                      $options->{evenStyle}, $options->{oddStyle});  
         my ($numStyle, $counterStyle) = ($options->{numStyle}, $options->{counterStyle});  
160          # Start the table.          # Start the table.
161          print GROUPFILE "<table class=\"$tableStyle\">\n";          print GROUPFILE "<table class=\"$tableStyle\">\n";
162          # Create the header row.          # Create the header row.
# Line 153  Line 167 
167                                                   "Named genes not in subsystems",        # n0                                                   "Named genes not in subsystems",        # n0
168                                                   "Hypothetical genes in subsystems",     # s1                                                   "Hypothetical genes in subsystems",     # s1
169                                                   "Hypothetical genes not in subsystems", # n1                                                   "Hypothetical genes not in subsystems", # n1
170                                                     "Subsystems",
171                                                   "RNAs",                                                   "RNAs",
172                                                     ])) . "\n";                                                     ])) . "\n";
         # Set up some useful stuff for the four count columns.  
         my %linkParms = ( s0 => "nothypo_sub", n0 => "nothypo_nosub",  
                           s1 => "hypo_sub", n1 => "hypo_nosub" );  
         my @columnTypes = ('s0', 'n0', 's1', 'n1');  
173          # The data rows will be built next. We'll be putting them into a hash keyed by          # The data rows will be built next. We'll be putting them into a hash keyed by
174          # organism name. The hash enables us to spit them out sorted by name.          # organism name. The hash enables us to spit them out sorted by name.
175          my %rows = ();          my %rows = ();
176            # This variable is used to hold the counts.
177            my $num;
178          # Loop through the genomes in the new group.          # Loop through the genomes in the new group.
179          for my $genomeID (@newGenomes) {          for my $genomeID (@newGenomes) {
180                # Count this genome.
181                $groupTotals{genomes}++;
182              # Check to see if this genome is new.              # Check to see if this genome is new.
183              my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");              my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");
184              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);              Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);
# Line 174  Line 189 
189                  $new = " <span class=\"$markerStyle\">NEW!</span>";                  $new = " <span class=\"$markerStyle\">NEW!</span>";
190              }              }
191              # Get the genome length.              # Get the genome length.
192              my $genomeLen = Tracer::CommaFormat($sprout->GenomeLength($genomeID));              $num = $sprout->GenomeLength($genomeID);
193                my $genomeLen = Tracer::CommaFormat($num);
194              # Get the number of PEGs.              # Get the number of PEGs.
195              my $pegCount = Tracer::CommaFormat($sprout->FeatureCount($genomeID, 'peg'));              $num = $sprout->FeatureCount($genomeID, 'peg');
196                my $pegCount = Tracer::CommaFormat($num);
197                $groupTotals{pegs} += $num;
198              # Get the number of RNAs.              # Get the number of RNAs.
199              my $rnaCount = Tracer::CommaFormat($sprout->FeatureCount($genomeID, 'rna'));              $num = $sprout->FeatureCount($genomeID, 'rna');
200                my $rnaCount = Tracer::CommaFormat($num);
201                $groupTotals{RNAs} += $num;
202                # If there are no RNAs, we say we don't know the number, since we know there
203                # must be RNAs somewhere.
204                if (! $rnaCount) {
205                    $rnaCount = "n/d";
206                }
207              # Now we have four categories of features to work with, for each              # Now we have four categories of features to work with, for each
208              # combination of named or hypothetical vs. in-subsystem or              # combination of named or hypothetical vs. in-subsystem or
209              # not-in-subsystem. First, we get all of the feature assignments for              # not-in-subsystem. First, we get all of the feature assignments for
210              # the genome.              # the genome.
211              my $assignHash = $sprout->GenomeAssignments($genomeID);              my $assignHash = $sprout->GenomeAssignments($genomeID);
212              # Next, we get all of the features in the genome that belong to a              # Next, we get all of the features in the genome that belong to a
213              # subsystem. This involves a query via the subsystem spreadsheet.              # subsystem.
214              my %ssHash = map { $_ => 1 } $sprout->GetFlat(['IsGenomeOf', 'ContainsFeature'],              my %ssHash = $sprout->GenomeSubsystemData($genomeID);
                                                     "IsGenomeOf(from-link) = ?",  
                                                     [$genomeID], 'ContainsFeature(to-link)');  
215              # Create a hash to track the four categories. "s" or "n" indicates              # Create a hash to track the four categories. "s" or "n" indicates
216              # in or out of a subsystem. "1" or "0" indicates hypothetical or              # in or out of a subsystem. "1" or "0" indicates hypothetical or
217              # real.              # real.
# Line 204  Line 227 
227                  $totalFeatures++;                  $totalFeatures++;
228              }              }
229              Trace("$totalFeatures total features found for $genomeID.") if T(3);              Trace("$totalFeatures total features found for $genomeID.") if T(3);
230                for my $counterKey (@columnTypes) {
231                    $groupTotals{$counterKey} += $counters{$counterKey};
232                }
233                $groupTotals{features} += $totalFeatures;
234              # We have all our data. Next we need to compute the percentages and the links.              # We have all our data. Next we need to compute the percentages and the links.
235              # First, the link stuff.              # First, the link stuff.
236              my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";              my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";
# Line 214  Line 241 
241                                               Tracer::Percent($counters{$type}, $totalFeatures)));                                               Tracer::Percent($counters{$type}, $totalFeatures)));
242              }              }
243              my @counterValues = map { $counters{$_} } @columnTypes;              my @counterValues = map { $counters{$_} } @columnTypes;
244                # The last link is a button to look at the subsystem summaries.
245                my $ssCount = $sprout->GetCount(['ParticipatesIn'], 'ParticipatesIn(from-link) = ?',
246                                                   [$genomeID]);
247                my $ssLink = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&show_subsystems=1";
248                my $ssCol = "<a href=\"$ssLink\">$ssCount</a>";
249              # Create the row text. Note that we use the distributive capability of the TD              # Create the row text. Note that we use the distributive capability of the TD
250              # function to apply the same style to each one.              # function to apply the same style to each one.
251              my $rowHtml = join("",              my $rowHtml = join("",
# Line 221  Line 253 
253                                 td({ class => $numStyle }, $genomeLen),                                 td({ class => $numStyle }, $genomeLen),
254                                 td({ class => $numStyle }, $pegCount),                                 td({ class => $numStyle }, $pegCount),
255                                 td({ class => $counterStyle }, \@counterValues),                                 td({ class => $counterStyle }, \@counterValues),
256                                   td({ class => $numStyle }, $ssCol),
257                                 td({ class => $numStyle }, $rnaCount),                                 td({ class => $numStyle }, $rnaCount),
258                                );                                );
259              # Put it in the row hash.              # Put it in the row hash.
# Line 245  Line 278 
278          print GROUPFILE "</table>\n";          print GROUPFILE "</table>\n";
279          close GROUPFILE;          close GROUPFILE;
280          Trace("$rowCount genomes processed.") if T(2);          Trace("$rowCount genomes processed.") if T(2);
281            # Now save the group totals.
282            $summaries{$groupID} = \%groupTotals;
283      }      }
284        # Now produce the summary table.
285        my $sumFileName = "stats-groups.inc";
286        Open(\*SUMFILE, ">$targetDir/$sumFileName");
287        # Start the table.
288        print SUMFILE "<table class=\"$tableStyle\">\n";
289        # Create the header row.
290        print SUMFILE Tr( { class => 'odd' }, th(["Group name",
291                                                 "Genomes",
292                                                 "Protein Encoding Genes (PEGs)",
293                                                 "Named genes in subsystems",            # s0
294                                                 "Named genes not in subsystems",        # n0
295                                                 "Hypothetical genes in subsystems",     # s1
296                                                 "Hypothetical genes not in subsystems", # n1
297                                                 "RNAs",
298                                                   ])) . "\n";
299        # Set up a flag for the odd-even styling.
300        my $rowFlag = 0;
301        # Put in the data rows.
302        for my $groupName (sort keys %summaries) {
303            my $group = $summaries{$groupName};
304            # Compute the link for the current group.
305            my $groupLink = a({ href => $groupData{$groupName}->[0] }, $groupName);
306            # Create the table row.
307            my $rowHtml = join("",
308                               td($groupLink),
309                               td({ class => $numStyle }, Tracer::CommaFormat($group->{genomes})),
310                               td({ class => $numStyle }, Tracer::CommaFormat($group->{pegs})),
311                               td({ class => $counterStyle }, [ map { Tracer::CommaFormat($group->{$_}) } @columnTypes ]),
312                               td({ class => $numStyle }, Tracer::CommaFormat($group->{RNAs})),
313                              );
314            print SUMFILE Tr( { class => $rowStyle[$rowFlag] }, $rowHtml ) . "\n";
315            # Flip the row style.
316            $rowFlag = 1 - $rowFlag;
317        }
318        # Terminate the table and close the file.
319        print SUMFILE "</table>\n";
320        close SUMFILE;
321      # We're all done.      # We're all done.
322      Trace("Processing complete.") if T(2);      Trace("Processing complete.") if T(2);
323  }  }
324    
 =head3 Fix  
   
 C<< my %fixedHash = Fix(%groupHash); >>  
   
 Prepare a genome group hash for processing. Groups with the same primary name will be combined.  
 The primary name is the first capitalized word in the group name.  
   
 =over 4  
   
 =item groupHash  
   
 Hash to be fixed up.  
   
 =item RETURN  
   
 Returns a fixed-up version of the hash.  
   
 =back  
   
 =cut  
   
 sub Fix {  
     # Get the parameters.  
     my (%groupHash) = @_;  
     # Create the result hash.  
     my %retVal = ();  
     # Copy over the genomes.  
     for my $groupID (keys %groupHash) {  
         # Make a safety copy of the group ID.  
         my $realGroupID = $groupID;  
         # Yank the primary name.  
         if ($groupID =~ /([A-Z]\w+)/) {  
             $realGroupID = $1;  
         }  
         # Append this group's genomes into the result hash.  
         Tracer::AddToListMap(\%retVal, $realGroupID, @{$groupHash{$groupID}});  
     }  
     # Return the result hash.  
     return %retVal;  
 }  
   
   
325  1;  1;

Legend:
Removed from v.1.16  
changed lines
  Added in v.1.24

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3