[Bio] / Sprout / NewStuffCheck.pl Repository:
ViewVC logotype

Diff of /Sprout/NewStuffCheck.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.23, Mon Jul 16 20:05:23 2007 UTC revision 1.24, Thu Dec 6 14:58:03 2007 UTC
# Line 136  Line 136 
136      # Get the group file.      # Get the group file.
137      my $groupFileName = $options->{groupFile};      my $groupFileName = $options->{groupFile};
138      Trace("Reading group file $groupFileName.") if T(2);      Trace("Reading group file $groupFileName.") if T(2);
     # We'll put each group's data into a hash, keyed by group  
     # name, each entry being a 3-tuple of page name, genus,  
     # and species  
     my %groups = Sprout::ReadGroupFile($groupFileName);  
139      Trace("Processing genomes.") if T(2);      Trace("Processing genomes.") if T(2);
140      # Get the current SEED.      # Get the current SEED.
141      my $fig = FIG->new();      my $fig = FIG->new();
# Line 153  Line 149 
149      my @newGenomes = GetGenomes($newSprout);      my @newGenomes = GetGenomes($newSprout);
150      # Compare the two genomes lists.      # Compare the two genomes lists.
151      my ($insertedGenomes, $deletedGenomes) = Tracer::CompareLists(\@newGenomes, \@oldGenomes);      my ($insertedGenomes, $deletedGenomes) = Tracer::CompareLists(\@newGenomes, \@oldGenomes);
152      # Add feature counts to the new genomes.      # Get the super-group data.
153        my %superTable = $newSprout->CheckGroupFile();
154        # Create a list for the new genomes that includes BBH and feature counts. We'll flip this
155        # lists so that the genome names are first and the IDs second.
156        my @insertedGenomeList = ();
157      for my $insertedGenome (@{$insertedGenomes}) {      for my $insertedGenome (@{$insertedGenomes}) {
158          my $genomeID = $insertedGenome->[0];          my $genomeID = $insertedGenome->[0];
159          # For a new genome, display the feature and BBH counts.          # For a new genome, display the feature and BBH counts.
# Line 162  Line 162 
162          my $suffix = ($count == 1 ? " one feature" : "$count features");          my $suffix = ($count == 1 ? " one feature" : "$count features");
163          my $bbhCount = FIGRules::BatchBBHs("fig|$genomeID.%", 1e-10);          my $bbhCount = FIGRules::BatchBBHs("fig|$genomeID.%", 1e-10);
164          $suffix .= "; " . ($bbhCount == 1 ? "one BBH" : "$bbhCount BBHs");          $suffix .= "; " . ($bbhCount == 1 ? "one BBH" : "$bbhCount BBHs");
165          $insertedGenome->[1] .= "($suffix)";          push @insertedGenomeList, [$insertedGenome->[1], "$genomeID ($suffix)"];
166      }      }
167      # Add information about SEED status to the deleted genomes.      # Create a list for the deleted genomes that contains information about SEED status.
168        # This list is flipped, too.
169        my @deletedGenomeList = ();
170      for my $deletedGenome (@{$deletedGenomes}) {      for my $deletedGenome (@{$deletedGenomes}) {
171          my $genomeID = $deletedGenome->[0];          my $genomeID = $deletedGenome->[0];
172            my $suffix = "";
173          if ($fig->is_genome($genomeID)) {          if ($fig->is_genome($genomeID)) {
174                # Here the deleted genome is still in the SEED.
175              my $complete = ($fig->is_complete($genomeID) ? "complete" : "incomplete");              my $complete = ($fig->is_complete($genomeID) ? "complete" : "incomplete");
176              $deletedGenome->[1] .= "(still in SEED, $complete)";              $suffix = " (still in SEED, $complete)";
177            } else {
178                # It's not in the SEED. See if it has been replaced.
179                my ($genus, $species, $strain) = $oldSprout->GetGenomeNameData($genomeID);
180                my @genomeIDs = $newSprout->GetGenomeByNameData($genus, $species, $strain);
181                if (scalar @genomeIDs) {
182                    $suffix = " (replaced)";
183                }
184          }          }
185            push @deletedGenomeList, [$deletedGenome->[1], "$genomeID$suffix"];
186      }      }
187      # Display the lists.      # Display the lists.
188      push @html, ShowLists($cgi, 'New Genomes'     => $insertedGenomes,      push @html, ShowLists($cgi, 'New Genomes'     => \@insertedGenomeList,
189                                  'Deleted Genomes' => $deletedGenomes);                                  'Deleted Genomes' => \@deletedGenomeList);
190      # Now the groups.      # Now the groups.
191      Trace("Comparing groups.") if T(2);      Trace("Comparing groups.") if T(2);
192      my %oldGroups = $oldSprout->GetGroups();      my %oldGroups = $oldSprout->GetGroups();
# Line 189  Line 201 
201              push @html, ShowLists($cgi, "Genomes in new group $newGroup" => \@groupGenomes);              push @html, ShowLists($cgi, "Genomes in new group $newGroup" => \@groupGenomes);
202          } else {          } else {
203              # Here the group is in both versions. Fix the lists and compare them. Note that we'll be comparing              # Here the group is in both versions. Fix the lists and compare them. Note that we'll be comparing
204              # on the genome ID, which will become the second list element after              # on the genome ID, which will become the second list element after the call to NameGenomes.
205              my @newGroupList = sort { $a->[1] <=> $b->[1] } NameGenomes($newSprout, $newGroups{$newGroup});              my @newGroupList = sort { $a->[1] <=> $b->[1] } NameGenomes($newSprout, $newGroups{$newGroup});
206              my @oldGroupList = sort { $a->[1] <=> $b->[1] } NameGenomes($oldSprout, $oldGroups{$newGroup});              my @oldGroupList = sort { $a->[1] <=> $b->[1] } NameGenomes($oldSprout, $oldGroups{$newGroup});
207              Trace("Comparing lists for $newGroup.") if T(4);              Trace("Comparing lists for $newGroup.") if T(4);
# Line 237  Line 249 
249      # coupling counts.      # coupling counts.
250      my $bbhCount = 0;      my $bbhCount = 0;
251      my $couplingCount = 0;      my $couplingCount = 0;
252        # We'll accumulate a report of genomes with missing BBHs in here.
253        my @bbhMissingGenomes = ();
254      # One of the reports is only for genomes common to both sprouts. To help us      # One of the reports is only for genomes common to both sprouts. To help us
255      # make this determination, we get a hash of the inserted genomes.      # make this determination, we get a hash of the inserted genomes.
256      my %skipGenomes = map { $_->[0] => 1 } @{$insertedGenomes};      my %skipGenomes = map { $_->[0] => 1 } @{$insertedGenomes};
# Line 250  Line 264 
264                              $cgi->th({align => 'right'}, ["Size (bp)", "Feats", "Contigs", "Subs",                              $cgi->th({align => 'right'}, ["Size (bp)", "Feats", "Contigs", "Subs",
265                                                            "F In SS", "PEGs", "RNAs", "PPs", "New", "Del"]));                                                            "F In SS", "PEGs", "RNAs", "PPs", "New", "Del"]));
266      FlushData(\*ORGOUT, \@orgHtml);      FlushData(\*ORGOUT, \@orgHtml);
267      # Now we start the loop.      # Now we start the loop. Note that "newGenomes" means all the genomes in the new Sprout,
268        # not the list of genomes that are new!
269      for my $genomeData (@newGenomes) {      for my $genomeData (@newGenomes) {
270          # Get this genome's ID and name.          # Get this genome's ID and name.
271          my $genomeID = $genomeData->[0];          my $genomeID = $genomeData->[0];
# Line 258  Line 273 
273          my $genomeTitle = "$genomeID: $genomeData->[1]";          my $genomeTitle = "$genomeID: $genomeData->[1]";
274          # Compute its size.          # Compute its size.
275          my $genomeSize = $newSprout->GenomeLength($genomeID);          my $genomeSize = $newSprout->GenomeLength($genomeID);
276          # Get a list of the genomes that are not this one.          # Get a list of the genomes in the new Sprout that are not this one.
277          my @otherGenomes = grep { $_ ne $genomeID } map { $_->[0] } @newGenomes;          my @otherGenomes = grep { $_ ne $genomeID } map { $_->[0] } @newGenomes;
278          Trace("Computing BBH matrix for $genomeID.") if T(3);          Trace("Computing BBH matrix for $genomeID.") if T(3);
279          # Get the bbh matrix going from the current gene to the others.          # Get the bbh matrix going from the current genome to the others.
280          my %matrix = $newSprout->BBHMatrix($genomeID, 1e-20, @otherGenomes);          my %matrix = $newSprout->BBHMatrix($genomeID, 1e-20, @otherGenomes);
281          # Set up the subsystem hash. This will be used to track which subsystems are used by          # Set up the subsystem hash. This will be used to track which subsystems are used by
282          # the genome's features.          # the genome's features.
# Line 273  Line 288 
288          my %counters = (peg => 0, in_ss => 0, rna => 0, pp => 0, total => 0);          my %counters = (peg => 0, in_ss => 0, rna => 0, pp => 0, total => 0);
289          # We'll store information about the genome's features (ID and functional role) in here.          # We'll store information about the genome's features (ID and functional role) in here.
290          my @newFeatures = ();          my @newFeatures = ();
291            # Finally, we'll use this flag to warn us if there are no BBHs.
292            my $bbhsFound = 0;
293          # Loop through the genome's features. The order is important here, because we need          # Loop through the genome's features. The order is important here, because we need
294          # to match the order used by "GetFeatures" for the feature difference comparison.          # to match the order used by "GetFeatures" for the feature difference comparison.
295          Trace("Processing feature statistics for $genomeID.") if T(3);          Trace("Processing feature statistics for $genomeID.") if T(3);
# Line 288  Line 305 
305              push @newFeatures, [$fid, $feature->PrimaryValue('Feature(assignment)')];              push @newFeatures, [$fid, $feature->PrimaryValue('Feature(assignment)')];
306              # Check to see if we have BBH data.              # Check to see if we have BBH data.
307              if (exists $matrix{$fid}) {              if (exists $matrix{$fid}) {
308                  $bbhCount += scalar keys %{$matrix{$fid}};                  my $fidBbhCount = scalar keys %{$matrix{$fid}};
309                    if ($fidBbhCount > 0) {
310                        # Denote that this feature has BBHs.
311                        $bbhsFound = 1;
312                        # Add them to the total BBH count.
313                        $bbhCount += $fidBbhCount;
314                    }
315              }              }
316              # Ask for couplings.              # Ask for couplings.
317              my %coupleHash = $newSprout->CoupledFeatures($fid);              my %coupleHash = $newSprout->CoupledFeatures($fid);
# Line 323  Line 346 
346              $addCount = scalar(@{$insertedFeatures});              $addCount = scalar(@{$insertedFeatures});
347              $delCount = scalar(@{$deletedFeatures});              $delCount = scalar(@{$deletedFeatures});
348          }          }
349            # Check to see if this genome is missing its BBHs.
350            if (! $bbhsFound) {
351                # It is, so add a line for it to the missing-BBH list.
352                push @bbhMissingGenomes, ShowDatum($cgi, $genomeData->[1], $genomeID);
353            }
354          push @orgHtml, $cgi->Tr($cgi->td($genomeTitle),          push @orgHtml, $cgi->Tr($cgi->td($genomeTitle),
355                                  $cgi->td({align => 'right'}, [$genomeSize, $counters{total}, scalar(keys %contigHash),                                  $cgi->td({align => 'right'}, [$genomeSize, $counters{total}, scalar(keys %contigHash),
356                                                                scalar(keys %subHash), $counters{in_ss}, $counters{peg},                                                                scalar(keys %subHash), $counters{in_ss}, $counters{peg},
# Line 333  Line 361 
361      push @orgHtml, $cgi->end_table();      push @orgHtml, $cgi->end_table();
362      FlushData(\*ORGOUT, \@orgHtml);      FlushData(\*ORGOUT, \@orgHtml);
363      close ORGOUT;      close ORGOUT;
364      # Flush the genome feature comparison data.      # Check for a missing-BBH report.
365        if (scalar @bbhMissingGenomes) {
366            # There is a report, so put it into the output stream.
367            push @html, ShowTitle($cgi, "Genomes without BBHs");
368            push @html, @bbhMissingGenomes;
369        }
370        # Flush the genome feature comparison data and the missing-BBH report (if any).
371      FlushData(\*OUTPUT, \@html);      FlushData(\*OUTPUT, \@html);
372      # Next, we show some basic counts.      # Next, we show some basic counts.
373      Trace("Displaying counts.") if T(3);      Trace("Displaying counts.") if T(3);
# Line 347  Line 381 
381      # Now we show the genomes that are not in groups but could be. First, we convert      # Now we show the genomes that are not in groups but could be. First, we convert
382      # our group hash from the new Sprout into the form used on the web site.      # our group hash from the new Sprout into the form used on the web site.
383      Trace("Examining possible missing genomes in groups.") if T(2);      Trace("Examining possible missing genomes in groups.") if T(2);
384      my %fixedGroups = Sprout::Fix(%newGroups);      my %fixedGroups = $newSprout->Fix(%newGroups);
385      for my $group (sort keys %groups) {      for my $group (sort keys %superTable) {
386          Trace("Checking group $group.");          Trace("Checking group $group.");
387          # Get this group's genus and species.          # Get this group's genus and species.
388          my $genus = $groups{$group}->[1];          my $genus = $superTable{$group}->{genus};
389          my $species = $groups{$group}->[2];          my $species = $superTable{$group}->{species};
390          # Get a hash of the genomes already in it.          # Get a hash of the genomes already in it.
391          my %inGroup = map { $_ => 1 } @{$fixedGroups{$group}};          my %inGroup = map { $_ => 1 } @{$fixedGroups{$group}};
392          # Get a list of its possible genomes.          # Get a list of its possible genomes.
393          my $filter = 'Genome(genus) = ?';          my $filter = 'Genome(genus) = ?';
394          my @parms = ($genus);          my @parms = ($genus);
395          # The species list is tricky because a given group may involve more than          # The species list is tricky because a given group may involve more than
396          # one target species. The species names will be comma-separated, and          # one target species or no target species (which means we want everything).
397          # we use some PERL trickiness to generate an OR filter for them.          # The species names will be in list references, and we use some PERL trickiness
398          if ($species) {          # to generate an OR filter for them.
399              # Get the individual species.          if (scalar @{$species}) {
             my @speciesList = split /\s*,\s*/, $species;  
400              # Create one species filter per species.              # Create one species filter per species.
401              my @filterClauses = map { 'Genome(species) = ?' } @speciesList;              my @filterClauses = map { 'Genome(species) = ?' } @{$species};
402              # OR the filter clauses together to get a real filter.              # OR the filter clauses together to get a real filter.
403              $filter .= " AND (" . (join " OR ", @filterClauses) . ")";              $filter .= " AND (" . (join " OR ", @filterClauses) . ")";
404              # Add the specieis names to the SQL parameter list.              # Add the specieis names to the SQL parameter list.
405              push @parms, @speciesList;              push @parms, @{$species};
406          }          }
407          my @possibles = $newSprout->GetFlat(['Genome'], $filter, \@parms, 'Genome(id)');          my @possibles = $newSprout->GetFlat(['Genome'], $filter, \@parms, 'Genome(id)');
408          # Get the possibles that aren't in the group and add identifying information.          # Get the possibles that aren't in the group and add identifying information.
# Line 406  Line 439 
439    
440  =head3 FlushData  =head3 FlushData
441    
442  C<< FlushData($handle, \@lines); >>      FlushData($handle, \@lines);
443    
444  Write the specified lines to the output file and clear them out of the list. This  Write the specified lines to the output file and clear them out of the list. This
445  method is called periodically so that even if something goes wrong we can still  method is called periodically so that even if something goes wrong we can still
446  see the data accumulating in the output file.  see the data accumulating in the output file. The key aspect here is that we
447    put new-line characters after each line written and show something in the trace
448    log.
449    
450  =over 4  =over 4
451    
# Line 441  Line 476 
476    
477  =head3 GetGenomes  =head3 GetGenomes
478    
479  C<< my @geneList = GetGenomes($sprout); >>      my @geneList = GetGenomes($sprout);
480    
481  Return a list of the genomes in the specified Sprout instance. The genomes  Return a list of the genomes in the specified Sprout instance. The genomes
482  are returned in alphabetical order by genome ID.  are returned in alphabetical order by genome ID.
# Line 477  Line 512 
512    
513  =head3 NameGenomes  =head3 NameGenomes
514    
515  C<< my @newList = NameGenomes($sprout, \@genomes); >>      my @newList = NameGenomes($sprout, \@genomes);
516    
517  Convert a list of genome IDs to a list of genome IDs with names.  Convert a list of genome IDs to a list of genome IDs with names.
518    
# Line 511  Line 546 
546    
547  =head3 GetSubsystems  =head3 GetSubsystems
548    
549  C<< my @subsystems = GetSubsystems($sprout); >>      my @subsystems = GetSubsystems($sprout);
550    
551  Get a list of the subsystems in the specified Sprout instance.  Get a list of the subsystems in the specified Sprout instance.
552    
# Line 542  Line 577 
577    
578  =head3 GetProperties  =head3 GetProperties
579    
580  C<< my @propertyList = GetProperties($sprout); >>      my @propertyList = GetProperties($sprout);
581    
582  Return a list of properties. Each element in the list will be a 2-tuple containing  Return a list of properties. Each element in the list will be a 2-tuple containing
583  the property name and value in the first column and its ID in the second column.  the property name and value in the first column and its ID in the second column.
# Line 589  Line 624 
624    
625  =head3 GetFeatures  =head3 GetFeatures
626    
627  C<< my @features = GetFeatures($sprout, $genomeID); >>      my @features = GetFeatures($sprout, $genomeID);
628    
629  Return the features of the specified genome in the specified Sprout instance.  Return the features of the specified genome in the specified Sprout instance.
630    
# Line 625  Line 660 
660    
661  =head3 ShowLists  =head3 ShowLists
662    
663  C<< my @htmlLines = ShowLists($cgi, %lists); >>      my @htmlLines = ShowLists($cgi, %lists);
664    
665  Display a set of lists. Each list should consist of 2-tuples, and the list  Display a set of lists. Each list should consist of 2-tuples, and the list
666  entries will be displayed as 2-element table rows with a header row.  entries will be displayed as 2-element table rows with a header row.
# Line 683  Line 718 
718    
719  =head3 ComputeHeader  =head3 ComputeHeader
720    
721  C<< my $header = ComputeHeader($name, $count); >>      my $header = ComputeHeader($name, $count);
722    
723  Return a list header for a list of the specified length.  Return a list header for a list of the specified length.
724    
# Line 723  Line 758 
758    
759  =head3 ShowCounts  =head3 ShowCounts
760    
761  C<< ShowCounts($sprout); >>      ShowCounts($sprout);
762    
763  Display general counts for the specified sprout instance. These counts are  Display general counts for the specified sprout instance. These counts are
764  used in progress reports.  used in progress reports.
# Line 770  Line 805 
805    
806  =head3 ShowDatum  =head3 ShowDatum
807    
808  C<< my $htmlText = ShowDatum($cgi, $label, $value); >>      my $htmlText = ShowDatum($cgi, $label, $value);
809    
810  Return a table row displaying the specified label and value.  Return a table row displaying the specified label and value.
811    
# Line 807  Line 842 
842    
843  =head3 ShowTitle  =head3 ShowTitle
844    
845  C<< my $html = ShowTitle($cgi, $title); >>      my $html = ShowTitle($cgi, $title);
846    
847  Display a title line. This will be a merged table row with bolded text.  Display a title line. This will be a merged table row with bolded text.
848    

Legend:
Removed from v.1.23  
changed lines
  Added in v.1.24

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3