[Bio] / Sprout / SimLoad.pl Repository:
ViewVC logotype

Diff of /Sprout/SimLoad.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.8, Wed Feb 1 03:28:11 2006 UTC revision 1.9, Thu Feb 2 21:30:05 2006 UTC
# Line 4  Line 4 
4  use CGI;  use CGI;
5  use Tracer;  use Tracer;
6  use Genome;  use Genome;
7    use ERDBLoad;
8  use DBKernel;  use DBKernel;
9  use SimBlocks;  use SimBlocks;
10  use File::Path;  use File::Path;
# Line 124  Line 125 
125  A text description of the genome (usually the species name with  A text description of the genome (usually the species name with
126  a strain ID).  a strain ID).
127    
128  =teim groupName  =item groupName
129    
130  The name of the group to which the genome belongs.  The name of the group to which the genome belongs.
131    
# Line 337  Line 338 
338  # Create an error counter and a directory counter.  # Create an error counter and a directory counter.
339  my $errorCount = 0;  my $errorCount = 0;
340  my $dirCount = 0;  my $dirCount = 0;
341    # Get a similarity block object. Note that we've waited until the database
342    # was already created before doing this. The tables, however, may not yet
343    # exist, so we can't do very much with it.
344    my $simBlocks = SimBlocks->new();
345    # Determine if this is a load-only run.
346    my $loadOnly = ($options->{generate} eq 'no');
347    # Create the loader objects for the tables. These are global to the whole
348    # script.
349    my $GenomeLoad = ERDBLoad->new($simBlocks, 'Genome', $outDirectory, $loadOnly);
350    my $ContigLoad = ERDBLoad->new($simBlocks, 'Contig', $outDirectory, $loadOnly);
351    my $GroupBlockLoad = ERDBLoad->new($simBlocks, 'GroupBlock', $outDirectory, $loadOnly);
352    my $RegionLoad = ERDBLoad->new($simBlocks, 'Region', $outDirectory, $loadOnly);
353    my $ConsistsOfLoad = ERDBLoad->new($simBlocks, 'ConsistsOf', $outDirectory, $loadOnly);
354    my $ContainsRegionLoad = ERDBLoad->new($simBlocks, 'ContainsRegion', $outDirectory, $loadOnly);
355    my $HasInstanceOfLoad = ERDBLoad->new($simBlocks, 'HasInstanceOf', $outDirectory, $loadOnly);
356    my $IncludesRegionLoad = ERDBLoad->new($simBlocks, 'IncludesRegion', $outDirectory, $loadOnly);
357    # This hash helps us to clean up when we're done.
358    my $loaderList = { Genome => $GenomeLoad, Contig => $ContigLoad,
359                       GroupBlock => $GroupBlockLoad, Region => $RegionLoad,
360                       ConsistsOf => $ConsistsOfLoad, ContainsRegion => $ContainsRegionLoad,
361                       HasInstanceOf => $HasInstanceOfLoad,
362                       IncludesRegion => $IncludesRegionLoad};
363  # Check to see if we should generate the output files.  # Check to see if we should generate the output files.
364  if ($options->{generate} eq 'no') {  if ($loadOnly) {
365      # Here we are to use existing output files.      # Here we are to use existing output files.
366      Trace("Existing database load files will be used.") if T(2);      Trace("Existing database load files will be used.") if T(2);
367  } else {  } else {
# Line 366  Line 389 
389          }          }
390      }      }
391      Trace("Load files generated from $dirCount directories.") if T(2);      Trace("Load files generated from $dirCount directories.") if T(2);
392        # Now finish up the loading.
393        my $stats = Stats->new();
394        for my $table (keys %{$loaderList}) {
395            Trace("Finishing $table.") if T(2);
396            my $tableThing = $loaderList->{$table};
397            my $newStats = $tableThing->Finish();
398            $stats->Accumulate($newStats);
399        }
400        Trace("Statistics for generate.\n" . $stats->Show()) if T(2);
401  }  }
402  # Check for errors.  # Check for errors.
403  if ($errorCount > 0) {  if ($errorCount > 0) {
# Line 375  Line 407 
407      if ($options->{load} eq 'yes') {      if ($options->{load} eq 'yes') {
408          # Here we have no outstanding errors and the user wants us to load          # Here we have no outstanding errors and the user wants us to load
409          # the database. First, we create a similarity block object.          # the database. First, we create a similarity block object.
         my $simBlocks = SimBlocks->new();  
410          # Use it to load the database. Note we specify that the tables are to be          # Use it to load the database. Note we specify that the tables are to be
411          # dropped and rebuilt.          # dropped and rebuilt.
412          $simBlocks->LoadTables($outDirectory, 1);          $simBlocks->LoadTables($outDirectory, 1);
# Line 387  Line 418 
418  sub ProcessDirectory {  sub ProcessDirectory {
419      my ($inDirectory, $outDirectory, $groupName) = @_;      my ($inDirectory, $outDirectory, $groupName) = @_;
420      Trace("Processing directory $inDirectory.") if T(2);      Trace("Processing directory $inDirectory.") if T(2);
421      # Our first task is to copy the genome data to the output directory      # Our first task is to copy the genome data to the output
422      # and add the genomes to the genome list.      # and add the genomes to the genome list.
423      my %genomes = ();      my %genomes = ();
424      Open(\*GENOMESIN, "<$inDirectory/genome.tbl");      Open(\*GENOMESIN, "<$inDirectory/genome.tbl");
     Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx");  
425      # Count the genomes read and errors found.      # Count the genomes read and errors found.
426      my $genomeCount = 0;      my $genomeCount = 0;
427      my $errorCount = 0;      my $errorCount = 0;
# Line 399  Line 429 
429      while (my $genomeData = <GENOMESIN>) {      while (my $genomeData = <GENOMESIN>) {
430          # Echo the genome record to the output.          # Echo the genome record to the output.
431          my @fields = Tracer::ParseRecord($genomeData);          my @fields = Tracer::ParseRecord($genomeData);
432          print GENOMESOUT join("\t", @fields, $groupName). "\n";          $GenomeLoad->Put(@fields, $groupName);
433          # Extract the genome ID.          # Extract the genome ID.
434          my $genomeID = $fields[0];          my $genomeID = $fields[0];
435          # Store it in the genomes hash. We start with a value of 0. If          # Store it in the genomes hash. We start with a value of 0. If
# Line 411  Line 441 
441          $genomeCount++;          $genomeCount++;
442      }      }
443      Trace("$genomeCount genomes found.") if T(2);      Trace("$genomeCount genomes found.") if T(2);
444      # Close the files.      # Close the genome file.
445      close GENOMESIN;      close GENOMESIN;
     close GENOMESOUT;  
446      # Create the contig hash used to associate contigs to their parent      # Create the contig hash used to associate contigs to their parent
447      # genomes.      # genomes.
448      my %contigs = ();      my %contigs = ();
449      # Now we begin to read the Block and Region files in parallel. Both      # Now we begin to read the Block and Region files in parallel. Both
450      # are sorted by block ID, so all processing for this section of the      # are sorted by block ID, so all processing for this section of the
451      # script is done a block at a time. The first task is to      # script is done a block at a time. First, we determine which file
452      # open the output files.      # sets we'll be processing. There are at most two: regular and
453      Open(\*BLOCKSOUT, ">>$outDirectory/GroupBlock.dtx");      # intergenic.
     Open(\*REGIONSOUT, ">>$outDirectory/Region.dtx");  
     Open(\*INSTANCESOUT, ">>$outDirectory/HasInstanceOf.dtx");  
     Open(\*CONTAINSOUT, ">>$outDirectory/ContainsRegion.dtx");  
     Open(\*INCLUDESOUT, ">>$outDirectory/IncludesRegion.dtx");  
     # Determine which file sets we'll be processing.  
454      my @fileSets = ();      my @fileSets = ();
455      my @prefixes = ("", "intergenic_");      my @prefixes = ("", "intergenic_");
456      for my $prefix (@prefixes) {      for my $prefix (@prefixes) {
# Line 529  Line 553 
553                  # Write the block record. Note that the block ID is prefixed by the group name to                  # Write the block record. Note that the block ID is prefixed by the group name to
554                  # make it unique.                  # make it unique.
555                  my $variance = $blockData{"snip-count"} / $blockData{len};                  my $variance = $blockData{"snip-count"} / $blockData{len};
556                  print BLOCKSOUT join("\t", "$groupName:$blockID", $blockData{description}, $blockData{len},                  $GroupBlockLoad->Put("$groupName:$blockID", $blockData{description}, $blockData{len},
557                                       $blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n";                                       $blockData{"snip-count"}, $variance, $blockData{pattern});
558                  # Find all the variance points in the block pattern. We'll use them to create                  # Find all the variance points in the block pattern. We'll use them to create
559                  # the content strings for each region.                  # the content strings for each region.
560                  my @positions = SimBlocks::ParsePattern($blockData{pattern});                  my @positions = SimBlocks::ParsePattern($blockData{pattern});
# Line 548  Line 572 
572                      # Get the region's location data.                      # Get the region's location data.
573                      my $location = BasicLocation->new($region);                      my $location = BasicLocation->new($region);
574                      # Write this region to the output files.                      # Write this region to the output files.
575                      print REGIONSOUT join("\t", $region, $location->Contig, $location->Dir,                      $RegionLoad->Put($region, $location->Contig, $location->Dir,
576                                            $location->Right, $location->Length,                                            $location->Right, $location->Length,
577                                            $regionPeg{$region}, $location->Left, $content) . "\n";                                       $regionPeg{$region}, $location->Left, $content);
578                      print CONTAINSOUT join("\t", $location->Contig, $region,                      $ContainsRegionLoad->Put($location->Contig, $region, $location->Length,
579                                             $location->Length, $location->Left) . "\n";                                               $location->Left);
580                      print INCLUDESOUT join("\t", "$groupName:$blockID", $region) . "\n";                      $IncludesRegionLoad->Put("$groupName:$blockID", $region);
581                  }                  }
582                  # Finally, we need to connect this block to the genomes in which it occurs.                  # Finally, we need to connect this block to the genomes in which it occurs.
583                  for my $genomeID (keys %genomesFound) {                  for my $genomeID (keys %genomesFound) {
584                      print INSTANCESOUT join("\t", $genomeID, "$groupName:$blockID") . "\n";                      $HasInstanceOfLoad->Put($genomeID, "$groupName:$blockID");
585                  }                  }
586                  # Count this block's regions.                  # Count this block's regions.
587                  $regionsCount += $regionCounter;                  $regionsCount += $regionCounter;
# Line 568  Line 592 
592          close REGIONSIN;          close REGIONSIN;
593      }      }
594      # Close the output files.      # Close the output files.
     close REGIONSOUT;  
     close BLOCKSOUT;  
     close INSTANCESOUT;  
595      # All the block data has been written. Tell the user what we found.      # All the block data has been written. Tell the user what we found.
596      Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2);      Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2);
597      # The next task is to write the genome/contig data. This is provided by the      # The next task is to write the genome/contig data. This is provided by the
598      # "%contigs" hash. First, we need to open the files.      # "%contigs" hash.
599      my $contigsCount = 0;      my $contigsCount = 0;
     Open(\*CONTIGSOUT, ">>$outDirectory/Contig.dtx");  
     Open(\*CONSISTSOUT, ">>$outDirectory/ConsistsOf.dtx");  
600      for my $contigID (keys %contigs) {      for my $contigID (keys %contigs) {
601          print CONTIGSOUT "$contigID\n";          $ContigLoad->Put("$contigID");
602          print CONSISTSOUT join("\t", $contigs{$contigID}, $contigID) . "\n";          $ConsistsOfLoad->Put($contigs{$contigID}, $contigID);
603          $contigsCount++;          $contigsCount++;
604      }      }
605      Trace("$contigsCount contigs found.") if T(2);      Trace("$contigsCount contigs found.") if T(2);
     # Close the output files.  
     close CONTIGSOUT;  
     close CONSISTSOUT;  
606      # Now warn the user about all the genomes that didn't have blocks.      # Now warn the user about all the genomes that didn't have blocks.
607      for my $genomeID (keys %genomes) {      for my $genomeID (keys %genomes) {
608          if (! $genomes{$genomeID}) {          if (! $genomes{$genomeID}) {

Legend:
Removed from v.1.8  
changed lines
  Added in v.1.9

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3