[Bio] / Sprout / SimLoad.pl Repository:
ViewVC logotype

Diff of /Sprout/SimLoad.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.8, Wed Feb 1 03:28:11 2006 UTC revision 1.10, Mon Feb 13 15:42:48 2006 UTC
# Line 4  Line 4 
4  use CGI;  use CGI;
5  use Tracer;  use Tracer;
6  use Genome;  use Genome;
7    use ERDBLoad;
8  use DBKernel;  use DBKernel;
9  use SimBlocks;  use SimBlocks;
10  use File::Path;  use File::Path;
# Line 124  Line 125 
125  A text description of the genome (usually the species name with  A text description of the genome (usually the species name with
126  a strain ID).  a strain ID).
127    
128  =teim groupName  =item groupName
129    
130  The name of the group to which the genome belongs.  The name of the group to which the genome belongs.
131    
# Line 315  Line 316 
316  my $inDirectoryTree = ($arguments[0] ? Cwd::abs_path($arguments[0]) : "$FIG_Config::data/CloseStrainSets");  my $inDirectoryTree = ($arguments[0] ? Cwd::abs_path($arguments[0]) : "$FIG_Config::data/CloseStrainSets");
317  # Check to see if we need to create the database.  # Check to see if we need to create the database.
318  if ($options->{createDB}) {  if ($options->{createDB}) {
319      Trace("Creating database.") if T(2);      my $dbname = SimBlocks::DBName();
320      DBKernel::CreateDB($FIG_Config::simBlocksDB);      Trace("Creating database $dbname.") if T(2);
321        DBKernel::CreateDB($dbname);
322  }  }
323  # Get the output directory.  # Get the output directory.
324  my $outDirectory = $FIG_Config::simBlocksData;  my $outDirectory = $FIG_Config::simBlocksData;
325    # Default it if it isn't present.
326    if (! $outDirectory) {
327        $outDirectory = "$FIG_Config::fig/BlockData";
328    }
329  # Insure that it exists.  # Insure that it exists.
330  if (! -d $outDirectory) {  if (! -d $outDirectory) {
331      Trace("Creating output directory $outDirectory.") if T(2);      Trace("Creating output directory $outDirectory.") if T(2);
# Line 337  Line 343 
343  # Create an error counter and a directory counter.  # Create an error counter and a directory counter.
344  my $errorCount = 0;  my $errorCount = 0;
345  my $dirCount = 0;  my $dirCount = 0;
346    # Get a similarity block object. Note that we've waited until the database
347    # was already created before doing this. The tables, however, may not yet
348    # exist, so we can't do very much with it.
349    my $simBlocks = SimBlocks->new();
350    # Determine if this is a load-only run.
351    my $loadOnly = ($options->{generate} eq 'no');
352    # Create the loader objects for the tables. These are global to the whole
353    # script.
354    my $GenomeLoad = ERDBLoad->new($simBlocks, 'Genome', $outDirectory, $loadOnly);
355    my $ContigLoad = ERDBLoad->new($simBlocks, 'Contig', $outDirectory, $loadOnly);
356    my $GroupBlockLoad = ERDBLoad->new($simBlocks, 'GroupBlock', $outDirectory, $loadOnly);
357    my $RegionLoad = ERDBLoad->new($simBlocks, 'Region', $outDirectory, $loadOnly);
358    my $ConsistsOfLoad = ERDBLoad->new($simBlocks, 'ConsistsOf', $outDirectory, $loadOnly);
359    my $ContainsRegionLoad = ERDBLoad->new($simBlocks, 'ContainsRegion', $outDirectory, $loadOnly);
360    my $HasInstanceOfLoad = ERDBLoad->new($simBlocks, 'HasInstanceOf', $outDirectory, $loadOnly);
361    my $IncludesRegionLoad = ERDBLoad->new($simBlocks, 'IncludesRegion', $outDirectory, $loadOnly);
362    # This hash helps us to clean up when we're done.
363    my $loaderList = { Genome => $GenomeLoad, Contig => $ContigLoad,
364                       GroupBlock => $GroupBlockLoad, Region => $RegionLoad,
365                       ConsistsOf => $ConsistsOfLoad, ContainsRegion => $ContainsRegionLoad,
366                       HasInstanceOf => $HasInstanceOfLoad,
367                       IncludesRegion => $IncludesRegionLoad};
368  # Check to see if we should generate the output files.  # Check to see if we should generate the output files.
369  if ($options->{generate} eq 'no') {  if ($loadOnly) {
370      # Here we are to use existing output files.      # Here we are to use existing output files.
371      Trace("Existing database load files will be used.") if T(2);      Trace("Existing database load files will be used.") if T(2);
372  } else {  } else {
# Line 366  Line 394 
394          }          }
395      }      }
396      Trace("Load files generated from $dirCount directories.") if T(2);      Trace("Load files generated from $dirCount directories.") if T(2);
397        # Now finish up the loading.
398        my $stats = Stats->new();
399        for my $table (keys %{$loaderList}) {
400            Trace("Finishing $table.") if T(2);
401            my $tableThing = $loaderList->{$table};
402            my $newStats = $tableThing->Finish();
403            $stats->Accumulate($newStats);
404        }
405        Trace("Statistics for generate.\n" . $stats->Show()) if T(2);
406  }  }
407  # Check for errors.  # Check for errors.
408  if ($errorCount > 0) {  if ($errorCount > 0) {
# Line 375  Line 412 
412      if ($options->{load} eq 'yes') {      if ($options->{load} eq 'yes') {
413          # Here we have no outstanding errors and the user wants us to load          # Here we have no outstanding errors and the user wants us to load
414          # the database. First, we create a similarity block object.          # the database. First, we create a similarity block object.
         my $simBlocks = SimBlocks->new();  
415          # Use it to load the database. Note we specify that the tables are to be          # Use it to load the database. Note we specify that the tables are to be
416          # dropped and rebuilt.          # dropped and rebuilt.
417          $simBlocks->LoadTables($outDirectory, 1);          $simBlocks->LoadTables($outDirectory, 1);
# Line 387  Line 423 
423  sub ProcessDirectory {  sub ProcessDirectory {
424      my ($inDirectory, $outDirectory, $groupName) = @_;      my ($inDirectory, $outDirectory, $groupName) = @_;
425      Trace("Processing directory $inDirectory.") if T(2);      Trace("Processing directory $inDirectory.") if T(2);
426      # Our first task is to copy the genome data to the output directory      # Our first task is to copy the genome data to the output
427      # and add the genomes to the genome list.      # and add the genomes to the genome list.
428      my %genomes = ();      my %genomes = ();
429      Open(\*GENOMESIN, "<$inDirectory/genome.tbl");      Open(\*GENOMESIN, "<$inDirectory/genome.tbl");
     Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx");  
430      # Count the genomes read and errors found.      # Count the genomes read and errors found.
431      my $genomeCount = 0;      my $genomeCount = 0;
432      my $errorCount = 0;      my $errorCount = 0;
# Line 399  Line 434 
434      while (my $genomeData = <GENOMESIN>) {      while (my $genomeData = <GENOMESIN>) {
435          # Echo the genome record to the output.          # Echo the genome record to the output.
436          my @fields = Tracer::ParseRecord($genomeData);          my @fields = Tracer::ParseRecord($genomeData);
437          print GENOMESOUT join("\t", @fields, $groupName). "\n";          $GenomeLoad->Put(@fields, $groupName);
438          # Extract the genome ID.          # Extract the genome ID.
439          my $genomeID = $fields[0];          my $genomeID = $fields[0];
440          # Store it in the genomes hash. We start with a value of 0. If          # Store it in the genomes hash. We start with a value of 0. If
# Line 411  Line 446 
446          $genomeCount++;          $genomeCount++;
447      }      }
448      Trace("$genomeCount genomes found.") if T(2);      Trace("$genomeCount genomes found.") if T(2);
449      # Close the files.      # Close the genome file.
450      close GENOMESIN;      close GENOMESIN;
     close GENOMESOUT;  
451      # Create the contig hash used to associate contigs to their parent      # Create the contig hash used to associate contigs to their parent
452      # genomes.      # genomes.
453      my %contigs = ();      my %contigs = ();
454      # Now we begin to read the Block and Region files in parallel. Both      # Now we begin to read the Block and Region files in parallel. Both
455      # are sorted by block ID, so all processing for this section of the      # are sorted by block ID, so all processing for this section of the
456      # script is done a block at a time. The first task is to      # script is done a block at a time. First, we determine which file
457      # open the output files.      # sets we'll be processing. There are at most two: regular and
458      Open(\*BLOCKSOUT, ">>$outDirectory/GroupBlock.dtx");      # intergenic.
     Open(\*REGIONSOUT, ">>$outDirectory/Region.dtx");  
     Open(\*INSTANCESOUT, ">>$outDirectory/HasInstanceOf.dtx");  
     Open(\*CONTAINSOUT, ">>$outDirectory/ContainsRegion.dtx");  
     Open(\*INCLUDESOUT, ">>$outDirectory/IncludesRegion.dtx");  
     # Determine which file sets we'll be processing.  
459      my @fileSets = ();      my @fileSets = ();
460      my @prefixes = ("", "intergenic_");      my @prefixes = ("", "intergenic_");
461      for my $prefix (@prefixes) {      for my $prefix (@prefixes) {
# Line 529  Line 558 
558                  # Write the block record. Note that the block ID is prefixed by the group name to                  # Write the block record. Note that the block ID is prefixed by the group name to
559                  # make it unique.                  # make it unique.
560                  my $variance = $blockData{"snip-count"} / $blockData{len};                  my $variance = $blockData{"snip-count"} / $blockData{len};
561                  print BLOCKSOUT join("\t", "$groupName:$blockID", $blockData{description}, $blockData{len},                  $GroupBlockLoad->Put("$groupName:$blockID", $blockData{description}, $blockData{len},
562                                       $blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n";                                       $blockData{"snip-count"}, $variance, $blockData{pattern});
563                  # Find all the variance points in the block pattern. We'll use them to create                  # Find all the variance points in the block pattern. We'll use them to create
564                  # the content strings for each region.                  # the content strings for each region.
565                  my @positions = SimBlocks::ParsePattern($blockData{pattern});                  my @positions = SimBlocks::ParsePattern($blockData{pattern});
# Line 548  Line 577 
577                      # Get the region's location data.                      # Get the region's location data.
578                      my $location = BasicLocation->new($region);                      my $location = BasicLocation->new($region);
579                      # Write this region to the output files.                      # Write this region to the output files.
580                      print REGIONSOUT join("\t", $region, $location->Contig, $location->Dir,                      $RegionLoad->Put($region, $location->Contig, $location->Dir,
581                                            $location->Right, $location->Length,                                            $location->Right, $location->Length,
582                                            $regionPeg{$region}, $location->Left, $content) . "\n";                                       $regionPeg{$region}, $location->Left, $content);
583                      print CONTAINSOUT join("\t", $location->Contig, $region,                      $ContainsRegionLoad->Put($location->Contig, $region, $location->Length,
584                                             $location->Length, $location->Left) . "\n";                                               $location->Left);
585                      print INCLUDESOUT join("\t", "$groupName:$blockID", $region) . "\n";                      $IncludesRegionLoad->Put("$groupName:$blockID", $region);
586                  }                  }
587                  # Finally, we need to connect this block to the genomes in which it occurs.                  # Finally, we need to connect this block to the genomes in which it occurs.
588                  for my $genomeID (keys %genomesFound) {                  for my $genomeID (keys %genomesFound) {
589                      print INSTANCESOUT join("\t", $genomeID, "$groupName:$blockID") . "\n";                      $HasInstanceOfLoad->Put($genomeID, "$groupName:$blockID");
590                  }                  }
591                  # Count this block's regions.                  # Count this block's regions.
592                  $regionsCount += $regionCounter;                  $regionsCount += $regionCounter;
# Line 568  Line 597 
597          close REGIONSIN;          close REGIONSIN;
598      }      }
599      # Close the output files.      # Close the output files.
     close REGIONSOUT;  
     close BLOCKSOUT;  
     close INSTANCESOUT;  
600      # All the block data has been written. Tell the user what we found.      # All the block data has been written. Tell the user what we found.
601      Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2);      Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2);
602      # The next task is to write the genome/contig data. This is provided by the      # The next task is to write the genome/contig data. This is provided by the
603      # "%contigs" hash. First, we need to open the files.      # "%contigs" hash.
604      my $contigsCount = 0;      my $contigsCount = 0;
     Open(\*CONTIGSOUT, ">>$outDirectory/Contig.dtx");  
     Open(\*CONSISTSOUT, ">>$outDirectory/ConsistsOf.dtx");  
605      for my $contigID (keys %contigs) {      for my $contigID (keys %contigs) {
606          print CONTIGSOUT "$contigID\n";          $ContigLoad->Put("$contigID");
607          print CONSISTSOUT join("\t", $contigs{$contigID}, $contigID) . "\n";          $ConsistsOfLoad->Put($contigs{$contigID}, $contigID);
608          $contigsCount++;          $contigsCount++;
609      }      }
610      Trace("$contigsCount contigs found.") if T(2);      Trace("$contigsCount contigs found.") if T(2);
     # Close the output files.  
     close CONTIGSOUT;  
     close CONSISTSOUT;  
611      # Now warn the user about all the genomes that didn't have blocks.      # Now warn the user about all the genomes that didn't have blocks.
612      for my $genomeID (keys %genomes) {      for my $genomeID (keys %genomes) {
613          if (! $genomes{$genomeID}) {          if (! $genomes{$genomeID}) {

Legend:
Removed from v.1.8  
changed lines
  Added in v.1.10

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3