[Bio] / Sprout / SimLoad.pl Repository:
ViewVC logotype

Diff of /Sprout/SimLoad.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.8, Wed Feb 1 03:28:11 2006 UTC revision 1.11, Thu Dec 6 14:58:03 2007 UTC
# Line 4  Line 4 
4  use CGI;  use CGI;
5  use Tracer;  use Tracer;
6  use Genome;  use Genome;
7    use ERDBLoad;
8  use DBKernel;  use DBKernel;
9  use SimBlocks;  use SimBlocks;
10  use File::Path;  use File::Path;
# Line 72  Line 73 
73  input files in the C</Users/fig/BlastData> directory tree  input files in the C</Users/fig/BlastData> directory tree
74  and run at a trace level of 3.  and run at a trace level of 3.
75    
76  C<< SimLoad -trace=3 /Users/fig/BlastData >>      SimLoad -trace=3 /Users/fig/BlastData
77    
78  The following command line converts the input files in  The following command line converts the input files in
79  the default directory into load files but does not load  the default directory into load files but does not load
80  the database and runs at a trace level of 2.  the database and runs at a trace level of 2.
81    
82  C<< SimLoad -load=no >>      SimLoad -load=no
83    
84  =head2 Input Directory  =head2 Input Directory
85    
# Line 124  Line 125 
125  A text description of the genome (usually the species name with  A text description of the genome (usually the species name with
126  a strain ID).  a strain ID).
127    
128  =teim groupName  =item groupName
129    
130  The name of the group to which the genome belongs.  The name of the group to which the genome belongs.
131    
# Line 315  Line 316 
316  my $inDirectoryTree = ($arguments[0] ? Cwd::abs_path($arguments[0]) : "$FIG_Config::data/CloseStrainSets");  my $inDirectoryTree = ($arguments[0] ? Cwd::abs_path($arguments[0]) : "$FIG_Config::data/CloseStrainSets");
317  # Check to see if we need to create the database.  # Check to see if we need to create the database.
318  if ($options->{createDB}) {  if ($options->{createDB}) {
319      Trace("Creating database.") if T(2);      my $dbname = SimBlocks::DBName();
320      DBKernel::CreateDB($FIG_Config::simBlocksDB);      Trace("Creating database $dbname.") if T(2);
321        DBKernel::CreateDB($dbname);
322  }  }
323  # Get the output directory.  # Get the output directory.
324  my $outDirectory = $FIG_Config::simBlocksData;  my $outDirectory = $FIG_Config::simBlocksData;
325    # Default it if it isn't present.
326    if (! $outDirectory) {
327        $outDirectory = "$FIG_Config::fig/BlockData";
328    }
329  # Insure that it exists.  # Insure that it exists.
330  if (! -d $outDirectory) {  if (! -d $outDirectory) {
331      Trace("Creating output directory $outDirectory.") if T(2);      Trace("Creating output directory $outDirectory.") if T(2);
# Line 337  Line 343 
343  # Create an error counter and a directory counter.  # Create an error counter and a directory counter.
344  my $errorCount = 0;  my $errorCount = 0;
345  my $dirCount = 0;  my $dirCount = 0;
346    # Get a similarity block object. Note that we've waited until the database
347    # was already created before doing this. The tables, however, may not yet
348    # exist, so we can't do very much with it.
349    my $simBlocks = SimBlocks->new();
350    # Determine if this is a load-only run.
351    my $loadOnly = ($options->{generate} eq 'no');
352    # Create the loader objects for the tables. These are global to the whole
353    # script.
354    my $GenomeLoad = ERDBLoad->new($simBlocks, 'Genome', $outDirectory, $loadOnly);
355    my $ContigLoad = ERDBLoad->new($simBlocks, 'Contig', $outDirectory, $loadOnly);
356    my $GroupBlockLoad = ERDBLoad->new($simBlocks, 'GroupBlock', $outDirectory, $loadOnly);
357    my $RegionLoad = ERDBLoad->new($simBlocks, 'Region', $outDirectory, $loadOnly);
358    my $ConsistsOfLoad = ERDBLoad->new($simBlocks, 'ConsistsOf', $outDirectory, $loadOnly);
359    my $ContainsRegionLoad = ERDBLoad->new($simBlocks, 'ContainsRegion', $outDirectory, $loadOnly);
360    my $HasInstanceOfLoad = ERDBLoad->new($simBlocks, 'HasInstanceOf', $outDirectory, $loadOnly);
361    my $IncludesRegionLoad = ERDBLoad->new($simBlocks, 'IncludesRegion', $outDirectory, $loadOnly);
362    # This hash helps us to clean up when we're done.
363    my $loaderList = { Genome => $GenomeLoad, Contig => $ContigLoad,
364                       GroupBlock => $GroupBlockLoad, Region => $RegionLoad,
365                       ConsistsOf => $ConsistsOfLoad, ContainsRegion => $ContainsRegionLoad,
366                       HasInstanceOf => $HasInstanceOfLoad,
367                       IncludesRegion => $IncludesRegionLoad};
368  # Check to see if we should generate the output files.  # Check to see if we should generate the output files.
369  if ($options->{generate} eq 'no') {  if ($loadOnly) {
370      # Here we are to use existing output files.      # Here we are to use existing output files.
371      Trace("Existing database load files will be used.") if T(2);      Trace("Existing database load files will be used.") if T(2);
372  } else {  } else {
# Line 366  Line 394 
394          }          }
395      }      }
396      Trace("Load files generated from $dirCount directories.") if T(2);      Trace("Load files generated from $dirCount directories.") if T(2);
397        # Now finish up the loading.
398        my $stats = Stats->new();
399        for my $table (keys %{$loaderList}) {
400            Trace("Finishing $table.") if T(2);
401            my $tableThing = $loaderList->{$table};
402            my $newStats = $tableThing->Finish();
403            $stats->Accumulate($newStats);
404        }
405        Trace("Statistics for generate.\n" . $stats->Show()) if T(2);
406  }  }
407  # Check for errors.  # Check for errors.
408  if ($errorCount > 0) {  if ($errorCount > 0) {
# Line 375  Line 412 
412      if ($options->{load} eq 'yes') {      if ($options->{load} eq 'yes') {
413          # Here we have no outstanding errors and the user wants us to load          # Here we have no outstanding errors and the user wants us to load
414          # the database. First, we create a similarity block object.          # the database. First, we create a similarity block object.
         my $simBlocks = SimBlocks->new();  
415          # Use it to load the database. Note we specify that the tables are to be          # Use it to load the database. Note we specify that the tables are to be
416          # dropped and rebuilt.          # dropped and rebuilt.
417          $simBlocks->LoadTables($outDirectory, 1);          $simBlocks->LoadTables($outDirectory, 1);
# Line 387  Line 423 
423  sub ProcessDirectory {  sub ProcessDirectory {
424      my ($inDirectory, $outDirectory, $groupName) = @_;      my ($inDirectory, $outDirectory, $groupName) = @_;
425      Trace("Processing directory $inDirectory.") if T(2);      Trace("Processing directory $inDirectory.") if T(2);
426      # Our first task is to copy the genome data to the output directory      # Our first task is to copy the genome data to the output
427      # and add the genomes to the genome list.      # and add the genomes to the genome list.
428      my %genomes = ();      my %genomes = ();
429      Open(\*GENOMESIN, "<$inDirectory/genome.tbl");      Open(\*GENOMESIN, "<$inDirectory/genome.tbl");
     Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx");  
430      # Count the genomes read and errors found.      # Count the genomes read and errors found.
431      my $genomeCount = 0;      my $genomeCount = 0;
432      my $errorCount = 0;      my $errorCount = 0;
# Line 399  Line 434 
434      while (my $genomeData = <GENOMESIN>) {      while (my $genomeData = <GENOMESIN>) {
435          # Echo the genome record to the output.          # Echo the genome record to the output.
436          my @fields = Tracer::ParseRecord($genomeData);          my @fields = Tracer::ParseRecord($genomeData);
437          print GENOMESOUT join("\t", @fields, $groupName). "\n";          $GenomeLoad->Put(@fields, $groupName);
438          # Extract the genome ID.          # Extract the genome ID.
439          my $genomeID = $fields[0];          my $genomeID = $fields[0];
440          # Store it in the genomes hash. We start with a value of 0. If          # Store it in the genomes hash. We start with a value of 0. If
# Line 411  Line 446 
446          $genomeCount++;          $genomeCount++;
447      }      }
448      Trace("$genomeCount genomes found.") if T(2);      Trace("$genomeCount genomes found.") if T(2);
449      # Close the files.      # Close the genome file.
450      close GENOMESIN;      close GENOMESIN;
     close GENOMESOUT;  
451      # Create the contig hash used to associate contigs to their parent      # Create the contig hash used to associate contigs to their parent
452      # genomes.      # genomes.
453      my %contigs = ();      my %contigs = ();
454      # Now we begin to read the Block and Region files in parallel. Both      # Now we begin to read the Block and Region files in parallel. Both
455      # are sorted by block ID, so all processing for this section of the      # are sorted by block ID, so all processing for this section of the
456      # script is done a block at a time. The first task is to      # script is done a block at a time. First, we determine which file
457      # open the output files.      # sets we'll be processing. There are at most two: regular and
458      Open(\*BLOCKSOUT, ">>$outDirectory/GroupBlock.dtx");      # intergenic.
     Open(\*REGIONSOUT, ">>$outDirectory/Region.dtx");  
     Open(\*INSTANCESOUT, ">>$outDirectory/HasInstanceOf.dtx");  
     Open(\*CONTAINSOUT, ">>$outDirectory/ContainsRegion.dtx");  
     Open(\*INCLUDESOUT, ">>$outDirectory/IncludesRegion.dtx");  
     # Determine which file sets we'll be processing.  
459      my @fileSets = ();      my @fileSets = ();
460      my @prefixes = ("", "intergenic_");      my @prefixes = ("", "intergenic_");
461      for my $prefix (@prefixes) {      for my $prefix (@prefixes) {
# Line 529  Line 558 
558                  # Write the block record. Note that the block ID is prefixed by the group name to                  # Write the block record. Note that the block ID is prefixed by the group name to
559                  # make it unique.                  # make it unique.
560                  my $variance = $blockData{"snip-count"} / $blockData{len};                  my $variance = $blockData{"snip-count"} / $blockData{len};
561                  print BLOCKSOUT join("\t", "$groupName:$blockID", $blockData{description}, $blockData{len},                  $GroupBlockLoad->Put("$groupName:$blockID", $blockData{description}, $blockData{len},
562                                       $blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n";                                       $blockData{"snip-count"}, $variance, $blockData{pattern});
563                  # Find all the variance points in the block pattern. We'll use them to create                  # Find all the variance points in the block pattern. We'll use them to create
564                  # the content strings for each region.                  # the content strings for each region.
565                  my @positions = SimBlocks::ParsePattern($blockData{pattern});                  my @positions = SimBlocks::ParsePattern($blockData{pattern});
# Line 548  Line 577 
577                      # Get the region's location data.                      # Get the region's location data.
578                      my $location = BasicLocation->new($region);                      my $location = BasicLocation->new($region);
579                      # Write this region to the output files.                      # Write this region to the output files.
580                      print REGIONSOUT join("\t", $region, $location->Contig, $location->Dir,                      $RegionLoad->Put($region, $location->Contig, $location->Dir,
581                                            $location->Right, $location->Length,                                            $location->Right, $location->Length,
582                                            $regionPeg{$region}, $location->Left, $content) . "\n";                                       $regionPeg{$region}, $location->Left, $content);
583                      print CONTAINSOUT join("\t", $location->Contig, $region,                      $ContainsRegionLoad->Put($location->Contig, $region, $location->Length,
584                                             $location->Length, $location->Left) . "\n";                                               $location->Left);
585                      print INCLUDESOUT join("\t", "$groupName:$blockID", $region) . "\n";                      $IncludesRegionLoad->Put("$groupName:$blockID", $region);
586                  }                  }
587                  # Finally, we need to connect this block to the genomes in which it occurs.                  # Finally, we need to connect this block to the genomes in which it occurs.
588                  for my $genomeID (keys %genomesFound) {                  for my $genomeID (keys %genomesFound) {
589                      print INSTANCESOUT join("\t", $genomeID, "$groupName:$blockID") . "\n";                      $HasInstanceOfLoad->Put($genomeID, "$groupName:$blockID");
590                  }                  }
591                  # Count this block's regions.                  # Count this block's regions.
592                  $regionsCount += $regionCounter;                  $regionsCount += $regionCounter;
# Line 568  Line 597 
597          close REGIONSIN;          close REGIONSIN;
598      }      }
599      # Close the output files.      # Close the output files.
     close REGIONSOUT;  
     close BLOCKSOUT;  
     close INSTANCESOUT;  
600      # All the block data has been written. Tell the user what we found.      # All the block data has been written. Tell the user what we found.
601      Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2);      Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2);
602      # The next task is to write the genome/contig data. This is provided by the      # The next task is to write the genome/contig data. This is provided by the
603      # "%contigs" hash. First, we need to open the files.      # "%contigs" hash.
604      my $contigsCount = 0;      my $contigsCount = 0;
     Open(\*CONTIGSOUT, ">>$outDirectory/Contig.dtx");  
     Open(\*CONSISTSOUT, ">>$outDirectory/ConsistsOf.dtx");  
605      for my $contigID (keys %contigs) {      for my $contigID (keys %contigs) {
606          print CONTIGSOUT "$contigID\n";          $ContigLoad->Put("$contigID");
607          print CONSISTSOUT join("\t", $contigs{$contigID}, $contigID) . "\n";          $ConsistsOfLoad->Put($contigs{$contigID}, $contigID);
608          $contigsCount++;          $contigsCount++;
609      }      }
610      Trace("$contigsCount contigs found.") if T(2);      Trace("$contigsCount contigs found.") if T(2);
     # Close the output files.  
     close CONTIGSOUT;  
     close CONSISTSOUT;  
611      # Now warn the user about all the genomes that didn't have blocks.      # Now warn the user about all the genomes that didn't have blocks.
612      for my $genomeID (keys %genomes) {      for my $genomeID (keys %genomes) {
613          if (! $genomes{$genomeID}) {          if (! $genomes{$genomeID}) {

Legend:
Removed from v.1.8  
changed lines
  Added in v.1.11

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3